ART: Single-frame deopt

Add deoptimization of a single frame. Works by removing the managed
code frame and jumping into the quick-to-interpreter bridge, and
the bridge understanding a stored ShadowFrame.

We need a separate fixup pass. For x86, we leave the return address
on the stack so we don't need to push it there.

Bug: 21611912
Change-Id: I06625685ced8b054244f8685ab50b238a705b9d2
diff --git a/runtime/arch/arm/context_arm.cc b/runtime/arch/arm/context_arm.cc
index 403d348..8f6b1ff 100644
--- a/runtime/arch/arm/context_arm.cc
+++ b/runtime/arch/arm/context_arm.cc
@@ -30,9 +30,11 @@
   std::fill_n(fprs_, arraysize(fprs_), nullptr);
   gprs_[SP] = &sp_;
   gprs_[PC] = &pc_;
+  gprs_[R0] = &arg0_;
   // Initialize registers with easy to spot debug values.
   sp_ = ArmContext::kBadGprBase + SP;
   pc_ = ArmContext::kBadGprBase + PC;
+  arg0_ = 0;
 }
 
 void ArmContext::FillCalleeSaves(const StackVisitor& fr) {
diff --git a/runtime/arch/arm/context_arm.h b/runtime/arch/arm/context_arm.h
index 77bb5c8..ea31055 100644
--- a/runtime/arch/arm/context_arm.h
+++ b/runtime/arch/arm/context_arm.h
@@ -45,6 +45,10 @@
     SetGPR(PC, new_pc);
   }
 
+  void SetArg0(uintptr_t new_arg0_value) OVERRIDE {
+    SetGPR(R0, new_arg0_value);
+  }
+
   bool IsAccessibleGPR(uint32_t reg) OVERRIDE {
     DCHECK_LT(reg, static_cast<uint32_t>(kNumberOfCoreRegisters));
     return gprs_[reg] != nullptr;
@@ -84,7 +88,7 @@
   uintptr_t* gprs_[kNumberOfCoreRegisters];
   uint32_t* fprs_[kNumberOfSRegisters];
   // Hold values for sp and pc if they are not located within a stack frame.
-  uintptr_t sp_, pc_;
+  uintptr_t sp_, pc_, arg0_;
 };
 
 }  // namespace arm
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index e45d828..dc1cf8a 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -437,8 +437,8 @@
     ldr  r14, [r0, #56]   @ (LR from gprs_ 56=4*14)
     add  r0, r0, #12      @ increment r0 to skip gprs_[0..2] 12=4*3
     ldm  r0, {r3-r13}     @ load remaining gprs from argument gprs_
-    mov  r0, #0           @ clear result registers r0 and r1
-    mov  r1, #0
+    ldr  r0, [r0, #-12]   @ load r0 value
+    mov  r1, #0           @ clear result register r1
     bx   r2               @ do long jump
 END art_quick_do_long_jump
 
@@ -1142,7 +1142,7 @@
 
     /*
      * Compiled code has requested that we deoptimize into the interpreter. The deoptimization
-     * will long jump to the upcall with a special exception of -1.
+     * will long jump to the interpreter bridge.
      */
     .extern artDeoptimizeFromCompiledCode
 ENTRY art_quick_deoptimize_from_compiled_code
diff --git a/runtime/arch/arm64/context_arm64.cc b/runtime/arch/arm64/context_arm64.cc
index 60becc6..4477631 100644
--- a/runtime/arch/arm64/context_arm64.cc
+++ b/runtime/arch/arm64/context_arm64.cc
@@ -31,10 +31,12 @@
   std::fill_n(gprs_, arraysize(gprs_), nullptr);
   std::fill_n(fprs_, arraysize(fprs_), nullptr);
   gprs_[SP] = &sp_;
-  gprs_[LR] = &pc_;
+  gprs_[kPC] = &pc_;
+  gprs_[X0] = &arg0_;
   // Initialize registers with easy to spot debug values.
   sp_ = Arm64Context::kBadGprBase + SP;
-  pc_ = Arm64Context::kBadGprBase + LR;
+  pc_ = Arm64Context::kBadGprBase + kPC;
+  arg0_ = 0;
 }
 
 void Arm64Context::FillCalleeSaves(const StackVisitor& fr) {
@@ -58,8 +60,8 @@
 }
 
 void Arm64Context::SetGPR(uint32_t reg, uintptr_t value) {
-  DCHECK_LT(reg, static_cast<uint32_t>(kNumberOfXRegisters));
-  DCHECK_NE(reg, static_cast<uint32_t>(XZR));
+  DCHECK_LT(reg, arraysize(gprs_));
+  // Note: we use kPC == XZR, so do not ensure that reg != XZR.
   DCHECK(IsAccessibleGPR(reg));
   DCHECK_NE(gprs_[reg], &gZero);  // Can't overwrite this static value since they are never reset.
   *gprs_[reg] = value;
@@ -124,13 +126,13 @@
 extern "C" NO_RETURN void art_quick_do_long_jump(uint64_t*, uint64_t*);
 
 void Arm64Context::DoLongJump() {
-  uint64_t gprs[kNumberOfXRegisters];
+  uint64_t gprs[arraysize(gprs_)];
   uint64_t fprs[kNumberOfDRegisters];
 
   // The long jump routine called below expects to find the value for SP at index 31.
   DCHECK_EQ(SP, 31);
 
-  for (size_t i = 0; i < kNumberOfXRegisters; ++i) {
+  for (size_t i = 0; i < arraysize(gprs_); ++i) {
     gprs[i] = gprs_[i] != nullptr ? *gprs_[i] : Arm64Context::kBadGprBase + i;
   }
   for (size_t i = 0; i < kNumberOfDRegisters; ++i) {
diff --git a/runtime/arch/arm64/context_arm64.h b/runtime/arch/arm64/context_arm64.h
index 1c99f3c..11314e0 100644
--- a/runtime/arch/arm64/context_arm64.h
+++ b/runtime/arch/arm64/context_arm64.h
@@ -42,20 +42,25 @@
   }
 
   void SetPC(uintptr_t new_lr) OVERRIDE {
-    SetGPR(LR, new_lr);
+    SetGPR(kPC, new_lr);
+  }
+
+  void SetArg0(uintptr_t new_arg0_value) OVERRIDE {
+    SetGPR(X0, new_arg0_value);
   }
 
   bool IsAccessibleGPR(uint32_t reg) OVERRIDE {
-    DCHECK_LT(reg, static_cast<uint32_t>(kNumberOfXRegisters));
+    DCHECK_LT(reg, arraysize(gprs_));
     return gprs_[reg] != nullptr;
   }
 
   uintptr_t* GetGPRAddress(uint32_t reg) OVERRIDE {
-    DCHECK_LT(reg, static_cast<uint32_t>(kNumberOfXRegisters));
+    DCHECK_LT(reg, arraysize(gprs_));
     return gprs_[reg];
   }
 
   uintptr_t GetGPR(uint32_t reg) OVERRIDE {
+    // Note: PC isn't an available GPR (outside of internals), so don't allow retrieving the value.
     DCHECK_LT(reg, static_cast<uint32_t>(kNumberOfXRegisters));
     DCHECK(IsAccessibleGPR(reg));
     return *gprs_[reg];
@@ -79,12 +84,15 @@
   void SmashCallerSaves() OVERRIDE;
   NO_RETURN void DoLongJump() OVERRIDE;
 
+  static constexpr size_t kPC = kNumberOfXRegisters;
+
  private:
-  // Pointers to register locations, initialized to null or the specific registers below.
-  uintptr_t* gprs_[kNumberOfXRegisters];
+  // Pointers to register locations, initialized to null or the specific registers below. We need
+  // an additional one for the PC.
+  uintptr_t* gprs_[kNumberOfXRegisters + 1];
   uint64_t * fprs_[kNumberOfDRegisters];
-  // Hold values for sp and pc if they are not located within a stack frame.
-  uintptr_t sp_, pc_;
+  // Hold values for sp, pc and arg0 if they are not located within a stack frame.
+  uintptr_t sp_, pc_, arg0_;
 };
 
 }  // namespace arm64
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 169bc38..6812178 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -941,7 +941,7 @@
     // Load GPRs
     // TODO: lots of those are smashed, could optimize.
     add x0, x0, #30*8
-    ldp x30, x1, [x0], #-16
+    ldp x30, x1, [x0], #-16          // LR & SP
     ldp x28, x29, [x0], #-16
     ldp x26, x27, [x0], #-16
     ldp x24, x25, [x0], #-16
@@ -958,10 +958,12 @@
     ldp x2, x3, [x0], #-16
     mov sp, x1
 
-    // TODO: Is it really OK to use LR for the target PC?
-    mov x0, #0
-    mov x1, #0
-    br  xLR
+    // Need to load PC, it's at the end (after the space for the unused XZR). Use x1.
+    ldr x1, [x0, #33*8]
+    // And the value of x0.
+    ldr x0, [x0]
+
+    br  x1
 END art_quick_do_long_jump
 
     /*
diff --git a/runtime/arch/context.h b/runtime/arch/context.h
index 9ef761e..9af7c04 100644
--- a/runtime/arch/context.h
+++ b/runtime/arch/context.h
@@ -50,6 +50,9 @@
   // Sets the program counter value.
   virtual void SetPC(uintptr_t new_pc) = 0;
 
+  // Sets the first argument register.
+  virtual void SetArg0(uintptr_t new_arg0_value) = 0;
+
   // Returns whether the given GPR is accessible (read or write).
   virtual bool IsAccessibleGPR(uint32_t reg) = 0;
 
diff --git a/runtime/arch/mips/context_mips.cc b/runtime/arch/mips/context_mips.cc
index bc2bf68..08ab356 100644
--- a/runtime/arch/mips/context_mips.cc
+++ b/runtime/arch/mips/context_mips.cc
@@ -30,9 +30,11 @@
   std::fill_n(fprs_, arraysize(fprs_), nullptr);
   gprs_[SP] = &sp_;
   gprs_[RA] = &ra_;
+  gprs_[A0] = &arg0_;
   // Initialize registers with easy to spot debug values.
   sp_ = MipsContext::kBadGprBase + SP;
   ra_ = MipsContext::kBadGprBase + RA;
+  arg0_ = 0;
 }
 
 void MipsContext::FillCalleeSaves(const StackVisitor& fr) {
diff --git a/runtime/arch/mips/context_mips.h b/runtime/arch/mips/context_mips.h
index 38cf29a..0affe53 100644
--- a/runtime/arch/mips/context_mips.h
+++ b/runtime/arch/mips/context_mips.h
@@ -78,12 +78,17 @@
   void SmashCallerSaves() OVERRIDE;
   NO_RETURN void DoLongJump() OVERRIDE;
 
+  void SetArg0(uintptr_t new_arg0_value) OVERRIDE {
+    SetGPR(A0, new_arg0_value);
+  }
+
  private:
   // Pointers to registers in the stack, initialized to null except for the special cases below.
   uintptr_t* gprs_[kNumberOfCoreRegisters];
   uint32_t* fprs_[kNumberOfFRegisters];
-  // Hold values for sp and ra (return address) if they are not located within a stack frame.
-  uintptr_t sp_, ra_;
+  // Hold values for sp and ra (return address) if they are not located within a stack frame, as
+  // well as the first argument.
+  uintptr_t sp_, ra_, arg0_;
 };
 }  // namespace mips
 }  // namespace art
diff --git a/runtime/arch/mips64/context_mips64.cc b/runtime/arch/mips64/context_mips64.cc
index cc6dc7e..2c17f1c 100644
--- a/runtime/arch/mips64/context_mips64.cc
+++ b/runtime/arch/mips64/context_mips64.cc
@@ -30,9 +30,11 @@
   std::fill_n(fprs_, arraysize(fprs_), nullptr);
   gprs_[SP] = &sp_;
   gprs_[T9] = &t9_;
+  gprs_[A0] = &arg0_;
   // Initialize registers with easy to spot debug values.
   sp_ = Mips64Context::kBadGprBase + SP;
   t9_ = Mips64Context::kBadGprBase + T9;
+  arg0_ = 0;
 }
 
 void Mips64Context::FillCalleeSaves(const StackVisitor& fr) {
diff --git a/runtime/arch/mips64/context_mips64.h b/runtime/arch/mips64/context_mips64.h
index 26fbcfe..84b1c9b 100644
--- a/runtime/arch/mips64/context_mips64.h
+++ b/runtime/arch/mips64/context_mips64.h
@@ -78,14 +78,20 @@
   void SmashCallerSaves() OVERRIDE;
   NO_RETURN void DoLongJump() OVERRIDE;
 
+  void SetArg0(uintptr_t new_arg0_value) OVERRIDE {
+    SetGPR(A0, new_arg0_value);
+  }
+
  private:
   // Pointers to registers in the stack, initialized to null except for the special cases below.
   uintptr_t* gprs_[kNumberOfGpuRegisters];
   uint64_t* fprs_[kNumberOfFpuRegisters];
   // Hold values for sp and t9 if they are not located within a stack frame. We use t9 for the
-  // PC (as ra is required to be valid for single-frame deopt and must not be clobbered).
-  uintptr_t sp_, t9_;
+  // PC (as ra is required to be valid for single-frame deopt and must not be clobbered). We
+  // also need the first argument for single-frame deopt.
+  uintptr_t sp_, t9_, arg0_;
 };
+
 }  // namespace mips64
 }  // namespace art
 
diff --git a/runtime/arch/x86/context_x86.cc b/runtime/arch/x86/context_x86.cc
index 7096c82..987ad60 100644
--- a/runtime/arch/x86/context_x86.cc
+++ b/runtime/arch/x86/context_x86.cc
@@ -29,9 +29,11 @@
   std::fill_n(gprs_, arraysize(gprs_), nullptr);
   std::fill_n(fprs_, arraysize(fprs_), nullptr);
   gprs_[ESP] = &esp_;
+  gprs_[EAX] = &arg0_;
   // Initialize registers with easy to spot debug values.
   esp_ = X86Context::kBadGprBase + ESP;
   eip_ = X86Context::kBadGprBase + kNumberOfCpuRegisters;
+  arg0_ = 0;
 }
 
 void X86Context::FillCalleeSaves(const StackVisitor& fr) {
diff --git a/runtime/arch/x86/context_x86.h b/runtime/arch/x86/context_x86.h
index c4a11d8..59beb12 100644
--- a/runtime/arch/x86/context_x86.h
+++ b/runtime/arch/x86/context_x86.h
@@ -44,6 +44,10 @@
     eip_ = new_pc;
   }
 
+  void SetArg0(uintptr_t new_arg0_value) OVERRIDE {
+    SetGPR(EAX, new_arg0_value);
+  }
+
   bool IsAccessibleGPR(uint32_t reg) OVERRIDE {
     DCHECK_LT(reg, static_cast<uint32_t>(kNumberOfCpuRegisters));
     return gprs_[reg] != nullptr;
@@ -95,10 +99,10 @@
   // Pointers to register locations. Values are initialized to null or the special registers below.
   uintptr_t* gprs_[kNumberOfCpuRegisters];
   uint32_t* fprs_[kNumberOfFloatRegisters];
-  // Hold values for esp and eip if they are not located within a stack frame. EIP is somewhat
+  // Hold values for esp, eip and arg0 if they are not located within a stack frame. EIP is somewhat
   // special in that it cannot be encoded normally as a register operand to an instruction (except
   // in 64bit addressing modes).
-  uintptr_t esp_, eip_;
+  uintptr_t esp_, eip_, arg0_;
 };
 }  // namespace x86
 }  // namespace art
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 029a296..f3b15c9 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -1695,7 +1695,7 @@
 
     /*
      * Compiled code has requested that we deoptimize into the interpreter. The deoptimization
-     * will long jump to the upcall with a special exception of -1.
+     * will long jump to the interpreter bridge.
      */
 DEFINE_FUNCTION art_quick_deoptimize_from_compiled_code
     SETUP_SAVE_ALL_CALLEE_SAVE_FRAME ebx, ebx
diff --git a/runtime/arch/x86_64/context_x86_64.cc b/runtime/arch/x86_64/context_x86_64.cc
index 1fe2ef8..3dc7d71 100644
--- a/runtime/arch/x86_64/context_x86_64.cc
+++ b/runtime/arch/x86_64/context_x86_64.cc
@@ -29,9 +29,11 @@
   std::fill_n(gprs_, arraysize(gprs_), nullptr);
   std::fill_n(fprs_, arraysize(fprs_), nullptr);
   gprs_[RSP] = &rsp_;
+  gprs_[RDI] = &arg0_;
   // Initialize registers with easy to spot debug values.
   rsp_ = X86_64Context::kBadGprBase + RSP;
   rip_ = X86_64Context::kBadGprBase + kNumberOfCpuRegisters;
+  arg0_ = 0;
 }
 
 void X86_64Context::FillCalleeSaves(const StackVisitor& fr) {
diff --git a/runtime/arch/x86_64/context_x86_64.h b/runtime/arch/x86_64/context_x86_64.h
index 30bb9ec..f05b7f0 100644
--- a/runtime/arch/x86_64/context_x86_64.h
+++ b/runtime/arch/x86_64/context_x86_64.h
@@ -44,6 +44,10 @@
     rip_ = new_pc;
   }
 
+  void SetArg0(uintptr_t new_arg0_value) OVERRIDE {
+    SetGPR(RDI, new_arg0_value);
+  }
+
   bool IsAccessibleGPR(uint32_t reg) OVERRIDE {
     DCHECK_LT(reg, static_cast<uint32_t>(kNumberOfCpuRegisters));
     return gprs_[reg] != nullptr;
@@ -82,10 +86,10 @@
   // Pointers to register locations. Values are initialized to null or the special registers below.
   uintptr_t* gprs_[kNumberOfCpuRegisters];
   uint64_t* fprs_[kNumberOfFloatRegisters];
-  // Hold values for rsp and rip if they are not located within a stack frame. RIP is somewhat
+  // Hold values for rsp, rip and arg0 if they are not located within a stack frame. RIP is somewhat
   // special in that it cannot be encoded normally as a register operand to an instruction (except
   // in 64bit addressing modes).
-  uintptr_t rsp_, rip_;
+  uintptr_t rsp_, rip_, arg0_;
 };
 }  // namespace x86_64
 }  // namespace art
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 861f802..2f438a3 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -1724,18 +1724,18 @@
      * will long jump to the upcall with a special exception of -1.
      */
 DEFINE_FUNCTION art_quick_deoptimize
-    pushq %rsi                     // Entry point for a jump. Fake that we were called.
-                                   // Use hidden arg.
+    pushq %rsi                         // Entry point for a jump. Fake that we were called.
+                                       // Use hidden arg.
     SETUP_SAVE_ALL_CALLEE_SAVE_FRAME
-                                   // Stack should be aligned now.
-    movq %gs:THREAD_SELF_OFFSET, %rdi         // Pass Thread.
-    call SYMBOL(artDeoptimize) // artDeoptimize(Thread*)
+                                       // Stack should be aligned now.
+    movq %gs:THREAD_SELF_OFFSET, %rdi  // Pass Thread.
+    call SYMBOL(artDeoptimize)         // artDeoptimize(Thread*)
     UNREACHABLE
 END_FUNCTION art_quick_deoptimize
 
     /*
      * Compiled code has requested that we deoptimize into the interpreter. The deoptimization
-     * will long jump to the upcall with a special exception of -1.
+     * will long jump to the interpreter bridge.
      */
 DEFINE_FUNCTION art_quick_deoptimize_from_compiled_code
     SETUP_SAVE_ALL_CALLEE_SAVE_FRAME
diff --git a/runtime/entrypoints/quick/quick_deoptimization_entrypoints.cc b/runtime/entrypoints/quick/quick_deoptimization_entrypoints.cc
index d749664..dfd9fcd 100644
--- a/runtime/entrypoints/quick/quick_deoptimization_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_deoptimization_entrypoints.cc
@@ -22,13 +22,16 @@
 #include "mirror/class-inl.h"
 #include "mirror/object_array-inl.h"
 #include "mirror/object-inl.h"
+#include "quick_exception_handler.h"
 #include "stack.h"
 #include "thread.h"
 #include "verifier/method_verifier.h"
 
 namespace art {
 
-NO_RETURN static void artDeoptimizeImpl(Thread* self) SHARED_REQUIRES(Locks::mutator_lock_) {
+extern "C" NO_RETURN void artDeoptimize(Thread* self) SHARED_REQUIRES(Locks::mutator_lock_) {
+  ScopedQuickEntrypointChecks sqec(self);
+
   if (VLOG_IS_ON(deopt)) {
     LOG(INFO) << "Deopting:";
     self->Dump(LOG(INFO));
@@ -39,19 +42,26 @@
   self->QuickDeliverException();
 }
 
-extern "C" NO_RETURN void artDeoptimize(Thread* self) SHARED_REQUIRES(Locks::mutator_lock_) {
-  ScopedQuickEntrypointChecks sqec(self);
-  artDeoptimizeImpl(self);
-}
-
 extern "C" NO_RETURN void artDeoptimizeFromCompiledCode(Thread* self)
     SHARED_REQUIRES(Locks::mutator_lock_) {
   ScopedQuickEntrypointChecks sqec(self);
+
+  // Deopt logging will be in DeoptimizeSingleFrame. It is there to take advantage of the
+  // specialized visitor that will show whether a method is Quick or Shadow.
+
   // Before deoptimizing to interpreter, we must push the deoptimization context.
   JValue return_value;
   return_value.SetJ(0);  // we never deoptimize from compiled code with an invoke result.
   self->PushDeoptimizationContext(return_value, false, self->GetException());
-  artDeoptimizeImpl(self);
+
+  QuickExceptionHandler exception_handler(self, true);
+  exception_handler.DeoptimizeSingleFrame();
+  exception_handler.UpdateInstrumentationStack();
+  exception_handler.DeoptimizeSingleFrameArchDependentFixup();
+  // We cannot smash the caller-saves, as we need the ArtMethod in a parameter register that would
+  // be caller-saved. This has the downside that we cannot track incorrect register usage down the
+  // line.
+  exception_handler.DoLongJump(false);
 }
 
 }  // namespace art
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index 1302c5f..c2488cc 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -29,6 +29,7 @@
 #include "mirror/method.h"
 #include "mirror/object-inl.h"
 #include "mirror/object_array-inl.h"
+#include "quick_exception_handler.h"
 #include "runtime.h"
 #include "scoped_thread_state_change.h"
 #include "debugger.h"
@@ -646,27 +647,85 @@
   if (method->IsAbstract()) {
     ThrowAbstractMethodError(method);
     return 0;
+  }
+
+  JValue tmp_value;
+  ShadowFrame* deopt_frame = self->PopStackedShadowFrame(
+      StackedShadowFrameType::kSingleFrameDeoptimizationShadowFrame, false);
+  const DexFile::CodeItem* code_item = method->GetCodeItem();
+  DCHECK(code_item != nullptr) << PrettyMethod(method);
+  ManagedStack fragment;
+
+  DCHECK(!method->IsNative()) << PrettyMethod(method);
+  uint32_t shorty_len = 0;
+  auto* non_proxy_method = method->GetInterfaceMethodIfProxy(sizeof(void*));
+  const char* shorty = non_proxy_method->GetShorty(&shorty_len);
+
+  JValue result;
+
+  if (deopt_frame != nullptr) {
+    // Coming from single-frame deopt.
+
+    if (kIsDebugBuild) {
+      // Sanity-check: are the methods as expected? We check that the last shadow frame (the bottom
+      // of the call-stack) corresponds to the called method.
+      ShadowFrame* linked = deopt_frame;
+      while (linked->GetLink() != nullptr) {
+        linked = linked->GetLink();
+      }
+      CHECK_EQ(method, linked->GetMethod()) << PrettyMethod(method) << " "
+          << PrettyMethod(linked->GetMethod());
+    }
+
+    if (VLOG_IS_ON(deopt)) {
+      // Print out the stack to verify that it was a single-frame deopt.
+      LOG(INFO) << "Continue-ing from deopt. Stack is:";
+      QuickExceptionHandler::DumpFramesWithType(self, true);
+    }
+
+    mirror::Throwable* pending_exception = nullptr;
+    self->PopDeoptimizationContext(&result, &pending_exception);
+
+    // Push a transition back into managed code onto the linked list in thread.
+    self->PushManagedStackFragment(&fragment);
+
+    // Ensure that the stack is still in order.
+    if (kIsDebugBuild) {
+      class DummyStackVisitor : public StackVisitor {
+       public:
+        explicit DummyStackVisitor(Thread* self_in) SHARED_REQUIRES(Locks::mutator_lock_)
+            : StackVisitor(self_in, nullptr, StackVisitor::StackWalkKind::kIncludeInlinedFrames) {}
+
+        bool VisitFrame() OVERRIDE SHARED_REQUIRES(Locks::mutator_lock_) {
+          // Nothing to do here. In a debug build, SanityCheckFrame will do the work in the walking
+          // logic. Just always say we want to continue.
+          return true;
+        }
+      };
+      DummyStackVisitor dsv(self);
+      dsv.WalkStack();
+    }
+
+    // Restore the exception that was pending before deoptimization then interpret the
+    // deoptimized frames.
+    if (pending_exception != nullptr) {
+      self->SetException(pending_exception);
+    }
+    interpreter::EnterInterpreterFromDeoptimize(self, deopt_frame, &result);
   } else {
-    DCHECK(!method->IsNative()) << PrettyMethod(method);
     const char* old_cause = self->StartAssertNoThreadSuspension(
         "Building interpreter shadow frame");
-    const DexFile::CodeItem* code_item = method->GetCodeItem();
-    DCHECK(code_item != nullptr) << PrettyMethod(method);
     uint16_t num_regs = code_item->registers_size_;
     void* memory = alloca(ShadowFrame::ComputeSize(num_regs));
     // No last shadow coming from quick.
     ShadowFrame* shadow_frame(ShadowFrame::Create(num_regs, nullptr, method, 0, memory));
     size_t first_arg_reg = code_item->registers_size_ - code_item->ins_size_;
-    uint32_t shorty_len = 0;
-    auto* non_proxy_method = method->GetInterfaceMethodIfProxy(sizeof(void*));
-    const char* shorty = non_proxy_method->GetShorty(&shorty_len);
     BuildQuickShadowFrameVisitor shadow_frame_builder(sp, method->IsStatic(), shorty, shorty_len,
                                                       shadow_frame, first_arg_reg);
     shadow_frame_builder.VisitArguments();
     const bool needs_initialization =
         method->IsStatic() && !method->GetDeclaringClass()->IsInitialized();
     // Push a transition back into managed code onto the linked list in thread.
-    ManagedStack fragment;
     self->PushManagedStackFragment(&fragment);
     self->PushShadowFrame(shadow_frame);
     self->EndAssertNoThreadSuspension(old_cause);
@@ -681,24 +740,26 @@
         return 0;
       }
     }
-    JValue result = interpreter::EnterInterpreterFromEntryPoint(self, code_item, shadow_frame);
-    // Pop transition.
-    self->PopManagedStackFragment(fragment);
 
-    // Request a stack deoptimization if needed
-    ArtMethod* caller = QuickArgumentVisitor::GetCallingMethod(sp);
-    if (UNLIKELY(Dbg::IsForcedInterpreterNeededForUpcall(self, caller))) {
-      // Push the context of the deoptimization stack so we can restore the return value and the
-      // exception before executing the deoptimized frames.
-      self->PushDeoptimizationContext(result, shorty[0] == 'L', self->GetException());
-
-      // Set special exception to cause deoptimization.
-      self->SetException(Thread::GetDeoptimizationException());
-    }
-
-    // No need to restore the args since the method has already been run by the interpreter.
-    return result.GetJ();
+    result = interpreter::EnterInterpreterFromEntryPoint(self, code_item, shadow_frame);
   }
+
+  // Pop transition.
+  self->PopManagedStackFragment(fragment);
+
+  // Request a stack deoptimization if needed
+  ArtMethod* caller = QuickArgumentVisitor::GetCallingMethod(sp);
+  if (UNLIKELY(Dbg::IsForcedInterpreterNeededForUpcall(self, caller))) {
+    // Push the context of the deoptimization stack so we can restore the return value and the
+    // exception before executing the deoptimized frames.
+    self->PushDeoptimizationContext(result, shorty[0] == 'L', self->GetException());
+
+    // Set special exception to cause deoptimization.
+    self->SetException(Thread::GetDeoptimizationException());
+  }
+
+  // No need to restore the args since the method has already been run by the interpreter.
+  return result.GetJ();
 }
 
 // Visits arguments on the stack placing them into the args vector, Object* arguments are converted
diff --git a/runtime/quick_exception_handler.cc b/runtime/quick_exception_handler.cc
index 5c13e13..63f43cf 100644
--- a/runtime/quick_exception_handler.cc
+++ b/runtime/quick_exception_handler.cc
@@ -20,6 +20,7 @@
 #include "art_method-inl.h"
 #include "dex_instruction.h"
 #include "entrypoints/entrypoint_utils.h"
+#include "entrypoints/quick/quick_entrypoints_enum.h"
 #include "entrypoints/runtime_asm_entrypoints.h"
 #include "handle_scope-inl.h"
 #include "mirror/class-inl.h"
@@ -36,8 +37,9 @@
   : self_(self), context_(self->GetLongJumpContext()), is_deoptimization_(is_deoptimization),
     method_tracing_active_(is_deoptimization ||
                            Runtime::Current()->GetInstrumentation()->AreExitStubsInstalled()),
-    handler_quick_frame_(nullptr), handler_quick_frame_pc_(0), handler_method_(nullptr),
-    handler_dex_pc_(0), clear_exception_(false), handler_frame_depth_(kInvalidFrameDepth) {
+    handler_quick_frame_(nullptr), handler_quick_frame_pc_(0), handler_quick_arg0_(0),
+    handler_method_(nullptr), handler_dex_pc_(0), clear_exception_(false),
+    handler_frame_depth_(kInvalidFrameDepth) {
 }
 
 // Finds catch handler.
@@ -260,19 +262,25 @@
 // Prepares deoptimization.
 class DeoptimizeStackVisitor FINAL : public StackVisitor {
  public:
-  DeoptimizeStackVisitor(Thread* self, Context* context, QuickExceptionHandler* exception_handler)
+  DeoptimizeStackVisitor(Thread* self,
+                         Context* context,
+                         QuickExceptionHandler* exception_handler,
+                         bool single_frame)
       SHARED_REQUIRES(Locks::mutator_lock_)
       : StackVisitor(self, context, StackVisitor::StackWalkKind::kIncludeInlinedFrames),
         exception_handler_(exception_handler),
         prev_shadow_frame_(nullptr),
-        stacked_shadow_frame_pushed_(false) {
+        stacked_shadow_frame_pushed_(false),
+        single_frame_deopt_(single_frame),
+        single_frame_done_(false) {
   }
 
   bool VisitFrame() OVERRIDE SHARED_REQUIRES(Locks::mutator_lock_) {
     exception_handler_->SetHandlerFrameDepth(GetFrameDepth());
     ArtMethod* method = GetMethod();
-    if (method == nullptr) {
-      // This is the upcall, we remember the frame and last pc so that we may long jump to them.
+    if (method == nullptr || single_frame_done_) {
+      // This is the upcall (or the next full frame in single-frame deopt), we remember the frame
+      // and last pc so that we may long jump to them.
       exception_handler_->SetHandlerQuickFramePc(GetCurrentQuickFramePc());
       exception_handler_->SetHandlerQuickFrame(GetCurrentQuickFrame());
       if (!stacked_shadow_frame_pushed_) {
@@ -295,7 +303,13 @@
       CHECK_EQ(GetFrameDepth(), 1U);
       return true;
     } else {
-      return HandleDeoptimization(method);
+      HandleDeoptimization(method);
+      if (single_frame_deopt_ && !IsInInlinedFrame()) {
+        // Single-frame deopt ends at the first non-inlined frame and needs to store that method.
+        exception_handler_->SetHandlerQuickArg0(reinterpret_cast<uintptr_t>(method));
+        single_frame_done_ = true;
+      }
+      return true;
     }
   }
 
@@ -304,7 +318,7 @@
     return static_cast<VRegKind>(kinds.at(reg * 2));
   }
 
-  bool HandleDeoptimization(ArtMethod* m) SHARED_REQUIRES(Locks::mutator_lock_) {
+  void HandleDeoptimization(ArtMethod* m) SHARED_REQUIRES(Locks::mutator_lock_) {
     const DexFile::CodeItem* code_item = m->GetCodeItem();
     CHECK(code_item != nullptr) << "No code item for " << PrettyMethod(m);
     uint16_t num_regs = code_item->registers_size_;
@@ -448,16 +462,20 @@
       // Will be popped after the long jump after DeoptimizeStack(),
       // right before interpreter::EnterInterpreterFromDeoptimize().
       stacked_shadow_frame_pushed_ = true;
-      GetThread()->PushStackedShadowFrame(new_frame,
-                                          StackedShadowFrameType::kDeoptimizationShadowFrame);
+      GetThread()->PushStackedShadowFrame(
+          new_frame,
+          single_frame_deopt_
+              ? StackedShadowFrameType::kSingleFrameDeoptimizationShadowFrame
+              : StackedShadowFrameType::kDeoptimizationShadowFrame);
     }
     prev_shadow_frame_ = new_frame;
-    return true;
   }
 
   QuickExceptionHandler* const exception_handler_;
   ShadowFrame* prev_shadow_frame_;
   bool stacked_shadow_frame_pushed_;
+  const bool single_frame_deopt_;
+  bool single_frame_done_;
 
   DISALLOW_COPY_AND_ASSIGN(DeoptimizeStackVisitor);
 };
@@ -468,13 +486,46 @@
     self_->DumpStack(LOG(INFO) << "Deoptimizing: ");
   }
 
-  DeoptimizeStackVisitor visitor(self_, context_, this);
+  DeoptimizeStackVisitor visitor(self_, context_, this, false);
   visitor.WalkStack(true);
 
   // Restore deoptimization exception
   self_->SetException(Thread::GetDeoptimizationException());
 }
 
+void QuickExceptionHandler::DeoptimizeSingleFrame() {
+  DCHECK(is_deoptimization_);
+
+  if (VLOG_IS_ON(deopt) || kDebugExceptionDelivery) {
+    LOG(INFO) << "Single-frame deopting:";
+    DumpFramesWithType(self_, true);
+  }
+
+  DeoptimizeStackVisitor visitor(self_, context_, this, true);
+  visitor.WalkStack(true);
+
+  // PC needs to be of the quick-to-interpreter bridge.
+  int32_t offset;
+  #ifdef __LP64__
+      offset = GetThreadOffset<8>(kQuickQuickToInterpreterBridge).Int32Value();
+  #else
+      offset = GetThreadOffset<4>(kQuickQuickToInterpreterBridge).Int32Value();
+  #endif
+  handler_quick_frame_pc_ = *reinterpret_cast<uintptr_t*>(
+      reinterpret_cast<uint8_t*>(self_) + offset);
+}
+
+void QuickExceptionHandler::DeoptimizeSingleFrameArchDependentFixup() {
+  // Architecture-dependent work. This is to get the LR right for x86 and x86-64.
+
+  if (kRuntimeISA == InstructionSet::kX86 || kRuntimeISA == InstructionSet::kX86_64) {
+    // On x86, the return address is on the stack, so just reuse it. Otherwise we would have to
+    // change how longjump works.
+    handler_quick_frame_ = reinterpret_cast<ArtMethod**>(
+        reinterpret_cast<uintptr_t>(handler_quick_frame_) - sizeof(void*));
+  }
+}
+
 // Unwinds all instrumentation stack frame prior to catch handler or upcall.
 class InstrumentationStackVisitor : public StackVisitor {
  public:
@@ -529,15 +580,67 @@
   }
 }
 
-void QuickExceptionHandler::DoLongJump() {
+void QuickExceptionHandler::DoLongJump(bool smash_caller_saves) {
   // Place context back on thread so it will be available when we continue.
   self_->ReleaseLongJumpContext(context_);
   context_->SetSP(reinterpret_cast<uintptr_t>(handler_quick_frame_));
   CHECK_NE(handler_quick_frame_pc_, 0u);
   context_->SetPC(handler_quick_frame_pc_);
-  context_->SmashCallerSaves();
+  context_->SetArg0(handler_quick_arg0_);
+  if (smash_caller_saves) {
+    context_->SmashCallerSaves();
+  }
   context_->DoLongJump();
   UNREACHABLE();
 }
 
+// Prints out methods with their type of frame.
+class DumpFramesWithTypeStackVisitor FINAL : public StackVisitor {
+ public:
+  DumpFramesWithTypeStackVisitor(Thread* self, bool show_details = false)
+      SHARED_REQUIRES(Locks::mutator_lock_)
+      : StackVisitor(self, nullptr, StackVisitor::StackWalkKind::kIncludeInlinedFrames),
+        show_details_(show_details) {}
+
+  bool VisitFrame() OVERRIDE SHARED_REQUIRES(Locks::mutator_lock_) {
+    ArtMethod* method = GetMethod();
+    if (show_details_) {
+      LOG(INFO) << "|> pc   = " << std::hex << GetCurrentQuickFramePc();
+      LOG(INFO) << "|> addr = " << std::hex << reinterpret_cast<uintptr_t>(GetCurrentQuickFrame());
+      if (GetCurrentQuickFrame() != nullptr && method != nullptr) {
+        LOG(INFO) << "|> ret  = " << std::hex << GetReturnPc();
+      }
+    }
+    if (method == nullptr) {
+      // Transition, do go on, we want to unwind over bridges, all the way.
+      if (show_details_) {
+        LOG(INFO) << "N  <transition>";
+      }
+      return true;
+    } else if (method->IsRuntimeMethod()) {
+      if (show_details_) {
+        LOG(INFO) << "R  " << PrettyMethod(method, true);
+      }
+      return true;
+    } else {
+      bool is_shadow = GetCurrentShadowFrame() != nullptr;
+      LOG(INFO) << (is_shadow ? "S" : "Q")
+                << ((!is_shadow && IsInInlinedFrame()) ? "i" : " ")
+                << " "
+                << PrettyMethod(method, true);
+      return true;  // Go on.
+    }
+  }
+
+ private:
+  bool show_details_;
+
+  DISALLOW_COPY_AND_ASSIGN(DumpFramesWithTypeStackVisitor);
+};
+
+void QuickExceptionHandler::DumpFramesWithType(Thread* self, bool details) {
+  DumpFramesWithTypeStackVisitor visitor(self, details);
+  visitor.WalkStack(true);
+}
+
 }  // namespace art
diff --git a/runtime/quick_exception_handler.h b/runtime/quick_exception_handler.h
index 2e05c7e..89d6a25 100644
--- a/runtime/quick_exception_handler.h
+++ b/runtime/quick_exception_handler.h
@@ -49,6 +49,9 @@
   // Deoptimize the stack to the upcall. For every compiled frame, we create a "copy"
   // shadow frame that will be executed with the interpreter.
   void DeoptimizeStack() SHARED_REQUIRES(Locks::mutator_lock_);
+  void DeoptimizeSingleFrame() SHARED_REQUIRES(Locks::mutator_lock_);
+  void DeoptimizeSingleFrameArchDependentFixup() SHARED_REQUIRES(Locks::mutator_lock_);
+
   // Update the instrumentation stack by removing all methods that will be unwound
   // by the exception being thrown.
   void UpdateInstrumentationStack() SHARED_REQUIRES(Locks::mutator_lock_);
@@ -58,7 +61,7 @@
       SHARED_REQUIRES(Locks::mutator_lock_);
 
   // Long jump either to a catch handler or to the upcall.
-  NO_RETURN void DoLongJump() SHARED_REQUIRES(Locks::mutator_lock_);
+  NO_RETURN void DoLongJump(bool smash_caller_saves = true) SHARED_REQUIRES(Locks::mutator_lock_);
 
   void SetHandlerQuickFrame(ArtMethod** handler_quick_frame) {
     handler_quick_frame_ = handler_quick_frame;
@@ -68,6 +71,10 @@
     handler_quick_frame_pc_ = handler_quick_frame_pc;
   }
 
+  void SetHandlerQuickArg0(uintptr_t handler_quick_arg0) {
+    handler_quick_arg0_ = handler_quick_arg0;
+  }
+
   ArtMethod* GetHandlerMethod() const {
     return handler_method_;
   }
@@ -92,6 +99,11 @@
     handler_frame_depth_ = frame_depth;
   }
 
+  // Walk the stack frames of the given thread, printing out non-runtime methods with their types
+  // of frames. Helps to verify that single-frame deopt really only deopted one frame.
+  static void DumpFramesWithType(Thread* self, bool details = false)
+      SHARED_REQUIRES(Locks::mutator_lock_);
+
  private:
   Thread* const self_;
   Context* const context_;
@@ -103,6 +115,8 @@
   ArtMethod** handler_quick_frame_;
   // PC to branch to for the handler.
   uintptr_t handler_quick_frame_pc_;
+  // The value for argument 0.
+  uintptr_t handler_quick_arg0_;
   // The handler method to report to the debugger.
   ArtMethod* handler_method_;
   // The handler's dex PC, zero implies an uncaught exception.
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 5bf895e..82e6fb0 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -250,10 +250,16 @@
   tlsPtr_.stacked_shadow_frame_record = record;
 }
 
-ShadowFrame* Thread::PopStackedShadowFrame(StackedShadowFrameType type) {
+ShadowFrame* Thread::PopStackedShadowFrame(StackedShadowFrameType type, bool must_be_present) {
   StackedShadowFrameRecord* record = tlsPtr_.stacked_shadow_frame_record;
-  DCHECK(record != nullptr);
-  DCHECK_EQ(record->GetType(), type);
+  if (must_be_present) {
+    DCHECK(record != nullptr);
+    DCHECK_EQ(record->GetType(), type);
+  } else {
+    if (record == nullptr || record->GetType() != type) {
+      return nullptr;
+    }
+  }
   tlsPtr_.stacked_shadow_frame_record = record->GetLink();
   ShadowFrame* shadow_frame = record->GetShadowFrame();
   delete record;
diff --git a/runtime/thread.h b/runtime/thread.h
index 11f2e28..d21644d 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -108,7 +108,8 @@
 
 enum class StackedShadowFrameType {
   kShadowFrameUnderConstruction,
-  kDeoptimizationShadowFrame
+  kDeoptimizationShadowFrame,
+  kSingleFrameDeoptimizationShadowFrame
 };
 
 static constexpr size_t kNumRosAllocThreadLocalSizeBrackets = 34;
@@ -843,7 +844,7 @@
   void AssertHasDeoptimizationContext()
       SHARED_REQUIRES(Locks::mutator_lock_);
   void PushStackedShadowFrame(ShadowFrame* sf, StackedShadowFrameType type);
-  ShadowFrame* PopStackedShadowFrame(StackedShadowFrameType type);
+  ShadowFrame* PopStackedShadowFrame(StackedShadowFrameType type, bool must_be_present = true);
 
   // For debugger, find the shadow frame that corresponds to a frame id.
   // Or return null if there is none.
diff --git a/test/449-checker-bce/src/Main.java b/test/449-checker-bce/src/Main.java
index a746664..f06c250 100644
--- a/test/449-checker-bce/src/Main.java
+++ b/test/449-checker-bce/src/Main.java
@@ -249,6 +249,25 @@
     array[Integer.MAX_VALUE - 998] = 1;
   }
 
+  /// CHECK-START: void Main.constantIndexing6(int[]) BCE (before)
+  /// CHECK: BoundsCheck
+  /// CHECK: ArraySet
+  /// CHECK: BoundsCheck
+  /// CHECK: ArraySet
+
+  /// CHECK-START: void Main.constantIndexing6(int[]) BCE (after)
+  /// CHECK: Deoptimize
+
+  static void constantIndexing6(int[] array) {
+    array[3] = 1;
+    array[4] = 1;
+  }
+
+  // A helper into which the actual throwing function should be inlined.
+  static void constantIndexingForward6(int[] array) {
+    constantIndexing6(array);
+  }
+
   /// CHECK-START: void Main.loopPattern1(int[]) BCE (before)
   /// CHECK: BoundsCheck
   /// CHECK: ArraySet
@@ -602,7 +621,12 @@
       // This will cause AIOOBE.
       constantIndexing2(new int[3]);
     } catch (ArrayIndexOutOfBoundsException e) {
-      return 99;
+      try {
+        // This will cause AIOOBE.
+        constantIndexingForward6(new int[3]);
+      } catch (ArrayIndexOutOfBoundsException e2) {
+        return 99;
+      }
     }
     return 0;
   }