ART: Fix Quick-style LR vs PC core spill mask bug

It's always been a bug that Quick marked PC as spilled instead of
LR. The root cause was a mutation of the spill mask at frame exit,
when LR is being restored into PC to return. A local should have
been used to keep the actual spill mask safe and sound.

This has only worked because nobody ever uses LR, even after long
jumps for exception dispatch. However, single-frame deoptimization
needs this to work, and I'd rather fix this than being forced to
have machine-specific fixups.

Also fix in optimizing, and bump the oat version.

Change-Id: Ib032a533408bf464097fc96dcbfc5b6a68bf59a1
diff --git a/compiler/dex/quick/arm/call_arm.cc b/compiler/dex/quick/arm/call_arm.cc
index eb8730c..868d9a4 100644
--- a/compiler/dex/quick/arm/call_arm.cc
+++ b/compiler/dex/quick/arm/call_arm.cc
@@ -547,27 +547,28 @@
     cfi_.RestoreMany(DwarfFpReg(0), fp_spill_mask_);
   }
   bool unspill_LR_to_PC = (core_spill_mask_ & (1 << rs_rARM_LR.GetRegNum())) != 0;
+  uint32_t core_unspill_mask = core_spill_mask_;
   if (unspill_LR_to_PC) {
-    core_spill_mask_ &= ~(1 << rs_rARM_LR.GetRegNum());
-    core_spill_mask_ |= (1 << rs_rARM_PC.GetRegNum());
+    core_unspill_mask &= ~(1 << rs_rARM_LR.GetRegNum());
+    core_unspill_mask |= (1 << rs_rARM_PC.GetRegNum());
   }
-  if (core_spill_mask_ != 0u) {
-    if ((core_spill_mask_ & ~(0xffu | (1u << rs_rARM_PC.GetRegNum()))) == 0u) {
+  if (core_unspill_mask != 0u) {
+    if ((core_unspill_mask & ~(0xffu | (1u << rs_rARM_PC.GetRegNum()))) == 0u) {
       // Unspilling only low regs and/or PC, use 16-bit POP.
       constexpr int pc_bit_shift = rs_rARM_PC.GetRegNum() - 8;
       NewLIR1(kThumbPop,
-              (core_spill_mask_ & ~(1u << rs_rARM_PC.GetRegNum())) |
-              ((core_spill_mask_ & (1u << rs_rARM_PC.GetRegNum())) >> pc_bit_shift));
-    } else if (IsPowerOfTwo(core_spill_mask_)) {
+              (core_unspill_mask & ~(1u << rs_rARM_PC.GetRegNum())) |
+              ((core_unspill_mask & (1u << rs_rARM_PC.GetRegNum())) >> pc_bit_shift));
+    } else if (IsPowerOfTwo(core_unspill_mask)) {
       // kThumb2Pop cannot be used to unspill a single register.
-      NewLIR1(kThumb2Pop1, CTZ(core_spill_mask_));
+      NewLIR1(kThumb2Pop1, CTZ(core_unspill_mask));
     } else {
-      NewLIR1(kThumb2Pop, core_spill_mask_);
+      NewLIR1(kThumb2Pop, core_unspill_mask);
     }
     // If we pop to PC, there is no further epilogue code.
     if (!unspill_LR_to_PC) {
       cfi_.AdjustCFAOffset(-num_core_spills_ * kArmPointerSize);
-      cfi_.RestoreMany(DwarfCoreReg(0), core_spill_mask_);
+      cfi_.RestoreMany(DwarfCoreReg(0), core_unspill_mask);
       DCHECK_EQ(cfi_.GetCurrentCFAOffset(), 0);  // empty stack.
     }
   }
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 438ef69..a4c58b0 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -48,7 +48,7 @@
 // with baseline.
 static constexpr Register kCoreSavedRegisterForBaseline = R5;
 static constexpr Register kCoreCalleeSaves[] =
-    { R5, R6, R7, R8, R10, R11, PC };
+    { R5, R6, R7, R8, R10, R11, LR };
 static constexpr SRegister kFpuCalleeSaves[] =
     { S16, S17, S18, S19, S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, S30, S31 };
 
@@ -409,8 +409,8 @@
       method_patches_(MethodReferenceComparator(), graph->GetArena()->Adapter()),
       call_patches_(MethodReferenceComparator(), graph->GetArena()->Adapter()),
       relative_call_patches_(graph->GetArena()->Adapter()) {
-  // Save the PC register to mimic Quick.
-  AddAllocatedRegister(Location::RegisterLocation(PC));
+  // Always save the LR register to mimic Quick.
+  AddAllocatedRegister(Location::RegisterLocation(LR));
 }
 
 void CodeGeneratorARM::Finalize(CodeAllocator* allocator) {
@@ -599,12 +599,9 @@
     RecordPcInfo(nullptr, 0);
   }
 
-  // PC is in the list of callee-save to mimic Quick, but we need to push
-  // LR at entry instead.
-  uint32_t push_mask = (core_spill_mask_ & (~(1 << PC))) | 1 << LR;
-  __ PushList(push_mask);
-  __ cfi().AdjustCFAOffset(kArmWordSize * POPCOUNT(push_mask));
-  __ cfi().RelOffsetForMany(DWARFReg(kMethodRegisterArgument), 0, push_mask, kArmWordSize);
+  __ PushList(core_spill_mask_);
+  __ cfi().AdjustCFAOffset(kArmWordSize * POPCOUNT(core_spill_mask_));
+  __ cfi().RelOffsetForMany(DWARFReg(kMethodRegisterArgument), 0, core_spill_mask_, kArmWordSize);
   if (fpu_spill_mask_ != 0) {
     SRegister start_register = SRegister(LeastSignificantBit(fpu_spill_mask_));
     __ vpushs(start_register, POPCOUNT(fpu_spill_mask_));
@@ -632,7 +629,10 @@
     __ cfi().AdjustCFAOffset(-kArmPointerSize * POPCOUNT(fpu_spill_mask_));
     __ cfi().RestoreMany(DWARFReg(SRegister(0)), fpu_spill_mask_);
   }
-  __ PopList(core_spill_mask_);
+  // Pop LR into PC to return.
+  DCHECK_NE(core_spill_mask_ & (1 << LR), 0U);
+  uint32_t pop_mask = (core_spill_mask_ & (~(1 << LR))) | 1 << PC;
+  __ PopList(pop_mask);
   __ cfi().RestoreState();
   __ cfi().DefCFAOffset(GetFrameSize());
 }
diff --git a/runtime/oat.h b/runtime/oat.h
index 1520a9b..b8b8d30 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -32,7 +32,7 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr uint8_t kOatMagic[] = { 'o', 'a', 't', '\n' };
-  static constexpr uint8_t kOatVersion[] = { '0', '6', '9', '\0' };
+  static constexpr uint8_t kOatVersion[] = { '0', '7', '0', '\0' };
 
   static constexpr const char* kImageLocationKey = "image-location";
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";