47 files changed, 814 insertions, 301 deletions
diff --git a/compiler/dex/bb_optimizations.h b/compiler/dex/bb_optimizations.h
index eb897f00e3..d1d5ad9715 100644
--- a/compiler/dex/bb_optimizations.h
+++ b/compiler/dex/bb_optimizations.h
@@ -71,26 +71,28 @@ class CacheMethodLoweringInfo : public PassME {
 };
 
 /**
- * @class CallInlining
- * @brief Perform method inlining pass.
+ * @class SpecialMethodInliner
+ * @brief Performs method inlining pass on special kinds of methods.
+ * @details Special methods are methods that fall in one of the following categories:
+ * empty, instance getter, instance setter, argument return, and constant return.
  */
-class CallInlining : public PassME {
+class SpecialMethodInliner : public PassME {
  public:
-  CallInlining() : PassME("CallInlining") {
+  SpecialMethodInliner() : PassME("SpecialMethodInliner") {
   }
 
   bool Gate(const PassDataHolder* data) const {
     DCHECK(data != nullptr);
     CompilationUnit* cUnit = down_cast<const PassMEDataHolder*>(data)->c_unit;
     DCHECK(cUnit != nullptr);
-    return cUnit->mir_graph->InlineCallsGate();
+    return cUnit->mir_graph->InlineSpecialMethodsGate();
   }
 
   void Start(PassDataHolder* data) const {
     DCHECK(data != nullptr);
     CompilationUnit* cUnit = down_cast<PassMEDataHolder*>(data)->c_unit;
     DCHECK(cUnit != nullptr);
-    cUnit->mir_graph->InlineCallsStart();
+    cUnit->mir_graph->InlineSpecialMethodsStart();
   }
 
   bool Worker(const PassDataHolder* data) const {
@@ -100,7 +102,7 @@ class CallInlining : public PassME {
     DCHECK(cUnit != nullptr);
     BasicBlock* bb = pass_me_data_holder->bb;
     DCHECK(bb != nullptr);
-    cUnit->mir_graph->InlineCalls(bb);
+    cUnit->mir_graph->InlineSpecialMethods(bb);
     // No need of repeating, so just return false.
     return false;
   }
@@ -109,7 +111,7 @@ class CallInlining : public PassME {
     DCHECK(data != nullptr);
     CompilationUnit* cUnit = down_cast<PassMEDataHolder*>(data)->c_unit;
     DCHECK(cUnit != nullptr);
-    cUnit->mir_graph->InlineCallsEnd();
+    cUnit->mir_graph->InlineSpecialMethodsEnd();
   }
 };
 
diff --git a/compiler/dex/frontend.cc b/compiler/dex/frontend.cc
index b16cf14b02..711743d69b 100644
--- a/compiler/dex/frontend.cc
+++ b/compiler/dex/frontend.cc
@@ -97,14 +97,6 @@ static constexpr uint32_t kDisabledOptimizationsPerISA[] = {
     // 2 = kArm64.     TODO(Arm64): enable optimizations once backend is mature enough.
     (1 << kLoadStoreElimination) |
     (1 << kLoadHoisting) |
-    (1 << kSuppressLoads) |
-    (1 << kClassInitCheckElimination) |
-    (1 << kTrackLiveTemps) |
-    (1 << kSafeOptimizations) |
-    (1 << kBBOpt) |
-    (1 << kMatch) |
-    (1 << kPromoteCompilerTemps) |
-    (1 << kSuppressExceptionEdges) |
     0,
     // 3 = kThumb2.
     0,
@@ -582,7 +574,7 @@ static bool CanCompileMethod(uint32_t method_idx, const DexFile& dex_file,
       // Check if we support the byte code.
       if (std::find(unsupport_list, unsupport_list + unsupport_list_size,
                     opcode) != unsupport_list + unsupport_list_size) {
-        if (!cu.mir_graph->IsPseudoMirOp(opcode)) {
+        if (!MIR::DecodedInstruction::IsPseudoMirOp(opcode)) {
           VLOG(compiler) << "Unsupported dalvik byte code : "
               << mir->dalvikInsn.opcode;
         } else {
diff --git a/compiler/dex/mir_analysis.cc b/compiler/dex/mir_analysis.cc
index e372206228..3de448344a 100644
--- a/compiler/dex/mir_analysis.cc
+++ b/compiler/dex/mir_analysis.cc
@@ -902,7 +902,7 @@ void MIRGraph::AnalyzeBlock(BasicBlock* bb, MethodStats* stats) {
   while (!done) {
     tbb->visited = true;
     for (MIR* mir = tbb->first_mir_insn; mir != NULL; mir = mir->next) {
-      if (IsPseudoMirOp(mir->dalvikInsn.opcode)) {
+      if (MIR::DecodedInstruction::IsPseudoMirOp(mir->dalvikInsn.opcode)) {
         // Skip any MIR pseudo-op.
         continue;
       }
diff --git a/compiler/dex/mir_dataflow.cc b/compiler/dex/mir_dataflow.cc
index bc99a272a6..b82c5c7f00 100644
--- a/compiler/dex/mir_dataflow.cc
+++ b/compiler/dex/mir_dataflow.cc
@@ -909,6 +909,16 @@ void MIRGraph::HandleDef(ArenaBitVector* def_v, int dalvik_reg_id) {
   def_v->SetBit(dalvik_reg_id);
 }
 
+void MIRGraph::HandleExtended(ArenaBitVector* use_v, ArenaBitVector* def_v,
+                            ArenaBitVector* live_in_v,
+                            const MIR::DecodedInstruction& d_insn) {
+  switch (static_cast<int>(d_insn.opcode)) {
+    default:
+      LOG(ERROR) << "Unexpected Extended Opcode " << d_insn.opcode;
+      break;
+  }
+}
+
 /*
  * Find out live-in variables for natural loops. Variables that are live-in in
  * the main loop body are considered to be defined in the entry block.
@@ -966,6 +976,9 @@ bool MIRGraph::FindLocalLiveIn(BasicBlock* bb) {
         HandleDef(def_v, d_insn->vA+1);
       }
     }
+    if (df_attributes & DF_FORMAT_EXTENDED) {
+      HandleExtended(use_v, def_v, live_in_v, mir->dalvikInsn);
+    }
   }
   return true;
 }
@@ -1048,6 +1061,14 @@ void MIRGraph::DataFlowSSAFormat3RC(MIR* mir) {
   }
 }
 
+void MIRGraph::DataFlowSSAFormatExtended(MIR* mir) {
+  switch (static_cast<int>(mir->dalvikInsn.opcode)) {
+    default:
+      LOG(ERROR) << "Missing case for extended MIR: " << mir->dalvikInsn.opcode;
+      break;
+  }
+}
+
 /* Entry function to convert a block into SSA representation */
 bool MIRGraph::DoSSAConversion(BasicBlock* bb) {
   MIR* mir;
@@ -1063,7 +1084,7 @@ bool MIRGraph::DoSSAConversion(BasicBlock* bb) {
     uint64_t df_attributes = GetDataFlowAttributes(mir);
 
       // If not a pseudo-op, note non-leaf or can throw
-    if (!IsPseudoMirOp(mir->dalvikInsn.opcode)) {
+    if (!MIR::DecodedInstruction::IsPseudoMirOp(mir->dalvikInsn.opcode)) {
       int flags = Instruction::FlagsOf(mir->dalvikInsn.opcode);
 
       if ((flags & Instruction::kInvoke) != 0 && (mir->optimization_flags & MIR_INLINED) == 0) {
@@ -1083,6 +1104,11 @@ bool MIRGraph::DoSSAConversion(BasicBlock* bb) {
       continue;
     }
 
+    if (df_attributes & DF_FORMAT_EXTENDED) {
+      DataFlowSSAFormatExtended(mir);
+      continue;
+    }
+
     if (df_attributes & DF_HAS_USES) {
       if (df_attributes & DF_UA) {
         num_uses++;
diff --git a/compiler/dex/mir_graph.cc b/compiler/dex/mir_graph.cc
index 4fbace26e7..1c8a9b5079 100644
--- a/compiler/dex/mir_graph.cc
+++ b/compiler/dex/mir_graph.cc
@@ -193,14 +193,16 @@ BasicBlock* MIRGraph::SplitBlock(DexOffset code_offset,
     bottom_block->successor_block_list_type = orig_block->successor_block_list_type;
     bottom_block->successor_blocks = orig_block->successor_blocks;
     orig_block->successor_block_list_type = kNotUsed;
-    orig_block->successor_blocks = NULL;
+    orig_block->successor_blocks = nullptr;
     GrowableArray<SuccessorBlockInfo*>::Iterator iterator(bottom_block->successor_blocks);
     while (true) {
       SuccessorBlockInfo* successor_block_info = iterator.Next();
-      if (successor_block_info == NULL) break;
+      if (successor_block_info == nullptr) break;
       BasicBlock* bb = GetBasicBlock(successor_block_info->block);
-      bb->predecessors->Delete(orig_block->id);
-      bb->predecessors->Insert(bottom_block->id);
+      if (bb != nullptr) {
+        bb->predecessors->Delete(orig_block->id);
+        bb->predecessors->Insert(bottom_block->id);
+      }
     }
   }
 
@@ -222,7 +224,7 @@ BasicBlock* MIRGraph::SplitBlock(DexOffset code_offset,
   DCHECK(insn == bottom_block->first_mir_insn);
   DCHECK_EQ(insn->offset, bottom_block->start_offset);
   DCHECK(static_cast<int>(insn->dalvikInsn.opcode) == kMirOpCheck ||
-         !IsPseudoMirOp(insn->dalvikInsn.opcode));
+         !MIR::DecodedInstruction::IsPseudoMirOp(insn->dalvikInsn.opcode));
   DCHECK_EQ(dex_pc_to_block_map_.Get(insn->offset), orig_block->id);
   MIR* p = insn;
   dex_pc_to_block_map_.Put(p->offset, bottom_block->id);
@@ -237,7 +239,7 @@ BasicBlock* MIRGraph::SplitBlock(DexOffset code_offset,
      * CHECK and work portions. Since the 2nd half of a split operation is always
      * the first in a BasicBlock, we can't hit it here.
      */
-    if ((opcode == kMirOpCheck) || !IsPseudoMirOp(opcode)) {
+    if ((opcode == kMirOpCheck) || !MIR::DecodedInstruction::IsPseudoMirOp(opcode)) {
       DCHECK_EQ(dex_pc_to_block_map_.Get(p->offset), orig_block->id);
       dex_pc_to_block_map_.Put(p->offset, bottom_block->id);
     }
@@ -861,11 +863,17 @@ uint64_t MIRGraph::GetDataFlowAttributes(MIR* mir) {
 /* Dump the CFG into a DOT graph */
 void MIRGraph::DumpCFG(const char* dir_prefix, bool all_blocks, const char *suffix) {
   FILE* file;
+  static AtomicInteger cnt(0);
+
+  // Increment counter to get a unique file number.
+  cnt++;
+
   std::string fname(PrettyMethod(cu_->method_idx, *cu_->dex_file));
   ReplaceSpecialChars(fname);
-  fname = StringPrintf("%s%s%x%s.dot", dir_prefix, fname.c_str(),
+  fname = StringPrintf("%s%s%x%s_%d.dot", dir_prefix, fname.c_str(),
                       GetBasicBlock(GetEntryBlock()->fall_through)->start_offset,
-                      suffix == nullptr ? "" : suffix);
+                      suffix == nullptr ? "" : suffix,
+                      cnt.LoadRelaxed());
   file = fopen(fname.c_str(), "w");
   if (file == NULL) {
     return;
@@ -882,6 +890,7 @@ void MIRGraph::DumpCFG(const char* dir_prefix, bool all_blocks, const char *suff
     BasicBlock* bb = GetBasicBlock(block_idx);
     if (bb == NULL) continue;
     if (bb->block_type == kDead) continue;
+    if (bb->hidden) continue;
     if (bb->block_type == kEntryBlock) {
       fprintf(file, "  entry_%d [shape=Mdiamond];\n", bb->id);
     } else if (bb->block_type == kExitBlock) {
@@ -916,7 +925,8 @@ void MIRGraph::DumpCFG(const char* dir_prefix, bool all_blocks, const char *suff
             } else {
               fprintf(file, "    {%04x %s %s %s %s\\l}%s\\\n", mir->offset,
                       mir->ssa_rep ? GetDalvikDisassembly(mir) :
-                      !IsPseudoMirOp(opcode) ? Instruction::Name(mir->dalvikInsn.opcode) :
+                      !MIR::DecodedInstruction::IsPseudoMirOp(opcode) ?
+                        Instruction::Name(mir->dalvikInsn.opcode) :
                         extended_mir_op_names_[opcode - kMirOpFirst],
                       (mir->optimization_flags & MIR_IGNORE_RANGE_CHECK) != 0 ? " no_rangecheck" : " ",
                       (mir->optimization_flags & MIR_IGNORE_NULL_CHECK) != 0 ? " no_nullcheck" : " ",
@@ -1222,7 +1232,7 @@ char* MIRGraph::GetDalvikDisassembly(const MIR* mir) {
     nop = true;
   }
 
-  if (IsPseudoMirOp(opcode)) {
+  if (MIR::DecodedInstruction::IsPseudoMirOp(opcode)) {
     str.append(extended_mir_op_names_[opcode - kMirOpFirst]);
   } else {
     dalvik_format = Instruction::FormatOf(insn.opcode);
@@ -1693,11 +1703,13 @@ BasicBlock* ChildBlockIterator::Next() {
   // We visited both taken and fallthrough. Now check if we have successors we need to visit.
   if (have_successors_ == true) {
     // Get information about next successor block.
-    SuccessorBlockInfo* successor_block_info = successor_iter_.Next();
-
-    // If we don't have anymore successors, return nullptr.
-    if (successor_block_info != nullptr) {
-      return mir_graph_->GetBasicBlock(successor_block_info->block);
+    for (SuccessorBlockInfo* successor_block_info = successor_iter_.Next();
+      successor_block_info != nullptr;
+      successor_block_info = successor_iter_.Next()) {
+      // If block was replaced by zero block, take next one.
+      if (successor_block_info->block != NullBasicBlockId) {
+        return mir_graph_->GetBasicBlock(successor_block_info->block);
+      }
     }
   }
 
diff --git a/compiler/dex/mir_graph.h b/compiler/dex/mir_graph.h
index d09732891c..1556a19da7 100644
--- a/compiler/dex/mir_graph.h
+++ b/compiler/dex/mir_graph.h
@@ -80,6 +80,7 @@ enum DataFlowAttributePos {
   kSetsConst,
   kFormat35c,
   kFormat3rc,
+  kFormatExtended,       // Extended format for extended MIRs.
   kNullCheckSrc0,        // Null check of uses[0].
   kNullCheckSrc1,        // Null check of uses[1].
   kNullCheckSrc2,        // Null check of uses[2].
@@ -118,6 +119,7 @@ enum DataFlowAttributePos {
 #define DF_SETS_CONST           (UINT64_C(1) << kSetsConst)
 #define DF_FORMAT_35C           (UINT64_C(1) << kFormat35c)
 #define DF_FORMAT_3RC           (UINT64_C(1) << kFormat3rc)
+#define DF_FORMAT_EXTENDED      (UINT64_C(1) << kFormatExtended)
 #define DF_NULL_CHK_0           (UINT64_C(1) << kNullCheckSrc0)
 #define DF_NULL_CHK_1           (UINT64_C(1) << kNullCheckSrc1)
 #define DF_NULL_CHK_2           (UINT64_C(1) << kNullCheckSrc2)
@@ -284,34 +286,46 @@ struct MIR {
      */
     bool GetConstant(int64_t* ptr_value, bool* wide) const;
 
+    static bool IsPseudoMirOp(Instruction::Code opcode) {
+      return static_cast<int>(opcode) >= static_cast<int>(kMirOpFirst);
+    }
+
+    static bool IsPseudoMirOp(int opcode) {
+      return opcode >= static_cast<int>(kMirOpFirst);
+    }
+
+    bool IsInvoke() const {
+      return !IsPseudoMirOp(opcode) && ((Instruction::FlagsOf(opcode) & Instruction::kInvoke) == Instruction::kInvoke);
+    }
+
     bool IsStore() const {
-      return ((Instruction::FlagsOf(opcode) & Instruction::kStore) == Instruction::kStore);
+      return !IsPseudoMirOp(opcode) && ((Instruction::FlagsOf(opcode) & Instruction::kStore) == Instruction::kStore);
     }
 
     bool IsLoad() const {
-      return ((Instruction::FlagsOf(opcode) & Instruction::kLoad) == Instruction::kLoad);
+      return !IsPseudoMirOp(opcode) && ((Instruction::FlagsOf(opcode) & Instruction::kLoad) == Instruction::kLoad);
     }
 
     bool IsConditionalBranch() const {
-      return (Instruction::FlagsOf(opcode) == (Instruction::kContinue | Instruction::kBranch));
+      return !IsPseudoMirOp(opcode) && (Instruction::FlagsOf(opcode) == (Instruction::kContinue | Instruction::kBranch));
     }
 
     /**
      * @brief Is the register C component of the decoded instruction a constant?
      */
     bool IsCFieldOrConstant() const {
-      return ((Instruction::FlagsOf(opcode) & Instruction::kRegCFieldOrConstant) == Instruction::kRegCFieldOrConstant);
+      return !IsPseudoMirOp(opcode) && ((Instruction::FlagsOf(opcode) & Instruction::kRegCFieldOrConstant) == Instruction::kRegCFieldOrConstant);
     }
 
     /**
      * @brief Is the register C component of the decoded instruction a constant?
      */
     bool IsBFieldOrConstant() const {
-      return ((Instruction::FlagsOf(opcode) & Instruction::kRegBFieldOrConstant) == Instruction::kRegBFieldOrConstant);
+      return !IsPseudoMirOp(opcode) && ((Instruction::FlagsOf(opcode) & Instruction::kRegBFieldOrConstant) == Instruction::kRegBFieldOrConstant);
     }
 
     bool IsCast() const {
-      return ((Instruction::FlagsOf(opcode) & Instruction::kCast) == Instruction::kCast);
+      return !IsPseudoMirOp(opcode) && ((Instruction::FlagsOf(opcode) & Instruction::kCast) == Instruction::kCast);
     }
 
     /**
@@ -321,11 +335,11 @@ struct MIR {
      *            when crossing such an instruction.
      */
     bool Clobbers() const {
-      return ((Instruction::FlagsOf(opcode) & Instruction::kClobber) == Instruction::kClobber);
+      return !IsPseudoMirOp(opcode) && ((Instruction::FlagsOf(opcode) & Instruction::kClobber) == Instruction::kClobber);
     }
 
     bool IsLinear() const {
-      return (Instruction::FlagsOf(opcode) & (Instruction::kAdd | Instruction::kSubtract)) != 0;
+      return !IsPseudoMirOp(opcode) && (Instruction::FlagsOf(opcode) & (Instruction::kAdd | Instruction::kSubtract)) != 0;
     }
   } dalvikInsn;
 
@@ -877,14 +891,6 @@ class MIRGraph {
     return backward_branches_ + forward_branches_;
   }
 
-  static bool IsPseudoMirOp(Instruction::Code opcode) {
-    return static_cast<int>(opcode) >= static_cast<int>(kMirOpFirst);
-  }
-
-  static bool IsPseudoMirOp(int opcode) {
-    return opcode >= static_cast<int>(kMirOpFirst);
-  }
-
   // Is this vreg in the in set?
   bool IsInVReg(int vreg) {
     return (vreg >= cu_->num_regs);
@@ -956,10 +962,10 @@ class MIRGraph {
   void ComputeTopologicalSortOrder();
   BasicBlock* CreateNewBB(BBType block_type);
 
-  bool InlineCallsGate();
-  void InlineCallsStart();
-  void InlineCalls(BasicBlock* bb);
-  void InlineCallsEnd();
+  bool InlineSpecialMethodsGate();
+  void InlineSpecialMethodsStart();
+  void InlineSpecialMethods(BasicBlock* bb);
+  void InlineSpecialMethodsEnd();
 
   /**
    * @brief Perform the initial preparation for the Method Uses.
@@ -1059,6 +1065,9 @@ class MIRGraph {
   void HandleLiveInUse(ArenaBitVector* use_v, ArenaBitVector* def_v,
                        ArenaBitVector* live_in_v, int dalvik_reg_id);
   void HandleDef(ArenaBitVector* def_v, int dalvik_reg_id);
+  void HandleExtended(ArenaBitVector* use_v, ArenaBitVector* def_v,
+                      ArenaBitVector* live_in_v,
+                      const MIR::DecodedInstruction& d_insn);
   bool DoSSAConversion(BasicBlock* bb);
   bool InvokeUsesMethodStar(MIR* mir);
   int ParseInsn(const uint16_t* code_ptr, MIR::DecodedInstruction* decoded_instruction);
@@ -1080,6 +1089,7 @@ class MIRGraph {
   void HandleSSAUse(int* uses, int dalvik_reg, int reg_index);
   void DataFlowSSAFormat35C(MIR* mir);
   void DataFlowSSAFormat3RC(MIR* mir);
+  void DataFlowSSAFormatExtended(MIR* mir);
   bool FindLocalLiveIn(BasicBlock* bb);
   bool VerifyPredInfo(BasicBlock* bb);
   BasicBlock* NeedsVisit(BasicBlock* bb);
diff --git a/compiler/dex/mir_optimization.cc b/compiler/dex/mir_optimization.cc
index dc1057f277..869c48f66c 100644
--- a/compiler/dex/mir_optimization.cc
+++ b/compiler/dex/mir_optimization.cc
@@ -137,7 +137,7 @@ MIR* MIRGraph::FindMoveResult(BasicBlock* bb, MIR* mir) {
       break;
     }
     // Keep going if pseudo op, otherwise terminate
-    if (IsPseudoMirOp(mir->dalvikInsn.opcode)) {
+    if (MIR::DecodedInstruction::IsPseudoMirOp(mir->dalvikInsn.opcode)) {
       mir = AdvanceMIR(&tbb, mir);
     } else {
       mir = NULL;
@@ -877,7 +877,7 @@ bool MIRGraph::EliminateNullChecksAndInferTypes(BasicBlock* bb) {
           struct BasicBlock* next_bb = GetBasicBlock(bb->fall_through);
           for (MIR* tmir = next_bb->first_mir_insn; tmir != NULL;
             tmir =tmir->next) {
-            if (IsPseudoMirOp(tmir->dalvikInsn.opcode)) {
+            if (MIR::DecodedInstruction::IsPseudoMirOp(tmir->dalvikInsn.opcode)) {
               continue;
             }
             // First non-pseudo should be MOVE_RESULT_OBJECT
@@ -1220,7 +1220,7 @@ void MIRGraph::ComputeInlineIFieldLoweringInfo(uint16_t field_idx, MIR* invoke,
   iget_or_iput->meta.ifield_lowering_info = field_info_index;
 }
 
-bool MIRGraph::InlineCallsGate() {
+bool MIRGraph::InlineSpecialMethodsGate() {
   if ((cu_->disable_opt & (1 << kSuppressMethodInlining)) != 0 ||
       method_lowering_infos_.Size() == 0u) {
     return false;
@@ -1232,7 +1232,7 @@ bool MIRGraph::InlineCallsGate() {
   return true;
 }
 
-void MIRGraph::InlineCallsStart() {
+void MIRGraph::InlineSpecialMethodsStart() {
   // Prepare for inlining getters/setters. Since we're inlining at most 1 IGET/IPUT from
   // each INVOKE, we can index the data by the MIR::meta::method_lowering_info index.
 
@@ -1246,12 +1246,12 @@ void MIRGraph::InlineCallsStart() {
       temp_bit_vector_size_ * sizeof(*temp_insn_data_), kArenaAllocGrowableArray));
 }
 
-void MIRGraph::InlineCalls(BasicBlock* bb) {
+void MIRGraph::InlineSpecialMethods(BasicBlock* bb) {
   if (bb->block_type != kDalvikByteCode) {
     return;
   }
   for (MIR* mir = bb->first_mir_insn; mir != NULL; mir = mir->next) {
-    if (IsPseudoMirOp(mir->dalvikInsn.opcode)) {
+    if (MIR::DecodedInstruction::IsPseudoMirOp(mir->dalvikInsn.opcode)) {
       continue;
     }
     if (!(Instruction::FlagsOf(mir->dalvikInsn.opcode) & Instruction::kInvoke)) {
@@ -1270,17 +1270,17 @@ void MIRGraph::InlineCalls(BasicBlock* bb) {
     MethodReference target = method_info.GetTargetMethod();
     if (cu_->compiler_driver->GetMethodInlinerMap()->GetMethodInliner(target.dex_file)
             ->GenInline(this, bb, mir, target.dex_method_index)) {
-      if (cu_->verbose) {
-        LOG(INFO) << "In \"" << PrettyMethod(cu_->method_idx, *cu_->dex_file)
-            << "\" @0x" << std::hex << mir->offset
-            << " inlined " << method_info.GetInvokeType() << " (" << sharp_type << ") call to \""
-            << PrettyMethod(target.dex_method_index, *target.dex_file) << "\"";
+      if (cu_->verbose || cu_->print_pass) {
+        LOG(INFO) << "SpecialMethodInliner: Inlined " << method_info.GetInvokeType() << " ("
+            << sharp_type << ") call to \"" << PrettyMethod(target.dex_method_index, *target.dex_file)
+            << "\" from \"" << PrettyMethod(cu_->method_idx, *cu_->dex_file)
+            << "\" @0x" << std::hex << mir->offset;
       }
     }
   }
 }
 
-void MIRGraph::InlineCallsEnd() {
+void MIRGraph::InlineSpecialMethodsEnd() {
   DCHECK(temp_insn_data_ != nullptr);
   temp_insn_data_ = nullptr;
   DCHECK(temp_bit_vector_ != nullptr);
diff --git a/compiler/dex/pass_driver_me_opts.cc b/compiler/dex/pass_driver_me_opts.cc
index 4c9bed65dc..c72a4a667e 100644
--- a/compiler/dex/pass_driver_me_opts.cc
+++ b/compiler/dex/pass_driver_me_opts.cc
@@ -35,7 +35,7 @@ template<>
 const Pass* const PassDriver<PassDriverMEOpts>::g_passes[] = {
   GetPassInstance<CacheFieldLoweringInfo>(),
   GetPassInstance<CacheMethodLoweringInfo>(),
-  GetPassInstance<CallInlining>(),
+  GetPassInstance<SpecialMethodInliner>(),
   GetPassInstance<CodeLayout>(),
   GetPassInstance<NullCheckEliminationAndTypeInference>(),
   GetPassInstance<ClassInitCheckElimination>(),
diff --git a/compiler/dex/quick/arm64/int_arm64.cc b/compiler/dex/quick/arm64/int_arm64.cc
index e8f5cb9f09..3ee3e2e61d 100644
--- a/compiler/dex/quick/arm64/int_arm64.cc
+++ b/compiler/dex/quick/arm64/int_arm64.cc
@@ -91,17 +91,121 @@ void Arm64Mir2Lir::GenSelect(BasicBlock* bb, MIR* mir) {
   RegLocation rl_dest = mir_graph_->GetDest(mir);
   RegisterClass src_reg_class = rl_src.ref ? kRefReg : kCoreReg;
   RegisterClass result_reg_class = rl_dest.ref ? kRefReg : kCoreReg;
+
   rl_src = LoadValue(rl_src, src_reg_class);
+  // rl_src may be aliased with rl_result/rl_dest, so do compare early.
+  OpRegImm(kOpCmp, rl_src.reg, 0);
+
   ArmConditionCode code = ArmConditionEncoding(mir->meta.ccode);
 
-  RegLocation rl_true = mir_graph_->reg_location_[mir->ssa_rep->uses[1]];
-  RegLocation rl_false = mir_graph_->reg_location_[mir->ssa_rep->uses[2]];
-  rl_true = LoadValue(rl_true, result_reg_class);
-  rl_false = LoadValue(rl_false, result_reg_class);
-  rl_result = EvalLoc(rl_dest, result_reg_class, true);
-  OpRegImm(kOpCmp, rl_src.reg, 0);
-  NewLIR4(kA64Csel4rrrc, rl_result.reg.GetReg(), rl_true.reg.GetReg(),
-          rl_false.reg.GetReg(), code);
+  // The kMirOpSelect has two variants, one for constants and one for moves.
+  bool is_wide = rl_dest.ref || rl_dest.wide;
+
+  if (mir->ssa_rep->num_uses == 1) {
+    uint32_t true_val = mir->dalvikInsn.vB;
+    uint32_t false_val = mir->dalvikInsn.vC;
+
+    int opcode;             // The opcode.
+    int left_op, right_op;  // The operands.
+    bool rl_result_evaled = false;
+
+    // Check some simple cases.
+    // TODO: Improve this.
+    int zero_reg = (is_wide ? rs_xzr : rs_wzr).GetReg();
+
+    if ((true_val == 0 && false_val == 1) || (true_val == 1 && false_val == 0)) {
+      // CSInc cheap based on wzr.
+      if (true_val == 1) {
+        // Negate.
+        code = ArmConditionEncoding(NegateComparison(mir->meta.ccode));
+      }
+
+      left_op = right_op = zero_reg;
+      opcode = is_wide ? WIDE(kA64Csinc4rrrc) : kA64Csinc4rrrc;
+    } else if ((true_val == 0 && false_val == 0xFFFFFFFF) ||
+               (true_val == 0xFFFFFFFF && false_val == 0)) {
+      // CSneg cheap based on wzr.
+      if (true_val == 0xFFFFFFFF) {
+        // Negate.
+        code = ArmConditionEncoding(NegateComparison(mir->meta.ccode));
+      }
+
+      left_op = right_op = zero_reg;
+      opcode = is_wide ? WIDE(kA64Csneg4rrrc) : kA64Csneg4rrrc;
+    } else if (true_val == 0 || false_val == 0) {
+      // Csel half cheap based on wzr.
+      rl_result = EvalLoc(rl_dest, result_reg_class, true);
+      rl_result_evaled = true;
+      if (false_val == 0) {
+        // Negate.
+        code = ArmConditionEncoding(NegateComparison(mir->meta.ccode));
+      }
+      LoadConstantNoClobber(rl_result.reg, true_val == 0 ? false_val : true_val);
+      left_op = zero_reg;
+      right_op = rl_result.reg.GetReg();
+      opcode = is_wide ? WIDE(kA64Csel4rrrc) : kA64Csel4rrrc;
+    } else if (true_val == 1 || false_val == 1) {
+      // CSInc half cheap based on wzr.
+      rl_result = EvalLoc(rl_dest, result_reg_class, true);
+      rl_result_evaled = true;
+      if (true_val == 1) {
+        // Negate.
+        code = ArmConditionEncoding(NegateComparison(mir->meta.ccode));
+      }
+      LoadConstantNoClobber(rl_result.reg, true_val == 1 ? false_val : true_val);
+      left_op = rl_result.reg.GetReg();
+      right_op = zero_reg;
+      opcode = is_wide ? WIDE(kA64Csinc4rrrc) : kA64Csinc4rrrc;
+    } else if (true_val == 0xFFFFFFFF || false_val == 0xFFFFFFFF) {
+      // CSneg half cheap based on wzr.
+      rl_result = EvalLoc(rl_dest, result_reg_class, true);
+      rl_result_evaled = true;
+      if (true_val == 0xFFFFFFFF) {
+        // Negate.
+        code = ArmConditionEncoding(NegateComparison(mir->meta.ccode));
+      }
+      LoadConstantNoClobber(rl_result.reg, true_val == 0xFFFFFFFF ? false_val : true_val);
+      left_op = rl_result.reg.GetReg();
+      right_op = zero_reg;
+      opcode = is_wide ? WIDE(kA64Csneg4rrrc) : kA64Csneg4rrrc;
+    } else {
+      // Csel. The rest. Use rl_result and a temp.
+      // TODO: To minimize the constants being loaded, check whether one can be inexpensively
+      //       loaded as n - 1 or ~n.
+      rl_result = EvalLoc(rl_dest, result_reg_class, true);
+      rl_result_evaled = true;
+      LoadConstantNoClobber(rl_result.reg, true_val);
+      RegStorage t_reg2 = AllocTypedTemp(false, result_reg_class);
+      if (rl_dest.wide) {
+        if (t_reg2.Is32Bit()) {
+          t_reg2 = As64BitReg(t_reg2);
+        }
+      }
+      LoadConstantNoClobber(t_reg2, false_val);
+
+      // Use csel.
+      left_op = rl_result.reg.GetReg();
+      right_op = t_reg2.GetReg();
+      opcode = is_wide ? WIDE(kA64Csel4rrrc) : kA64Csel4rrrc;
+    }
+
+    if (!rl_result_evaled) {
+      rl_result = EvalLoc(rl_dest, result_reg_class, true);
+    }
+
+    NewLIR4(opcode, rl_result.reg.GetReg(), left_op, right_op, code);
+  } else {
+    RegLocation rl_true = mir_graph_->reg_location_[mir->ssa_rep->uses[1]];
+    RegLocation rl_false = mir_graph_->reg_location_[mir->ssa_rep->uses[2]];
+
+    rl_true = LoadValue(rl_true, result_reg_class);
+    rl_false = LoadValue(rl_false, result_reg_class);
+    rl_result = EvalLoc(rl_dest, result_reg_class, true);
+
+    int opcode = is_wide ? WIDE(kA64Csel4rrrc) : kA64Csel4rrrc;
+    NewLIR4(opcode, rl_result.reg.GetReg(),
+            rl_true.reg.GetReg(), rl_false.reg.GetReg(), code);
+  }
   StoreValue(rl_dest, rl_result);
 }
 
@@ -110,7 +214,6 @@ void Arm64Mir2Lir::GenFusedLongCmpBranch(BasicBlock* bb, MIR* mir) {
   RegLocation rl_src2 = mir_graph_->GetSrcWide(mir, 2);
   LIR* taken = &block_label_list_[bb->taken];
   LIR* not_taken = &block_label_list_[bb->fall_through];
-  rl_src1 = LoadValueWide(rl_src1, kCoreReg);
   // Normalize such that if either operand is constant, src2 will be constant.
   ConditionCode ccode = mir->meta.ccode;
   if (rl_src1.is_const) {
@@ -118,16 +221,22 @@ void Arm64Mir2Lir::GenFusedLongCmpBranch(BasicBlock* bb, MIR* mir) {
     ccode = FlipComparisonOrder(ccode);
   }
 
+  rl_src1 = LoadValueWide(rl_src1, kCoreReg);
+
   if (rl_src2.is_const) {
-    rl_src2 = UpdateLocWide(rl_src2);
+    // TODO: Optimize for rl_src1.is_const? (Does happen in the boot image at the moment.)
+
     int64_t val = mir_graph_->ConstantValueWide(rl_src2);
     // Special handling using cbz & cbnz.
     if (val == 0 && (ccode == kCondEq || ccode == kCondNe)) {
       OpCmpImmBranch(ccode, rl_src1.reg, 0, taken);
       OpCmpImmBranch(NegateComparison(ccode), rl_src1.reg, 0, not_taken);
       return;
+    }
+
     // Only handle Imm if src2 is not already in a register.
-    } else if (rl_src2.location != kLocPhysReg) {
+    rl_src2 = UpdateLocWide(rl_src2);
+    if (rl_src2.location != kLocPhysReg) {
       OpRegImm64(kOpCmp, rl_src1.reg, val);
       OpCondBranch(ccode, taken);
       OpCondBranch(NegateComparison(ccode), not_taken);
diff --git a/compiler/dex/quick/codegen_util.cc b/compiler/dex/quick/codegen_util.cc
index 5870d22208..048aca3735 100644
--- a/compiler/dex/quick/codegen_util.cc
+++ b/compiler/dex/quick/codegen_util.cc
@@ -1046,9 +1046,19 @@ CompiledMethod* Mir2Lir::GetCompiledMethod() {
     }
     // Push a marker to take place of lr.
     vmap_encoder.PushBackUnsigned(VmapTable::kAdjustedFpMarker);
-    // fp regs already sorted.
-    for (uint32_t i = 0; i < fp_vmap_table_.size(); i++) {
-      vmap_encoder.PushBackUnsigned(fp_vmap_table_[i] + VmapTable::kEntryAdjustment);
+    if (cu_->instruction_set == kThumb2) {
+      // fp regs already sorted.
+      for (uint32_t i = 0; i < fp_vmap_table_.size(); i++) {
+        vmap_encoder.PushBackUnsigned(fp_vmap_table_[i] + VmapTable::kEntryAdjustment);
+      }
+    } else {
+      // For other platforms regs may have been inserted out of order - sort first.
+      std::sort(fp_vmap_table_.begin(), fp_vmap_table_.end());
+      for (size_t i = 0 ; i < fp_vmap_table_.size(); ++i) {
+        // Copy, stripping out the phys register sort key.
+        vmap_encoder.PushBackUnsigned(
+            ~(-1 << VREG_NUM_WIDTH) & (fp_vmap_table_[i] + VmapTable::kEntryAdjustment));
+      }
     }
   } else {
     DCHECK_EQ(POPCOUNT(core_spill_mask_), 0);
diff --git a/compiler/dex/quick/dex_file_method_inliner.cc b/compiler/dex/quick/dex_file_method_inliner.cc
index 6191e4b0a1..45dd7f08a6 100644
--- a/compiler/dex/quick/dex_file_method_inliner.cc
+++ b/compiler/dex/quick/dex_file_method_inliner.cc
@@ -96,7 +96,7 @@ MIR* AllocReplacementMIR(MIRGraph* mir_graph, MIR* invoke, MIR* move_return) {
 
 uint32_t GetInvokeReg(MIR* invoke, uint32_t arg) {
   DCHECK_LT(arg, invoke->dalvikInsn.vA);
-  DCHECK(!MIRGraph::IsPseudoMirOp(invoke->dalvikInsn.opcode));
+  DCHECK(!MIR::DecodedInstruction::IsPseudoMirOp(invoke->dalvikInsn.opcode));
   if (Instruction::FormatOf(invoke->dalvikInsn.opcode) == Instruction::k3rc) {
     return invoke->dalvikInsn.vC + arg;  // Non-range invoke.
   } else {
@@ -107,7 +107,7 @@ uint32_t GetInvokeReg(MIR* invoke, uint32_t arg) {
 
 bool WideArgIsInConsecutiveDalvikRegs(MIR* invoke, uint32_t arg) {
   DCHECK_LT(arg + 1, invoke->dalvikInsn.vA);
-  DCHECK(!MIRGraph::IsPseudoMirOp(invoke->dalvikInsn.opcode));
+  DCHECK(!MIR::DecodedInstruction::IsPseudoMirOp(invoke->dalvikInsn.opcode));
   return Instruction::FormatOf(invoke->dalvikInsn.opcode) == Instruction::k3rc ||
       invoke->dalvikInsn.arg[arg + 1u] == invoke->dalvikInsn.arg[arg] + 1u;
 }
diff --git a/compiler/dex/quick/mir_to_lir.cc b/compiler/dex/quick/mir_to_lir.cc
index caadc0ad89..07c615f342 100644
--- a/compiler/dex/quick/mir_to_lir.cc
+++ b/compiler/dex/quick/mir_to_lir.cc
@@ -1185,7 +1185,7 @@ bool Mir2Lir::MethodBlockCodeGen(BasicBlock* bb) {
       work_half->meta.throw_insn = mir;
     }
 
-    if (MIRGraph::IsPseudoMirOp(opcode)) {
+    if (MIR::DecodedInstruction::IsPseudoMirOp(opcode)) {
       HandleExtendedMethodMIR(bb, mir);
       continue;
     }
diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index 48855012c3..87509b636c 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h
@@ -531,7 +531,7 @@ class Mir2Lir : public Backend {
       LIRSlowPath(Mir2Lir* m2l, const DexOffset dexpc, LIR* fromfast,
                   LIR* cont = nullptr) :
         m2l_(m2l), cu_(m2l->cu_), current_dex_pc_(dexpc), fromfast_(fromfast), cont_(cont) {
-          m2l->StartSlowPath(cont);
+          m2l->StartSlowPath(this);
       }
       virtual ~LIRSlowPath() {}
       virtual void Compile() = 0;
@@ -705,17 +705,17 @@ class Mir2Lir : public Backend {
     int AssignLiteralOffset(CodeOffset offset);
     int AssignSwitchTablesOffset(CodeOffset offset);
     int AssignFillArrayDataOffset(CodeOffset offset);
-    LIR* InsertCaseLabel(DexOffset vaddr, int keyVal);
+    virtual LIR* InsertCaseLabel(DexOffset vaddr, int keyVal);
     void MarkPackedCaseLabels(Mir2Lir::SwitchTable* tab_rec);
     void MarkSparseCaseLabels(Mir2Lir::SwitchTable* tab_rec);
 
-    virtual void StartSlowPath(LIR *label) {}
+    virtual void StartSlowPath(LIRSlowPath* slowpath) {}
     virtual void BeginInvoke(CallInfo* info) {}
     virtual void EndInvoke(CallInfo* info) {}
 
 
     // Handle bookkeeping to convert a wide RegLocation to a narrow RegLocation.  No code generated.
-    RegLocation NarrowRegLoc(RegLocation loc);
+    virtual RegLocation NarrowRegLoc(RegLocation loc);
 
     // Shared by all targets - implemented in local_optimizations.cc
     void ConvertMemOpIntoMove(LIR* orig_lir, RegStorage dest, RegStorage src);
@@ -763,7 +763,7 @@ class Mir2Lir : public Backend {
     virtual bool IsTemp(RegStorage reg);
     bool IsPromoted(RegStorage reg);
     bool IsDirty(RegStorage reg);
-    void LockTemp(RegStorage reg);
+    virtual void LockTemp(RegStorage reg);
     void ResetDef(RegStorage reg);
     void NullifyRange(RegStorage reg, int s_reg);
     void MarkDef(RegLocation rl, LIR *start, LIR *finish);
diff --git a/compiler/dex/quick/x86/call_x86.cc b/compiler/dex/quick/x86/call_x86.cc
index 9000514856..8e2a1e3532 100644
--- a/compiler/dex/quick/x86/call_x86.cc
+++ b/compiler/dex/quick/x86/call_x86.cc
@@ -234,8 +234,7 @@ void X86Mir2Lir::GenEntrySequence(RegLocation* ArgLocs, RegLocation rl_method) {
   NewLIR0(kPseudoMethodEntry);
   /* Spill core callee saves */
   SpillCoreRegs();
-  /* NOTE: promotion of FP regs currently unsupported, thus no FP spill */
-  DCHECK_EQ(num_fp_spills_, 0);
+  SpillFPRegs();
   if (!skip_overflow_check) {
     class StackOverflowSlowPath : public LIRSlowPath {
      public:
@@ -309,6 +308,7 @@ void X86Mir2Lir::GenExitSequence() {
 
   NewLIR0(kPseudoMethodExit);
   UnSpillCoreRegs();
+  UnSpillFPRegs();
   /* Remove frame except for return address */
   stack_increment_ = OpRegImm(kOpAdd, rs_rX86_SP, frame_size_ - GetInstructionSetPointerSize(cu_->instruction_set));
   NewLIR0(kX86Ret);
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index ff7b30eeec..b0c54e86e9 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -319,6 +319,8 @@ class X86Mir2Lir : public Mir2Lir {
   void OpRegThreadMem(OpKind op, RegStorage r_dest, ThreadOffset<8> thread_offset);
   void SpillCoreRegs();
   void UnSpillCoreRegs();
+  void UnSpillFPRegs();
+  void SpillFPRegs();
   static const X86EncodingMap EncodingMap[kX86Last];
   bool InexpensiveConstantInt(int32_t value);
   bool InexpensiveConstantFloat(int32_t value);
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index e81f505f2f..1ebbbbd5ee 100755
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -52,6 +52,13 @@ static constexpr RegStorage dp_regs_arr_64[] = {
     rs_dr0, rs_dr1, rs_dr2, rs_dr3, rs_dr4, rs_dr5, rs_dr6, rs_dr7,
     rs_dr8, rs_dr9, rs_dr10, rs_dr11, rs_dr12, rs_dr13, rs_dr14, rs_dr15
 };
+static constexpr RegStorage xp_regs_arr_32[] = {
+    rs_xr0, rs_xr1, rs_xr2, rs_xr3, rs_xr4, rs_xr5, rs_xr6, rs_xr7,
+};
+static constexpr RegStorage xp_regs_arr_64[] = {
+    rs_xr0, rs_xr1, rs_xr2, rs_xr3, rs_xr4, rs_xr5, rs_xr6, rs_xr7,
+    rs_xr8, rs_xr9, rs_xr10, rs_xr11, rs_xr12, rs_xr13, rs_xr14, rs_xr15
+};
 static constexpr RegStorage reserved_regs_arr_32[] = {rs_rX86_SP_32};
 static constexpr RegStorage reserved_regs_arr_64[] = {rs_rX86_SP_32};
 static constexpr RegStorage reserved_regs_arr_64q[] = {rs_rX86_SP_64};
@@ -60,6 +67,24 @@ static constexpr RegStorage core_temps_arr_64[] = {
     rs_rAX, rs_rCX, rs_rDX, rs_rSI, rs_rDI,
     rs_r8, rs_r9, rs_r10, rs_r11
 };
+
+// How to add register to be available for promotion:
+// 1) Remove register from array defining temp
+// 2) Update ClobberCallerSave
+// 3) Update JNI compiler ABI:
+// 3.1) add reg in JniCallingConvention method
+// 3.2) update CoreSpillMask/FpSpillMask
+// 4) Update entrypoints
+// 4.1) Update constants in asm_support_x86_64.h for new frame size
+// 4.2) Remove entry in SmashCallerSaves
+// 4.3) Update jni_entrypoints to spill/unspill new callee save reg
+// 4.4) Update quick_entrypoints to spill/unspill new callee save reg
+// 5) Update runtime ABI
+// 5.1) Update quick_method_frame_info with new required spills
+// 5.2) Update QuickArgumentVisitor with new offsets to gprs and xmms
+// Note that you cannot use register corresponding to incoming args
+// according to ABI and QCG needs one additional XMM temp for
+// bulk copy in preparation to call.
 static constexpr RegStorage core_temps_arr_64q[] = {
     rs_r0q, rs_r1q, rs_r2q, rs_r6q, rs_r7q,
     rs_r8q, rs_r9q, rs_r10q, rs_r11q
@@ -69,14 +94,14 @@ static constexpr RegStorage sp_temps_arr_32[] = {
 };
 static constexpr RegStorage sp_temps_arr_64[] = {
     rs_fr0, rs_fr1, rs_fr2, rs_fr3, rs_fr4, rs_fr5, rs_fr6, rs_fr7,
-    rs_fr8, rs_fr9, rs_fr10, rs_fr11, rs_fr12, rs_fr13, rs_fr14, rs_fr15
+    rs_fr8, rs_fr9, rs_fr10, rs_fr11
 };
 static constexpr RegStorage dp_temps_arr_32[] = {
     rs_dr0, rs_dr1, rs_dr2, rs_dr3, rs_dr4, rs_dr5, rs_dr6, rs_dr7,
 };
 static constexpr RegStorage dp_temps_arr_64[] = {
     rs_dr0, rs_dr1, rs_dr2, rs_dr3, rs_dr4, rs_dr5, rs_dr6, rs_dr7,
-    rs_dr8, rs_dr9, rs_dr10, rs_dr11, rs_dr12, rs_dr13, rs_dr14, rs_dr15
+    rs_dr8, rs_dr9, rs_dr10, rs_dr11
 };
 
 static constexpr RegStorage xp_temps_arr_32[] = {
@@ -84,7 +109,7 @@ static constexpr RegStorage xp_temps_arr_32[] = {
 };
 static constexpr RegStorage xp_temps_arr_64[] = {
     rs_xr0, rs_xr1, rs_xr2, rs_xr3, rs_xr4, rs_xr5, rs_xr6, rs_xr7,
-    rs_xr8, rs_xr9, rs_xr10, rs_xr11, rs_xr12, rs_xr13, rs_xr14, rs_xr15
+    rs_xr8, rs_xr9, rs_xr10, rs_xr11
 };
 
 static constexpr ArrayRef<const RegStorage> empty_pool;
@@ -95,6 +120,8 @@ static constexpr ArrayRef<const RegStorage> sp_regs_32(sp_regs_arr_32);
 static constexpr ArrayRef<const RegStorage> sp_regs_64(sp_regs_arr_64);
 static constexpr ArrayRef<const RegStorage> dp_regs_32(dp_regs_arr_32);
 static constexpr ArrayRef<const RegStorage> dp_regs_64(dp_regs_arr_64);
+static constexpr ArrayRef<const RegStorage> xp_regs_32(xp_regs_arr_32);
+static constexpr ArrayRef<const RegStorage> xp_regs_64(xp_regs_arr_64);
 static constexpr ArrayRef<const RegStorage> reserved_regs_32(reserved_regs_arr_32);
 static constexpr ArrayRef<const RegStorage> reserved_regs_64(reserved_regs_arr_64);
 static constexpr ArrayRef<const RegStorage> reserved_regs_64q(reserved_regs_arr_64q);
@@ -437,21 +464,13 @@ bool X86Mir2Lir::IsByteRegister(RegStorage reg) {
 
 /* Clobber all regs that might be used by an external C call */
 void X86Mir2Lir::ClobberCallerSave() {
-  Clobber(rs_rAX);
-  Clobber(rs_rCX);
-  Clobber(rs_rDX);
-  Clobber(rs_rBX);
-
-  Clobber(rs_fr0);
-  Clobber(rs_fr1);
-  Clobber(rs_fr2);
-  Clobber(rs_fr3);
-  Clobber(rs_fr4);
-  Clobber(rs_fr5);
-  Clobber(rs_fr6);
-  Clobber(rs_fr7);
-
   if (cu_->target64) {
+    Clobber(rs_rAX);
+    Clobber(rs_rCX);
+    Clobber(rs_rDX);
+    Clobber(rs_rSI);
+    Clobber(rs_rDI);
+
     Clobber(rs_r8);
     Clobber(rs_r9);
     Clobber(rs_r10);
@@ -461,11 +480,21 @@ void X86Mir2Lir::ClobberCallerSave() {
     Clobber(rs_fr9);
     Clobber(rs_fr10);
     Clobber(rs_fr11);
-    Clobber(rs_fr12);
-    Clobber(rs_fr13);
-    Clobber(rs_fr14);
-    Clobber(rs_fr15);
+  } else {
+    Clobber(rs_rAX);
+    Clobber(rs_rCX);
+    Clobber(rs_rDX);
+    Clobber(rs_rBX);
   }
+
+  Clobber(rs_fr0);
+  Clobber(rs_fr1);
+  Clobber(rs_fr2);
+  Clobber(rs_fr3);
+  Clobber(rs_fr4);
+  Clobber(rs_fr5);
+  Clobber(rs_fr6);
+  Clobber(rs_fr7);
 }
 
 RegLocation X86Mir2Lir::GetReturnWideAlt() {
@@ -599,11 +628,15 @@ void X86Mir2Lir::CompilerInitializeRegAlloc() {
   // Target-specific adjustments.
 
   // Add in XMM registers.
-  const ArrayRef<const RegStorage> *xp_temps = cu_->target64 ? &xp_temps_64 : &xp_temps_32;
-  for (RegStorage reg : *xp_temps) {
+  const ArrayRef<const RegStorage> *xp_regs = cu_->target64 ? &xp_regs_64 : &xp_regs_32;
+  for (RegStorage reg : *xp_regs) {
     RegisterInfo* info = new (arena_) RegisterInfo(reg, GetRegMaskCommon(reg));
     reginfo_map_.Put(reg.GetReg(), info);
-    info->SetIsTemp(true);
+  }
+  const ArrayRef<const RegStorage> *xp_temps = cu_->target64 ? &xp_temps_64 : &xp_temps_32;
+  for (RegStorage reg : *xp_temps) {
+    RegisterInfo* xp_reg_info = GetRegInfo(reg);
+    xp_reg_info->SetIsTemp(true);
   }
 
   // Alias single precision xmm to double xmms.
@@ -665,9 +698,11 @@ void X86Mir2Lir::SpillCoreRegs() {
   // Spill mask not including fake return address register
   uint32_t mask = core_spill_mask_ & ~(1 << rs_rRET.GetRegNum());
   int offset = frame_size_ - (GetInstructionSetPointerSize(cu_->instruction_set) * num_core_spills_);
+  OpSize size = cu_->target64 ? k64 : k32;
   for (int reg = 0; mask; mask >>= 1, reg++) {
     if (mask & 0x1) {
-      StoreWordDisp(rs_rX86_SP, offset, RegStorage::Solo32(reg));
+      StoreBaseDisp(rs_rX86_SP, offset, cu_->target64 ? RegStorage::Solo64(reg) :  RegStorage::Solo32(reg),
+                   size, kNotVolatile);
       offset += GetInstructionSetPointerSize(cu_->instruction_set);
     }
   }
@@ -680,14 +715,46 @@ void X86Mir2Lir::UnSpillCoreRegs() {
   // Spill mask not including fake return address register
   uint32_t mask = core_spill_mask_ & ~(1 << rs_rRET.GetRegNum());
   int offset = frame_size_ - (GetInstructionSetPointerSize(cu_->instruction_set) * num_core_spills_);
+  OpSize size = cu_->target64 ? k64 : k32;
   for (int reg = 0; mask; mask >>= 1, reg++) {
     if (mask & 0x1) {
-      LoadWordDisp(rs_rX86_SP, offset, RegStorage::Solo32(reg));
+      LoadBaseDisp(rs_rX86_SP, offset, cu_->target64 ? RegStorage::Solo64(reg) :  RegStorage::Solo32(reg),
+                   size, kNotVolatile);
       offset += GetInstructionSetPointerSize(cu_->instruction_set);
     }
   }
 }
 
+void X86Mir2Lir::SpillFPRegs() {
+  if (num_fp_spills_ == 0) {
+    return;
+  }
+  uint32_t mask = fp_spill_mask_;
+  int offset = frame_size_ - (GetInstructionSetPointerSize(cu_->instruction_set) * (num_fp_spills_ + num_core_spills_));
+  for (int reg = 0; mask; mask >>= 1, reg++) {
+    if (mask & 0x1) {
+      StoreBaseDisp(rs_rX86_SP, offset, RegStorage::FloatSolo64(reg),
+                   k64, kNotVolatile);
+      offset += sizeof(double);
+    }
+  }
+}
+void X86Mir2Lir::UnSpillFPRegs() {
+  if (num_fp_spills_ == 0) {
+    return;
+  }
+  uint32_t mask = fp_spill_mask_;
+  int offset = frame_size_ - (GetInstructionSetPointerSize(cu_->instruction_set) * (num_fp_spills_ + num_core_spills_));
+  for (int reg = 0; mask; mask >>= 1, reg++) {
+    if (mask & 0x1) {
+      LoadBaseDisp(rs_rX86_SP, offset, RegStorage::FloatSolo64(reg),
+                   k64, kNotVolatile);
+      offset += sizeof(double);
+    }
+  }
+}
+
+
 bool X86Mir2Lir::IsUnconditionalBranch(LIR* lir) {
   return (lir->opcode == kX86Jmp8 || lir->opcode == kX86Jmp32);
 }
diff --git a/compiler/dex/quick/x86/utility_x86.cc b/compiler/dex/quick/x86/utility_x86.cc
index 657160ffd1..5c7c91b5b5 100644
--- a/compiler/dex/quick/x86/utility_x86.cc
+++ b/compiler/dex/quick/x86/utility_x86.cc
@@ -917,7 +917,7 @@ void X86Mir2Lir::AnalyzeBB(BasicBlock * bb) {
 
   for (MIR *mir = bb->first_mir_insn; mir != NULL; mir = mir->next) {
     int opcode = mir->dalvikInsn.opcode;
-    if (MIRGraph::IsPseudoMirOp(opcode)) {
+    if (MIR::DecodedInstruction::IsPseudoMirOp(opcode)) {
       AnalyzeExtendedMIR(opcode, bb, mir);
     } else {
       AnalyzeMIR(opcode, bb, mir);
diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h
index 2789923bb9..56573810ca 100644
--- a/compiler/dex/quick/x86/x86_lir.h
+++ b/compiler/dex/quick/x86/x86_lir.h
@@ -66,7 +66,9 @@ namespace art {
  *  XMM6: caller | caller, arg7 | caller, scratch            | caller, arg7, scratch
  *  XMM7: caller | caller, arg8 | caller, scratch            | caller, arg8, scratch
  *  ---  x86-64/x32 registers
- *  XMM8 .. 15: caller save available as scratch registers for ART.
+ *  XMM8 .. 11: caller save available as scratch registers for ART.
+ *  XMM12 .. 15: callee save available as promoted registers for ART.
+ *  This change (XMM12..15) is for QCG only, for others they are caller save.
  *
  * X87 is a necessary evil outside of ART code for x86:
  *  ST0:  x86 float/double native return value, caller save
diff --git a/compiler/dex/vreg_analysis.cc b/compiler/dex/vreg_analysis.cc
index db383c4d0b..892b30284f 100644
--- a/compiler/dex/vreg_analysis.cc
+++ b/compiler/dex/vreg_analysis.cc
@@ -251,7 +251,8 @@ bool MIRGraph::InferTypeAndSize(BasicBlock* bb, MIR* mir, bool changed) {
 
     // Special-case handling for format 35c/3rc invokes
     Instruction::Code opcode = mir->dalvikInsn.opcode;
-    int flags = IsPseudoMirOp(opcode) ? 0 : Instruction::FlagsOf(mir->dalvikInsn.opcode);
+    int flags = MIR::DecodedInstruction::IsPseudoMirOp(opcode) ?
+                  0 : Instruction::FlagsOf(mir->dalvikInsn.opcode);
     if ((flags & Instruction::kInvoke) &&
         (attrs & (DF_FORMAT_35C | DF_FORMAT_3RC))) {
       DCHECK_EQ(next, 0);
diff --git a/compiler/jni/quick/x86_64/calling_convention_x86_64.cc b/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
index 5febed24fe..525f05c522 100644
--- a/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
+++ b/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
@@ -130,6 +130,10 @@ X86_64JniCallingConvention::X86_64JniCallingConvention(bool is_static, bool is_s
   callee_save_regs_.push_back(X86_64ManagedRegister::FromCpuRegister(R13));
   callee_save_regs_.push_back(X86_64ManagedRegister::FromCpuRegister(R14));
   callee_save_regs_.push_back(X86_64ManagedRegister::FromCpuRegister(R15));
+  callee_save_regs_.push_back(X86_64ManagedRegister::FromXmmRegister(XMM12));
+  callee_save_regs_.push_back(X86_64ManagedRegister::FromXmmRegister(XMM13));
+  callee_save_regs_.push_back(X86_64ManagedRegister::FromXmmRegister(XMM14));
+  callee_save_regs_.push_back(X86_64ManagedRegister::FromXmmRegister(XMM15));
 }
 
 uint32_t X86_64JniCallingConvention::CoreSpillMask() const {
@@ -137,6 +141,10 @@ uint32_t X86_64JniCallingConvention::CoreSpillMask() const {
       1 << kNumberOfCpuRegisters;
 }
 
+uint32_t X86_64JniCallingConvention::FpSpillMask() const {
+  return 1 << XMM12 | 1 << XMM13 | 1 << XMM14 | 1 << XMM15;
+}
+
 size_t X86_64JniCallingConvention::FrameSize() {
   // Method*, return address and callee save area size, local reference segment state
   size_t frame_data_size = sizeof(StackReference<mirror::ArtMethod>) +
diff --git a/compiler/jni/quick/x86_64/calling_convention_x86_64.h b/compiler/jni/quick/x86_64/calling_convention_x86_64.h
index 1ba5353289..7a90c6e94e 100644
--- a/compiler/jni/quick/x86_64/calling_convention_x86_64.h
+++ b/compiler/jni/quick/x86_64/calling_convention_x86_64.h
@@ -61,9 +61,7 @@ class X86_64JniCallingConvention FINAL : public JniCallingConvention {
   }
   ManagedRegister ReturnScratchRegister() const OVERRIDE;
   uint32_t CoreSpillMask() const OVERRIDE;
-  uint32_t FpSpillMask() const OVERRIDE {
-    return 0;
-  }
+  uint32_t FpSpillMask() const OVERRIDE;
   bool IsCurrentParamInRegister() OVERRIDE;
   bool IsCurrentParamOnStack() OVERRIDE;
   ManagedRegister CurrentParamRegister() OVERRIDE;
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index 4d5d613015..78738d8934 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -1671,16 +1671,31 @@ void X86_64Assembler::BuildFrame(size_t frame_size, ManagedRegister method_reg,
                                  const std::vector<ManagedRegister>& spill_regs,
                                  const ManagedRegisterEntrySpills& entry_spills) {
   CHECK_ALIGNED(frame_size, kStackAlignment);
+  int gpr_count = 0;
   for (int i = spill_regs.size() - 1; i >= 0; --i) {
-    pushq(spill_regs.at(i).AsX86_64().AsCpuRegister());
+    x86_64::X86_64ManagedRegister spill = spill_regs.at(i).AsX86_64();
+    if (spill.IsCpuRegister()) {
+      pushq(spill.AsCpuRegister());
+      gpr_count++;
+    }
   }
   // return address then method on stack
-  addq(CpuRegister(RSP), Immediate(-static_cast<int64_t>(frame_size) + (spill_regs.size() * kFramePointerSize) +
-                                   sizeof(StackReference<mirror::ArtMethod>) /*method*/ +
-                                   kFramePointerSize /*return address*/));
+  int64_t rest_of_frame = static_cast<int64_t>(frame_size)
+                          - (gpr_count * kFramePointerSize)
+                          - kFramePointerSize /*return address*/;
+  subq(CpuRegister(RSP), Immediate(rest_of_frame));
+  // spill xmms
+  int64_t offset = rest_of_frame;
+  for (int i = spill_regs.size() - 1; i >= 0; --i) {
+    x86_64::X86_64ManagedRegister spill = spill_regs.at(i).AsX86_64();
+    if (spill.IsXmmRegister()) {
+      offset -= sizeof(double);
+      movsd(Address(CpuRegister(RSP), offset), spill.AsXmmRegister());
+    }
+  }
 
   DCHECK_EQ(4U, sizeof(StackReference<mirror::ArtMethod>));
-  subq(CpuRegister(RSP), Immediate(4));
+
   movl(Address(CpuRegister(RSP), 0), method_reg.AsX86_64().AsCpuRegister());
 
   for (size_t i = 0; i < entry_spills.size(); ++i) {
@@ -1707,9 +1722,24 @@ void X86_64Assembler::BuildFrame(size_t frame_size, ManagedRegister method_reg,
 void X86_64Assembler::RemoveFrame(size_t frame_size,
                             const std::vector<ManagedRegister>& spill_regs) {
   CHECK_ALIGNED(frame_size, kStackAlignment);
-  addq(CpuRegister(RSP), Immediate(static_cast<int64_t>(frame_size) - (spill_regs.size() * kFramePointerSize) - kFramePointerSize));
+  int gpr_count = 0;
+  // unspill xmms
+  int64_t offset = static_cast<int64_t>(frame_size) - (spill_regs.size() * kFramePointerSize) - 2 * kFramePointerSize;
   for (size_t i = 0; i < spill_regs.size(); ++i) {
-    popq(spill_regs.at(i).AsX86_64().AsCpuRegister());
+    x86_64::X86_64ManagedRegister spill = spill_regs.at(i).AsX86_64();
+    if (spill.IsXmmRegister()) {
+      offset += sizeof(double);
+      movsd(spill.AsXmmRegister(), Address(CpuRegister(RSP), offset));
+    } else {
+      gpr_count++;
+    }
+  }
+  addq(CpuRegister(RSP), Immediate(static_cast<int64_t>(frame_size) - (gpr_count * kFramePointerSize) - kFramePointerSize));
+  for (size_t i = 0; i < spill_regs.size(); ++i) {
+    x86_64::X86_64ManagedRegister spill = spill_regs.at(i).AsX86_64();
+    if (spill.IsCpuRegister()) {
+      popq(spill.AsCpuRegister());
+    }
   }
   ret();
 }
diff --git a/compiler/utils/x86_64/assembler_x86_64_test.cc b/compiler/utils/x86_64/assembler_x86_64_test.cc
index f7bad8b057..dc1758ffdf 100644
--- a/compiler/utils/x86_64/assembler_x86_64_test.cc
+++ b/compiler/utils/x86_64/assembler_x86_64_test.cc
@@ -246,11 +246,9 @@ std::string buildframe_test_fn(x86_64::X86_64Assembler* assembler) {
   str << "pushq %rsi\n";
   str << "pushq %r10\n";
   // 2) Move down the stack pointer.
-  ssize_t displacement = -static_cast<ssize_t>(frame_size) + spill_regs.size() * 8 +
-      sizeof(StackReference<mirror::ArtMethod>) + 8;
-  str << "addq $" << displacement << ", %rsp\n";
-  // 3) Make space for method reference, and store it.
-  str << "subq $4, %rsp\n";
+  ssize_t displacement = static_cast<ssize_t>(frame_size) - (spill_regs.size() * 8 + 8);
+  str << "subq $" << displacement << ", %rsp\n";
+  // 3) Store method reference.
   str << "movl %edi, (%rsp)\n";
   // 4) Entry spills.
   str << "movq %rax, " << frame_size + 0 << "(%rsp)\n";
diff --git a/runtime/arch/x86_64/asm_support_x86_64.h b/runtime/arch/x86_64/asm_support_x86_64.h
index bff8501cf2..05d0ef8761 100644
--- a/runtime/arch/x86_64/asm_support_x86_64.h
+++ b/runtime/arch/x86_64/asm_support_x86_64.h
@@ -35,9 +35,9 @@
 // Offset of field Thread::thin_lock_thread_id_ verified in InitCpu
 #define THREAD_ID_OFFSET 12
 
-#define FRAME_SIZE_SAVE_ALL_CALLEE_SAVE 64
-#define FRAME_SIZE_REFS_ONLY_CALLEE_SAVE 64
-#define FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE 176
+#define FRAME_SIZE_SAVE_ALL_CALLEE_SAVE 64 + 4*8
+#define FRAME_SIZE_REFS_ONLY_CALLEE_SAVE 64 + 4*8
+#define FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE 176 + 4*8
 
 // Expected size of a heap reference
 #define HEAP_REFERENCE_SIZE 4
diff --git a/runtime/arch/x86_64/context_x86_64.cc b/runtime/arch/x86_64/context_x86_64.cc
index e1f47ee3d4..7699eaf9d4 100644
--- a/runtime/arch/x86_64/context_x86_64.cc
+++ b/runtime/arch/x86_64/context_x86_64.cc
@@ -78,6 +78,18 @@ void X86_64Context::SmashCallerSaves() {
   gprs_[R9] = nullptr;
   gprs_[R10] = nullptr;
   gprs_[R11] = nullptr;
+  fprs_[XMM0] = nullptr;
+  fprs_[XMM1] = nullptr;
+  fprs_[XMM2] = nullptr;
+  fprs_[XMM3] = nullptr;
+  fprs_[XMM4] = nullptr;
+  fprs_[XMM5] = nullptr;
+  fprs_[XMM6] = nullptr;
+  fprs_[XMM7] = nullptr;
+  fprs_[XMM8] = nullptr;
+  fprs_[XMM9] = nullptr;
+  fprs_[XMM10] = nullptr;
+  fprs_[XMM11] = nullptr;
 }
 
 bool X86_64Context::SetGPR(uint32_t reg, uintptr_t value) {
@@ -102,41 +114,26 @@ bool X86_64Context::SetFPR(uint32_t reg, uintptr_t value) {
   }
 }
 
+extern "C" void art_quick_do_long_jump(uintptr_t*, uintptr_t*);
+
 void X86_64Context::DoLongJump() {
 #if defined(__x86_64__)
-  // Array of GPR values, filled from the context backward for the long jump pop. We add a slot at
-  // the top for the stack pointer that doesn't get popped in a pop-all.
-  volatile uintptr_t gprs[kNumberOfCpuRegisters + 1];
+  uintptr_t gprs[kNumberOfCpuRegisters + 1];
+  uintptr_t fprs[kNumberOfFloatRegisters];
+
   for (size_t i = 0; i < kNumberOfCpuRegisters; ++i) {
     gprs[kNumberOfCpuRegisters - i - 1] = gprs_[i] != nullptr ? *gprs_[i] : X86_64Context::kBadGprBase + i;
   }
+  for (size_t i = 0; i < kNumberOfFloatRegisters; ++i) {
+    fprs[i] = fprs_[i] != nullptr ? *fprs_[i] : X86_64Context::kBadFprBase + i;
+  }
+
   // We want to load the stack pointer one slot below so that the ret will pop eip.
   uintptr_t rsp = gprs[kNumberOfCpuRegisters - RSP - 1] - kWordSize;
   gprs[kNumberOfCpuRegisters] = rsp;
   *(reinterpret_cast<uintptr_t*>(rsp)) = rip_;
-  __asm__ __volatile__(
-      "movq %0, %%rsp\n\t"  // RSP points to gprs.
-      "popq %%r15\n\t"       // Load all registers except RSP and RIP with values in gprs.
-      "popq %%r14\n\t"
-      "popq %%r13\n\t"
-      "popq %%r12\n\t"
-      "popq %%r11\n\t"
-      "popq %%r10\n\t"
-      "popq %%r9\n\t"
-      "popq %%r8\n\t"
-      "popq %%rdi\n\t"
-      "popq %%rsi\n\t"
-      "popq %%rbp\n\t"
-      "addq $8, %%rsp\n\t"
-      "popq %%rbx\n\t"
-      "popq %%rdx\n\t"
-      "popq %%rcx\n\t"
-      "popq %%rax\n\t"
-      "popq %%rsp\n\t"      // Load stack pointer.
-      "ret\n\t"             // From higher in the stack pop rip.
-      :  // output.
-      : "g"(&gprs[0])  // input.
-      :);  // clobber.
+
+  art_quick_do_long_jump(gprs, fprs);
 #else
   UNIMPLEMENTED(FATAL);
 #endif
diff --git a/runtime/arch/x86_64/entrypoints_init_x86_64.cc b/runtime/arch/x86_64/entrypoints_init_x86_64.cc
index 609d1c6500..204d52c723 100644
--- a/runtime/arch/x86_64/entrypoints_init_x86_64.cc
+++ b/runtime/arch/x86_64/entrypoints_init_x86_64.cc
@@ -35,7 +35,7 @@ extern "C" void art_portable_resolution_trampoline(mirror::ArtMethod*);
 extern "C" void art_portable_to_interpreter_bridge(mirror::ArtMethod*);
 
 // Cast entrypoints.
-extern "C" uint32_t artIsAssignableFromCode(const mirror::Class* klass,
+extern "C" uint32_t art_quick_assignable_from_code(const mirror::Class* klass,
                                             const mirror::Class* ref_class);
 extern "C" void art_quick_check_cast(void*, void*);
 
@@ -129,7 +129,7 @@ void InitEntryPoints(InterpreterEntryPoints* ipoints, JniEntryPoints* jpoints,
   ResetQuickAllocEntryPoints(qpoints);
 
   // Cast
-  qpoints->pInstanceofNonTrivial = artIsAssignableFromCode;
+  qpoints->pInstanceofNonTrivial = art_quick_assignable_from_code;
   qpoints->pCheckCast = art_quick_check_cast;
 
   // DexCache
diff --git a/runtime/arch/x86_64/jni_entrypoints_x86_64.S b/runtime/arch/x86_64/jni_entrypoints_x86_64.S
index d668797ba4..f6736df11f 100644
--- a/runtime/arch/x86_64/jni_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/jni_entrypoints_x86_64.S
@@ -28,8 +28,8 @@ DEFINE_FUNCTION art_jni_dlsym_lookup_stub
     PUSH rdx  // Arg.
     PUSH rcx  // Arg.
     // Create space for FPR args, plus padding for alignment
-    subq LITERAL(72), %rsp
-    CFI_ADJUST_CFA_OFFSET(72)
+    subq LITERAL(72 + 4 * 8), %rsp
+    CFI_ADJUST_CFA_OFFSET(72 + 4 * 8)
     // Save FPRs.
     movq %xmm0, 0(%rsp)
     movq %xmm1, 8(%rsp)
@@ -39,6 +39,10 @@ DEFINE_FUNCTION art_jni_dlsym_lookup_stub
     movq %xmm5, 40(%rsp)
     movq %xmm6, 48(%rsp)
     movq %xmm7, 56(%rsp)
+    movq %xmm12, 64(%rsp)
+    movq %xmm13, 72(%rsp)
+    movq %xmm14, 80(%rsp)
+    movq %xmm15, 88(%rsp)
     // prepare call
     movq %gs:THREAD_SELF_OFFSET, %rdi      // RDI := Thread::Current()
     // call
@@ -52,8 +56,12 @@ DEFINE_FUNCTION art_jni_dlsym_lookup_stub
     movq 40(%rsp), %xmm5
     movq 48(%rsp), %xmm6
     movq 56(%rsp), %xmm7
-    addq LITERAL(72), %rsp
-    CFI_ADJUST_CFA_OFFSET(-72)
+    movq 64(%rsp), %xmm12
+    movq 72(%rsp), %xmm13
+    movq 80(%rsp), %xmm14
+    movq 88(%rsp), %xmm15
+    addq LITERAL(72 + 4 * 8), %rsp
+    CFI_ADJUST_CFA_OFFSET(-72 - 4 * 8)
     POP rcx  // Arg.
     POP rdx  // Arg.
     POP rsi  // Arg.
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 8fa947c9b3..7f7226c0ad 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -16,6 +16,26 @@
 
 #include "asm_support_x86_64.S"
 
+MACRO0(SETUP_FP_CALLEE_SAVE_FRAME)
+    // Create space for ART FP callee-saved registers
+    subq LITERAL(4 * 8), %rsp
+    CFI_ADJUST_CFA_OFFSET(4 * 8)
+    movq %xmm12, 0(%rsp)
+    movq %xmm13, 8(%rsp)
+    movq %xmm14, 16(%rsp)
+    movq %xmm15, 24(%rsp)
+END_MACRO
+
+MACRO0(RESTORE_FP_CALLEE_SAVE_FRAME)
+    // Restore ART FP callee-saved registers
+    movq 0(%rsp), %xmm12
+    movq 8(%rsp), %xmm13
+    movq 16(%rsp), %xmm14
+    movq 24(%rsp), %xmm15
+    addq LITERAL(4 * 8), %rsp
+    CFI_ADJUST_CFA_OFFSET(- 4 * 8)
+END_MACRO
+
 // For x86, the CFA is esp+4, the address above the pushed return address on the stack.
 
     /*
@@ -37,6 +57,14 @@ MACRO0(SETUP_SAVE_ALL_CALLEE_SAVE_FRAME)
     PUSH r12  // Callee save.
     PUSH rbp  // Callee save.
     PUSH rbx  // Callee save.
+    // Create space for FPR args, plus padding for alignment
+    subq LITERAL(4 * 8), %rsp
+    CFI_ADJUST_CFA_OFFSET(4 * 8)
+    // Save FPRs.
+    movq %xmm12, 0(%rsp)
+    movq %xmm13, 8(%rsp)
+    movq %xmm14, 16(%rsp)
+    movq %xmm15, 24(%rsp)
     subq MACRO_LITERAL(8), %rsp  // Space for Method* (also aligns the frame).
     CFI_ADJUST_CFA_OFFSET(8)
     // R10 := ArtMethod* for save all callee save frame method.
@@ -46,7 +74,7 @@ MACRO0(SETUP_SAVE_ALL_CALLEE_SAVE_FRAME)
 
     // Ugly compile-time check, but we only have the preprocessor.
     // Last +8: implicit return address pushed on stack when caller made call.
-#if (FRAME_SIZE_SAVE_ALL_CALLEE_SAVE != 6*8 + 8 + 8)
+#if (FRAME_SIZE_SAVE_ALL_CALLEE_SAVE != 6*8 + 4*8 + 8 + 8)
 #error "SAVE_ALL_CALLEE_SAVE_FRAME(X86_64) size not as expected."
 #endif
 #endif  // __APPLE__
@@ -71,8 +99,14 @@ MACRO0(SETUP_REF_ONLY_CALLEE_SAVE_FRAME)
     PUSH r12  // Callee save.
     PUSH rbp  // Callee save.
     PUSH rbx  // Callee save.
-    subq MACRO_LITERAL(8), %rsp  // Space for Method* (also aligns the frame).
-    CFI_ADJUST_CFA_OFFSET(8)
+    // Create space for FPR args, plus padding for alignment
+    subq LITERAL(8 + 4*8), %rsp
+    CFI_ADJUST_CFA_OFFSET(8 + 4*8)
+    // Save FPRs.
+    movq %xmm12, 8(%rsp)
+    movq %xmm13, 16(%rsp)
+    movq %xmm14, 24(%rsp)
+    movq %xmm15, 32(%rsp)
     // R10 := ArtMethod* for refs only callee save frame method.
     movq RUNTIME_REFS_ONLY_CALLEE_SAVE_FRAME_OFFSET(%r10), %r10
     // Store ArtMethod* to bottom of stack.
@@ -80,15 +114,19 @@ MACRO0(SETUP_REF_ONLY_CALLEE_SAVE_FRAME)
 
     // Ugly compile-time check, but we only have the preprocessor.
     // Last +8: implicit return address pushed on stack when caller made call.
-#if (FRAME_SIZE_REFS_ONLY_CALLEE_SAVE != 6*8 + 8 + 8)
+#if (FRAME_SIZE_REFS_ONLY_CALLEE_SAVE != 6*8 + 4*8 + 8 + 8)
 #error "REFS_ONLY_CALLEE_SAVE_FRAME(X86_64) size not as expected."
 #endif
 #endif  // __APPLE__
 END_MACRO
 
 MACRO0(RESTORE_REF_ONLY_CALLEE_SAVE_FRAME)
-    addq MACRO_LITERAL(8), %rsp
-    CFI_ADJUST_CFA_OFFSET(-8)
+    movq 8(%rsp), %xmm12
+    movq 16(%rsp), %xmm13
+    movq 24(%rsp), %xmm14
+    movq 32(%rsp), %xmm15
+    addq LITERAL(8 + 4*8), %rsp
+    CFI_ADJUST_CFA_OFFSET(-8 - 4*8)
     // TODO: optimize by not restoring callee-saves restored by the ABI
     POP rbx
     POP rbp
@@ -123,8 +161,8 @@ MACRO0(SETUP_REF_AND_ARGS_CALLEE_SAVE_FRAME)
     PUSH rdx  // Quick arg 2.
     PUSH rcx  // Quick arg 3.
     // Create space for FPR args and create 2 slots, 1 of padding and 1 for the ArtMethod*.
-    subq MACRO_LITERAL(80), %rsp
-    CFI_ADJUST_CFA_OFFSET(80)
+    subq MACRO_LITERAL(80 + 4 * 8), %rsp
+    CFI_ADJUST_CFA_OFFSET(80 + 4 * 8)
     // R10 := ArtMethod* for ref and args callee save frame method.
     movq RUNTIME_REF_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET(%r10), %r10
     // Save FPRs.
@@ -136,12 +174,16 @@ MACRO0(SETUP_REF_AND_ARGS_CALLEE_SAVE_FRAME)
     movq %xmm5, 56(%rsp)
     movq %xmm6, 64(%rsp)
     movq %xmm7, 72(%rsp)
+    movq %xmm12, 80(%rsp)
+    movq %xmm13, 88(%rsp)
+    movq %xmm14, 96(%rsp)
+    movq %xmm15, 104(%rsp)
     // Store ArtMethod* to bottom of stack.
     movq %r10, 0(%rsp)
 
     // Ugly compile-time check, but we only have the preprocessor.
     // Last +8: implicit return address pushed on stack when caller made call.
-#if (FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE != 11*8 + 80 + 8)
+#if (FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE != 11*8 + 4*8 + 80 + 8)
 #error "REFS_AND_ARGS_CALLEE_SAVE_FRAME(X86_64) size not as expected."
 #endif
 #endif  // __APPLE__
@@ -157,8 +199,12 @@ MACRO0(RESTORE_REF_AND_ARGS_CALLEE_SAVE_FRAME)
     movq 56(%rsp), %xmm5
     movq 64(%rsp), %xmm6
     movq 72(%rsp), %xmm7
-    addq MACRO_LITERAL(80), %rsp
-    CFI_ADJUST_CFA_OFFSET(-80)
+    movq 80(%rsp), %xmm12
+    movq 88(%rsp), %xmm13
+    movq 96(%rsp), %xmm14
+    movq 104(%rsp), %xmm15
+    addq MACRO_LITERAL(80 + 4 * 8), %rsp
+    CFI_ADJUST_CFA_OFFSET(-(80 + 4 * 8))
     // Restore callee and GPR args, mixed together to agree with core spills bitmap.
     POP rcx
     POP rdx
@@ -536,6 +582,58 @@ DEFINE_FUNCTION art_quick_invoke_static_stub
 #endif  // __APPLE__
 END_FUNCTION art_quick_invoke_static_stub
 
+    /*
+     * Long jump stub.
+     * On entry:
+     *   rdi = gprs
+     *   rsi = fprs
+     */
+DEFINE_FUNCTION art_quick_do_long_jump
+#if defined(__APPLE__)
+    int3
+    int3
+#else
+    // Restore FPRs.
+    movq 0(%rsi), %xmm0
+    movq 8(%rsi), %xmm1
+    movq 16(%rsi), %xmm2
+    movq 24(%rsi), %xmm3
+    movq 32(%rsi), %xmm4
+    movq 40(%rsi), %xmm5
+    movq 48(%rsi), %xmm6
+    movq 56(%rsi), %xmm7
+    movq 64(%rsi), %xmm8
+    movq 72(%rsi), %xmm9
+    movq 80(%rsi), %xmm10
+    movq 88(%rsi), %xmm11
+    movq 96(%rsi), %xmm12
+    movq 104(%rsi), %xmm13
+    movq 112(%rsi), %xmm14
+    movq 120(%rsi), %xmm15
+    // Restore FPRs.
+    movq %rdi, %rsp   // RSP points to gprs.
+    // Load all registers except RSP and RIP with values in gprs.
+    popq %r15
+    popq %r14
+    popq %r13
+    popq %r12
+    popq %r11
+    popq %r10
+    popq %r9
+    popq %r8
+    popq %rdi
+    popq %rsi
+    popq %rbp
+    addq LITERAL(8), %rsp   // Skip rsp
+    popq %rbx
+    popq %rdx
+    popq %rcx
+    popq %rax
+    popq %rsp      // Load stack pointer.
+    ret            // From higher in the stack pop rip.
+#endif  // __APPLE__
+END_FUNCTION art_quick_do_long_jump
+
 MACRO3(NO_ARG_DOWNCALL, c_name, cxx_name, return_macro)
     DEFINE_FUNCTION VAR(c_name, 0)
     SETUP_REF_ONLY_CALLEE_SAVE_FRAME  // save ref containing registers for GC
@@ -820,13 +918,17 @@ END_FUNCTION art_quick_unlock_object
 DEFINE_FUNCTION art_quick_check_cast
     PUSH rdi                          // Save args for exc
     PUSH rsi
+    SETUP_FP_CALLEE_SAVE_FRAME
     call PLT_SYMBOL(artIsAssignableFromCode)  // (Class* klass, Class* ref_klass)
     testq %rax, %rax
     jz 1f                             // jump forward if not assignable
+    RESTORE_FP_CALLEE_SAVE_FRAME
     addq LITERAL(16), %rsp            // pop arguments
     CFI_ADJUST_CFA_OFFSET(-16)
+
     ret
 1:
+    RESTORE_FP_CALLEE_SAVE_FRAME
     POP rsi                           // Pop arguments
     POP rdi
     SETUP_SAVE_ALL_CALLEE_SAVE_FRAME  // save all registers as basis for long jump context
@@ -907,6 +1009,7 @@ DEFINE_FUNCTION art_quick_aput_obj
     PUSH rdx
     subq LITERAL(8), %rsp        // Alignment padding.
     CFI_ADJUST_CFA_OFFSET(8)
+    SETUP_FP_CALLEE_SAVE_FRAME
 
                                   // "Uncompress" = do nothing, as already zero-extended on load.
     movl CLASS_OFFSET(%edx), %esi // Pass arg2 = value's class.
@@ -918,6 +1021,7 @@ DEFINE_FUNCTION art_quick_aput_obj
     testq %rax, %rax
     jz   .Lthrow_array_store_exception
 
+    RESTORE_FP_CALLEE_SAVE_FRAME
     // Restore arguments.
     addq LITERAL(8), %rsp
     CFI_ADJUST_CFA_OFFSET(-8)
@@ -934,6 +1038,7 @@ DEFINE_FUNCTION art_quick_aput_obj
 //  movb %dl, (%rdx, %rdi)
     ret
 .Lthrow_array_store_exception:
+    RESTORE_FP_CALLEE_SAVE_FRAME
     // Restore arguments.
     addq LITERAL(8), %rsp
     CFI_ADJUST_CFA_OFFSET(-8)
@@ -1012,8 +1117,8 @@ DEFINE_FUNCTION art_quick_proxy_invoke_handler
     PUSH rdx  // Quick arg 2.
     PUSH rcx  // Quick arg 3.
     // Create space for FPR args and create 2 slots, 1 of padding and 1 for the ArtMethod*.
-    subq LITERAL(80), %rsp
-    CFI_ADJUST_CFA_OFFSET(80)
+    subq LITERAL(80 + 4*8), %rsp
+    CFI_ADJUST_CFA_OFFSET(80 + 4*8)
     // Save FPRs.
     movq %xmm0, 16(%rsp)
     movq %xmm1, 24(%rsp)
@@ -1023,14 +1128,18 @@ DEFINE_FUNCTION art_quick_proxy_invoke_handler
     movq %xmm5, 56(%rsp)
     movq %xmm6, 64(%rsp)
     movq %xmm7, 72(%rsp)
+    movq %xmm12, 80(%rsp)
+    movq %xmm13, 88(%rsp)
+    movq %xmm14, 96(%rsp)
+    movq %xmm15, 104(%rsp)
     // Store proxy method to bottom of stack.
     movq %rdi, 0(%rsp)
     movq %gs:THREAD_SELF_OFFSET, %rdx  // Pass Thread::Current().
     movq %rsp, %rcx                    // Pass SP.
     call PLT_SYMBOL(artQuickProxyInvokeHandler) // (proxy method, receiver, Thread*, SP)
     movq %rax, %xmm0                   // Copy return value in case of float returns.
-    addq LITERAL(168), %rsp            // Pop arguments.
-    CFI_ADJUST_CFA_OFFSET(-168)
+    addq LITERAL(168 + 4*8), %rsp            // Pop arguments.
+    CFI_ADJUST_CFA_OFFSET(-168 - 4*8)
     RETURN_OR_DELIVER_PENDING_EXCEPTION
 END_FUNCTION art_quick_proxy_invoke_handler
 
@@ -1156,8 +1265,8 @@ DEFINE_FUNCTION art_quick_generic_jni_trampoline
     PUSH rdx  // Quick arg 2.
     PUSH rcx  // Quick arg 3.
     // Create space for FPR args and create 2 slots, 1 of padding and 1 for the ArtMethod*.
-    subq LITERAL(80), %rsp
-    CFI_ADJUST_CFA_OFFSET(80)
+    subq LITERAL(80 + 4*8), %rsp
+    CFI_ADJUST_CFA_OFFSET(80 + 4*8)
     // Save FPRs.
     movq %xmm0, 16(%rsp)
     movq %xmm1, 24(%rsp)
@@ -1167,6 +1276,10 @@ DEFINE_FUNCTION art_quick_generic_jni_trampoline
     movq %xmm5, 56(%rsp)
     movq %xmm6, 64(%rsp)
     movq %xmm7, 72(%rsp)
+    movq %xmm12, 80(%rsp)
+    movq %xmm13, 88(%rsp)
+    movq %xmm14, 96(%rsp)
+    movq %xmm15, 104(%rsp)
     movq %rdi, 0(%rsp)              // Store native ArtMethod* to bottom of stack.
     movq %rsp, %rbp                 // save SP at (old) callee-save frame
     CFI_DEF_CFA_REGISTER(rbp)
@@ -1260,9 +1373,13 @@ DEFINE_FUNCTION art_quick_generic_jni_trampoline
     movq 56(%rsp), %xmm5
     movq 64(%rsp), %xmm6
     movq 72(%rsp), %xmm7
+    movq 80(%rsp), %xmm12
+    movq 88(%rsp), %xmm13
+    movq 96(%rsp), %xmm14
+    movq 104(%rsp), %xmm15
     // was 80 bytes
-    addq LITERAL(80), %rsp
-    CFI_ADJUST_CFA_OFFSET(-80)
+    addq LITERAL(80 + 4*8), %rsp
+    CFI_ADJUST_CFA_OFFSET(-80 - 4*8)
     // Save callee and GPR args, mixed together to agree with core spills bitmap.
     POP rcx  // Arg.
     POP rdx  // Arg.
@@ -1292,9 +1409,13 @@ DEFINE_FUNCTION art_quick_generic_jni_trampoline
     movq 56(%rsp), %xmm5
     movq 64(%rsp), %xmm6
     movq 72(%rsp), %xmm7
-    // was 80 bytes
-    addq LITERAL(80), %rsp
-    CFI_ADJUST_CFA_OFFSET(-80)
+    movq 80(%rsp), %xmm12
+    movq 88(%rsp), %xmm13
+    movq 96(%rsp), %xmm14
+    movq 104(%rsp), %xmm15
+    // was 80 + 32 bytes
+    addq LITERAL(80 + 4*8), %rsp
+    CFI_ADJUST_CFA_OFFSET(-80 - 4*8)
     // Save callee and GPR args, mixed together to agree with core spills bitmap.
     POP rcx  // Arg.
     POP rdx  // Arg.
@@ -1450,3 +1571,10 @@ DEFINE_FUNCTION art_quick_string_compareto
 END_FUNCTION art_quick_string_compareto
 
 UNIMPLEMENTED art_quick_memcmp16
+
+DEFINE_FUNCTION art_quick_assignable_from_code
+    SETUP_FP_CALLEE_SAVE_FRAME
+    call PLT_SYMBOL(artIsAssignableFromCode)       // (const mirror::Class*, const mirror::Class*)
+    RESTORE_FP_CALLEE_SAVE_FRAME
+    ret
+END_FUNCTION art_quick_assignable_from_code
diff --git a/runtime/arch/x86_64/quick_method_frame_info_x86_64.h b/runtime/arch/x86_64/quick_method_frame_info_x86_64.h
index 618390903b..53aa212a88 100644
--- a/runtime/arch/x86_64/quick_method_frame_info_x86_64.h
+++ b/runtime/arch/x86_64/quick_method_frame_info_x86_64.h
@@ -34,6 +34,9 @@ static constexpr uint32_t kX86_64CalleeSaveFpArgSpills =
     (1 << art::x86_64::XMM0) | (1 << art::x86_64::XMM1) | (1 << art::x86_64::XMM2) |
     (1 << art::x86_64::XMM3) | (1 << art::x86_64::XMM4) | (1 << art::x86_64::XMM5) |
     (1 << art::x86_64::XMM6) | (1 << art::x86_64::XMM7);
+static constexpr uint32_t kX86_64CalleeSaveFpSpills =
+    (1 << art::x86_64::XMM12) | (1 << art::x86_64::XMM13) |
+    (1 << art::x86_64::XMM14) | (1 << art::x86_64::XMM15);
 
 constexpr uint32_t X86_64CalleeSaveCoreSpills(Runtime::CalleeSaveType type) {
   return kX86_64CalleeSaveRefSpills |
@@ -42,7 +45,8 @@ constexpr uint32_t X86_64CalleeSaveCoreSpills(Runtime::CalleeSaveType type) {
 }
 
 constexpr uint32_t X86_64CalleeSaveFpSpills(Runtime::CalleeSaveType type) {
-  return (type == Runtime::kRefsAndArgs ? kX86_64CalleeSaveFpArgSpills : 0);
+  return kX86_64CalleeSaveFpSpills |
+      (type == Runtime::kRefsAndArgs ? kX86_64CalleeSaveFpArgSpills : 0);
 }
 
 constexpr uint32_t X86_64CalleeSaveFrameSize(Runtime::CalleeSaveType type) {
diff --git a/runtime/arch/x86_64/registers_x86_64.cc b/runtime/arch/x86_64/registers_x86_64.cc
index 38f3494502..f29c42652b 100644
--- a/runtime/arch/x86_64/registers_x86_64.cc
+++ b/runtime/arch/x86_64/registers_x86_64.cc
@@ -34,5 +34,14 @@ std::ostream& operator<<(std::ostream& os, const Register& rhs) {
   return os;
 }
 
+std::ostream& operator<<(std::ostream& os, const FloatRegister& rhs) {
+  if (rhs >= XMM0 && rhs <= XMM15) {
+    os << "xmm" << static_cast<int>(rhs);
+  } else {
+    os << "Register[" << static_cast<int>(rhs) << "]";
+  }
+  return os;
+}
+
 }  // namespace x86_64
 }  // namespace art
diff --git a/runtime/entrypoints/quick/quick_instrumentation_entrypoints.cc b/runtime/entrypoints/quick/quick_instrumentation_entrypoints.cc
index d161d0b9ed..2edcb78be3 100644
--- a/runtime/entrypoints/quick/quick_instrumentation_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_instrumentation_entrypoints.cc
@@ -32,10 +32,15 @@ extern "C" const void* artInstrumentationMethodEntryFromCode(mirror::ArtMethod*
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   FinishCalleeSaveFrameSetup(self, sp, Runtime::kRefsAndArgs);
   instrumentation::Instrumentation* instrumentation = Runtime::Current()->GetInstrumentation();
-  const void* result = instrumentation->GetQuickCodeFor(method);
+  const void* result;
+  if (instrumentation->IsDeoptimized(method)) {
+    result = GetQuickToInterpreterBridge();
+  } else {
+    result = instrumentation->GetQuickCodeFor(method);
+  }
   DCHECK(result != GetQuickToInterpreterBridgeTrampoline(Runtime::Current()->GetClassLinker()));
   bool interpreter_entry = (result == GetQuickToInterpreterBridge());
-  instrumentation->PushInstrumentationStackFrame(self, method->IsStatic() ? NULL : this_object,
+  instrumentation->PushInstrumentationStackFrame(self, method->IsStatic() ? nullptr : this_object,
                                                  method, lr, interpreter_entry);
   CHECK(result != NULL) << PrettyMethod(method);
   return result;
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index 95cb85eefc..2a66f2fe3b 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -175,8 +175,8 @@ class QuickArgumentVisitor {
   static constexpr size_t kNumQuickGprArgs = 5;  // 5 arguments passed in GPRs.
   static constexpr size_t kNumQuickFprArgs = 8;  // 8 arguments passed in FPRs.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 16;  // Offset of first FPR arg.
-  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset = 80;  // Offset of first GPR arg.
-  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_LrOffset = 168;  // Offset of return address.
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset = 80 + 4*8;  // Offset of first GPR arg.
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_LrOffset = 168 + 4*8;  // Offset of return address.
   static size_t GprIndexToGprOffset(uint32_t gpr_index) {
     switch (gpr_index) {
       case 0: return (4 * GetBytesPerGprSpillLocation(kRuntimeISA));
diff --git a/runtime/gc/allocator/rosalloc.cc b/runtime/gc/allocator/rosalloc.cc
index 722576f164..c66e80d2f1 100644
--- a/runtime/gc/allocator/rosalloc.cc
+++ b/runtime/gc/allocator/rosalloc.cc
@@ -2112,30 +2112,40 @@ size_t RosAlloc::ReleasePages() {
     // result in occasionally not releasing pages which we could release.
     byte pm = page_map_[i];
     switch (pm) {
+      case kPageMapReleased:
+        // Fall through.
       case kPageMapEmpty: {
-        // Only lock if we have an empty page since we want to prevent other threads racing in.
+        // This is currently the start of a free page run.
+        // Acquire the lock to prevent other threads racing in and modifying the page map.
         MutexLock mu(self, lock_);
         // Check that it's still empty after we acquired the lock since another thread could have
         // raced in and placed an allocation here.
-        pm = page_map_[i];
-        if (LIKELY(pm == kPageMapEmpty)) {
-          // The start of a free page run. Release pages.
+        if (IsFreePage(i)) {
+          // Free page runs can start with a released page if we coalesced a released page free
+          // page run with an empty page run.
           FreePageRun* fpr = reinterpret_cast<FreePageRun*>(base_ + i * kPageSize);
-          DCHECK(free_page_runs_.find(fpr) != free_page_runs_.end());
-          size_t fpr_size = fpr->ByteSize(this);
-          DCHECK(IsAligned<kPageSize>(fpr_size));
-          byte* start = reinterpret_cast<byte*>(fpr);
-          reclaimed_bytes += ReleasePageRange(start, start + fpr_size);
-          i += fpr_size / kPageSize;
-          DCHECK_LE(i, page_map_size_);
+          // There is a race condition where FreePage can coalesce fpr with the previous
+          // free page run before we acquire lock_. In that case free_page_runs_.find will not find
+          // a run starting at fpr. To handle this race, we skip reclaiming the page range and go
+          // to the next page.
+          if (free_page_runs_.find(fpr) != free_page_runs_.end()) {
+            size_t fpr_size = fpr->ByteSize(this);
+            DCHECK(IsAligned<kPageSize>(fpr_size));
+            byte* start = reinterpret_cast<byte*>(fpr);
+            reclaimed_bytes += ReleasePageRange(start, start + fpr_size);
+            size_t pages = fpr_size / kPageSize;
+            CHECK_GT(pages, 0U) << "Infinite loop probable";
+            i += pages;
+            DCHECK_LE(i, page_map_size_);
+            break;
+          }
         }
-        break;
+        // Fall through.
       }
       case kPageMapLargeObject:      // Fall through.
       case kPageMapLargeObjectPart:  // Fall through.
       case kPageMapRun:              // Fall through.
       case kPageMapRunPart:          // Fall through.
-      case kPageMapReleased:         // Fall through since it is already released.
         ++i;
         break;  // Skip.
       default:
@@ -2175,6 +2185,34 @@ size_t RosAlloc::ReleasePageRange(byte* start, byte* end) {
   return reclaimed_bytes;
 }
 
+void RosAlloc::LogFragmentationAllocFailure(std::ostream& os, size_t failed_alloc_bytes) {
+  Thread* self = Thread::Current();
+  size_t largest_continuous_free_pages = 0;
+  WriterMutexLock wmu(self, bulk_free_lock_);
+  MutexLock mu(self, lock_);
+  for (FreePageRun* fpr : free_page_runs_) {
+    largest_continuous_free_pages = std::max(largest_continuous_free_pages,
+                                             fpr->ByteSize(this));
+  }
+  if (failed_alloc_bytes > kLargeSizeThreshold) {
+    // Large allocation.
+    size_t required_bytes = RoundUp(failed_alloc_bytes, kPageSize);
+    if (required_bytes > largest_continuous_free_pages) {
+      os << "; failed due to fragmentation (required continguous free "
+         << required_bytes << " bytes where largest contiguous free "
+         <<  largest_continuous_free_pages << " bytes)";
+    }
+  } else {
+    // Non-large allocation.
+    size_t required_bytes = numOfPages[SizeToIndex(failed_alloc_bytes)] * kPageSize;
+    if (required_bytes > largest_continuous_free_pages) {
+      os << "; failed due to fragmentation (required continguous free "
+         << required_bytes << " bytes for a new buffer where largest contiguous free "
+         <<  largest_continuous_free_pages << " bytes)";
+    }
+  }
+}
+
 }  // namespace allocator
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/allocator/rosalloc.h b/runtime/gc/allocator/rosalloc.h
index fad0dc888e..85a8225807 100644
--- a/runtime/gc/allocator/rosalloc.h
+++ b/runtime/gc/allocator/rosalloc.h
@@ -590,6 +590,8 @@ class RosAlloc {
 
   // Verify for debugging.
   void Verify() EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
+
+  void LogFragmentationAllocFailure(std::ostream& os, size_t failed_alloc_bytes);
 };
 
 }  // namespace allocator
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index e9adca07c6..19715e9331 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -805,37 +805,23 @@ space::ImageSpace* Heap::GetImageSpace() const {
   return NULL;
 }
 
-static void MSpaceChunkCallback(void* start, void* end, size_t used_bytes, void* arg) {
-  size_t chunk_size = reinterpret_cast<uint8_t*>(end) - reinterpret_cast<uint8_t*>(start);
-  if (used_bytes < chunk_size) {
-    size_t chunk_free_bytes = chunk_size - used_bytes;
-    size_t& max_contiguous_allocation = *reinterpret_cast<size_t*>(arg);
-    max_contiguous_allocation = std::max(max_contiguous_allocation, chunk_free_bytes);
-  }
-}
-
-void Heap::ThrowOutOfMemoryError(Thread* self, size_t byte_count, bool large_object_allocation) {
+void Heap::ThrowOutOfMemoryError(Thread* self, size_t byte_count, AllocatorType allocator_type) {
   std::ostringstream oss;
   size_t total_bytes_free = GetFreeMemory();
   oss << "Failed to allocate a " << byte_count << " byte allocation with " << total_bytes_free
       << " free bytes";
   // If the allocation failed due to fragmentation, print out the largest continuous allocation.
-  if (!large_object_allocation && total_bytes_free >= byte_count) {
-    size_t max_contiguous_allocation = 0;
-    for (const auto& space : continuous_spaces_) {
-      if (space->IsMallocSpace()) {
-        // To allow the Walk/InspectAll() to exclusively-lock the mutator
-        // lock, temporarily release the shared access to the mutator
-        // lock here by transitioning to the suspended state.
-        Locks::mutator_lock_->AssertSharedHeld(self);
-        self->TransitionFromRunnableToSuspended(kSuspended);
-        space->AsMallocSpace()->Walk(MSpaceChunkCallback, &max_contiguous_allocation);
-        self->TransitionFromSuspendedToRunnable();
-        Locks::mutator_lock_->AssertSharedHeld(self);
-      }
+  if (allocator_type != kAllocatorTypeLOS && total_bytes_free >= byte_count) {
+    space::MallocSpace* space = nullptr;
+    if (allocator_type == kAllocatorTypeNonMoving) {
+      space = non_moving_space_;
+    } else if (allocator_type == kAllocatorTypeRosAlloc ||
+               allocator_type == kAllocatorTypeDlMalloc) {
+      space = main_space_;
+    }
+    if (space != nullptr) {
+      space->LogFragmentationAllocFailure(oss, byte_count);
     }
-    oss << "; failed due to fragmentation (largest possible contiguous allocation "
-        <<  max_contiguous_allocation << " bytes)";
   }
   self->ThrowOutOfMemoryError(oss.str().c_str());
 }
@@ -1188,7 +1174,7 @@ mirror::Object* Heap::AllocateInternalWithGc(Thread* self, AllocatorType allocat
   }
   ptr = TryToAllocate<true, true>(self, allocator, alloc_size, bytes_allocated, usable_size);
   if (ptr == nullptr) {
-    ThrowOutOfMemoryError(self, alloc_size, allocator == kAllocatorTypeLOS);
+    ThrowOutOfMemoryError(self, alloc_size, allocator);
   }
   return ptr;
 }
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index c9ea03e45c..86dab21008 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -120,7 +120,7 @@ class Heap {
 
   static constexpr size_t kDefaultStartingSize = kPageSize;
   static constexpr size_t kDefaultInitialSize = 2 * MB;
-  static constexpr size_t kDefaultMaximumSize = 32 * MB;
+  static constexpr size_t kDefaultMaximumSize = 256 * MB;
   static constexpr size_t kDefaultMaxFree = 2 * MB;
   static constexpr size_t kDefaultMinFree = kDefaultMaxFree / 4;
   static constexpr size_t kDefaultLongPauseLogThreshold = MsToNs(5);
@@ -194,7 +194,6 @@ class Heap {
 
   void CheckPreconditionsForAllocObject(mirror::Class* c, size_t byte_count)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  void ThrowOutOfMemoryError(size_t byte_count, bool large_object_allocation);
 
   void RegisterNativeAllocation(JNIEnv* env, int bytes);
   void RegisterNativeFree(JNIEnv* env, int bytes);
@@ -628,7 +627,7 @@ class Heap {
                                               size_t* usable_size)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  void ThrowOutOfMemoryError(Thread* self, size_t byte_count, bool large_object_allocation)
+  void ThrowOutOfMemoryError(Thread* self, size_t byte_count, AllocatorType allocator_type)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   template <bool kGrow>
diff --git a/runtime/gc/space/dlmalloc_space.cc b/runtime/gc/space/dlmalloc_space.cc
index 5123e4787d..456d1b31e2 100644
--- a/runtime/gc/space/dlmalloc_space.cc
+++ b/runtime/gc/space/dlmalloc_space.cc
@@ -304,6 +304,30 @@ void DlMallocSpace::CheckMoreCoreForPrecondition() {
 }
 #endif
 
+static void MSpaceChunkCallback(void* start, void* end, size_t used_bytes, void* arg) {
+  size_t chunk_size = reinterpret_cast<uint8_t*>(end) - reinterpret_cast<uint8_t*>(start);
+  if (used_bytes < chunk_size) {
+    size_t chunk_free_bytes = chunk_size - used_bytes;
+    size_t& max_contiguous_allocation = *reinterpret_cast<size_t*>(arg);
+    max_contiguous_allocation = std::max(max_contiguous_allocation, chunk_free_bytes);
+  }
+}
+
+void DlMallocSpace::LogFragmentationAllocFailure(std::ostream& os, size_t failed_alloc_bytes) {
+  Thread* self = Thread::Current();
+  size_t max_contiguous_allocation = 0;
+  // To allow the Walk/InspectAll() to exclusively-lock the mutator
+  // lock, temporarily release the shared access to the mutator
+  // lock here by transitioning to the suspended state.
+  Locks::mutator_lock_->AssertSharedHeld(self);
+  self->TransitionFromRunnableToSuspended(kSuspended);
+  Walk(MSpaceChunkCallback, &max_contiguous_allocation);
+  self->TransitionFromSuspendedToRunnable();
+  Locks::mutator_lock_->AssertSharedHeld(self);
+  os << "; failed due to fragmentation (largest possible contiguous allocation "
+     <<  max_contiguous_allocation << " bytes)";
+}
+
 }  // namespace space
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/space/dlmalloc_space.h b/runtime/gc/space/dlmalloc_space.h
index accd26bd21..7aff14b665 100644
--- a/runtime/gc/space/dlmalloc_space.h
+++ b/runtime/gc/space/dlmalloc_space.h
@@ -124,6 +124,9 @@ class DlMallocSpace : public MallocSpace {
     return this;
   }
 
+  void LogFragmentationAllocFailure(std::ostream& os, size_t failed_alloc_bytes) OVERRIDE
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
  protected:
   DlMallocSpace(const std::string& name, MemMap* mem_map, void* mspace, byte* begin, byte* end,
                 byte* limit, size_t growth_limit, bool can_move_objects, size_t starting_size,
diff --git a/runtime/gc/space/malloc_space.h b/runtime/gc/space/malloc_space.h
index d24016cb18..6f49fbf203 100644
--- a/runtime/gc/space/malloc_space.h
+++ b/runtime/gc/space/malloc_space.h
@@ -19,6 +19,7 @@
 
 #include "space.h"
 
+#include <iostream>
 #include <valgrind.h>
 #include <memcheck/memcheck.h>
 
@@ -132,6 +133,8 @@ class MallocSpace : public ContinuousMemMapAllocSpace {
     return can_move_objects_;
   }
 
+  virtual void LogFragmentationAllocFailure(std::ostream& os, size_t failed_alloc_bytes) = 0;
+
  protected:
   MallocSpace(const std::string& name, MemMap* mem_map, byte* begin, byte* end,
               byte* limit, size_t growth_limit, bool create_bitmaps, bool can_move_objects,
diff --git a/runtime/gc/space/rosalloc_space.h b/runtime/gc/space/rosalloc_space.h
index 2934af87c6..f50530576b 100644
--- a/runtime/gc/space/rosalloc_space.h
+++ b/runtime/gc/space/rosalloc_space.h
@@ -120,6 +120,10 @@ class RosAllocSpace : public MallocSpace {
 
   virtual ~RosAllocSpace();
 
+  void LogFragmentationAllocFailure(std::ostream& os, size_t failed_alloc_bytes) OVERRIDE {
+    rosalloc_->LogFragmentationAllocFailure(os, failed_alloc_bytes);
+  }
+
  protected:
   RosAllocSpace(const std::string& name, MemMap* mem_map, allocator::RosAlloc* rosalloc,
                 byte* begin, byte* end, byte* limit, size_t growth_limit, bool can_move_objects,
diff --git a/runtime/instrumentation.cc b/runtime/instrumentation.cc
index f459b590eb..0e05b62dde 100644
--- a/runtime/instrumentation.cc
+++ b/runtime/instrumentation.cc
@@ -146,16 +146,13 @@ void Instrumentation::InstallStubsForMethod(mirror::ArtMethod* method) {
       // class, all its static methods code will be set to the instrumentation entry point.
       // For more details, see ClassLinker::FixupStaticTrampolines.
       if (is_class_initialized || !method->IsStatic() || method->IsConstructor()) {
-        // Do not overwrite interpreter to prevent from posting method entry/exit events twice.
-        new_portable_code = class_linker->GetPortableOatCodeFor(method, &have_portable_code);
-        new_quick_code = class_linker->GetQuickOatCodeFor(method);
-        DCHECK(new_quick_code != GetQuickToInterpreterBridgeTrampoline(class_linker));
-        if (entry_exit_stubs_installed_ && new_quick_code != GetQuickToInterpreterBridge()) {
-          // TODO: portable to quick bridge. Bug: 8196384. We cannot enable the check below as long
-          // as GetPortableToQuickBridge() == GetPortableToInterpreterBridge().
-          // DCHECK(new_portable_code != GetPortableToInterpreterBridge());
+        if (entry_exit_stubs_installed_) {
           new_portable_code = GetPortableToInterpreterBridge();
           new_quick_code = GetQuickInstrumentationEntryPoint();
+        } else {
+          new_portable_code = class_linker->GetPortableOatCodeFor(method, &have_portable_code);
+          new_quick_code = class_linker->GetQuickOatCodeFor(method);
+          DCHECK(new_quick_code != GetQuickToInterpreterBridgeTrampoline(class_linker));
         }
       } else {
         new_portable_code = GetPortableResolutionTrampoline(class_linker);
@@ -175,7 +172,6 @@ static void InstrumentationInstallStack(Thread* thread, void* arg)
   struct InstallStackVisitor : public StackVisitor {
     InstallStackVisitor(Thread* thread, Context* context, uintptr_t instrumentation_exit_pc)
         : StackVisitor(thread, context),  instrumentation_stack_(thread->GetInstrumentationStack()),
-          existing_instrumentation_frames_count_(instrumentation_stack_->size()),
           instrumentation_exit_pc_(instrumentation_exit_pc),
           reached_existing_instrumentation_frames_(false), instrumentation_stack_depth_(0),
           last_return_pc_(0) {
@@ -190,18 +186,10 @@ static void InstrumentationInstallStack(Thread* thread, void* arg)
         last_return_pc_ = 0;
         return true;  // Ignore upcalls.
       }
-      if (m->IsRuntimeMethod()) {
-        if (kVerboseInstrumentation) {
-          LOG(INFO) << "  Skipping runtime method. Frame " << GetFrameId();
-        }
-        last_return_pc_ = GetReturnPc();
-        return true;  // Ignore unresolved methods since they will be instrumented after resolution.
-      }
-      if (kVerboseInstrumentation) {
-        LOG(INFO) << "  Installing exit stub in " << DescribeLocation();
-      }
       if (GetCurrentQuickFrame() == NULL) {
-        InstrumentationStackFrame instrumentation_frame(GetThisObject(), m, 0, GetFrameId(), false);
+        bool interpreter_frame = !m->IsPortableCompiled();
+        InstrumentationStackFrame instrumentation_frame(GetThisObject(), m, 0, GetFrameId(),
+                                                        interpreter_frame);
         if (kVerboseInstrumentation) {
           LOG(INFO) << "Pushing shadow frame " << instrumentation_frame.Dump();
         }
@@ -209,6 +197,32 @@ static void InstrumentationInstallStack(Thread* thread, void* arg)
         return true;  // Continue.
       }
       uintptr_t return_pc = GetReturnPc();
+      if (m->IsRuntimeMethod()) {
+        if (return_pc == instrumentation_exit_pc_) {
+          if (kVerboseInstrumentation) {
+            LOG(INFO) << "  Handling quick to interpreter transition. Frame " << GetFrameId();
+          }
+          CHECK_LT(instrumentation_stack_depth_, instrumentation_stack_->size());
+          const InstrumentationStackFrame& frame = instrumentation_stack_->at(instrumentation_stack_depth_);
+          CHECK(frame.interpreter_entry_);
+          // This is an interpreter frame so method enter event must have been reported. However we
+          // need to push a DEX pc into the dex_pcs_ list to match size of instrumentation stack.
+          // Since we won't report method entry here, we can safely push any DEX pc.
+          dex_pcs_.push_back(0);
+          last_return_pc_ = frame.return_pc_;
+          ++instrumentation_stack_depth_;
+          return true;
+        } else {
+          if (kVerboseInstrumentation) {
+            LOG(INFO) << "  Skipping runtime method. Frame " << GetFrameId();
+          }
+          last_return_pc_ = GetReturnPc();
+          return true;  // Ignore unresolved methods since they will be instrumented after resolution.
+        }
+      }
+      if (kVerboseInstrumentation) {
+        LOG(INFO) << "  Installing exit stub in " << DescribeLocation();
+      }
       if (return_pc == instrumentation_exit_pc_) {
         // We've reached a frame which has already been installed with instrumentation exit stub.
         // We should have already installed instrumentation on previous frames.
@@ -231,8 +245,15 @@ static void InstrumentationInstallStack(Thread* thread, void* arg)
           LOG(INFO) << "Pushing frame " << instrumentation_frame.Dump();
         }
 
-        // Insert frame before old ones so we do not corrupt the instrumentation stack.
-        auto it = instrumentation_stack_->end() - existing_instrumentation_frames_count_;
+        // Insert frame at the right position so we do not corrupt the instrumentation stack.
+        // Instrumentation stack frames are in descending frame id order.
+        auto it = instrumentation_stack_->begin();
+        for (auto end = instrumentation_stack_->end(); it != end; ++it) {
+          const InstrumentationStackFrame& current = *it;
+          if (instrumentation_frame.frame_id_ >= current.frame_id_) {
+            break;
+          }
+        }
         instrumentation_stack_->insert(it, instrumentation_frame);
         SetReturnPc(instrumentation_exit_pc_);
       }
@@ -243,7 +264,6 @@ static void InstrumentationInstallStack(Thread* thread, void* arg)
     }
     std::deque<InstrumentationStackFrame>* const instrumentation_stack_;
     std::vector<InstrumentationStackFrame> shadow_stack_;
-    const size_t existing_instrumentation_frames_count_;
     std::vector<uint32_t> dex_pcs_;
     const uintptr_t instrumentation_exit_pc_;
     bool reached_existing_instrumentation_frames_;
@@ -275,7 +295,9 @@ static void InstrumentationInstallStack(Thread* thread, void* arg)
       }
       uint32_t dex_pc = visitor.dex_pcs_.back();
       visitor.dex_pcs_.pop_back();
-      instrumentation->MethodEnterEvent(thread, (*isi).this_object_, (*isi).method_, dex_pc);
+      if (!isi->interpreter_entry_) {
+        instrumentation->MethodEnterEvent(thread, (*isi).this_object_, (*isi).method_, dex_pc);
+      }
     }
   }
   thread->VerifyStack();
@@ -606,7 +628,7 @@ void Instrumentation::Deoptimize(mirror::ArtMethod* method) {
   CHECK(!already_deoptimized) << "Method " << PrettyMethod(method) << " is already deoptimized";
 
   if (!interpreter_stubs_installed_) {
-    UpdateEntrypoints(method, GetQuickToInterpreterBridge(), GetPortableToInterpreterBridge(),
+    UpdateEntrypoints(method, GetQuickInstrumentationEntryPoint(), GetPortableToInterpreterBridge(),
                       false);
 
     // Install instrumentation exit stub and instrumentation frames. We may already have installed
@@ -844,7 +866,9 @@ void Instrumentation::PushInstrumentationStackFrame(Thread* self, mirror::Object
                                                                    frame_id, interpreter_entry);
   stack->push_front(instrumentation_frame);
 
-  MethodEnterEvent(self, this_object, method, 0);
+  if (!interpreter_entry) {
+    MethodEnterEvent(self, this_object, method, 0);
+  }
 }
 
 TwoWordReturn Instrumentation::PopInstrumentationStackFrame(Thread* self, uintptr_t* return_pc,
@@ -875,7 +899,9 @@ TwoWordReturn Instrumentation::PopInstrumentationStackFrame(Thread* self, uintpt
   //       return_pc.
   uint32_t dex_pc = DexFile::kDexNoIndex;
   mirror::Object* this_object = instrumentation_frame.this_object_;
-  MethodExitEvent(self, this_object, instrumentation_frame.method_, dex_pc, return_value);
+  if (!instrumentation_frame.interpreter_entry_) {
+    MethodExitEvent(self, this_object, instrumentation_frame.method_, dex_pc, return_value);
+  }
 
   // Deoptimize if the caller needs to continue execution in the interpreter. Do nothing if we get
   // back to an upcall.
diff --git a/runtime/interpreter/interpreter.cc b/runtime/interpreter/interpreter.cc
index 9cc144149b..2db62f8ead 100644
--- a/runtime/interpreter/interpreter.cc
+++ b/runtime/interpreter/interpreter.cc
@@ -95,9 +95,11 @@ static void UnstartedRuntimeJni(Thread* self, ArtMethod* method,
     jint newValue = args[4];
     bool success;
     if (Runtime::Current()->IsActiveTransaction()) {
-      success = obj->CasField32<true>(MemberOffset(offset), expectedValue, newValue);
+      success = obj->CasFieldWeakSequentiallyConsistent32<true>(MemberOffset(offset),
+                                                                expectedValue, newValue);
     } else {
-      success = obj->CasField32<false>(MemberOffset(offset), expectedValue, newValue);
+      success = obj->CasFieldWeakSequentiallyConsistent32<false>(MemberOffset(offset),
+                                                                 expectedValue, newValue);
     }
     result->SetZ(success ? JNI_TRUE : JNI_FALSE);
   } else if (name == "void sun.misc.Unsafe.putObject(java.lang.Object, long, java.lang.Object)") {
diff --git a/runtime/mirror/object-inl.h b/runtime/mirror/object-inl.h
index 62c1162b13..089ef57310 100644
--- a/runtime/mirror/object-inl.h
+++ b/runtime/mirror/object-inl.h
@@ -69,10 +69,10 @@ inline void Object::SetLockWord(LockWord new_val, bool as_volatile) {
   }
 }
 
-inline bool Object::CasLockWord(LockWord old_val, LockWord new_val) {
+inline bool Object::CasLockWordWeakSequentiallyConsistent(LockWord old_val, LockWord new_val) {
   // Force use of non-transactional mode and do not check.
-  return CasField32<false, false>(OFFSET_OF_OBJECT_MEMBER(Object, monitor_), old_val.GetValue(),
-                                  new_val.GetValue());
+  return CasFieldWeakSequentiallyConsistent32<false, false>(
+      OFFSET_OF_OBJECT_MEMBER(Object, monitor_), old_val.GetValue(), new_val.GetValue());
 }
 
 inline uint32_t Object::GetLockOwnerThreadId() {
@@ -131,21 +131,17 @@ inline bool Object::AtomicSetReadBarrierPointer(Object* expected_rb_ptr, Object*
   DCHECK(kUseBakerOrBrooksReadBarrier);
   MemberOffset offset = OFFSET_OF_OBJECT_MEMBER(Object, x_rb_ptr_);
   byte* raw_addr = reinterpret_cast<byte*>(this) + offset.SizeValue();
-  HeapReference<Object>* ref = reinterpret_cast<HeapReference<Object>*>(raw_addr);
+  Atomic<uint32_t>* atomic_rb_ptr = reinterpret_cast<Atomic<uint32_t>*>(raw_addr);
   HeapReference<Object> expected_ref(HeapReference<Object>::FromMirrorPtr(expected_rb_ptr));
   HeapReference<Object> new_ref(HeapReference<Object>::FromMirrorPtr(rb_ptr));
-  uint32_t expected_val = expected_ref.reference_;
-  uint32_t new_val;
   do {
-    uint32_t old_val = ref->reference_;
-    if (old_val != expected_val) {
+    if (UNLIKELY(atomic_rb_ptr->LoadRelaxed() != expected_ref.reference_)) {
       // Lost the race.
       return false;
     }
-    new_val = new_ref.reference_;
-  } while (!__sync_bool_compare_and_swap(
-      reinterpret_cast<uint32_t*>(raw_addr), expected_val, new_val));
-  DCHECK_EQ(new_val, ref->reference_);
+  } while (!atomic_rb_ptr->CompareExchangeWeakSequentiallyConsistent(expected_ref.reference_,
+                                                                     new_ref.reference_));
+  DCHECK_EQ(new_ref.reference_, atomic_rb_ptr->LoadRelaxed());
   return true;
 #else
   LOG(FATAL) << "Unreachable";
@@ -448,7 +444,8 @@ inline void Object::SetField32Volatile(MemberOffset field_offset, int32_t new_va
 }
 
 template<bool kTransactionActive, bool kCheckTransaction, VerifyObjectFlags kVerifyFlags>
-inline bool Object::CasField32(MemberOffset field_offset, int32_t old_value, int32_t new_value) {
+inline bool Object::CasFieldWeakSequentiallyConsistent32(MemberOffset field_offset,
+                                                         int32_t old_value, int32_t new_value) {
   if (kCheckTransaction) {
     DCHECK_EQ(kTransactionActive, Runtime::Current()->IsActiveTransaction());
   }
@@ -459,9 +456,9 @@ inline bool Object::CasField32(MemberOffset field_offset, int32_t old_value, int
     VerifyObject(this);
   }
   byte* raw_addr = reinterpret_cast<byte*>(this) + field_offset.Int32Value();
-  volatile int32_t* addr = reinterpret_cast<volatile int32_t*>(raw_addr);
+  AtomicInteger* atomic_addr = reinterpret_cast<AtomicInteger*>(raw_addr);
 
-  return __sync_bool_compare_and_swap(addr, old_value, new_value);
+  return atomic_addr->CompareExchangeWeakSequentiallyConsistent(old_value, new_value);
 }
 
 template<VerifyObjectFlags kVerifyFlags, bool kIsVolatile>
@@ -513,7 +510,8 @@ inline void Object::SetField64Volatile(MemberOffset field_offset, int64_t new_va
 }
 
 template<bool kTransactionActive, bool kCheckTransaction, VerifyObjectFlags kVerifyFlags>
-inline bool Object::CasField64(MemberOffset field_offset, int64_t old_value, int64_t new_value) {
+inline bool Object::CasFieldWeakSequentiallyConsistent64(MemberOffset field_offset,
+                                                         int64_t old_value, int64_t new_value) {
   if (kCheckTransaction) {
     DCHECK_EQ(kTransactionActive, Runtime::Current()->IsActiveTransaction());
   }
@@ -524,8 +522,8 @@ inline bool Object::CasField64(MemberOffset field_offset, int64_t old_value, int
     VerifyObject(this);
   }
   byte* raw_addr = reinterpret_cast<byte*>(this) + field_offset.Int32Value();
-  volatile int64_t* addr = reinterpret_cast<volatile int64_t*>(raw_addr);
-  return QuasiAtomic::Cas64(old_value, new_value, addr);
+  Atomic<int64_t>* atomic_addr = reinterpret_cast<Atomic<int64_t>*>(raw_addr);
+  return atomic_addr->CompareExchangeWeakSequentiallyConsistent(old_value, new_value);
 }
 
 template<class T, VerifyObjectFlags kVerifyFlags, ReadBarrierOption kReadBarrierOption,
@@ -615,8 +613,8 @@ inline HeapReference<Object>* Object::GetFieldObjectReferenceAddr(MemberOffset f
 }
 
 template<bool kTransactionActive, bool kCheckTransaction, VerifyObjectFlags kVerifyFlags>
-inline bool Object::CasFieldObject(MemberOffset field_offset, Object* old_value,
-                                   Object* new_value) {
+inline bool Object::CasFieldWeakSequentiallyConsistentObject(MemberOffset field_offset,
+                                                             Object* old_value, Object* new_value) {
   if (kCheckTransaction) {
     DCHECK_EQ(kTransactionActive, Runtime::Current()->IsActiveTransaction());
   }
@@ -632,11 +630,14 @@ inline bool Object::CasFieldObject(MemberOffset field_offset, Object* old_value,
   if (kTransactionActive) {
     Runtime::Current()->RecordWriteFieldReference(this, field_offset, old_value, true);
   }
-  byte* raw_addr = reinterpret_cast<byte*>(this) + field_offset.Int32Value();
-  volatile int32_t* addr = reinterpret_cast<volatile int32_t*>(raw_addr);
   HeapReference<Object> old_ref(HeapReference<Object>::FromMirrorPtr(old_value));
   HeapReference<Object> new_ref(HeapReference<Object>::FromMirrorPtr(new_value));
-  bool success =  __sync_bool_compare_and_swap(addr, old_ref.reference_, new_ref.reference_);
+  byte* raw_addr = reinterpret_cast<byte*>(this) + field_offset.Int32Value();
+  Atomic<uint32_t>* atomic_addr = reinterpret_cast<Atomic<uint32_t>*>(raw_addr);
+
+  bool success = atomic_addr->CompareExchangeWeakSequentiallyConsistent(old_ref.reference_,
+                                                                        new_ref.reference_);
+
   if (success) {
     Runtime::Current()->GetHeap()->WriteBarrierField(this, field_offset, new_value);
   }
diff --git a/runtime/mirror/object.cc b/runtime/mirror/object.cc
index 422a88b688..e58091fe09 100644
--- a/runtime/mirror/object.cc
+++ b/runtime/mirror/object.cc
@@ -156,7 +156,7 @@ int32_t Object::IdentityHashCode() const {
         // loop iteration.
         LockWord hash_word(LockWord::FromHashCode(GenerateIdentityHashCode()));
         DCHECK_EQ(hash_word.GetState(), LockWord::kHashCode);
-        if (const_cast<Object*>(this)->CasLockWord(lw, hash_word)) {
+        if (const_cast<Object*>(this)->CasLockWordWeakSequentiallyConsistent(lw, hash_word)) {
           return hash_word.GetHashCode();
         }
         break;
diff --git a/runtime/mirror/object.h b/runtime/mirror/object.h
index c082443ad9..d29011a4b5 100644
--- a/runtime/mirror/object.h
+++ b/runtime/mirror/object.h
@@ -110,7 +110,8 @@ class MANAGED LOCKABLE Object {
   // have C++11 "strong" semantics.
   // TODO: In most, possibly all, cases, these assumptions are too strong.
   // Confirm and weaken the implementation.
-  bool CasLockWord(LockWord old_val, LockWord new_val) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  bool CasLockWordWeakSequentiallyConsistent(LockWord old_val, LockWord new_val)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   uint32_t GetLockOwnerThreadId();
 
   mirror::Object* MonitorEnter(Thread* self) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
@@ -226,7 +227,8 @@ class MANAGED LOCKABLE Object {
 
   template<bool kTransactionActive, bool kCheckTransaction = true,
       VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
-  bool CasFieldObject(MemberOffset field_offset, Object* old_value, Object* new_value)
+  bool CasFieldWeakSequentiallyConsistentObject(MemberOffset field_offset, Object* old_value,
+                                                Object* new_value)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
@@ -252,7 +254,8 @@ class MANAGED LOCKABLE Object {
 
   template<bool kTransactionActive, bool kCheckTransaction = true,
       VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
-  bool CasField32(MemberOffset field_offset, int32_t old_value, int32_t new_value) ALWAYS_INLINE
+  bool CasFieldWeakSequentiallyConsistent32(MemberOffset field_offset, int32_t old_value,
+                                            int32_t new_value) ALWAYS_INLINE
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags, bool kIsVolatile = false>
@@ -275,7 +278,8 @@ class MANAGED LOCKABLE Object {
 
   template<bool kTransactionActive, bool kCheckTransaction = true,
       VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
-  bool CasField64(MemberOffset field_offset, int64_t old_value, int64_t new_value)
+  bool CasFieldWeakSequentiallyConsistent64(MemberOffset field_offset, int64_t old_value,
+                                            int64_t new_value)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   template<bool kTransactionActive, bool kCheckTransaction = true,
diff --git a/runtime/monitor.cc b/runtime/monitor.cc
index c3ec38d1d3..5633a77b6f 100644
--- a/runtime/monitor.cc
+++ b/runtime/monitor.cc
@@ -163,7 +163,7 @@ bool Monitor::Install(Thread* self) {
   }
   LockWord fat(this);
   // Publish the updated lock word, which may race with other threads.
-  bool success = GetObject()->CasLockWord(lw, fat);
+  bool success = GetObject()->CasLockWordWeakSequentiallyConsistent(lw, fat);
   // Lock profiling.
   if (success && owner_ != nullptr && lock_profiling_threshold_ != 0) {
     locking_method_ = owner_->GetCurrentMethod(&locking_dex_pc_);
@@ -722,7 +722,7 @@ mirror::Object* Monitor::MonitorEnter(Thread* self, mirror::Object* obj) {
     switch (lock_word.GetState()) {
       case LockWord::kUnlocked: {
         LockWord thin_locked(LockWord::FromThinLockId(thread_id, 0));
-        if (h_obj->CasLockWord(lock_word, thin_locked)) {
+        if (h_obj->CasLockWordWeakSequentiallyConsistent(lock_word, thin_locked)) {
           // CasLockWord enforces more than the acquire ordering we need here.
           return h_obj.Get();  // Success!
         }
diff --git a/runtime/native/sun_misc_Unsafe.cc b/runtime/native/sun_misc_Unsafe.cc
index d23cfff0ce..7cc4cac83f 100644
--- a/runtime/native/sun_misc_Unsafe.cc
+++ b/runtime/native/sun_misc_Unsafe.cc
@@ -28,7 +28,8 @@ static jboolean Unsafe_compareAndSwapInt(JNIEnv* env, jobject, jobject javaObj,
   ScopedFastNativeObjectAccess soa(env);
   mirror::Object* obj = soa.Decode<mirror::Object*>(javaObj);
   // JNI must use non transactional mode.
-  bool success = obj->CasField32<false>(MemberOffset(offset), expectedValue, newValue);
+  bool success = obj->CasFieldWeakSequentiallyConsistent32<false>(MemberOffset(offset),
+                                                                  expectedValue, newValue);
   return success ? JNI_TRUE : JNI_FALSE;
 }
 
@@ -37,7 +38,8 @@ static jboolean Unsafe_compareAndSwapLong(JNIEnv* env, jobject, jobject javaObj,
   ScopedFastNativeObjectAccess soa(env);
   mirror::Object* obj = soa.Decode<mirror::Object*>(javaObj);
   // JNI must use non transactional mode.
-  bool success = obj->CasField64<false>(MemberOffset(offset), expectedValue, newValue);
+  bool success = obj->CasFieldWeakSequentiallyConsistent64<false>(MemberOffset(offset),
+                                                                  expectedValue, newValue);
   return success ? JNI_TRUE : JNI_FALSE;
 }
 
@@ -48,7 +50,8 @@ static jboolean Unsafe_compareAndSwapObject(JNIEnv* env, jobject, jobject javaOb
   mirror::Object* expectedValue = soa.Decode<mirror::Object*>(javaExpectedValue);
   mirror::Object* newValue = soa.Decode<mirror::Object*>(javaNewValue);
   // JNI must use non transactional mode.
-  bool success = obj->CasFieldObject<false>(MemberOffset(offset), expectedValue, newValue);
+  bool success = obj->CasFieldWeakSequentiallyConsistentObject<false>(MemberOffset(offset),
+                                                                      expectedValue, newValue);
   return success ? JNI_TRUE : JNI_FALSE;
 }