81 files changed, 1582 insertions, 513 deletions
diff --git a/build/Android.common_test.mk b/build/Android.common_test.mk
index 3e76d91aed..ca718f1ed0 100644
--- a/build/Android.common_test.mk
+++ b/build/Android.common_test.mk
@@ -78,6 +78,9 @@ ART_TEST_JNI_FORCECOPY ?= $(ART_TEST_FULL)
 # Do you want run-tests with relocation disabled run?
 ART_TEST_RUN_TEST_NO_RELOCATE ?= $(ART_TEST_FULL)
 
+# Do you want run-tests with prebuilding?
+ART_TEST_RUN_TEST_PREBUILD ?= true
+
 # Do you want run-tests with no prebuilding enabled run?
 ART_TEST_RUN_TEST_NO_PREBUILD ?= $(ART_TEST_FULL)
 
@@ -96,6 +99,9 @@ ART_TEST_RUN_TEST_DEBUG ?= true
 # Do you want run-tests with libart.so?
 ART_TEST_RUN_TEST_NDEBUG ?= $(ART_TEST_FULL)
 
+# Do you want run-tests with the host/target's second arch?
+ART_TEST_RUN_TEST_2ND_ARCH ?= true
+
 # Do you want failed tests to have their artifacts cleaned up?
 ART_TEST_RUN_TEST_ALWAYS_CLEAN ?= true
 
diff --git a/compiler/common_compiler_test.cc b/compiler/common_compiler_test.cc
index 359d6af57d..7e19e15961 100644
--- a/compiler/common_compiler_test.cc
+++ b/compiler/common_compiler_test.cc
@@ -111,7 +111,7 @@ void CommonCompilerTest::MakeExecutable(const void* code_start, size_t code_leng
 #else
   // Only warn if not Intel as Intel doesn't have cache flush instructions.
 #if !defined(__i386__) && !defined(__x86_64__)
-  LOG(WARNING) << "UNIMPLEMENTED: cache flush";
+  UNIMPLEMENTED(WARNING) << "cache flush";
 #endif
 #endif
 }
diff --git a/compiler/compiler.cc b/compiler/compiler.cc
index fbfd8e6415..36213ca417 100644
--- a/compiler/compiler.cc
+++ b/compiler/compiler.cc
@@ -75,8 +75,8 @@ Compiler* Compiler::Create(CompilerDriver* driver, Compiler::Kind kind) {
 
     default:
       LOG(FATAL) << "UNREACHABLE";
+      UNREACHABLE();
   }
-  return nullptr;
 }
 
 }  // namespace art
diff --git a/compiler/dex/compiler_enums.h b/compiler/dex/compiler_enums.h
index 78da420339..beeb3adb72 100644
--- a/compiler/dex/compiler_enums.h
+++ b/compiler/dex/compiler_enums.h
@@ -60,6 +60,14 @@ enum SpecialTargetRegister {
   kFArg5,
   kFArg6,
   kFArg7,
+  kFArg8,
+  kFArg9,
+  kFArg10,
+  kFArg11,
+  kFArg12,
+  kFArg13,
+  kFArg14,
+  kFArg15,
   kRet0,
   kRet1,
   kInvokeTgt,
@@ -306,6 +314,7 @@ enum MIROptimizationFlagPositions {
   kMIRIgnoreRangeCheck,
   kMIRRangeCheckOnly,
   kMIRIgnoreClInitCheck,
+  kMirIgnoreDivZeroCheck,
   kMIRInlined,                        // Invoke is inlined (ie dead).
   kMIRInlinedPred,                    // Invoke is inlined via prediction.
   kMIRCallee,                         // Instruction is inlined from callee.
diff --git a/compiler/dex/global_value_numbering.cc b/compiler/dex/global_value_numbering.cc
index f0f7a7051b..d311bc76ff 100644
--- a/compiler/dex/global_value_numbering.cc
+++ b/compiler/dex/global_value_numbering.cc
@@ -70,12 +70,7 @@ LocalValueNumbering* GlobalValueNumbering::PrepareBasicBlock(BasicBlock* bb,
   DCHECK(work_lvn_.get() == nullptr);
   work_lvn_.reset(new (allocator) LocalValueNumbering(this, bb->id, allocator));
   if (bb->block_type == kEntryBlock) {
-    if ((cu_->access_flags & kAccStatic) == 0) {
-      // If non-static method, mark "this" as non-null
-      int this_reg = cu_->mir_graph->GetFirstInVR();
-      uint16_t value_name = work_lvn_->GetSRegValueName(this_reg);
-      work_lvn_->SetValueNameNullChecked(value_name);
-    }
+    work_lvn_->PrepareEntryBlock();
     DCHECK(bb->first_mir_insn == nullptr);  // modifications_allowed_ is irrelevant.
   } else {
     // To avoid repeated allocation on the ArenaStack, reuse a single vector kept as a member.
@@ -127,12 +122,6 @@ LocalValueNumbering* GlobalValueNumbering::PrepareBasicBlock(BasicBlock* bb,
     CHECK(!merge_lvns_.empty());
     if (merge_lvns_.size() == 1u) {
       work_lvn_->MergeOne(*merge_lvns_[0], merge_type);
-      BasicBlock* pred_bb = mir_graph_->GetBasicBlock(merge_lvns_[0]->Id());
-      if (HasNullCheckLastInsn(pred_bb, bb->id)) {
-        int s_reg = pred_bb->last_mir_insn->ssa_rep->uses[0];
-        uint16_t value_name = merge_lvns_[0]->GetSRegValueName(s_reg);
-        work_lvn_->SetValueNameNullChecked(value_name);
-      }
     } else {
       work_lvn_->Merge(merge_type);
     }
diff --git a/compiler/dex/global_value_numbering.h b/compiler/dex/global_value_numbering.h
index df554cdade..a4a7602c4b 100644
--- a/compiler/dex/global_value_numbering.h
+++ b/compiler/dex/global_value_numbering.h
@@ -105,6 +105,19 @@ class GlobalValueNumbering {
     return res;
   }
 
+  // Look up a value in the global value map, don't add a new entry if there was none before.
+  uint16_t FindValue(uint16_t op, uint16_t operand1, uint16_t operand2, uint16_t modifier) {
+    uint16_t res;
+    uint64_t key = BuildKey(op, operand1, operand2, modifier);
+    ValueMap::iterator lb = global_value_map_.lower_bound(key);
+    if (lb != global_value_map_.end() && lb->first == key) {
+      res = lb->second;
+    } else {
+      res = kNoValue;
+    }
+    return res;
+  }
+
   // Check if the exact value is stored in the global value map.
   bool HasValue(uint16_t op, uint16_t operand1, uint16_t operand2, uint16_t modifier,
                 uint16_t value) const {
@@ -253,6 +266,7 @@ class GlobalValueNumbering {
   ScopedArenaVector<const LocalValueNumbering*> merge_lvns_;  // Not owning.
 
   friend class LocalValueNumbering;
+  friend class GlobalValueNumberingTest;
 
   DISALLOW_COPY_AND_ASSIGN(GlobalValueNumbering);
 };
diff --git a/compiler/dex/global_value_numbering_test.cc b/compiler/dex/global_value_numbering_test.cc
index 82a11a55b3..d1bca291b1 100644
--- a/compiler/dex/global_value_numbering_test.cc
+++ b/compiler/dex/global_value_numbering_test.cc
@@ -25,6 +25,8 @@ namespace art {
 
 class GlobalValueNumberingTest : public testing::Test {
  protected:
+  static constexpr uint16_t kNoValue = GlobalValueNumbering::kNoValue;
+
   struct IFieldDef {
     uint16_t field_idx;
     uintptr_t declaring_dex_file;
@@ -125,6 +127,8 @@ class GlobalValueNumberingTest : public testing::Test {
     { bb, opcode, 0u, 0u, 1, { reg }, 0, { } }
 #define DEF_MOVE(bb, opcode, reg, src) \
     { bb, opcode, 0u, 0u, 1, { src }, 1, { reg } }
+#define DEF_MOVE_WIDE(bb, opcode, reg, src) \
+    { bb, opcode, 0u, 0u, 2, { src, src + 1 }, 2, { reg, reg + 1 } }
 #define DEF_PHI2(bb, reg, src1, src2) \
     { bb, static_cast<Instruction::Code>(kMirOpPhi), 0, 0u, 2u, { src1, src2 }, 1, { reg } }
 
@@ -341,6 +345,8 @@ class GlobalValueNumberingTest : public testing::Test {
       cu_.mir_graph->ssa_base_vregs_.push_back(i);
       cu_.mir_graph->ssa_subscripts_.push_back(0);
     }
+    // Set shorty for a void-returning method without arguments.
+    cu_.shorty = "V";
   }
 
   static constexpr size_t kMaxSsaRegs = 16384u;
@@ -356,6 +362,8 @@ class GlobalValueNumberingTest : public testing::Test {
   ArenaBitVector* live_in_v_;
 };
 
+constexpr uint16_t GlobalValueNumberingTest::kNoValue;
+
 class GlobalValueNumberingTestDiamond : public GlobalValueNumberingTest {
  public:
   GlobalValueNumberingTestDiamond();
@@ -981,6 +989,92 @@ TEST_F(GlobalValueNumberingTestDiamond, Phi) {
   EXPECT_EQ(value_names_[18], value_names_[21]);
 }
 
+TEST_F(GlobalValueNumberingTestDiamond, PhiWide) {
+  static const MIRDef mirs[] = {
+      DEF_CONST_WIDE(3, Instruction::CONST_WIDE, 0u, 1000),
+      DEF_CONST_WIDE(4, Instruction::CONST_WIDE, 2u, 2000),
+      DEF_CONST_WIDE(5, Instruction::CONST_WIDE, 4u, 3000),
+      DEF_MOVE_WIDE(4, Instruction::MOVE_WIDE, 6u, 0u),
+      DEF_MOVE_WIDE(4, Instruction::MOVE_WIDE, 8u, 2u),
+      DEF_MOVE_WIDE(5, Instruction::MOVE_WIDE, 10u, 0u),
+      DEF_MOVE_WIDE(5, Instruction::MOVE_WIDE, 12u, 4u),
+      DEF_PHI2(6, 14u, 6u, 10u),    // Same as CONST_WIDE 0u (1000).
+      DEF_PHI2(6, 15u, 7u, 11u),    // Same as CONST_WIDE 0u (1000), high word.
+      DEF_PHI2(6, 16u, 6u,  0u),    // Same as CONST_WIDE 0u (1000).
+      DEF_PHI2(6, 17u, 7u,  1u),    // Same as CONST_WIDE 0u (1000), high word.
+      DEF_PHI2(6, 18u, 0u, 10u),    // Same as CONST_WIDE 0u (1000).
+      DEF_PHI2(6, 19u, 1u, 11u),    // Same as CONST_WIDE 0u (1000), high word.
+      DEF_PHI2(6, 20u, 8u, 10u),    // Merge 2u (2000) and 0u (1000).
+      DEF_PHI2(6, 21u, 9u, 11u),    // Merge 2u (2000) and 0u (1000), high word.
+      DEF_PHI2(6, 22u, 2u, 10u),    // Merge 2u (2000) and 0u (1000).
+      DEF_PHI2(6, 23u, 3u, 11u),    // Merge 2u (2000) and 0u (1000), high word.
+      DEF_PHI2(6, 24u, 8u,  0u),    // Merge 2u (2000) and 0u (1000).
+      DEF_PHI2(6, 25u, 9u,  1u),    // Merge 2u (2000) and 0u (1000), high word.
+      DEF_PHI2(6, 26u, 2u,  0u),    // Merge 2u (2000) and 0u (1000).
+      DEF_PHI2(6, 27u, 5u,  1u),    // Merge 2u (2000) and 0u (1000), high word.
+      DEF_PHI2(6, 28u, 6u, 12u),    // Merge 0u (1000) and 4u (3000).
+      DEF_PHI2(6, 29u, 7u, 13u),    // Merge 0u (1000) and 4u (3000), high word.
+      DEF_PHI2(6, 30u, 0u, 12u),    // Merge 0u (1000) and 4u (3000).
+      DEF_PHI2(6, 31u, 1u, 13u),    // Merge 0u (1000) and 4u (3000), high word.
+      DEF_PHI2(6, 32u, 6u,  4u),    // Merge 0u (1000) and 4u (3000).
+      DEF_PHI2(6, 33u, 7u,  5u),    // Merge 0u (1000) and 4u (3000), high word.
+      DEF_PHI2(6, 34u, 0u,  4u),    // Merge 0u (1000) and 4u (3000).
+      DEF_PHI2(6, 35u, 1u,  5u),    // Merge 0u (1000) and 4u (3000), high word.
+      DEF_PHI2(6, 36u, 8u, 12u),    // Merge 2u (2000) and 4u (3000).
+      DEF_PHI2(6, 37u, 9u, 13u),    // Merge 2u (2000) and 4u (3000), high word.
+      DEF_PHI2(6, 38u, 2u, 12u),    // Merge 2u (2000) and 4u (3000).
+      DEF_PHI2(6, 39u, 3u, 13u),    // Merge 2u (2000) and 4u (3000), high word.
+      DEF_PHI2(6, 40u, 8u,  4u),    // Merge 2u (2000) and 4u (3000).
+      DEF_PHI2(6, 41u, 9u,  5u),    // Merge 2u (2000) and 4u (3000), high word.
+      DEF_PHI2(6, 42u, 2u,  4u),    // Merge 2u (2000) and 4u (3000).
+      DEF_PHI2(6, 43u, 3u,  5u),    // Merge 2u (2000) and 4u (3000), high word.
+  };
+
+  PrepareMIRs(mirs);
+  PerformGVN();
+  ASSERT_EQ(arraysize(mirs), value_names_.size());
+  EXPECT_EQ(value_names_[0], value_names_[7]);
+  EXPECT_EQ(value_names_[0], value_names_[9]);
+  EXPECT_EQ(value_names_[0], value_names_[11]);
+  EXPECT_NE(value_names_[13], value_names_[0]);
+  EXPECT_NE(value_names_[13], value_names_[1]);
+  EXPECT_NE(value_names_[13], value_names_[2]);
+  EXPECT_EQ(value_names_[13], value_names_[15]);
+  EXPECT_EQ(value_names_[13], value_names_[17]);
+  EXPECT_EQ(value_names_[13], value_names_[19]);
+  EXPECT_NE(value_names_[21], value_names_[0]);
+  EXPECT_NE(value_names_[21], value_names_[1]);
+  EXPECT_NE(value_names_[21], value_names_[2]);
+  EXPECT_NE(value_names_[21], value_names_[13]);
+  EXPECT_EQ(value_names_[21], value_names_[23]);
+  EXPECT_EQ(value_names_[21], value_names_[25]);
+  EXPECT_EQ(value_names_[21], value_names_[27]);
+  EXPECT_NE(value_names_[29], value_names_[0]);
+  EXPECT_NE(value_names_[29], value_names_[1]);
+  EXPECT_NE(value_names_[29], value_names_[2]);
+  EXPECT_NE(value_names_[29], value_names_[13]);
+  EXPECT_NE(value_names_[29], value_names_[21]);
+  EXPECT_EQ(value_names_[29], value_names_[31]);
+  EXPECT_EQ(value_names_[29], value_names_[33]);
+  EXPECT_EQ(value_names_[29], value_names_[35]);
+  // High words should get kNoValue.
+  EXPECT_EQ(value_names_[8], kNoValue);
+  EXPECT_EQ(value_names_[10], kNoValue);
+  EXPECT_EQ(value_names_[12], kNoValue);
+  EXPECT_EQ(value_names_[14], kNoValue);
+  EXPECT_EQ(value_names_[16], kNoValue);
+  EXPECT_EQ(value_names_[18], kNoValue);
+  EXPECT_EQ(value_names_[20], kNoValue);
+  EXPECT_EQ(value_names_[22], kNoValue);
+  EXPECT_EQ(value_names_[24], kNoValue);
+  EXPECT_EQ(value_names_[26], kNoValue);
+  EXPECT_EQ(value_names_[28], kNoValue);
+  EXPECT_EQ(value_names_[30], kNoValue);
+  EXPECT_EQ(value_names_[32], kNoValue);
+  EXPECT_EQ(value_names_[34], kNoValue);
+  EXPECT_EQ(value_names_[36], kNoValue);
+}
+
 TEST_F(GlobalValueNumberingTestLoop, NonAliasingIFields) {
   static const IFieldDef ifields[] = {
       { 0u, 1u, 0u, false },  // Int.
diff --git a/compiler/dex/local_value_numbering.cc b/compiler/dex/local_value_numbering.cc
index 8b7ae20b95..5456f4d0a1 100644
--- a/compiler/dex/local_value_numbering.cc
+++ b/compiler/dex/local_value_numbering.cc
@@ -374,6 +374,12 @@ void LocalValueNumbering::MergeOne(const LocalValueNumbering& other, MergeType m
   range_checked_ = other.range_checked_;
   null_checked_ = other.null_checked_;
 
+  const BasicBlock* pred_bb = gvn_->GetBasicBlock(other.Id());
+  if (GlobalValueNumbering::HasNullCheckLastInsn(pred_bb, Id())) {
+    int s_reg = pred_bb->last_mir_insn->ssa_rep->uses[0];
+    null_checked_.insert(other.GetOperandValue(s_reg));
+  }
+
   if (merge_type == kCatchMerge) {
     // Memory is clobbered. Use new memory version and don't merge aliasing locations.
     global_memory_version_ = NewMemoryVersion(&merge_new_memory_version_);
@@ -465,10 +471,7 @@ void LocalValueNumbering::PruneNonAliasingRefsForCatch() {
     DCHECK(mir != nullptr);
     // Only INVOKEs can leak and clobber non-aliasing references if they throw.
     if ((mir->dalvikInsn.FlagsOf() & Instruction::kInvoke) != 0) {
-      for (uint16_t i = 0u; i != mir->ssa_rep->num_uses; ++i) {
-        uint16_t value_name = lvn->GetOperandValue(mir->ssa_rep->uses[i]);
-        non_aliasing_refs_.erase(value_name);
-      }
+      HandleInvokeArgs(mir, lvn);
     }
   }
 }
@@ -681,7 +684,7 @@ void LocalValueNumbering::MergeNullChecked() {
   const BasicBlock* least_entries_bb = gvn_->GetBasicBlock(least_entries_lvn->Id());
   if (gvn_->HasNullCheckLastInsn(least_entries_bb, id_)) {
     int s_reg = least_entries_bb->last_mir_insn->ssa_rep->uses[0];
-    uint32_t value_name = least_entries_lvn->GetSRegValueName(s_reg);
+    uint32_t value_name = least_entries_lvn->GetOperandValue(s_reg);
     merge_names_.clear();
     merge_names_.resize(gvn_->merge_lvns_.size(), value_name);
     if (gvn_->NullCheckedInAllPredecessors(merge_names_)) {
@@ -953,6 +956,26 @@ void LocalValueNumbering::Merge(MergeType merge_type) {
                 AliasingArrayVersions>>();
 }
 
+void LocalValueNumbering::PrepareEntryBlock() {
+  uint32_t vreg = gvn_->GetMirGraph()->GetFirstInVR();
+  CompilationUnit* cu = gvn_->GetCompilationUnit();
+  const char* shorty = cu->shorty;
+  ++shorty;  // Skip return value.
+  if ((cu->access_flags & kAccStatic) == 0) {
+    // If non-static method, mark "this" as non-null
+    uint16_t value_name = GetOperandValue(vreg);
+    ++vreg;
+    null_checked_.insert(value_name);
+  }
+  for ( ; *shorty != 0; ++shorty, ++vreg) {
+    if (*shorty == 'J' || *shorty == 'D') {
+      uint16_t value_name = GetOperandValueWide(vreg);
+      SetOperandValueWide(vreg, value_name);
+      ++vreg;
+    }
+  }
+}
+
 uint16_t LocalValueNumbering::MarkNonAliasingNonNull(MIR* mir) {
   uint16_t res = GetOperandValue(mir->ssa_rep->defs[0]);
   DCHECK(null_checked_.find(res) == null_checked_.end());
@@ -1039,12 +1062,30 @@ void LocalValueNumbering::HandleEscapingRef(uint16_t base) {
   }
 }
 
+void LocalValueNumbering::HandleInvokeArgs(const MIR* mir, const LocalValueNumbering* mir_lvn) {
+  const int32_t* uses = mir->ssa_rep->uses;
+  const int32_t* uses_end = uses + mir->ssa_rep->num_uses;
+  while (uses != uses_end) {
+    uint16_t sreg = *uses;
+    ++uses;
+    // Avoid LookupValue() so that we don't store new values in the global value map.
+    auto local_it = mir_lvn->sreg_value_map_.find(sreg);
+    if (local_it != mir_lvn->sreg_value_map_.end()) {
+      non_aliasing_refs_.erase(local_it->second);
+    } else {
+      uint16_t value_name = gvn_->FindValue(kNoValue, sreg, kNoValue, kNoValue);
+      if (value_name != kNoValue) {
+        non_aliasing_refs_.erase(value_name);
+      }
+    }
+  }
+}
+
 uint16_t LocalValueNumbering::HandlePhi(MIR* mir) {
   if (gvn_->merge_lvns_.empty()) {
     // Running LVN without a full GVN?
     return kNoValue;
   }
-  int16_t num_uses = mir->ssa_rep->num_uses;
   int32_t* uses = mir->ssa_rep->uses;
   // Try to find out if this is merging wide regs.
   if (mir->ssa_rep->defs[0] != 0 &&
@@ -1052,18 +1093,20 @@ uint16_t LocalValueNumbering::HandlePhi(MIR* mir) {
     // This is the high part of a wide reg. Ignore the Phi.
     return kNoValue;
   }
-  bool wide = false;
-  for (int16_t i = 0; i != num_uses; ++i) {
-    if (sreg_wide_value_map_.count(uses[i]) != 0u) {
-      wide = true;
-      break;
-    }
+  BasicBlockId* incoming = mir->meta.phi_incoming;
+  int16_t pos = 0;
+  // Check if we're merging a wide value based on the first merged LVN.
+  const LocalValueNumbering* first_lvn = gvn_->merge_lvns_[0];
+  DCHECK_LT(pos, mir->ssa_rep->num_uses);
+  while (incoming[pos] != first_lvn->Id()) {
+    ++pos;
+    DCHECK_LT(pos, mir->ssa_rep->num_uses);
   }
+  int first_s_reg = uses[pos];
+  bool wide = (first_lvn->sreg_wide_value_map_.count(first_s_reg) != 0u);
   // Iterate over *merge_lvns_ and skip incoming sregs for BBs without associated LVN.
   uint16_t value_name = kNoValue;
   merge_names_.clear();
-  BasicBlockId* incoming = mir->meta.phi_incoming;
-  int16_t pos = 0;
   bool same_values = true;
   for (const LocalValueNumbering* lvn : gvn_->merge_lvns_) {
     DCHECK_LT(pos, mir->ssa_rep->num_uses);
@@ -1468,10 +1511,7 @@ uint16_t LocalValueNumbering::GetValueNumber(MIR* mir) {
     case Instruction::INVOKE_STATIC:
     case Instruction::INVOKE_STATIC_RANGE:
       // Make ref args aliasing.
-      for (size_t i = 0u, count = mir->ssa_rep->num_uses; i != count; ++i) {
-        uint16_t reg = GetOperandValue(mir->ssa_rep->uses[i]);
-        non_aliasing_refs_.erase(reg);
-      }
+      HandleInvokeArgs(mir, this);
       HandleInvokeOrClInitOrAcquireOp(mir);
       break;
 
diff --git a/compiler/dex/local_value_numbering.h b/compiler/dex/local_value_numbering.h
index c60da32b95..dd8d2db8f4 100644
--- a/compiler/dex/local_value_numbering.h
+++ b/compiler/dex/local_value_numbering.h
@@ -44,14 +44,6 @@ class LocalValueNumbering {
 
   bool Equals(const LocalValueNumbering& other) const;
 
-  uint16_t GetSRegValueName(uint16_t s_reg) const {
-    return GetOperandValue(s_reg);
-  }
-
-  void SetValueNameNullChecked(uint16_t value_name) {
-    null_checked_.insert(value_name);
-  }
-
   bool IsValueNullChecked(uint16_t value_name) const {
     return null_checked_.find(value_name) != null_checked_.end();
   }
@@ -73,6 +65,7 @@ class LocalValueNumbering {
 
   void MergeOne(const LocalValueNumbering& other, MergeType merge_type);
   void Merge(MergeType merge_type);  // Merge gvn_->merge_lvns_.
+  void PrepareEntryBlock();
 
   uint16_t GetValueNumber(MIR* mir);
 
@@ -121,18 +114,22 @@ class LocalValueNumbering {
   }
 
   void SetOperandValue(uint16_t s_reg, uint16_t value) {
+    DCHECK_EQ(sreg_wide_value_map_.count(s_reg), 0u);
     SetOperandValueImpl(s_reg, value, &sreg_value_map_);
   }
 
   uint16_t GetOperandValue(int s_reg) const {
+    DCHECK_EQ(sreg_wide_value_map_.count(s_reg), 0u);
     return GetOperandValueImpl(s_reg, &sreg_value_map_);
   }
 
   void SetOperandValueWide(uint16_t s_reg, uint16_t value) {
+    DCHECK_EQ(sreg_value_map_.count(s_reg), 0u);
     SetOperandValueImpl(s_reg, value, &sreg_wide_value_map_);
   }
 
   uint16_t GetOperandValueWide(int s_reg) const {
+    DCHECK_EQ(sreg_value_map_.count(s_reg), 0u);
     return GetOperandValueImpl(s_reg, &sreg_wide_value_map_);
   }
 
@@ -300,6 +297,7 @@ class LocalValueNumbering {
   void HandleRangeCheck(MIR* mir, uint16_t array, uint16_t index);
   void HandlePutObject(MIR* mir);
   void HandleEscapingRef(uint16_t base);
+  void HandleInvokeArgs(const MIR* mir, const LocalValueNumbering* mir_lvn);
   uint16_t HandlePhi(MIR* mir);
   uint16_t HandleAGet(MIR* mir, uint16_t opcode);
   void HandleAPut(MIR* mir, uint16_t opcode);
diff --git a/compiler/dex/mir_graph.h b/compiler/dex/mir_graph.h
index a405af16a3..5c74e9e6bf 100644
--- a/compiler/dex/mir_graph.h
+++ b/compiler/dex/mir_graph.h
@@ -147,6 +147,7 @@ enum OatMethodAttributes {
 #define MIR_IGNORE_RANGE_CHECK          (1 << kMIRIgnoreRangeCheck)
 #define MIR_RANGE_CHECK_ONLY            (1 << kMIRRangeCheckOnly)
 #define MIR_IGNORE_CLINIT_CHECK         (1 << kMIRIgnoreClInitCheck)
+#define MIR_IGNORE_DIV_ZERO_CHECK       (1 << kMirIgnoreDivZeroCheck)
 #define MIR_INLINED                     (1 << kMIRInlined)
 #define MIR_INLINED_PRED                (1 << kMIRInlinedPred)
 #define MIR_CALLEE                      (1 << kMIRCallee)
diff --git a/compiler/dex/quick/arm/arm_lir.h b/compiler/dex/quick/arm/arm_lir.h
index d935bc30c4..36cb7a4efc 100644
--- a/compiler/dex/quick/arm/arm_lir.h
+++ b/compiler/dex/quick/arm/arm_lir.h
@@ -297,19 +297,20 @@ constexpr RegStorage rs_dr30(RegStorage::kValid | dr30);
 constexpr RegStorage rs_dr31(RegStorage::kValid | dr31);
 #endif
 
-// RegisterLocation templates return values (r0, or r0/r1).
-const RegLocation arm_loc_c_return
-    {kLocPhysReg, 0, 0, 0, 0, 0, 0, 0, 1,
-     RegStorage(RegStorage::k32BitSolo, r0), INVALID_SREG, INVALID_SREG};
-const RegLocation arm_loc_c_return_wide
+// RegisterLocation templates return values (r0, r0/r1, s0, or d0).
+// Note: The return locations are shared between quick code and quick helper. This follows quick
+// ABI. Quick helper assembly routine needs to handle the ABI differences.
+const RegLocation arm_loc_c_return =
+    {kLocPhysReg, 0, 0, 0, 0, 0, 0, 0, 1, rs_r0, INVALID_SREG, INVALID_SREG};
+const RegLocation arm_loc_c_return_wide =
     {kLocPhysReg, 1, 0, 0, 0, 0, 0, 0, 1,
-     RegStorage(RegStorage::k64BitPair, r0, r1), INVALID_SREG, INVALID_SREG};
-const RegLocation arm_loc_c_return_float
-    {kLocPhysReg, 0, 0, 0, 0, 0, 0, 0, 1,
-     RegStorage(RegStorage::k32BitSolo, r0), INVALID_SREG, INVALID_SREG};
-const RegLocation arm_loc_c_return_double
-    {kLocPhysReg, 1, 0, 0, 0, 0, 0, 0, 1,
-     RegStorage(RegStorage::k64BitPair, r0, r1), INVALID_SREG, INVALID_SREG};
+     RegStorage::MakeRegPair(rs_r0, rs_r1), INVALID_SREG, INVALID_SREG};
+const RegLocation arm_loc_c_return_float = kArm32QuickCodeUseSoftFloat
+    ? arm_loc_c_return
+    : RegLocation({kLocPhysReg, 0, 0, 0, 1, 0, 0, 0, 1, rs_fr0, INVALID_SREG, INVALID_SREG});
+const RegLocation arm_loc_c_return_double = kArm32QuickCodeUseSoftFloat
+    ? arm_loc_c_return_wide
+    : RegLocation({kLocPhysReg, 1, 0, 0, 1, 0, 0, 0, 1, rs_dr0, INVALID_SREG, INVALID_SREG});
 
 enum ArmShiftEncodings {
   kArmLsl = 0x0,
diff --git a/compiler/dex/quick/arm/codegen_arm.h b/compiler/dex/quick/arm/codegen_arm.h
index 1c87a0386c..442c4fcec6 100644
--- a/compiler/dex/quick/arm/codegen_arm.h
+++ b/compiler/dex/quick/arm/codegen_arm.h
@@ -25,6 +25,64 @@
 namespace art {
 
 class ArmMir2Lir FINAL : public Mir2Lir {
+ protected:
+  // TODO: Consolidate hard float target support.
+  // InToRegStorageMapper and InToRegStorageMapping can be shared with all backends.
+  // Base class used to get RegStorage for next argument.
+  class InToRegStorageMapper {
+   public:
+    virtual RegStorage GetNextReg(bool is_double_or_float, bool is_wide) = 0;
+    virtual ~InToRegStorageMapper() {
+    }
+  };
+
+  // Inherited class for ARM backend.
+  class InToRegStorageArmMapper FINAL : public InToRegStorageMapper {
+   public:
+    InToRegStorageArmMapper()
+        : cur_core_reg_(0), cur_fp_reg_(0), cur_fp_double_reg_(0) {
+    }
+
+    virtual ~InToRegStorageArmMapper() {
+    }
+
+    RegStorage GetNextReg(bool is_double_or_float, bool is_wide) OVERRIDE;
+
+   private:
+    uint32_t cur_core_reg_;
+    uint32_t cur_fp_reg_;
+    uint32_t cur_fp_double_reg_;
+  };
+
+  // Class to map argument to RegStorage. The mapping object is initialized by a mapper.
+  class InToRegStorageMapping FINAL {
+   public:
+    InToRegStorageMapping()
+        : max_mapped_in_(0), is_there_stack_mapped_(false), initialized_(false) {
+    }
+
+    int GetMaxMappedIn() const {
+      return max_mapped_in_;
+    }
+
+    bool IsThereStackMapped() const {
+      return is_there_stack_mapped_;
+    }
+
+    bool IsInitialized() const {
+      return initialized_;
+    }
+
+    void Initialize(RegLocation* arg_locs, int count, InToRegStorageMapper* mapper);
+    RegStorage Get(int in_position) const;
+
+   private:
+    std::map<int, RegStorage> mapping_;
+    int max_mapped_in_;
+    bool is_there_stack_mapped_;
+    bool initialized_;
+  };
+
   public:
     ArmMir2Lir(CompilationUnit* cu, MIRGraph* mir_graph, ArenaAllocator* arena);
 
@@ -47,15 +105,30 @@ class ArmMir2Lir FINAL : public Mir2Lir {
     void MarkGCCard(RegStorage val_reg, RegStorage tgt_addr_reg);
 
     // Required for target - register utilities.
-    RegStorage TargetReg(SpecialTargetRegister reg);
-    RegStorage GetArgMappingToPhysicalReg(int arg_num);
-    RegLocation GetReturnAlt();
-    RegLocation GetReturnWideAlt();
-    RegLocation LocCReturn();
-    RegLocation LocCReturnRef();
-    RegLocation LocCReturnDouble();
-    RegLocation LocCReturnFloat();
-    RegLocation LocCReturnWide();
+    RegStorage TargetReg(SpecialTargetRegister reg) OVERRIDE;
+    RegStorage TargetReg(SpecialTargetRegister reg, WideKind wide_kind) OVERRIDE {
+      if (wide_kind == kWide) {
+        DCHECK((kArg0 <= reg && reg < kArg3) || (kFArg0 <= reg && reg < kFArg15) || (kRet0 == reg));
+        RegStorage ret_reg = RegStorage::MakeRegPair(TargetReg(reg),
+            TargetReg(static_cast<SpecialTargetRegister>(reg + 1)));
+        if (ret_reg.IsFloat()) {
+          // Regard double as double, be consistent with register allocation.
+          ret_reg = As64BitFloatReg(ret_reg);
+        }
+        return ret_reg;
+      } else {
+        return TargetReg(reg);
+      }
+    }
+
+    RegStorage GetArgMappingToPhysicalReg(int arg_num) OVERRIDE;
+    RegLocation GetReturnAlt() OVERRIDE;
+    RegLocation GetReturnWideAlt() OVERRIDE;
+    RegLocation LocCReturn() OVERRIDE;
+    RegLocation LocCReturnRef() OVERRIDE;
+    RegLocation LocCReturnDouble() OVERRIDE;
+    RegLocation LocCReturnFloat() OVERRIDE;
+    RegLocation LocCReturnWide() OVERRIDE;
     ResourceMask GetRegMaskCommon(const RegStorage& reg) const OVERRIDE;
     void AdjustSpillMask();
     void ClobberCallerSave();
@@ -87,15 +160,15 @@ class ArmMir2Lir FINAL : public Mir2Lir {
 
     // Required for target - Dalvik-level generators.
     void GenArithOpLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                        RegLocation rl_src2) OVERRIDE;
+                        RegLocation rl_src2, int flags) OVERRIDE;
     void GenArithImmOpLong(Instruction::Code opcode, RegLocation rl_dest,
-                           RegLocation rl_src1, RegLocation rl_src2);
+                           RegLocation rl_src1, RegLocation rl_src2, int flags);
     void GenArrayGet(int opt_flags, OpSize size, RegLocation rl_array,
                      RegLocation rl_index, RegLocation rl_dest, int scale);
     void GenArrayPut(int opt_flags, OpSize size, RegLocation rl_array, RegLocation rl_index,
                      RegLocation rl_src, int scale, bool card_mark);
     void GenShiftImmOpLong(Instruction::Code opcode, RegLocation rl_dest,
-                           RegLocation rl_src1, RegLocation rl_shift);
+                           RegLocation rl_src1, RegLocation rl_shift, int flags);
     void GenArithOpDouble(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
                           RegLocation rl_src2);
     void GenArithOpFloat(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
@@ -210,6 +283,19 @@ class ArmMir2Lir FINAL : public Mir2Lir {
     LIR* InvokeTrampoline(OpKind op, RegStorage r_tgt, QuickEntrypointEnum trampoline) OVERRIDE;
     size_t GetInstructionOffset(LIR* lir);
 
+    int GenDalvikArgsNoRange(CallInfo* info, int call_state, LIR** pcrLabel,
+                             NextCallInsn next_call_insn,
+                             const MethodReference& target_method,
+                             uint32_t vtable_idx,
+                             uintptr_t direct_code, uintptr_t direct_method, InvokeType type,
+                             bool skip_this) OVERRIDE;
+    int GenDalvikArgsRange(CallInfo* info, int call_state, LIR** pcrLabel,
+                           NextCallInsn next_call_insn,
+                           const MethodReference& target_method,
+                           uint32_t vtable_idx,
+                           uintptr_t direct_code, uintptr_t direct_method, InvokeType type,
+                           bool skip_this) OVERRIDE;
+
   private:
     void GenNegLong(RegLocation rl_dest, RegLocation rl_src);
     void GenMulLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
@@ -224,12 +310,12 @@ class ArmMir2Lir FINAL : public Mir2Lir {
     void InsertFixupBefore(LIR* prev_lir, LIR* orig_lir, LIR* new_lir);
     void AssignDataOffsets();
     RegLocation GenDivRem(RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2,
-                          bool is_div, bool check_zero);
-    RegLocation GenDivRemLit(RegLocation rl_dest, RegLocation rl_src1, int lit, bool is_div);
-    typedef struct {
+                          bool is_div, int flags) OVERRIDE;
+    RegLocation GenDivRemLit(RegLocation rl_dest, RegLocation rl_src1, int lit, bool is_div) OVERRIDE;
+    struct EasyMultiplyOp {
       OpKind op;
       uint32_t shift;
-    } EasyMultiplyOp;
+    };
     bool GetEasyMultiplyOp(int lit, EasyMultiplyOp* op);
     bool GetEasyMultiplyTwoOps(int lit, EasyMultiplyOp* ops);
     void GenEasyMultiplyTwoOps(RegStorage r_dest, RegStorage r_src, EasyMultiplyOp* ops);
@@ -239,6 +325,36 @@ class ArmMir2Lir FINAL : public Mir2Lir {
     static constexpr ResourceMask EncodeArmRegFpcsList(int reg_list);
 
     ArenaVector<LIR*> call_method_insns_;
+
+    /**
+     * @brief Given float register pair, returns Solo64 float register.
+     * @param reg #RegStorage containing a float register pair (e.g. @c s2 and @c s3).
+     * @return A Solo64 float mapping to the register pair (e.g. @c d1).
+     */
+    static RegStorage As64BitFloatReg(RegStorage reg) {
+      DCHECK(reg.IsFloat());
+
+      RegStorage low = reg.GetLow();
+      RegStorage high = reg.GetHigh();
+      DCHECK((low.GetRegNum() % 2 == 0) && (low.GetRegNum() + 1 == high.GetRegNum()));
+
+      return RegStorage::FloatSolo64(low.GetRegNum() / 2);
+    }
+
+    /**
+     * @brief Given Solo64 float register, returns float register pair.
+     * @param reg #RegStorage containing a Solo64 float register (e.g. @c d1).
+     * @return A float register pair mapping to the Solo64 float pair (e.g. @c s2 and s3).
+     */
+    static RegStorage As64BitFloatRegPair(RegStorage reg) {
+      DCHECK(reg.IsDouble() && reg.Is64BitSolo());
+
+      int reg_num = reg.GetRegNum();
+      return RegStorage::MakeRegPair(RegStorage::FloatSolo32(reg_num * 2),
+                                     RegStorage::FloatSolo32(reg_num * 2 + 1));
+    }
+
+    InToRegStorageMapping in_to_reg_storage_mapping_;
 };
 
 }  // namespace art
diff --git a/compiler/dex/quick/arm/int_arm.cc b/compiler/dex/quick/arm/int_arm.cc
index 018dc1c0c6..8e08f5fb9d 100644
--- a/compiler/dex/quick/arm/int_arm.cc
+++ b/compiler/dex/quick/arm/int_arm.cc
@@ -442,6 +442,15 @@ void ArmMir2Lir::OpRegCopyWide(RegStorage r_dest, RegStorage r_src) {
     bool src_fp = r_src.IsFloat();
     DCHECK(r_dest.Is64Bit());
     DCHECK(r_src.Is64Bit());
+    // Note: If the register is get by register allocator, it should never be a pair.
+    // But some functions in mir_2_lir assume 64-bit registers are 32-bit register pairs.
+    // TODO: Rework Mir2Lir::LoadArg() and Mir2Lir::LoadArgDirect().
+    if (dest_fp && r_dest.IsPair()) {
+      r_dest = As64BitFloatReg(r_dest);
+    }
+    if (src_fp && r_src.IsPair()) {
+      r_src = As64BitFloatReg(r_src);
+    }
     if (dest_fp) {
       if (src_fp) {
         OpRegCopy(r_dest, r_src);
@@ -678,7 +687,7 @@ bool ArmMir2Lir::EasyMultiply(RegLocation rl_src, RegLocation rl_dest, int lit)
 }
 
 RegLocation ArmMir2Lir::GenDivRem(RegLocation rl_dest, RegLocation rl_src1,
-                      RegLocation rl_src2, bool is_div, bool check_zero) {
+                      RegLocation rl_src2, bool is_div, int flags) {
   LOG(FATAL) << "Unexpected use of GenDivRem for Arm";
   return rl_dest;
 }
@@ -1264,7 +1273,7 @@ void ArmMir2Lir::GenMulLong(Instruction::Code opcode, RegLocation rl_dest,
 }
 
 void ArmMir2Lir::GenArithOpLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                                RegLocation rl_src2) {
+                                RegLocation rl_src2, int flags) {
   switch (opcode) {
     case Instruction::MUL_LONG:
     case Instruction::MUL_LONG_2ADDR:
@@ -1279,7 +1288,7 @@ void ArmMir2Lir::GenArithOpLong(Instruction::Code opcode, RegLocation rl_dest, R
   }
 
   // Fallback for all other ops.
-  Mir2Lir::GenArithOpLong(opcode, rl_dest, rl_src1, rl_src2);
+  Mir2Lir::GenArithOpLong(opcode, rl_dest, rl_src1, rl_src2, flags);
 }
 
 /*
@@ -1464,7 +1473,8 @@ void ArmMir2Lir::GenArrayPut(int opt_flags, OpSize size, RegLocation rl_array,
 
 
 void ArmMir2Lir::GenShiftImmOpLong(Instruction::Code opcode,
-                                   RegLocation rl_dest, RegLocation rl_src, RegLocation rl_shift) {
+                                   RegLocation rl_dest, RegLocation rl_src, RegLocation rl_shift,
+                                   int flags) {
   rl_src = LoadValueWide(rl_src, kCoreReg);
   // Per spec, we only care about low 6 bits of shift amount.
   int shift_amount = mir_graph_->ConstantValue(rl_shift) & 0x3f;
@@ -1537,11 +1547,12 @@ void ArmMir2Lir::GenShiftImmOpLong(Instruction::Code opcode,
 }
 
 void ArmMir2Lir::GenArithImmOpLong(Instruction::Code opcode,
-                                   RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2) {
+                                   RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2,
+                                   int flags) {
   if ((opcode == Instruction::SUB_LONG_2ADDR) || (opcode == Instruction::SUB_LONG)) {
     if (!rl_src2.is_const) {
       // Don't bother with special handling for subtract from immediate.
-      GenArithOpLong(opcode, rl_dest, rl_src1, rl_src2);
+      GenArithOpLong(opcode, rl_dest, rl_src1, rl_src2, flags);
       return;
     }
   } else {
@@ -1552,7 +1563,7 @@ void ArmMir2Lir::GenArithImmOpLong(Instruction::Code opcode,
     }
   }
   if (PartiallyIntersects(rl_src1, rl_dest)) {
-    GenArithOpLong(opcode, rl_dest, rl_src1, rl_src2);
+    GenArithOpLong(opcode, rl_dest, rl_src1, rl_src2, flags);
     return;
   }
   DCHECK(rl_src2.is_const);
@@ -1569,7 +1580,7 @@ void ArmMir2Lir::GenArithImmOpLong(Instruction::Code opcode,
     case Instruction::SUB_LONG:
     case Instruction::SUB_LONG_2ADDR:
       if ((mod_imm_lo < 0) || (mod_imm_hi < 0)) {
-        GenArithOpLong(opcode, rl_dest, rl_src1, rl_src2);
+        GenArithOpLong(opcode, rl_dest, rl_src1, rl_src2, flags);
         return;
       }
       break;
diff --git a/compiler/dex/quick/arm/target_arm.cc b/compiler/dex/quick/arm/target_arm.cc
index dd8f7fe3d8..7100a285a6 100644
--- a/compiler/dex/quick/arm/target_arm.cc
+++ b/compiler/dex/quick/arm/target_arm.cc
@@ -89,7 +89,7 @@ RegLocation ArmMir2Lir::LocCReturnDouble() {
 
 // Return a target-dependent special register.
 RegStorage ArmMir2Lir::TargetReg(SpecialTargetRegister reg) {
-  RegStorage res_reg = RegStorage::InvalidReg();
+  RegStorage res_reg;
   switch (reg) {
     case kSelf: res_reg = rs_rARM_SELF; break;
 #ifdef ARM_R4_SUSPEND_FLAG
@@ -104,10 +104,22 @@ RegStorage ArmMir2Lir::TargetReg(SpecialTargetRegister reg) {
     case kArg1: res_reg = rs_r1; break;
     case kArg2: res_reg = rs_r2; break;
     case kArg3: res_reg = rs_r3; break;
-    case kFArg0: res_reg = rs_r0; break;
-    case kFArg1: res_reg = rs_r1; break;
-    case kFArg2: res_reg = rs_r2; break;
-    case kFArg3: res_reg = rs_r3; break;
+    case kFArg0: res_reg = kArm32QuickCodeUseSoftFloat ? rs_r0 : rs_fr0; break;
+    case kFArg1: res_reg = kArm32QuickCodeUseSoftFloat ? rs_r1 : rs_fr1; break;
+    case kFArg2: res_reg = kArm32QuickCodeUseSoftFloat ? rs_r2 : rs_fr2; break;
+    case kFArg3: res_reg = kArm32QuickCodeUseSoftFloat ? rs_r3 : rs_fr3; break;
+    case kFArg4: res_reg = kArm32QuickCodeUseSoftFloat ? RegStorage::InvalidReg() : rs_fr4; break;
+    case kFArg5: res_reg = kArm32QuickCodeUseSoftFloat ? RegStorage::InvalidReg() : rs_fr5; break;
+    case kFArg6: res_reg = kArm32QuickCodeUseSoftFloat ? RegStorage::InvalidReg() : rs_fr6; break;
+    case kFArg7: res_reg = kArm32QuickCodeUseSoftFloat ? RegStorage::InvalidReg() : rs_fr7; break;
+    case kFArg8: res_reg = kArm32QuickCodeUseSoftFloat ? RegStorage::InvalidReg() : rs_fr8; break;
+    case kFArg9: res_reg = kArm32QuickCodeUseSoftFloat ? RegStorage::InvalidReg() : rs_fr9; break;
+    case kFArg10: res_reg = kArm32QuickCodeUseSoftFloat ? RegStorage::InvalidReg() : rs_fr10; break;
+    case kFArg11: res_reg = kArm32QuickCodeUseSoftFloat ? RegStorage::InvalidReg() : rs_fr11; break;
+    case kFArg12: res_reg = kArm32QuickCodeUseSoftFloat ? RegStorage::InvalidReg() : rs_fr12; break;
+    case kFArg13: res_reg = kArm32QuickCodeUseSoftFloat ? RegStorage::InvalidReg() : rs_fr13; break;
+    case kFArg14: res_reg = kArm32QuickCodeUseSoftFloat ? RegStorage::InvalidReg() : rs_fr14; break;
+    case kFArg15: res_reg = kArm32QuickCodeUseSoftFloat ? RegStorage::InvalidReg() : rs_fr15; break;
     case kRet0: res_reg = rs_r0; break;
     case kRet1: res_reg = rs_r1; break;
     case kInvokeTgt: res_reg = rs_rARM_LR; break;
@@ -119,20 +131,6 @@ RegStorage ArmMir2Lir::TargetReg(SpecialTargetRegister reg) {
   return res_reg;
 }
 
-RegStorage ArmMir2Lir::GetArgMappingToPhysicalReg(int arg_num) {
-  // For the 32-bit internal ABI, the first 3 arguments are passed in registers.
-  switch (arg_num) {
-    case 0:
-      return rs_r1;
-    case 1:
-      return rs_r2;
-    case 2:
-      return rs_r3;
-    default:
-      return RegStorage::InvalidReg();
-  }
-}
-
 /*
  * Decode the register id.
  */
@@ -718,6 +716,32 @@ void ArmMir2Lir::LockCallTemps() {
   LockTemp(rs_r1);
   LockTemp(rs_r2);
   LockTemp(rs_r3);
+  if (!kArm32QuickCodeUseSoftFloat) {
+    LockTemp(rs_fr0);
+    LockTemp(rs_fr1);
+    LockTemp(rs_fr2);
+    LockTemp(rs_fr3);
+    LockTemp(rs_fr4);
+    LockTemp(rs_fr5);
+    LockTemp(rs_fr6);
+    LockTemp(rs_fr7);
+    LockTemp(rs_fr8);
+    LockTemp(rs_fr9);
+    LockTemp(rs_fr10);
+    LockTemp(rs_fr11);
+    LockTemp(rs_fr12);
+    LockTemp(rs_fr13);
+    LockTemp(rs_fr14);
+    LockTemp(rs_fr15);
+    LockTemp(rs_dr0);
+    LockTemp(rs_dr1);
+    LockTemp(rs_dr2);
+    LockTemp(rs_dr3);
+    LockTemp(rs_dr4);
+    LockTemp(rs_dr5);
+    LockTemp(rs_dr6);
+    LockTemp(rs_dr7);
+  }
 }
 
 /* To be used when explicitly managing register use */
@@ -726,6 +750,32 @@ void ArmMir2Lir::FreeCallTemps() {
   FreeTemp(rs_r1);
   FreeTemp(rs_r2);
   FreeTemp(rs_r3);
+  if (!kArm32QuickCodeUseSoftFloat) {
+    FreeTemp(rs_fr0);
+    FreeTemp(rs_fr1);
+    FreeTemp(rs_fr2);
+    FreeTemp(rs_fr3);
+    FreeTemp(rs_fr4);
+    FreeTemp(rs_fr5);
+    FreeTemp(rs_fr6);
+    FreeTemp(rs_fr7);
+    FreeTemp(rs_fr8);
+    FreeTemp(rs_fr9);
+    FreeTemp(rs_fr10);
+    FreeTemp(rs_fr11);
+    FreeTemp(rs_fr12);
+    FreeTemp(rs_fr13);
+    FreeTemp(rs_fr14);
+    FreeTemp(rs_fr15);
+    FreeTemp(rs_dr0);
+    FreeTemp(rs_dr1);
+    FreeTemp(rs_dr2);
+    FreeTemp(rs_dr3);
+    FreeTemp(rs_dr4);
+    FreeTemp(rs_dr5);
+    FreeTemp(rs_dr6);
+    FreeTemp(rs_dr7);
+  }
 }
 
 RegStorage ArmMir2Lir::LoadHelper(QuickEntrypointEnum trampoline) {
@@ -847,4 +897,313 @@ void ArmMir2Lir::InstallLiteralPools() {
   Mir2Lir::InstallLiteralPools();
 }
 
+RegStorage ArmMir2Lir::InToRegStorageArmMapper::GetNextReg(bool is_double_or_float, bool is_wide) {
+  const RegStorage coreArgMappingToPhysicalReg[] =
+      {rs_r1, rs_r2, rs_r3};
+  const int coreArgMappingToPhysicalRegSize = arraysize(coreArgMappingToPhysicalReg);
+  const RegStorage fpArgMappingToPhysicalReg[] =
+      {rs_fr0, rs_fr1, rs_fr2, rs_fr3, rs_fr4, rs_fr5, rs_fr6, rs_fr7,
+       rs_fr8, rs_fr9, rs_fr10, rs_fr11, rs_fr12, rs_fr13, rs_fr14, rs_fr15};
+  const uint32_t fpArgMappingToPhysicalRegSize = arraysize(fpArgMappingToPhysicalReg);
+  COMPILE_ASSERT(fpArgMappingToPhysicalRegSize % 2 == 0, knum_of_fp_arg_regs_not_even);
+
+  if (kArm32QuickCodeUseSoftFloat) {
+    is_double_or_float = false;  // Regard double as long, float as int.
+    is_wide = false;  // Map long separately.
+  }
+
+  RegStorage result = RegStorage::InvalidReg();
+  if (is_double_or_float) {
+    // TODO: Remove "cur_fp_double_reg_ % 2 != 0" when we return double as double.
+    if (is_wide || cur_fp_double_reg_ % 2 != 0) {
+      cur_fp_double_reg_ = std::max(cur_fp_double_reg_, RoundUp(cur_fp_reg_, 2));
+      if (cur_fp_double_reg_ < fpArgMappingToPhysicalRegSize) {
+        // TODO: Replace by following code in the branch when FlushIns() support 64-bit registers.
+        // result = RegStorage::MakeRegPair(fpArgMappingToPhysicalReg[cur_fp_double_reg_],
+        //                                  fpArgMappingToPhysicalReg[cur_fp_double_reg_ + 1]);
+        // result = As64BitFloatReg(result);
+        // cur_fp_double_reg_ += 2;
+        result = fpArgMappingToPhysicalReg[cur_fp_double_reg_];
+        cur_fp_double_reg_++;
+      }
+    } else {
+      // TODO: Remove the check when we return double as double.
+      DCHECK_EQ(cur_fp_double_reg_ % 2, 0U);
+      if (cur_fp_reg_ % 2 == 0) {
+        cur_fp_reg_ = std::max(cur_fp_double_reg_, cur_fp_reg_);
+      }
+      if (cur_fp_reg_ < fpArgMappingToPhysicalRegSize) {
+        result = fpArgMappingToPhysicalReg[cur_fp_reg_];
+        cur_fp_reg_++;
+      }
+    }
+  } else {
+    if (cur_core_reg_ < coreArgMappingToPhysicalRegSize) {
+      result = coreArgMappingToPhysicalReg[cur_core_reg_++];
+      // TODO: Enable following code when FlushIns() support 64-bit registers.
+      // if (is_wide && cur_core_reg_ < coreArgMappingToPhysicalRegSize) {
+      //   result = RegStorage::MakeRegPair(result, coreArgMappingToPhysicalReg[cur_core_reg_++]);
+      // }
+    }
+  }
+  return result;
+}
+
+RegStorage ArmMir2Lir::InToRegStorageMapping::Get(int in_position) const {
+  DCHECK(IsInitialized());
+  auto res = mapping_.find(in_position);
+  return res != mapping_.end() ? res->second : RegStorage::InvalidReg();
+}
+
+void ArmMir2Lir::InToRegStorageMapping::Initialize(RegLocation* arg_locs, int count,
+                                                   InToRegStorageMapper* mapper) {
+  DCHECK(mapper != nullptr);
+  max_mapped_in_ = -1;
+  is_there_stack_mapped_ = false;
+  for (int in_position = 0; in_position < count; in_position++) {
+     RegStorage reg = mapper->GetNextReg(arg_locs[in_position].fp,
+                                         arg_locs[in_position].wide);
+     if (reg.Valid()) {
+       mapping_[in_position] = reg;
+       // TODO: Enable the following code when FlushIns() support 64-bit argument registers.
+       // if (arg_locs[in_position].wide) {
+       //  if (reg.Is32Bit()) {
+       //    // As it is a split long, the hi-part is on stack.
+       //    is_there_stack_mapped_ = true;
+       //  }
+       //  // We covered 2 v-registers, so skip the next one
+       //  in_position++;
+       // }
+       max_mapped_in_ = std::max(max_mapped_in_, in_position);
+     } else {
+       is_there_stack_mapped_ = true;
+     }
+  }
+  initialized_ = true;
+}
+
+// TODO: Should be able to return long, double registers.
+// Need check some common code as it will break some assumption.
+RegStorage ArmMir2Lir::GetArgMappingToPhysicalReg(int arg_num) {
+  if (!in_to_reg_storage_mapping_.IsInitialized()) {
+    int start_vreg = mir_graph_->GetFirstInVR();
+    RegLocation* arg_locs = &mir_graph_->reg_location_[start_vreg];
+
+    InToRegStorageArmMapper mapper;
+    in_to_reg_storage_mapping_.Initialize(arg_locs, mir_graph_->GetNumOfInVRs(), &mapper);
+  }
+  return in_to_reg_storage_mapping_.Get(arg_num);
+}
+
+int ArmMir2Lir::GenDalvikArgsNoRange(CallInfo* info,
+                                     int call_state, LIR** pcrLabel, NextCallInsn next_call_insn,
+                                     const MethodReference& target_method,
+                                     uint32_t vtable_idx, uintptr_t direct_code,
+                                     uintptr_t direct_method, InvokeType type, bool skip_this) {
+  if (kArm32QuickCodeUseSoftFloat) {
+    return Mir2Lir::GenDalvikArgsNoRange(info, call_state, pcrLabel, next_call_insn, target_method,
+                                         vtable_idx, direct_code, direct_method, type, skip_this);
+  } else {
+    return GenDalvikArgsRange(info, call_state, pcrLabel, next_call_insn, target_method, vtable_idx,
+                              direct_code, direct_method, type, skip_this);
+  }
+}
+
+int ArmMir2Lir::GenDalvikArgsRange(CallInfo* info, int call_state,
+                                   LIR** pcrLabel, NextCallInsn next_call_insn,
+                                   const MethodReference& target_method,
+                                   uint32_t vtable_idx, uintptr_t direct_code,
+                                   uintptr_t direct_method, InvokeType type, bool skip_this) {
+  if (kArm32QuickCodeUseSoftFloat) {
+    return Mir2Lir::GenDalvikArgsRange(info, call_state, pcrLabel, next_call_insn, target_method,
+                                       vtable_idx, direct_code, direct_method, type, skip_this);
+  }
+
+  // TODO: Rework the implementation when argument register can be long or double.
+
+  /* If no arguments, just return */
+  if (info->num_arg_words == 0) {
+    return call_state;
+  }
+
+  const int start_index = skip_this ? 1 : 0;
+
+  InToRegStorageArmMapper mapper;
+  InToRegStorageMapping in_to_reg_storage_mapping;
+  in_to_reg_storage_mapping.Initialize(info->args, info->num_arg_words, &mapper);
+  const int last_mapped_in = in_to_reg_storage_mapping.GetMaxMappedIn();
+  int regs_left_to_pass_via_stack = info->num_arg_words - (last_mapped_in + 1);
+
+  // First of all, check whether it makes sense to use bulk copying.
+  // Bulk copying is done only for the range case.
+  // TODO: make a constant instead of 2
+  if (info->is_range && regs_left_to_pass_via_stack >= 2) {
+    // Scan the rest of the args - if in phys_reg flush to memory
+    for (int next_arg = last_mapped_in + 1; next_arg < info->num_arg_words;) {
+      RegLocation loc = info->args[next_arg];
+      if (loc.wide) {
+        // TODO: Only flush hi-part.
+        if (loc.high_word) {
+          loc = info->args[--next_arg];
+        }
+        loc = UpdateLocWide(loc);
+        if (loc.location == kLocPhysReg) {
+          ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
+          StoreBaseDisp(TargetPtrReg(kSp), SRegOffset(loc.s_reg_low), loc.reg, k64, kNotVolatile);
+        }
+        next_arg += 2;
+      } else {
+        loc = UpdateLoc(loc);
+        if (loc.location == kLocPhysReg) {
+          ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
+          if (loc.ref) {
+            StoreRefDisp(TargetPtrReg(kSp), SRegOffset(loc.s_reg_low), loc.reg, kNotVolatile);
+          } else {
+            StoreBaseDisp(TargetPtrReg(kSp), SRegOffset(loc.s_reg_low), loc.reg, k32,
+                          kNotVolatile);
+          }
+        }
+        next_arg++;
+      }
+    }
+
+    // The rest can be copied together
+    int start_offset = SRegOffset(info->args[last_mapped_in + 1].s_reg_low);
+    int outs_offset = StackVisitor::GetOutVROffset(last_mapped_in + 1,
+                                                   cu_->instruction_set);
+
+    int current_src_offset = start_offset;
+    int current_dest_offset = outs_offset;
+
+    // Only davik regs are accessed in this loop; no next_call_insn() calls.
+    ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
+    while (regs_left_to_pass_via_stack > 0) {
+      /*
+       * TODO: Improve by adding block copy for large number of arguments.  This
+       * should be done, if possible, as a target-depending helper.  For now, just
+       * copy a Dalvik vreg at a time.
+       */
+      // Moving 32-bits via general purpose register.
+      size_t bytes_to_move = sizeof(uint32_t);
+
+      // Instead of allocating a new temp, simply reuse one of the registers being used
+      // for argument passing.
+      RegStorage temp = TargetReg(kArg3, kNotWide);
+
+      // Now load the argument VR and store to the outs.
+      Load32Disp(TargetPtrReg(kSp), current_src_offset, temp);
+      Store32Disp(TargetPtrReg(kSp), current_dest_offset, temp);
+
+      current_src_offset += bytes_to_move;
+      current_dest_offset += bytes_to_move;
+      regs_left_to_pass_via_stack -= (bytes_to_move >> 2);
+    }
+    DCHECK_EQ(regs_left_to_pass_via_stack, 0);
+  }
+
+  // Now handle rest not registers if they are
+  if (in_to_reg_storage_mapping.IsThereStackMapped()) {
+    RegStorage regWide = TargetReg(kArg2, kWide);
+    for (int i = start_index; i <= last_mapped_in + regs_left_to_pass_via_stack; i++) {
+      RegLocation rl_arg = info->args[i];
+      rl_arg = UpdateRawLoc(rl_arg);
+      RegStorage reg = in_to_reg_storage_mapping.Get(i);
+      // TODO: Only pass split wide hi-part via stack.
+      if (!reg.Valid() || rl_arg.wide) {
+        int out_offset = StackVisitor::GetOutVROffset(i, cu_->instruction_set);
+
+        {
+          ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
+          if (rl_arg.wide) {
+            if (rl_arg.location == kLocPhysReg) {
+              StoreBaseDisp(TargetPtrReg(kSp), out_offset, rl_arg.reg, k64, kNotVolatile);
+            } else {
+              LoadValueDirectWideFixed(rl_arg, regWide);
+              StoreBaseDisp(TargetPtrReg(kSp), out_offset, regWide, k64, kNotVolatile);
+            }
+          } else {
+            if (rl_arg.location == kLocPhysReg) {
+              if (rl_arg.ref) {
+                StoreRefDisp(TargetPtrReg(kSp), out_offset, rl_arg.reg, kNotVolatile);
+              } else {
+                StoreBaseDisp(TargetPtrReg(kSp), out_offset, rl_arg.reg, k32, kNotVolatile);
+              }
+            } else {
+              if (rl_arg.ref) {
+                RegStorage regSingle = TargetReg(kArg2, kRef);
+                LoadValueDirectFixed(rl_arg, regSingle);
+                StoreRefDisp(TargetPtrReg(kSp), out_offset, regSingle, kNotVolatile);
+              } else {
+                RegStorage regSingle = TargetReg(kArg2, kNotWide);
+                LoadValueDirectFixed(rl_arg, regSingle);
+                StoreBaseDisp(TargetPtrReg(kSp), out_offset, regSingle, k32, kNotVolatile);
+              }
+            }
+          }
+        }
+
+        call_state = next_call_insn(cu_, info, call_state, target_method,
+                                    vtable_idx, direct_code, direct_method, type);
+      }
+      if (rl_arg.wide) {
+        i++;
+      }
+    }
+  }
+
+  // Finish with mapped registers
+  for (int i = start_index; i <= last_mapped_in; i++) {
+    RegLocation rl_arg = info->args[i];
+    rl_arg = UpdateRawLoc(rl_arg);
+    RegStorage reg = in_to_reg_storage_mapping.Get(i);
+    if (reg.Valid()) {
+      if (reg.Is64Bit()) {
+        LoadValueDirectWideFixed(rl_arg, reg);
+      } else {
+        // TODO: Only split long should be the case we need to care about.
+        if (rl_arg.wide) {
+          ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
+          int high_word = rl_arg.high_word ? 1 : 0;
+          rl_arg = high_word ? info->args[i - 1] : rl_arg;
+          if (rl_arg.location == kLocPhysReg) {
+            RegStorage rs_arg = rl_arg.reg;
+            if (rs_arg.IsDouble() && rs_arg.Is64BitSolo()) {
+              rs_arg = As64BitFloatRegPair(rs_arg);
+            }
+            RegStorage rs_arg_low = rs_arg.GetLow();
+            RegStorage rs_arg_high = rs_arg.GetHigh();
+            OpRegCopy(reg, high_word ? rs_arg_high : rs_arg_low);
+          } else {
+            Load32Disp(TargetPtrReg(kSp), SRegOffset(rl_arg.s_reg_low + high_word), reg);
+          }
+        } else {
+          LoadValueDirectFixed(rl_arg, reg);
+        }
+      }
+      call_state = next_call_insn(cu_, info, call_state, target_method, vtable_idx,
+                                  direct_code, direct_method, type);
+    }
+    if (reg.Is64Bit()) {
+      i++;
+    }
+  }
+
+  call_state = next_call_insn(cu_, info, call_state, target_method, vtable_idx,
+                           direct_code, direct_method, type);
+  if (pcrLabel) {
+    if (!cu_->compiler_driver->GetCompilerOptions().GetImplicitNullChecks()) {
+      *pcrLabel = GenExplicitNullCheck(TargetReg(kArg1, kRef), info->opt_flags);
+    } else {
+      *pcrLabel = nullptr;
+      // In lieu of generating a check for kArg1 being null, we need to
+      // perform a load when doing implicit checks.
+      RegStorage tmp = AllocTemp();
+      Load32Disp(TargetReg(kArg1, kRef), 0, tmp);
+      MarkPossibleNullPointerException(info->opt_flags);
+      FreeTemp(tmp);
+    }
+  }
+  return call_state;
+}
+
 }  // namespace art
diff --git a/compiler/dex/quick/arm/utility_arm.cc b/compiler/dex/quick/arm/utility_arm.cc
index 09acf4cf17..ce2de65abf 100644
--- a/compiler/dex/quick/arm/utility_arm.cc
+++ b/compiler/dex/quick/arm/utility_arm.cc
@@ -1007,6 +1007,12 @@ LIR* ArmMir2Lir::StoreBaseDispBody(RegStorage r_base, int displacement, RegStora
     // Intentional fall-though.
     case k64:
       if (r_src.IsFloat()) {
+        // Note: If the register is retrieved by register allocator, it should never be a pair.
+        // But some functions in mir2lir assume 64-bit registers are 32-bit register pairs.
+        // TODO: Rework Mir2Lir::LoadArg() and Mir2Lir::LoadArgDirect().
+        if (r_src.IsPair()) {
+          r_src = As64BitFloatReg(r_src);
+        }
         DCHECK(!r_src.IsPair());
         store = LoadStoreUsingInsnWithOffsetImm8Shl2(kThumb2Vstrd, r_base, displacement, r_src);
       } else {
diff --git a/compiler/dex/quick/arm64/codegen_arm64.h b/compiler/dex/quick/arm64/codegen_arm64.h
index 510bd4c5de..9f0260635d 100644
--- a/compiler/dex/quick/arm64/codegen_arm64.h
+++ b/compiler/dex/quick/arm64/codegen_arm64.h
@@ -141,13 +141,13 @@ class Arm64Mir2Lir FINAL : public Mir2Lir {
   void GenShiftOpLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
                       RegLocation lr_shift) OVERRIDE;
   void GenArithImmOpLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                         RegLocation rl_src2) OVERRIDE;
+                         RegLocation rl_src2, int flags) OVERRIDE;
   void GenArrayGet(int opt_flags, OpSize size, RegLocation rl_array, RegLocation rl_index,
                    RegLocation rl_dest, int scale) OVERRIDE;
   void GenArrayPut(int opt_flags, OpSize size, RegLocation rl_array, RegLocation rl_index,
                    RegLocation rl_src, int scale, bool card_mark) OVERRIDE;
   void GenShiftImmOpLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                         RegLocation rl_shift) OVERRIDE;
+                         RegLocation rl_shift, int flags) OVERRIDE;
   void GenArithOpDouble(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
                         RegLocation rl_src2) OVERRIDE;
   void GenArithOpFloat(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
@@ -173,7 +173,7 @@ class Arm64Mir2Lir FINAL : public Mir2Lir {
   bool GenInlinedArrayCopyCharArray(CallInfo* info) OVERRIDE;
   void GenIntToLong(RegLocation rl_dest, RegLocation rl_src) OVERRIDE;
   void GenArithOpLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                      RegLocation rl_src2) OVERRIDE;
+                      RegLocation rl_src2, int flags) OVERRIDE;
   RegLocation GenDivRem(RegLocation rl_dest, RegStorage reg_lo, RegStorage reg_hi, bool is_div)
       OVERRIDE;
   RegLocation GenDivRemLit(RegLocation rl_dest, RegStorage reg_lo, int lit, bool is_div)
@@ -363,8 +363,8 @@ class Arm64Mir2Lir FINAL : public Mir2Lir {
   void InsertFixupBefore(LIR* prev_lir, LIR* orig_lir, LIR* new_lir);
   void AssignDataOffsets();
   RegLocation GenDivRem(RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2,
-                        bool is_div, bool check_zero);
-  RegLocation GenDivRemLit(RegLocation rl_dest, RegLocation rl_src1, int lit, bool is_div);
+                        bool is_div, int flags) OVERRIDE;
+  RegLocation GenDivRemLit(RegLocation rl_dest, RegLocation rl_src1, int lit, bool is_div) OVERRIDE;
   size_t GetLoadStoreSize(LIR* lir);
 
   bool SmallLiteralDivRem64(Instruction::Code dalvik_opcode, bool is_div, RegLocation rl_src,
@@ -413,7 +413,7 @@ class Arm64Mir2Lir FINAL : public Mir2Lir {
   void GenNotLong(RegLocation rl_dest, RegLocation rl_src);
   void GenNegLong(RegLocation rl_dest, RegLocation rl_src);
   void GenDivRemLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                     RegLocation rl_src2, bool is_div);
+                     RegLocation rl_src2, bool is_div, int flags);
 
   InToRegStorageMapping in_to_reg_storage_mapping_;
   static const A64EncodingMap EncodingMap[kA64Last];
diff --git a/compiler/dex/quick/arm64/int_arm64.cc b/compiler/dex/quick/arm64/int_arm64.cc
index abcb30f49a..6e7241dcd6 100644
--- a/compiler/dex/quick/arm64/int_arm64.cc
+++ b/compiler/dex/quick/arm64/int_arm64.cc
@@ -614,7 +614,7 @@ RegLocation Arm64Mir2Lir::GenDivRemLit(RegLocation rl_dest, RegStorage reg1, int
 }
 
 RegLocation Arm64Mir2Lir::GenDivRem(RegLocation rl_dest, RegLocation rl_src1,
-                                    RegLocation rl_src2, bool is_div, bool check_zero) {
+                                    RegLocation rl_src2, bool is_div, int flags) {
   LOG(FATAL) << "Unexpected use of GenDivRem for Arm64";
   return rl_dest;
 }
@@ -1020,7 +1020,7 @@ void Arm64Mir2Lir::GenIntToLong(RegLocation rl_dest, RegLocation rl_src) {
 }
 
 void Arm64Mir2Lir::GenDivRemLong(Instruction::Code opcode, RegLocation rl_dest,
-                                 RegLocation rl_src1, RegLocation rl_src2, bool is_div) {
+                                 RegLocation rl_src1, RegLocation rl_src2, bool is_div, int flags) {
   if (rl_src2.is_const) {
     DCHECK(rl_src2.wide);
     int64_t lit = mir_graph_->ConstantValueWide(rl_src2);
@@ -1032,7 +1032,9 @@ void Arm64Mir2Lir::GenDivRemLong(Instruction::Code opcode, RegLocation rl_dest,
   RegLocation rl_result;
   rl_src1 = LoadValueWide(rl_src1, kCoreReg);
   rl_src2 = LoadValueWide(rl_src2, kCoreReg);
-  GenDivZeroCheck(rl_src2.reg);
+  if ((flags & MIR_IGNORE_DIV_ZERO_CHECK) == 0) {
+    GenDivZeroCheck(rl_src2.reg);
+  }
   rl_result = GenDivRem(rl_dest, rl_src1.reg, rl_src2.reg, is_div);
   StoreValueWide(rl_dest, rl_result);
 }
@@ -1067,7 +1069,7 @@ void Arm64Mir2Lir::GenNotLong(RegLocation rl_dest, RegLocation rl_src) {
 }
 
 void Arm64Mir2Lir::GenArithOpLong(Instruction::Code opcode, RegLocation rl_dest,
-                                  RegLocation rl_src1, RegLocation rl_src2) {
+                                  RegLocation rl_src1, RegLocation rl_src2, int flags) {
   switch (opcode) {
     case Instruction::NOT_LONG:
       GenNotLong(rl_dest, rl_src2);
@@ -1086,11 +1088,11 @@ void Arm64Mir2Lir::GenArithOpLong(Instruction::Code opcode, RegLocation rl_dest,
       return;
     case Instruction::DIV_LONG:
     case Instruction::DIV_LONG_2ADDR:
-      GenDivRemLong(opcode, rl_dest, rl_src1, rl_src2, /*is_div*/ true);
+      GenDivRemLong(opcode, rl_dest, rl_src1, rl_src2, /*is_div*/ true, flags);
       return;
     case Instruction::REM_LONG:
     case Instruction::REM_LONG_2ADDR:
-      GenDivRemLong(opcode, rl_dest, rl_src1, rl_src2, /*is_div*/ false);
+      GenDivRemLong(opcode, rl_dest, rl_src1, rl_src2, /*is_div*/ false, flags);
       return;
     case Instruction::AND_LONG_2ADDR:
     case Instruction::AND_LONG:
@@ -1312,7 +1314,8 @@ void Arm64Mir2Lir::GenArrayPut(int opt_flags, OpSize size, RegLocation rl_array,
 }
 
 void Arm64Mir2Lir::GenShiftImmOpLong(Instruction::Code opcode,
-                                     RegLocation rl_dest, RegLocation rl_src, RegLocation rl_shift) {
+                                     RegLocation rl_dest, RegLocation rl_src, RegLocation rl_shift,
+                                     int flags) {
   OpKind op = kOpBkpt;
   // Per spec, we only care about low 6 bits of shift amount.
   int shift_amount = mir_graph_->ConstantValue(rl_shift) & 0x3f;
@@ -1344,7 +1347,7 @@ void Arm64Mir2Lir::GenShiftImmOpLong(Instruction::Code opcode,
 }
 
 void Arm64Mir2Lir::GenArithImmOpLong(Instruction::Code opcode, RegLocation rl_dest,
-                                     RegLocation rl_src1, RegLocation rl_src2) {
+                                     RegLocation rl_src1, RegLocation rl_src2, int flags) {
   OpKind op = kOpBkpt;
   switch (opcode) {
     case Instruction::ADD_LONG:
@@ -1373,7 +1376,7 @@ void Arm64Mir2Lir::GenArithImmOpLong(Instruction::Code opcode, RegLocation rl_de
 
   if (op == kOpSub) {
     if (!rl_src2.is_const) {
-      return GenArithOpLong(opcode, rl_dest, rl_src1, rl_src2);
+      return GenArithOpLong(opcode, rl_dest, rl_src1, rl_src2, flags);
     }
   } else {
     // Associativity.
diff --git a/compiler/dex/quick/arm64/utility_arm64.cc b/compiler/dex/quick/arm64/utility_arm64.cc
index 0883694033..6985b73845 100644
--- a/compiler/dex/quick/arm64/utility_arm64.cc
+++ b/compiler/dex/quick/arm64/utility_arm64.cc
@@ -768,8 +768,8 @@ LIR* Arm64Mir2Lir::OpRegRegRegExtend(OpKind op, RegStorage r_dest, RegStorage r_
       opcode = kA64Sub4RRre;
       break;
     default:
-      LOG(FATAL) << "Unimplemented opcode: " << op;
-      break;
+      UNIMPLEMENTED(FATAL) << "Unimplemented opcode: " << op;
+      UNREACHABLE();
   }
   A64Opcode widened_opcode = r_dest.Is64Bit() ? WIDE(opcode) : opcode;
 
diff --git a/compiler/dex/quick/gen_common.cc b/compiler/dex/quick/gen_common.cc
index a33d15fb32..2abfcc3639 100644
--- a/compiler/dex/quick/gen_common.cc
+++ b/compiler/dex/quick/gen_common.cc
@@ -1501,7 +1501,7 @@ void Mir2Lir::GenShiftOpLong(Instruction::Code opcode, RegLocation rl_dest,
 
 
 void Mir2Lir::GenArithOpInt(Instruction::Code opcode, RegLocation rl_dest,
-                            RegLocation rl_src1, RegLocation rl_src2) {
+                            RegLocation rl_src1, RegLocation rl_src2, int flags) {
   DCHECK(cu_->instruction_set != kX86 && cu_->instruction_set != kX86_64);
   OpKind op = kOpBkpt;
   bool is_div_rem = false;
@@ -1600,7 +1600,7 @@ void Mir2Lir::GenArithOpInt(Instruction::Code opcode, RegLocation rl_dest,
     if (cu_->instruction_set == kMips || cu_->instruction_set == kArm64) {
       rl_src1 = LoadValue(rl_src1, kCoreReg);
       rl_src2 = LoadValue(rl_src2, kCoreReg);
-      if (check_zero) {
+      if (check_zero && (flags & MIR_IGNORE_DIV_ZERO_CHECK) == 0) {
         GenDivZeroCheck(rl_src2.reg);
       }
       rl_result = GenDivRem(rl_dest, rl_src1.reg, rl_src2.reg, op == kOpDiv);
@@ -1612,7 +1612,7 @@ void Mir2Lir::GenArithOpInt(Instruction::Code opcode, RegLocation rl_dest,
         // calculate using a MUL and subtract.
         rl_src1 = LoadValue(rl_src1, kCoreReg);
         rl_src2 = LoadValue(rl_src2, kCoreReg);
-        if (check_zero) {
+        if (check_zero && (flags & MIR_IGNORE_DIV_ZERO_CHECK) == 0) {
           GenDivZeroCheck(rl_src2.reg);
         }
         rl_result = GenDivRem(rl_dest, rl_src1.reg, rl_src2.reg, op == kOpDiv);
@@ -1626,7 +1626,7 @@ void Mir2Lir::GenArithOpInt(Instruction::Code opcode, RegLocation rl_dest,
       LoadValueDirectFixed(rl_src2, TargetReg(kArg1, kNotWide));
       RegStorage r_tgt = CallHelperSetup(kQuickIdivmod);
       LoadValueDirectFixed(rl_src1, TargetReg(kArg0, kNotWide));
-      if (check_zero) {
+      if (check_zero && (flags & MIR_IGNORE_DIV_ZERO_CHECK) == 0) {
         GenDivZeroCheck(TargetReg(kArg1, kNotWide));
       }
       // NOTE: callout here is not a safepoint.
@@ -1914,7 +1914,7 @@ void Mir2Lir::GenArithOpIntLit(Instruction::Code opcode, RegLocation rl_dest, Re
 }
 
 void Mir2Lir::GenArithOpLong(Instruction::Code opcode, RegLocation rl_dest,
-                             RegLocation rl_src1, RegLocation rl_src2) {
+                             RegLocation rl_src1, RegLocation rl_src2, int flags) {
   RegLocation rl_result;
   OpKind first_op = kOpBkpt;
   OpKind second_op = kOpBkpt;
@@ -1999,7 +1999,9 @@ void Mir2Lir::GenArithOpLong(Instruction::Code opcode, RegLocation rl_dest,
       RegStorage r_tmp2 = TargetReg(kArg2, kWide);
       LoadValueDirectWideFixed(rl_src2, r_tmp2);
       RegStorage r_tgt = CallHelperSetup(target);
-      GenDivZeroCheckWide(r_tmp2);
+      if ((flags & MIR_IGNORE_DIV_ZERO_CHECK) == 0) {
+        GenDivZeroCheckWide(r_tmp2);
+      }
       LoadValueDirectWideFixed(rl_src1, r_tmp1);
       // NOTE: callout here is not a safepoint
       CallHelper(r_tgt, target, false /* not safepoint */);
diff --git a/compiler/dex/quick/gen_invoke.cc b/compiler/dex/quick/gen_invoke.cc
index 2bef7c53c5..bc4d00b6cd 100755
--- a/compiler/dex/quick/gen_invoke.cc
+++ b/compiler/dex/quick/gen_invoke.cc
@@ -248,13 +248,13 @@ void Mir2Lir::CallRuntimeHelperRegLocationRegLocation(QuickEntrypointEnum trampo
         if (cu_->instruction_set == kMips) {
           LoadValueDirectFixed(arg1, TargetReg(arg1.fp ? kFArg2 : kArg1, kNotWide));
         } else {
-          LoadValueDirectFixed(arg1, TargetReg(kArg1, kNotWide));
+          LoadValueDirectFixed(arg1, TargetReg(arg1.fp ? kFArg1 : kArg1, kNotWide));
         }
       } else {
         if (cu_->instruction_set == kMips) {
           LoadValueDirectWideFixed(arg1, TargetReg(arg1.fp ? kFArg2 : kArg2, kWide));
         } else {
-          LoadValueDirectWideFixed(arg1, TargetReg(kArg1, kWide));
+          LoadValueDirectWideFixed(arg1, TargetReg(arg1.fp ? kFArg1 : kArg1, kWide));
         }
       }
     } else {
@@ -365,6 +365,7 @@ void Mir2Lir::CallRuntimeHelperRegLocationRegLocationRegLocation(
  * ArgLocs is an array of location records describing the incoming arguments
  * with one location record per word of argument.
  */
+// TODO: Support 64-bit argument registers.
 void Mir2Lir::FlushIns(RegLocation* ArgLocs, RegLocation rl_method) {
   /*
    * Dummy up a RegLocation for the incoming StackReference<mirror::ArtMethod>
diff --git a/compiler/dex/quick/mips/codegen_mips.h b/compiler/dex/quick/mips/codegen_mips.h
index bd709f38ae..508d474404 100644
--- a/compiler/dex/quick/mips/codegen_mips.h
+++ b/compiler/dex/quick/mips/codegen_mips.h
@@ -86,13 +86,13 @@ class MipsMir2Lir FINAL : public Mir2Lir {
 
     // Required for target - Dalvik-level generators.
     void GenArithImmOpLong(Instruction::Code opcode, RegLocation rl_dest,
-                           RegLocation rl_src1, RegLocation rl_src2);
+                           RegLocation rl_src1, RegLocation rl_src2, int flags);
     void GenArrayGet(int opt_flags, OpSize size, RegLocation rl_array,
                      RegLocation rl_index, RegLocation rl_dest, int scale);
     void GenArrayPut(int opt_flags, OpSize size, RegLocation rl_array,
                      RegLocation rl_index, RegLocation rl_src, int scale, bool card_mark);
     void GenShiftImmOpLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                           RegLocation rl_shift);
+                           RegLocation rl_shift, int flags);
     void GenArithOpDouble(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
                           RegLocation rl_src2);
     void GenArithOpFloat(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
@@ -108,7 +108,7 @@ class MipsMir2Lir FINAL : public Mir2Lir {
     bool GenInlinedPeek(CallInfo* info, OpSize size);
     bool GenInlinedPoke(CallInfo* info, OpSize size);
     void GenArithOpLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                        RegLocation rl_src2) OVERRIDE;
+                        RegLocation rl_src2, int flags) OVERRIDE;
     RegLocation GenDivRem(RegLocation rl_dest, RegStorage reg_lo, RegStorage reg_hi, bool is_div);
     RegLocation GenDivRemLit(RegLocation rl_dest, RegStorage reg_lo, int lit, bool is_div);
     void GenCmpLong(RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2);
@@ -190,8 +190,8 @@ class MipsMir2Lir FINAL : public Mir2Lir {
 
     void ConvertShortToLongBranch(LIR* lir);
     RegLocation GenDivRem(RegLocation rl_dest, RegLocation rl_src1,
-                          RegLocation rl_src2, bool is_div, bool check_zero);
-    RegLocation GenDivRemLit(RegLocation rl_dest, RegLocation rl_src1, int lit, bool is_div);
+                          RegLocation rl_src2, bool is_div, int flags) OVERRIDE;
+    RegLocation GenDivRemLit(RegLocation rl_dest, RegLocation rl_src1, int lit, bool is_div) OVERRIDE;
 };
 
 }  // namespace art
diff --git a/compiler/dex/quick/mips/int_mips.cc b/compiler/dex/quick/mips/int_mips.cc
index 30aa611f9f..baf7311398 100644
--- a/compiler/dex/quick/mips/int_mips.cc
+++ b/compiler/dex/quick/mips/int_mips.cc
@@ -263,7 +263,7 @@ RegLocation MipsMir2Lir::GenDivRemLit(RegLocation rl_dest, RegStorage reg1, int
 }
 
 RegLocation MipsMir2Lir::GenDivRem(RegLocation rl_dest, RegLocation rl_src1,
-                      RegLocation rl_src2, bool is_div, bool check_zero) {
+                      RegLocation rl_src2, bool is_div, int flags) {
   LOG(FATAL) << "Unexpected use of GenDivRem for Mips";
   return rl_dest;
 }
@@ -437,7 +437,7 @@ void MipsMir2Lir::GenSubLong(Instruction::Code opcode, RegLocation rl_dest,
 }
 
 void MipsMir2Lir::GenArithOpLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                                 RegLocation rl_src2) {
+                                 RegLocation rl_src2, int flags) {
   switch (opcode) {
     case Instruction::ADD_LONG:
     case Instruction::ADD_LONG_2ADDR:
@@ -456,7 +456,7 @@ void MipsMir2Lir::GenArithOpLong(Instruction::Code opcode, RegLocation rl_dest,
   }
 
   // Fallback for all other ops.
-  Mir2Lir::GenArithOpLong(opcode, rl_dest, rl_src1, rl_src2);
+  Mir2Lir::GenArithOpLong(opcode, rl_dest, rl_src1, rl_src2, flags);
 }
 
 void MipsMir2Lir::GenNegLong(RegLocation rl_dest, RegLocation rl_src) {
@@ -628,15 +628,16 @@ void MipsMir2Lir::GenArrayPut(int opt_flags, OpSize size, RegLocation rl_array,
 }
 
 void MipsMir2Lir::GenShiftImmOpLong(Instruction::Code opcode, RegLocation rl_dest,
-                                    RegLocation rl_src1, RegLocation rl_shift) {
+                                    RegLocation rl_src1, RegLocation rl_shift, int flags) {
   // Default implementation is just to ignore the constant case.
   GenShiftOpLong(opcode, rl_dest, rl_src1, rl_shift);
 }
 
 void MipsMir2Lir::GenArithImmOpLong(Instruction::Code opcode,
-                                    RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2) {
+                                    RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2,
+                                    int flags) {
   // Default - bail to non-const handler.
-  GenArithOpLong(opcode, rl_dest, rl_src1, rl_src2);
+  GenArithOpLong(opcode, rl_dest, rl_src1, rl_src2, flags);
 }
 
 }  // namespace art
diff --git a/compiler/dex/quick/mir_to_lir.cc b/compiler/dex/quick/mir_to_lir.cc
index 4399981272..408606d366 100644
--- a/compiler/dex/quick/mir_to_lir.cc
+++ b/compiler/dex/quick/mir_to_lir.cc
@@ -928,12 +928,12 @@ void Mir2Lir::CompileDalvikInstruction(MIR* mir, BasicBlock* bb, LIR* label_list
 
     case Instruction::NEG_INT:
     case Instruction::NOT_INT:
-      GenArithOpInt(opcode, rl_dest, rl_src[0], rl_src[0]);
+      GenArithOpInt(opcode, rl_dest, rl_src[0], rl_src[0], opt_flags);
       break;
 
     case Instruction::NEG_LONG:
     case Instruction::NOT_LONG:
-      GenArithOpLong(opcode, rl_dest, rl_src[0], rl_src[0]);
+      GenArithOpLong(opcode, rl_dest, rl_src[0], rl_src[0], opt_flags);
       break;
 
     case Instruction::NEG_FLOAT:
@@ -993,7 +993,7 @@ void Mir2Lir::CompileDalvikInstruction(MIR* mir, BasicBlock* bb, LIR* label_list
         GenArithOpIntLit(opcode, rl_dest, rl_src[0],
                              mir_graph_->ConstantValue(rl_src[1].orig_sreg));
       } else {
-        GenArithOpInt(opcode, rl_dest, rl_src[0], rl_src[1]);
+        GenArithOpInt(opcode, rl_dest, rl_src[0], rl_src[1], opt_flags);
       }
       break;
 
@@ -1013,7 +1013,7 @@ void Mir2Lir::CompileDalvikInstruction(MIR* mir, BasicBlock* bb, LIR* label_list
           InexpensiveConstantInt(mir_graph_->ConstantValue(rl_src[1]), opcode)) {
         GenArithOpIntLit(opcode, rl_dest, rl_src[0], mir_graph_->ConstantValue(rl_src[1]));
       } else {
-        GenArithOpInt(opcode, rl_dest, rl_src[0], rl_src[1]);
+        GenArithOpInt(opcode, rl_dest, rl_src[0], rl_src[1], opt_flags);
       }
       break;
 
@@ -1028,7 +1028,7 @@ void Mir2Lir::CompileDalvikInstruction(MIR* mir, BasicBlock* bb, LIR* label_list
     case Instruction::OR_LONG_2ADDR:
     case Instruction::XOR_LONG_2ADDR:
       if (rl_src[0].is_const || rl_src[1].is_const) {
-        GenArithImmOpLong(opcode, rl_dest, rl_src[0], rl_src[1]);
+        GenArithImmOpLong(opcode, rl_dest, rl_src[0], rl_src[1], opt_flags);
         break;
       }
       FALLTHROUGH_INTENDED;
@@ -1038,7 +1038,7 @@ void Mir2Lir::CompileDalvikInstruction(MIR* mir, BasicBlock* bb, LIR* label_list
     case Instruction::MUL_LONG_2ADDR:
     case Instruction::DIV_LONG_2ADDR:
     case Instruction::REM_LONG_2ADDR:
-      GenArithOpLong(opcode, rl_dest, rl_src[0], rl_src[1]);
+      GenArithOpLong(opcode, rl_dest, rl_src[0], rl_src[1], opt_flags);
       break;
 
     case Instruction::SHL_LONG:
@@ -1048,7 +1048,7 @@ void Mir2Lir::CompileDalvikInstruction(MIR* mir, BasicBlock* bb, LIR* label_list
     case Instruction::SHR_LONG_2ADDR:
     case Instruction::USHR_LONG_2ADDR:
       if (rl_src[1].is_const) {
-        GenShiftImmOpLong(opcode, rl_dest, rl_src[0], rl_src[1]);
+        GenShiftImmOpLong(opcode, rl_dest, rl_src[0], rl_src[1], opt_flags);
       } else {
         GenShiftOpLong(opcode, rl_dest, rl_src[0], rl_src[1]);
       }
diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index ea93bbe7f2..f4e6dfead2 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h
@@ -857,7 +857,7 @@ class Mir2Lir : public Backend {
     void GenArithOpIntLit(Instruction::Code opcode, RegLocation rl_dest,
                           RegLocation rl_src, int lit);
     virtual void GenArithOpLong(Instruction::Code opcode, RegLocation rl_dest,
-                                RegLocation rl_src1, RegLocation rl_src2);
+                                RegLocation rl_src1, RegLocation rl_src2, int flags);
     void GenConversionCall(QuickEntrypointEnum trampoline, RegLocation rl_dest, RegLocation rl_src);
     virtual void GenSuspendTest(int opt_flags);
     virtual void GenSuspendTestAndBranch(int opt_flags, LIR* target);
@@ -865,7 +865,7 @@ class Mir2Lir : public Backend {
     // This will be overridden by x86 implementation.
     virtual void GenConstWide(RegLocation rl_dest, int64_t value);
     virtual void GenArithOpInt(Instruction::Code opcode, RegLocation rl_dest,
-                       RegLocation rl_src1, RegLocation rl_src2);
+                       RegLocation rl_src1, RegLocation rl_src2, int flags);
 
     // Shared by all targets - implemented in gen_invoke.cc.
     LIR* CallHelper(RegStorage r_tgt, QuickEntrypointEnum trampoline, bool safepoint_pc,
@@ -1191,13 +1191,17 @@ class Mir2Lir : public Backend {
      */
     virtual RegStorage TargetReg(SpecialTargetRegister reg, WideKind wide_kind) {
       if (wide_kind == kWide) {
-        DCHECK((kArg0 <= reg && reg < kArg7) || (kFArg0 <= reg && reg < kFArg7) || (kRet0 == reg));
+        DCHECK((kArg0 <= reg && reg < kArg7) || (kFArg0 <= reg && reg < kFArg15) || (kRet0 == reg));
         COMPILE_ASSERT((kArg1 == kArg0 + 1) && (kArg2 == kArg1 + 1) && (kArg3 == kArg2 + 1) &&
                        (kArg4 == kArg3 + 1) && (kArg5 == kArg4 + 1) && (kArg6 == kArg5 + 1) &&
                        (kArg7 == kArg6 + 1), kargs_range_unexpected);
         COMPILE_ASSERT((kFArg1 == kFArg0 + 1) && (kFArg2 == kFArg1 + 1) && (kFArg3 == kFArg2 + 1) &&
                        (kFArg4 == kFArg3 + 1) && (kFArg5 == kFArg4 + 1) && (kFArg6 == kFArg5 + 1) &&
-                       (kFArg7 == kFArg6 + 1), kfargs_range_unexpected);
+                       (kFArg7 == kFArg6 + 1) && (kFArg8 == kFArg7 + 1) && (kFArg9 == kFArg8 + 1) &&
+                       (kFArg10 == kFArg9 + 1) && (kFArg11 == kFArg10 + 1) &&
+                       (kFArg12 == kFArg11 + 1) && (kFArg13 == kFArg12 + 1) &&
+                       (kFArg14 == kFArg13 + 1) && (kFArg15 == kFArg14 + 1),
+                       kfargs_range_unexpected);
         COMPILE_ASSERT(kRet1 == kRet0 + 1, kret_range_unexpected);
         return RegStorage::MakeRegPair(TargetReg(reg),
                                        TargetReg(static_cast<SpecialTargetRegister>(reg + 1)));
@@ -1259,7 +1263,7 @@ class Mir2Lir : public Backend {
 
     // Required for target - Dalvik-level generators.
     virtual void GenArithImmOpLong(Instruction::Code opcode, RegLocation rl_dest,
-                                   RegLocation rl_src1, RegLocation rl_src2) = 0;
+                                   RegLocation rl_src1, RegLocation rl_src2, int flags) = 0;
     virtual void GenArithOpDouble(Instruction::Code opcode,
                                   RegLocation rl_dest, RegLocation rl_src1,
                                   RegLocation rl_src2) = 0;
@@ -1297,10 +1301,11 @@ class Mir2Lir : public Backend {
      * @param rl_src1 Numerator Location.
      * @param rl_src2 Divisor Location.
      * @param is_div 'true' if this is a division, 'false' for a remainder.
-     * @param check_zero 'true' if an exception should be generated if the divisor is 0.
+     * @param flags The instruction optimization flags. It can include information
+     * if exception check can be elided.
      */
     virtual RegLocation GenDivRem(RegLocation rl_dest, RegLocation rl_src1,
-                                  RegLocation rl_src2, bool is_div, bool check_zero) = 0;
+                                  RegLocation rl_src2, bool is_div, int flags) = 0;
     /*
      * @brief Generate an integer div or rem operation by a literal.
      * @param rl_dest Destination Location.
@@ -1382,7 +1387,7 @@ class Mir2Lir : public Backend {
                              RegLocation rl_index, RegLocation rl_src, int scale,
                              bool card_mark) = 0;
     virtual void GenShiftImmOpLong(Instruction::Code opcode, RegLocation rl_dest,
-                                   RegLocation rl_src1, RegLocation rl_shift) = 0;
+                                   RegLocation rl_src1, RegLocation rl_shift, int flags) = 0;
 
     // Required for target - single operation generators.
     virtual LIR* OpUnconditionalBranch(LIR* target) = 0;
diff --git a/compiler/dex/quick/quick_compiler.cc b/compiler/dex/quick/quick_compiler.cc
index 6f2a647313..8f7bd3033a 100644
--- a/compiler/dex/quick/quick_compiler.cc
+++ b/compiler/dex/quick/quick_compiler.cc
@@ -425,6 +425,21 @@ static int kAllOpcodes[] = {
     kMirOpSelect,
 };
 
+static int kInvokeOpcodes[] = {
+    Instruction::INVOKE_VIRTUAL,
+    Instruction::INVOKE_SUPER,
+    Instruction::INVOKE_DIRECT,
+    Instruction::INVOKE_STATIC,
+    Instruction::INVOKE_INTERFACE,
+    Instruction::INVOKE_VIRTUAL_RANGE,
+    Instruction::INVOKE_SUPER_RANGE,
+    Instruction::INVOKE_DIRECT_RANGE,
+    Instruction::INVOKE_STATIC_RANGE,
+    Instruction::INVOKE_INTERFACE_RANGE,
+    Instruction::INVOKE_VIRTUAL_QUICK,
+    Instruction::INVOKE_VIRTUAL_RANGE_QUICK,
+};
+
 // Unsupported opcodes. nullptr can be used when everything is supported. Size of the lists is
 // recorded below.
 static const int* kUnsupportedOpcodes[] = {
@@ -523,8 +538,8 @@ bool QuickCompiler::CanCompileMethod(uint32_t method_idx, const DexFile& dex_fil
     for (MIR* mir = bb->first_mir_insn; mir != nullptr; mir = mir->next) {
       int opcode = mir->dalvikInsn.opcode;
       // Check if we support the byte code.
-      if (std::find(unsupport_list, unsupport_list + unsupport_list_size,
-                    opcode) != unsupport_list + unsupport_list_size) {
+      if (std::find(unsupport_list, unsupport_list + unsupport_list_size, opcode)
+          != unsupport_list + unsupport_list_size) {
         if (!MIR::DecodedInstruction::IsPseudoMirOp(opcode)) {
           VLOG(compiler) << "Unsupported dalvik byte code : "
               << mir->dalvikInsn.opcode;
@@ -535,11 +550,8 @@ bool QuickCompiler::CanCompileMethod(uint32_t method_idx, const DexFile& dex_fil
         return false;
       }
       // Check if it invokes a prototype that we cannot support.
-      if (Instruction::INVOKE_VIRTUAL == opcode ||
-          Instruction::INVOKE_SUPER == opcode ||
-          Instruction::INVOKE_DIRECT == opcode ||
-          Instruction::INVOKE_STATIC == opcode ||
-          Instruction::INVOKE_INTERFACE == opcode) {
+      if (std::find(kInvokeOpcodes, kInvokeOpcodes + arraysize(kInvokeOpcodes), opcode)
+          != kInvokeOpcodes + arraysize(kInvokeOpcodes)) {
         uint32_t invoke_method_idx = mir->dalvikInsn.vB;
         const char* invoke_method_shorty = dex_file.GetMethodShorty(
             dex_file.GetMethodId(invoke_method_idx));
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index b3544dafba..7b5b831e38 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -180,11 +180,11 @@ class X86Mir2Lir : public Mir2Lir {
 
   // Long instructions.
   void GenArithOpLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                      RegLocation rl_src2) OVERRIDE;
+                      RegLocation rl_src2, int flags) OVERRIDE;
   void GenArithImmOpLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                         RegLocation rl_src2) OVERRIDE;
+                         RegLocation rl_src2, int flags) OVERRIDE;
   void GenShiftImmOpLong(Instruction::Code opcode, RegLocation rl_dest,
-                         RegLocation rl_src1, RegLocation rl_shift) OVERRIDE;
+                         RegLocation rl_src1, RegLocation rl_shift, int flags) OVERRIDE;
   void GenCmpLong(RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2) OVERRIDE;
   void GenIntToLong(RegLocation rl_dest, RegLocation rl_src) OVERRIDE;
   void GenShiftOpLong(Instruction::Code opcode, RegLocation rl_dest,
@@ -314,9 +314,10 @@ class X86Mir2Lir : public Mir2Lir {
    * @param rl_dest Destination for the result.
    * @param rl_lhs Left hand operand.
    * @param rl_rhs Right hand operand.
+   * @param flags The instruction optimization flags.
    */
   void GenArithOpInt(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_lhs,
-                     RegLocation rl_rhs) OVERRIDE;
+                     RegLocation rl_rhs, int flags) OVERRIDE;
 
   /*
    * @brief Load the Method* of a dex method into the register.
@@ -768,10 +769,11 @@ class X86Mir2Lir : public Mir2Lir {
    * @param rl_src1 Numerator Location.
    * @param rl_src2 Divisor Location.
    * @param is_div 'true' if this is a division, 'false' for a remainder.
-   * @param check_zero 'true' if an exception should be generated if the divisor is 0.
+   * @param flags The instruction optimization flags. It can include information
+   * if exception check can be elided.
    */
   RegLocation GenDivRem(RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2,
-                        bool is_div, bool check_zero);
+                        bool is_div, int flags);
 
   /*
    * @brief Generate an integer div or rem operation by a literal.
@@ -788,10 +790,11 @@ class X86Mir2Lir : public Mir2Lir {
    * @param rl_dest The destination.
    * @param rl_src The value to be shifted.
    * @param shift_amount How much to shift.
+   * @param flags The instruction optimization flags.
    * @returns the RegLocation of the result.
    */
   RegLocation GenShiftImmOpLong(Instruction::Code opcode, RegLocation rl_dest,
-                                RegLocation rl_src, int shift_amount);
+                                RegLocation rl_src, int shift_amount, int flags);
   /*
    * Generate an imul of a register by a constant or a better sequence.
    * @param dest Destination Register.
@@ -858,13 +861,13 @@ class X86Mir2Lir : public Mir2Lir {
 
   // Try to do a long multiplication where rl_src2 is a constant. This simplified setup might fail,
   // in which case false will be returned.
-  bool GenMulLongConst(RegLocation rl_dest, RegLocation rl_src1, int64_t val);
+  bool GenMulLongConst(RegLocation rl_dest, RegLocation rl_src1, int64_t val, int flags);
   void GenMulLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                  RegLocation rl_src2);
+                  RegLocation rl_src2, int flags);
   void GenNotLong(RegLocation rl_dest, RegLocation rl_src);
   void GenNegLong(RegLocation rl_dest, RegLocation rl_src);
   void GenDivRemLong(Instruction::Code, RegLocation rl_dest, RegLocation rl_src1,
-                     RegLocation rl_src2, bool is_div);
+                     RegLocation rl_src2, bool is_div, int flags);
 
   void SpillCoreRegs();
   void UnSpillCoreRegs();
diff --git a/compiler/dex/quick/x86/int_x86.cc b/compiler/dex/quick/x86/int_x86.cc
index acf5599d5f..aa1bf7fe6d 100755
--- a/compiler/dex/quick/x86/int_x86.cc
+++ b/compiler/dex/quick/x86/int_x86.cc
@@ -768,7 +768,7 @@ RegLocation X86Mir2Lir::GenDivRem(RegLocation rl_dest, RegStorage reg_lo, RegSto
 }
 
 RegLocation X86Mir2Lir::GenDivRem(RegLocation rl_dest, RegLocation rl_src1,
-                                  RegLocation rl_src2, bool is_div, bool check_zero) {
+                                  RegLocation rl_src2, bool is_div, int flags) {
   // We have to use fixed registers, so flush all the temps.
 
   // Prepare for explicit register usage.
@@ -783,7 +783,7 @@ RegLocation X86Mir2Lir::GenDivRem(RegLocation rl_dest, RegLocation rl_src1,
   // Copy LHS sign bit into EDX.
   NewLIR0(kx86Cdq32Da);
 
-  if (check_zero) {
+  if ((flags & MIR_IGNORE_DIV_ZERO_CHECK) == 0) {
     // Handle division by zero case.
     GenDivZeroCheck(rs_r1);
   }
@@ -1506,7 +1506,7 @@ void X86Mir2Lir::GenImulMemImm(RegStorage dest, int sreg, int displacement, int
 }
 
 void X86Mir2Lir::GenArithOpLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
-                                RegLocation rl_src2) {
+                                RegLocation rl_src2, int flags) {
   if (!cu_->target64) {
     // Some x86 32b ops are fallback.
     switch (opcode) {
@@ -1515,7 +1515,7 @@ void X86Mir2Lir::GenArithOpLong(Instruction::Code opcode, RegLocation rl_dest, R
       case Instruction::DIV_LONG_2ADDR:
       case Instruction::REM_LONG:
       case Instruction::REM_LONG_2ADDR:
-        Mir2Lir::GenArithOpLong(opcode, rl_dest, rl_src1, rl_src2);
+        Mir2Lir::GenArithOpLong(opcode, rl_dest, rl_src1, rl_src2, flags);
         return;
 
       default:
@@ -1541,17 +1541,17 @@ void X86Mir2Lir::GenArithOpLong(Instruction::Code opcode, RegLocation rl_dest, R
 
     case Instruction::MUL_LONG:
     case Instruction::MUL_LONG_2ADDR:
-      GenMulLong(opcode, rl_dest, rl_src1, rl_src2);
+      GenMulLong(opcode, rl_dest, rl_src1, rl_src2, flags);
       return;
 
     case Instruction::DIV_LONG:
     case Instruction::DIV_LONG_2ADDR:
-      GenDivRemLong(opcode, rl_dest, rl_src1, rl_src2, /*is_div*/ true);
+      GenDivRemLong(opcode, rl_dest, rl_src1, rl_src2, /*is_div*/ true, flags);
       return;
 
     case Instruction::REM_LONG:
     case Instruction::REM_LONG_2ADDR:
-      GenDivRemLong(opcode, rl_dest, rl_src1, rl_src2, /*is_div*/ false);
+      GenDivRemLong(opcode, rl_dest, rl_src1, rl_src2, /*is_div*/ false, flags);
       return;
 
     case Instruction::AND_LONG_2ADDR:
@@ -1579,7 +1579,7 @@ void X86Mir2Lir::GenArithOpLong(Instruction::Code opcode, RegLocation rl_dest, R
   }
 }
 
-bool X86Mir2Lir::GenMulLongConst(RegLocation rl_dest, RegLocation rl_src1, int64_t val) {
+bool X86Mir2Lir::GenMulLongConst(RegLocation rl_dest, RegLocation rl_src1, int64_t val, int flags) {
   // All memory accesses below reference dalvik regs.
   ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
 
@@ -1597,14 +1597,14 @@ bool X86Mir2Lir::GenMulLongConst(RegLocation rl_dest, RegLocation rl_src1, int64
     StoreValueWide(rl_dest, rl_src1);
     return true;
   } else if (val == 2) {
-    GenArithOpLong(Instruction::ADD_LONG, rl_dest, rl_src1, rl_src1);
+    GenArithOpLong(Instruction::ADD_LONG, rl_dest, rl_src1, rl_src1, flags);
     return true;
   } else if (IsPowerOfTwo(val)) {
     int shift_amount = LowestSetBit(val);
     if (!PartiallyIntersects(rl_src1, rl_dest)) {
       rl_src1 = LoadValueWide(rl_src1, kCoreReg);
       RegLocation rl_result = GenShiftImmOpLong(Instruction::SHL_LONG, rl_dest, rl_src1,
-                                                shift_amount);
+                                                shift_amount, flags);
       StoreValueWide(rl_dest, rl_result);
       return true;
     }
@@ -1658,13 +1658,13 @@ bool X86Mir2Lir::GenMulLongConst(RegLocation rl_dest, RegLocation rl_src1, int64
 }
 
 void X86Mir2Lir::GenMulLong(Instruction::Code, RegLocation rl_dest, RegLocation rl_src1,
-                            RegLocation rl_src2) {
+                            RegLocation rl_src2, int flags) {
   if (rl_src1.is_const) {
     std::swap(rl_src1, rl_src2);
   }
 
   if (rl_src2.is_const) {
-    if (GenMulLongConst(rl_dest, rl_src1, mir_graph_->ConstantValueWide(rl_src2))) {
+    if (GenMulLongConst(rl_dest, rl_src1, mir_graph_->ConstantValueWide(rl_src2), flags)) {
       return;
     }
   }
@@ -2164,7 +2164,7 @@ void X86Mir2Lir::GenDivRemLongLit(RegLocation rl_dest, RegLocation rl_src,
 }
 
 void X86Mir2Lir::GenDivRemLong(Instruction::Code, RegLocation rl_dest, RegLocation rl_src1,
-                               RegLocation rl_src2, bool is_div) {
+                               RegLocation rl_src2, bool is_div, int flags) {
   if (!cu_->target64) {
     LOG(FATAL) << "Unexpected use GenDivRemLong()";
     return;
@@ -2191,7 +2191,9 @@ void X86Mir2Lir::GenDivRemLong(Instruction::Code, RegLocation rl_dest, RegLocati
   NewLIR0(kx86Cqo64Da);
 
   // Handle division by zero case.
-  GenDivZeroCheckWide(rs_r1q);
+  if ((flags & MIR_IGNORE_DIV_ZERO_CHECK) == 0) {
+    GenDivZeroCheckWide(rs_r1q);
+  }
 
   // Have to catch 0x8000000000000000/-1 case, or we will get an exception!
   NewLIR2(kX86Cmp64RI8, rs_r1q.GetReg(), -1);
@@ -2392,7 +2394,7 @@ void X86Mir2Lir::GenArrayPut(int opt_flags, OpSize size, RegLocation rl_array,
 }
 
 RegLocation X86Mir2Lir::GenShiftImmOpLong(Instruction::Code opcode, RegLocation rl_dest,
-                                          RegLocation rl_src, int shift_amount) {
+                                          RegLocation rl_src, int shift_amount, int flags) {
   RegLocation rl_result = EvalLocWide(rl_dest, kCoreReg, true);
   if (cu_->target64) {
     OpKind op = static_cast<OpKind>(0);    /* Make gcc happy */
@@ -2477,7 +2479,7 @@ RegLocation X86Mir2Lir::GenShiftImmOpLong(Instruction::Code opcode, RegLocation
 }
 
 void X86Mir2Lir::GenShiftImmOpLong(Instruction::Code opcode, RegLocation rl_dest,
-                                   RegLocation rl_src, RegLocation rl_shift) {
+                                   RegLocation rl_src, RegLocation rl_shift, int flags) {
   // Per spec, we only care about low 6 bits of shift amount.
   int shift_amount = mir_graph_->ConstantValue(rl_shift) & 0x3f;
   if (shift_amount == 0) {
@@ -2487,7 +2489,7 @@ void X86Mir2Lir::GenShiftImmOpLong(Instruction::Code opcode, RegLocation rl_dest
   } else if (shift_amount == 1 &&
             (opcode ==  Instruction::SHL_LONG || opcode == Instruction::SHL_LONG_2ADDR)) {
     // Need to handle this here to avoid calling StoreValueWide twice.
-    GenArithOpLong(Instruction::ADD_LONG, rl_dest, rl_src, rl_src);
+    GenArithOpLong(Instruction::ADD_LONG, rl_dest, rl_src, rl_src, flags);
     return;
   }
   if (PartiallyIntersects(rl_src, rl_dest)) {
@@ -2495,12 +2497,13 @@ void X86Mir2Lir::GenShiftImmOpLong(Instruction::Code opcode, RegLocation rl_dest
     return;
   }
   rl_src = LoadValueWide(rl_src, kCoreReg);
-  RegLocation rl_result = GenShiftImmOpLong(opcode, rl_dest, rl_src, shift_amount);
+  RegLocation rl_result = GenShiftImmOpLong(opcode, rl_dest, rl_src, shift_amount, flags);
   StoreValueWide(rl_dest, rl_result);
 }
 
 void X86Mir2Lir::GenArithImmOpLong(Instruction::Code opcode,
-                                   RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2) {
+                                   RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2,
+                                   int flags) {
   bool isConstSuccess = false;
   switch (opcode) {
     case Instruction::ADD_LONG:
@@ -2519,7 +2522,7 @@ void X86Mir2Lir::GenArithImmOpLong(Instruction::Code opcode,
       if (rl_src2.is_const) {
         isConstSuccess = GenLongLongImm(rl_dest, rl_src1, rl_src2, opcode);
       } else {
-        GenArithOpLong(opcode, rl_dest, rl_src1, rl_src2);
+        GenArithOpLong(opcode, rl_dest, rl_src1, rl_src2, flags);
         isConstSuccess = true;
       }
       break;
@@ -2545,7 +2548,7 @@ void X86Mir2Lir::GenArithImmOpLong(Instruction::Code opcode,
 
   if (!isConstSuccess) {
     // Default - bail to non-const handler.
-    GenArithOpLong(opcode, rl_dest, rl_src1, rl_src2);
+    GenArithOpLong(opcode, rl_dest, rl_src1, rl_src2, flags);
   }
 }
 
@@ -2917,7 +2920,7 @@ void X86Mir2Lir::GenInstanceofFinal(bool use_declaring_class, uint32_t type_idx,
 }
 
 void X86Mir2Lir::GenArithOpInt(Instruction::Code opcode, RegLocation rl_dest,
-                            RegLocation rl_lhs, RegLocation rl_rhs) {
+                            RegLocation rl_lhs, RegLocation rl_rhs, int flags) {
   OpKind op = kOpBkpt;
   bool is_div_rem = false;
   bool unary = false;
@@ -3022,7 +3025,7 @@ void X86Mir2Lir::GenArithOpInt(Instruction::Code opcode, RegLocation rl_dest,
 
   // Get the div/rem stuff out of the way.
   if (is_div_rem) {
-    rl_result = GenDivRem(rl_dest, rl_lhs, rl_rhs, op == kOpDiv, true);
+    rl_result = GenDivRem(rl_dest, rl_lhs, rl_rhs, op == kOpDiv, flags);
     StoreValue(rl_dest, rl_result);
     return;
   }
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index 2ef4c218c3..79d5eebe17 100755
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -2166,7 +2166,7 @@ void X86Mir2Lir::GenAddReduceVector(BasicBlock *bb, MIR *mir) {
       NewLIR2(kX86MovdrxRR, temp_loc.reg.GetHighReg(), vector_src.GetReg());
     }
 
-    GenArithOpLong(Instruction::ADD_LONG_2ADDR, rl_dest, temp_loc, temp_loc);
+    GenArithOpLong(Instruction::ADD_LONG_2ADDR, rl_dest, temp_loc, temp_loc, mir->optimization_flags);
   } else if (opsize == kSignedByte || opsize == kUnsignedByte) {
     RegStorage rs_tmp = Get128BitRegister(AllocTempDouble());
     NewLIR2(kX86PxorRR, rs_tmp.GetReg(), rs_tmp.GetReg());
diff --git a/compiler/dex/vreg_analysis.cc b/compiler/dex/vreg_analysis.cc
index bdfab130c0..f6c7d52eb6 100644
--- a/compiler/dex/vreg_analysis.cc
+++ b/compiler/dex/vreg_analysis.cc
@@ -378,7 +378,20 @@ bool MIRGraph::InferTypeAndSize(BasicBlock* bb, MIR* mir, bool changed) {
         changed |= SetWide(defs[1]);
         changed |= SetHigh(defs[1]);
       }
+
+      bool has_ins = (GetNumOfInVRs() > 0);
+
       for (int i = 0; i < ssa_rep->num_uses; i++) {
+        if (has_ins && IsInVReg(uses[i])) {
+          // NB: The SSA name for the first def of an in-reg will be the same as
+          // the reg's actual name.
+          if (!reg_location_[uses[i]].fp && defined_fp) {
+            // If we were about to infer that this first def of an in-reg is a float
+            // when it wasn't previously (because float/int is set during SSA initialization),
+            // do not allow this to happen.
+            continue;
+          }
+        }
         changed |= SetFp(uses[i], defined_fp);
         changed |= SetCore(uses[i], defined_core);
         changed |= SetRef(uses[i], defined_ref);
diff --git a/compiler/image_writer.cc b/compiler/image_writer.cc
index 35a3d4b3b2..235aba823c 100644
--- a/compiler/image_writer.cc
+++ b/compiler/image_writer.cc
@@ -206,7 +206,7 @@ void ImageWriter::SetImageOffset(mirror::Object* object, size_t offset) {
       break;
     default:
       LOG(FATAL) << "Unreachable.";
-      break;
+      UNREACHABLE();
   }
   object->SetLockWord(LockWord::FromForwardingAddress(offset), false);
   DCHECK(IsImageOffsetAssigned(object));
diff --git a/compiler/jni/portable/jni_compiler.cc b/compiler/jni/portable/jni_compiler.cc
index d2f54f8090..ff37d858ad 100644
--- a/compiler/jni/portable/jni_compiler.cc
+++ b/compiler/jni/portable/jni_compiler.cc
@@ -298,6 +298,7 @@ void JniCompiler::CreateFunction(const std::string& func_name) {
     case 'D': ret_type =  irb_.getJDoubleTy(); break;
     case 'L': ret_type =  irb_.getJObjectTy(); break;
     default: LOG(FATAL)  << "Unreachable: unexpected return type in shorty " << shorty;
+      UNREACHABLE();
   }
   // Get argument type
   std::vector< ::llvm::Type*> args_type;
diff --git a/compiler/jni/quick/arm/calling_convention_arm.cc b/compiler/jni/quick/arm/calling_convention_arm.cc
index f0c0ed72bf..9545896f64 100644
--- a/compiler/jni/quick/arm/calling_convention_arm.cc
+++ b/compiler/jni/quick/arm/calling_convention_arm.cc
@@ -21,6 +21,22 @@
 namespace art {
 namespace arm {
 
+// Used by hard float.
+static const Register kHFCoreArgumentRegisters[] = {
+  R0, R1, R2, R3
+};
+
+static const SRegister kHFSArgumentRegisters[] = {
+  S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15
+};
+
+static const DRegister kHFDArgumentRegisters[] = {
+  D0, D1, D2, D3, D4, D5, D6, D7
+};
+
+COMPILE_ASSERT(arraysize(kHFDArgumentRegisters) * 2 == arraysize(kHFSArgumentRegisters),
+    ks_d_argument_registers_mismatch);
+
 // Calling convention
 
 ManagedRegister ArmManagedRuntimeCallingConvention::InterproceduralScratchRegister() {
@@ -31,26 +47,43 @@ ManagedRegister ArmJniCallingConvention::InterproceduralScratchRegister() {
   return ArmManagedRegister::FromCoreRegister(IP);  // R12
 }
 
-static ManagedRegister ReturnRegisterForShorty(const char* shorty) {
-  if (shorty[0] == 'F') {
-    return ArmManagedRegister::FromCoreRegister(R0);
-  } else if (shorty[0] == 'D') {
-    return ArmManagedRegister::FromRegisterPair(R0_R1);
-  } else if (shorty[0] == 'J') {
-    return ArmManagedRegister::FromRegisterPair(R0_R1);
-  } else if (shorty[0] == 'V') {
-    return ArmManagedRegister::NoRegister();
+ManagedRegister ArmManagedRuntimeCallingConvention::ReturnRegister() {
+  if (kArm32QuickCodeUseSoftFloat) {
+    switch (GetShorty()[0]) {
+    case 'V':
+      return ArmManagedRegister::NoRegister();
+    case 'D':
+    case 'J':
+      return ArmManagedRegister::FromRegisterPair(R0_R1);
+    default:
+      return ArmManagedRegister::FromCoreRegister(R0);
+    }
   } else {
-    return ArmManagedRegister::FromCoreRegister(R0);
+    switch (GetShorty()[0]) {
+    case 'V':
+      return ArmManagedRegister::NoRegister();
+    case 'D':
+      return ArmManagedRegister::FromDRegister(D0);
+    case 'F':
+      return ArmManagedRegister::FromSRegister(S0);
+    case 'J':
+      return ArmManagedRegister::FromRegisterPair(R0_R1);
+    default:
+      return ArmManagedRegister::FromCoreRegister(R0);
+    }
   }
 }
 
-ManagedRegister ArmManagedRuntimeCallingConvention::ReturnRegister() {
-  return ReturnRegisterForShorty(GetShorty());
-}
-
 ManagedRegister ArmJniCallingConvention::ReturnRegister() {
-  return ReturnRegisterForShorty(GetShorty());
+  switch (GetShorty()[0]) {
+  case 'V':
+    return ArmManagedRegister::NoRegister();
+  case 'D':
+  case 'J':
+    return ArmManagedRegister::FromRegisterPair(R0_R1);
+  default:
+    return ArmManagedRegister::FromCoreRegister(R0);
+  }
 }
 
 ManagedRegister ArmJniCallingConvention::IntReturnRegister() {
@@ -88,15 +121,68 @@ FrameOffset ArmManagedRuntimeCallingConvention::CurrentParamStackOffset() {
 const ManagedRegisterEntrySpills& ArmManagedRuntimeCallingConvention::EntrySpills() {
   // We spill the argument registers on ARM to free them up for scratch use, we then assume
   // all arguments are on the stack.
-  if (entry_spills_.size() == 0) {
-    size_t num_spills = NumArgs() + NumLongOrDoubleArgs();
-    if (num_spills > 0) {
-      entry_spills_.push_back(ArmManagedRegister::FromCoreRegister(R1));
-      if (num_spills > 1) {
-        entry_spills_.push_back(ArmManagedRegister::FromCoreRegister(R2));
-        if (num_spills > 2) {
-          entry_spills_.push_back(ArmManagedRegister::FromCoreRegister(R3));
+  if (kArm32QuickCodeUseSoftFloat) {
+    if (entry_spills_.size() == 0) {
+      size_t num_spills = NumArgs() + NumLongOrDoubleArgs();
+      if (num_spills > 0) {
+        entry_spills_.push_back(ArmManagedRegister::FromCoreRegister(R1));
+        if (num_spills > 1) {
+          entry_spills_.push_back(ArmManagedRegister::FromCoreRegister(R2));
+          if (num_spills > 2) {
+            entry_spills_.push_back(ArmManagedRegister::FromCoreRegister(R3));
+          }
+        }
+      }
+    }
+  } else {
+    if ((entry_spills_.size() == 0) && (NumArgs() > 0)) {
+      uint32_t gpr_index = 1;  // R0 ~ R3. Reserve r0 for ArtMethod*.
+      uint32_t fpr_index = 0;  // S0 ~ S15.
+      uint32_t fpr_double_index = 0;  // D0 ~ D7.
+
+      ResetIterator(FrameOffset(0));
+      while (HasNext()) {
+        if (IsCurrentParamAFloatOrDouble()) {
+          if (IsCurrentParamADouble()) {  // Double.
+            // Double should not overlap with float.
+            fpr_double_index = (std::max(fpr_double_index * 2, RoundUp(fpr_index, 2))) / 2;
+            if (fpr_double_index < arraysize(kHFDArgumentRegisters)) {
+              entry_spills_.push_back(
+                  ArmManagedRegister::FromDRegister(kHFDArgumentRegisters[fpr_double_index++]));
+            } else {
+              entry_spills_.push_back(ManagedRegister::NoRegister(), 8);
+            }
+          } else {  // Float.
+            // Float should not overlap with double.
+            if (fpr_index % 2 == 0) {
+              fpr_index = std::max(fpr_double_index * 2, fpr_index);
+            }
+            if (fpr_index < arraysize(kHFSArgumentRegisters)) {
+              entry_spills_.push_back(
+                  ArmManagedRegister::FromSRegister(kHFSArgumentRegisters[fpr_index++]));
+            } else {
+              entry_spills_.push_back(ManagedRegister::NoRegister(), 4);
+            }
+          }
+        } else {
+          // FIXME: Pointer this returns as both reference and long.
+          if (IsCurrentParamALong() && !IsCurrentParamAReference()) {  // Long.
+            if (gpr_index < arraysize(kHFCoreArgumentRegisters)) {
+              entry_spills_.push_back(
+                  ArmManagedRegister::FromCoreRegister(kHFCoreArgumentRegisters[gpr_index++]));
+            } else {
+              entry_spills_.push_back(ManagedRegister::NoRegister(), 4);
+            }
+          }
+          // High part of long or 32-bit argument.
+          if (gpr_index < arraysize(kHFCoreArgumentRegisters)) {
+            entry_spills_.push_back(
+                ArmManagedRegister::FromCoreRegister(kHFCoreArgumentRegisters[gpr_index++]));
+          } else {
+            entry_spills_.push_back(ManagedRegister::NoRegister(), 4);
+          }
         }
+        Next();
       }
     }
   }
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index 80e9cdb16f..0555c00e33 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -233,23 +233,30 @@ CompiledMethod* OptimizingCompiler::TryCompile(const DexFile::CodeItem* code_ite
   bool shouldOptimize =
       dex_compilation_unit.GetSymbol().find("00024reg_00024") != std::string::npos;
 
+  if (instruction_set == kThumb2 && !kArm32QuickCodeUseSoftFloat) {
+    uint32_t shorty_len;
+    const char* shorty = dex_compilation_unit.GetShorty(&shorty_len);
+    for (uint32_t i = 0; i < shorty_len; ++i) {
+      if (shorty[i] == 'D' || shorty[i] == 'F') {
+        CHECK(!shouldCompile) << "Hard float ARM32 parameters are not yet supported";
+        return nullptr;
+      }
+    }
+  }
+
   ArenaPool pool;
   ArenaAllocator arena(&pool);
   HGraphBuilder builder(&arena, &dex_compilation_unit, &dex_file, GetCompilerDriver());
 
   HGraph* graph = builder.BuildGraph(*code_item);
   if (graph == nullptr) {
-    if (shouldCompile) {
-      LOG(FATAL) << "Could not build graph in optimizing compiler";
-    }
+    CHECK(!shouldCompile) << "Could not build graph in optimizing compiler";
     return nullptr;
   }
 
   CodeGenerator* codegen = CodeGenerator::Create(&arena, graph, instruction_set);
   if (codegen == nullptr) {
-    if (shouldCompile) {
-      LOG(FATAL) << "Could not find code generator for optimizing compiler";
-    }
+    CHECK(!shouldCompile) << "Could not find code generator for optimizing compiler";
     return nullptr;
   }
 
@@ -305,7 +312,7 @@ CompiledMethod* OptimizingCompiler::TryCompile(const DexFile::CodeItem* code_ite
                               stack_map);
   } else if (shouldOptimize && RegisterAllocator::Supports(instruction_set)) {
     LOG(FATAL) << "Could not allocate registers in optimizing compiler";
-    return nullptr;
+    UNREACHABLE();
   } else {
     unoptimized_compiled_methods_++;
     codegen->CompileBaseline(&allocator);
diff --git a/compiler/utils/arena_object.h b/compiler/utils/arena_object.h
index 50909f7532..8f6965edc5 100644
--- a/compiler/utils/arena_object.h
+++ b/compiler/utils/arena_object.h
@@ -31,6 +31,7 @@ class ArenaObject {
 
   void operator delete(void*, size_t) {
     LOG(FATAL) << "UNREACHABLE";
+    UNREACHABLE();
   }
 };
 
diff --git a/compiler/utils/arm/assembler_arm.cc b/compiler/utils/arm/assembler_arm.cc
index b430c7ee97..e9788f91ba 100644
--- a/compiler/utils/arm/assembler_arm.cc
+++ b/compiler/utils/arm/assembler_arm.cc
@@ -324,7 +324,7 @@ bool Address::CanHoldLoadOffsetArm(LoadOperandType type, int offset) {
       return IsAbsoluteUint(10, offset);  // VFP addressing mode.
     default:
       LOG(FATAL) << "UNREACHABLE";
-      return false;
+      UNREACHABLE();
   }
 }
 
@@ -342,7 +342,7 @@ bool Address::CanHoldStoreOffsetArm(StoreOperandType type, int offset) {
       return IsAbsoluteUint(10, offset);  // VFP addressing mode.
     default:
       LOG(FATAL) << "UNREACHABLE";
-      return false;
+      UNREACHABLE();
   }
 }
 
@@ -359,9 +359,9 @@ bool Address::CanHoldLoadOffsetThumb(LoadOperandType type, int offset) {
       return IsAbsoluteUint(10, offset);  // VFP addressing mode.
     case kLoadWordPair:
       return IsAbsoluteUint(10, offset);
-  default:
+    default:
       LOG(FATAL) << "UNREACHABLE";
-      return false;
+      UNREACHABLE();
   }
 }
 
@@ -377,9 +377,9 @@ bool Address::CanHoldStoreOffsetThumb(StoreOperandType type, int offset) {
       return IsAbsoluteUint(10, offset);  // VFP addressing mode.
     case kStoreWordPair:
       return IsAbsoluteUint(10, offset);
-  default:
+    default:
       LOG(FATAL) << "UNREACHABLE";
-      return false;
+      UNREACHABLE();
   }
 }
 
@@ -417,9 +417,23 @@ void ArmAssembler::BuildFrame(size_t frame_size, ManagedRegister method_reg,
   StoreToOffset(kStoreWord, R0, SP, 0);
 
   // Write out entry spills.
+  int32_t offset = frame_size + sizeof(StackReference<mirror::ArtMethod>);
   for (size_t i = 0; i < entry_spills.size(); ++i) {
-    Register reg = entry_spills.at(i).AsArm().AsCoreRegister();
-    StoreToOffset(kStoreWord, reg, SP, frame_size + kFramePointerSize + (i * kFramePointerSize));
+    ArmManagedRegister reg = entry_spills.at(i).AsArm();
+    if (reg.IsNoRegister()) {
+      // only increment stack offset.
+      ManagedRegisterSpill spill = entry_spills.at(i);
+      offset += spill.getSize();
+    } else if (reg.IsCoreRegister()) {
+      StoreToOffset(kStoreWord, reg.AsCoreRegister(), SP, offset);
+      offset += 4;
+    } else if (reg.IsSRegister()) {
+      StoreSToOffset(reg.AsSRegister(), SP, offset);
+      offset += 4;
+    } else if (reg.IsDRegister()) {
+      StoreDToOffset(reg.AsDRegister(), SP, offset);
+      offset += 8;
+    }
   }
 }
 
diff --git a/compiler/utils/arm/assembler_arm32.cc b/compiler/utils/arm/assembler_arm32.cc
index 6af69c86ce..3d46617079 100644
--- a/compiler/utils/arm/assembler_arm32.cc
+++ b/compiler/utils/arm/assembler_arm32.cc
@@ -1356,6 +1356,7 @@ void Arm32Assembler::LoadFromOffset(LoadOperandType type,
       break;
     default:
       LOG(FATAL) << "UNREACHABLE";
+      UNREACHABLE();
   }
 }
 
@@ -1427,6 +1428,7 @@ void Arm32Assembler::StoreToOffset(StoreOperandType type,
       break;
     default:
       LOG(FATAL) << "UNREACHABLE";
+      UNREACHABLE();
   }
 }
 
diff --git a/compiler/utils/arm/assembler_thumb2.cc b/compiler/utils/arm/assembler_thumb2.cc
index 7968a7774e..37478c4ac1 100644
--- a/compiler/utils/arm/assembler_thumb2.cc
+++ b/compiler/utils/arm/assembler_thumb2.cc
@@ -2406,6 +2406,7 @@ void Thumb2Assembler::LoadFromOffset(LoadOperandType type,
       break;
     default:
       LOG(FATAL) << "UNREACHABLE";
+      UNREACHABLE();
   }
 }
 
@@ -2477,6 +2478,7 @@ void Thumb2Assembler::StoreToOffset(StoreOperandType type,
       break;
     default:
       LOG(FATAL) << "UNREACHABLE";
+      UNREACHABLE();
   }
 }
 
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index f164138168..bf58b16ce7 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -1222,7 +1222,7 @@ void X86_64Assembler::imull(CpuRegister dst, CpuRegister src) {
 
 void X86_64Assembler::imull(CpuRegister reg, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitOptionalRex32(reg);
+  EmitOptionalRex32(reg, reg);
   EmitUint8(0x69);
   EmitOperand(reg.LowBits(), Operand(reg));
   EmitImmediate(imm);
diff --git a/compiler/utils/x86_64/assembler_x86_64_test.cc b/compiler/utils/x86_64/assembler_x86_64_test.cc
index 18c5cbcbc6..0e8ea5b3ee 100644
--- a/compiler/utils/x86_64/assembler_x86_64_test.cc
+++ b/compiler/utils/x86_64/assembler_x86_64_test.cc
@@ -186,6 +186,15 @@ TEST_F(AssemblerX86_64Test, Movw) {
   DriverStr(expected, "movw");
 }
 
+TEST_F(AssemblerX86_64Test, IMulImmediate) {
+  GetAssembler()->imull(x86_64::CpuRegister(x86_64::RAX), x86_64::Immediate(0x40000));
+  GetAssembler()->imull(x86_64::CpuRegister(x86_64::R8), x86_64::Immediate(0x40000));
+  const char* expected =
+    "imull $0x40000,%eax,%eax\n"
+    "imull $0x40000,%r8d,%r8d\n";
+  DriverStr(expected, "imul");
+}
+
 
 std::string setcc_test_fn(x86_64::X86_64Assembler* assembler) {
   // From Condition
diff --git a/disassembler/disassembler_x86.cc b/disassembler/disassembler_x86.cc
index e12559f010..b9e98f6cb0 100644
--- a/disassembler/disassembler_x86.cc
+++ b/disassembler/disassembler_x86.cc
@@ -508,7 +508,7 @@ DISASSEMBLER_ENTRY(cmp,
           case 0x5D: opcode << "min"; break;
           case 0x5E: opcode << "div"; break;
           case 0x5F: opcode << "max"; break;
-          default: LOG(FATAL) << "Unreachable";
+          default: LOG(FATAL) << "Unreachable"; UNREACHABLE();
         }
         if (prefix[2] == 0x66) {
           opcode << "pd";
@@ -1239,7 +1239,7 @@ DISASSEMBLER_ENTRY(cmp,
     case 0xF2: prefixed_opcode << "repne "; break;
     case 0xF3: prefixed_opcode << "repe "; break;
     case 0: break;
-    default: LOG(FATAL) << "Unreachable";
+    default: LOG(FATAL) << "Unreachable"; UNREACHABLE();
   }
   prefixed_opcode << opcode.str();
   os << FormatInstructionPointer(begin_instr)
diff --git a/runtime/Android.mk b/runtime/Android.mk
index 0ef0fefd61..6f6dcbcb6c 100644
--- a/runtime/Android.mk
+++ b/runtime/Android.mk
@@ -222,6 +222,7 @@ LIBART_TARGET_SRC_FILES_arm := \
   arch/arm/memcmp16_arm.S \
   arch/arm/portable_entrypoints_arm.S \
   arch/arm/quick_entrypoints_arm.S \
+  arch/arm/quick_entrypoints_cc_arm.cc \
   arch/arm/thread_arm.cc \
   arch/arm/fault_handler_arm.cc
 
diff --git a/runtime/arch/arm/asm_support_arm.h b/runtime/arch/arm/asm_support_arm.h
index 5388cc0daf..8cd2a27b9b 100644
--- a/runtime/arch/arm/asm_support_arm.h
+++ b/runtime/arch/arm/asm_support_arm.h
@@ -19,9 +19,9 @@
 
 #include "asm_support.h"
 
-#define FRAME_SIZE_SAVE_ALL_CALLEE_SAVE 176
+#define FRAME_SIZE_SAVE_ALL_CALLEE_SAVE 112
 #define FRAME_SIZE_REFS_ONLY_CALLEE_SAVE 32
-#define FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE 48
+#define FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE 112
 
 // Flag for enabling R4 optimization in arm runtime
 #define ARM_R4_SUSPEND_FLAG
diff --git a/runtime/arch/arm/context_arm.cc b/runtime/arch/arm/context_arm.cc
index 96ffc9310f..fd9c626228 100644
--- a/runtime/arch/arm/context_arm.cc
+++ b/runtime/arch/arm/context_arm.cc
@@ -97,6 +97,23 @@ void ArmContext::SmashCallerSaves() {
   gprs_[R1] = const_cast<uint32_t*>(&gZero);
   gprs_[R2] = nullptr;
   gprs_[R3] = nullptr;
+
+  fprs_[S0] = nullptr;
+  fprs_[S1] = nullptr;
+  fprs_[S2] = nullptr;
+  fprs_[S3] = nullptr;
+  fprs_[S4] = nullptr;
+  fprs_[S5] = nullptr;
+  fprs_[S6] = nullptr;
+  fprs_[S7] = nullptr;
+  fprs_[S8] = nullptr;
+  fprs_[S9] = nullptr;
+  fprs_[S10] = nullptr;
+  fprs_[S11] = nullptr;
+  fprs_[S12] = nullptr;
+  fprs_[S13] = nullptr;
+  fprs_[S14] = nullptr;
+  fprs_[S15] = nullptr;
 }
 
 extern "C" void art_quick_do_long_jump(uint32_t*, uint32_t*);
diff --git a/runtime/arch/arm/entrypoints_init_arm.cc b/runtime/arch/arm/entrypoints_init_arm.cc
index ff0eb4ae45..24e9b1d3e4 100644
--- a/runtime/arch/arm/entrypoints_init_arm.cc
+++ b/runtime/arch/arm/entrypoints_init_arm.cc
@@ -77,23 +77,17 @@ extern "C" void art_quick_handle_fill_data(void*, void*);
 extern "C" void art_quick_lock_object(void*);
 extern "C" void art_quick_unlock_object(void*);
 
-// Math entrypoints.
-extern int32_t CmpgDouble(double a, double b);
-extern int32_t CmplDouble(double a, double b);
-extern int32_t CmpgFloat(float a, float b);
-extern int32_t CmplFloat(float a, float b);
-
-// Math conversions.
-extern "C" int32_t __aeabi_f2iz(float op1);        // FLOAT_TO_INT
-extern "C" int32_t __aeabi_d2iz(double op1);       // DOUBLE_TO_INT
-extern "C" float __aeabi_l2f(int64_t op1);         // LONG_TO_FLOAT
-extern "C" double __aeabi_l2d(int64_t op1);        // LONG_TO_DOUBLE
-
+// Used by soft float.
 // Single-precision FP arithmetics.
-extern "C" float fmodf(float a, float b);          // REM_FLOAT[_2ADDR]
-
+extern "C" float fmodf(float a, float b);              // REM_FLOAT[_2ADDR]
 // Double-precision FP arithmetics.
-extern "C" double fmod(double a, double b);         // REM_DOUBLE[_2ADDR]
+extern "C" double fmod(double a, double b);            // REM_DOUBLE[_2ADDR]
+
+// Used by hard float.
+extern "C" int64_t art_quick_f2l(float f);             // FLOAT_TO_LONG
+extern "C" int64_t art_quick_d2l(double d);            // DOUBLE_TO_LONG
+extern "C" float art_quick_fmodf(float a, float b);    // REM_FLOAT[_2ADDR]
+extern "C" double art_quick_fmod(double a, double b);  // REM_DOUBLE[_2ADDR]
 
 // Integer arithmetics.
 extern "C" int __aeabi_idivmod(int32_t, int32_t);  // [DIV|REM]_INT[_2ADDR|_LIT8|_LIT16]
@@ -205,25 +199,24 @@ void InitEntryPoints(InterpreterEntryPoints* ipoints, JniEntryPoints* jpoints,
   qpoints->pUnlockObject = art_quick_unlock_object;
 
   // Math
-  qpoints->pCmpgDouble = CmpgDouble;
-  qpoints->pCmpgFloat = CmpgFloat;
-  qpoints->pCmplDouble = CmplDouble;
-  qpoints->pCmplFloat = CmplFloat;
-  qpoints->pFmod = fmod;
-  qpoints->pL2d = __aeabi_l2d;
-  qpoints->pFmodf = fmodf;
-  qpoints->pL2f = __aeabi_l2f;
-  qpoints->pD2iz = __aeabi_d2iz;
-  qpoints->pF2iz = __aeabi_f2iz;
   qpoints->pIdivmod = __aeabi_idivmod;
-  qpoints->pD2l = art_d2l;
-  qpoints->pF2l = art_f2l;
   qpoints->pLdiv = __aeabi_ldivmod;
   qpoints->pLmod = __aeabi_ldivmod;  // result returned in r2:r3
   qpoints->pLmul = art_quick_mul_long;
   qpoints->pShlLong = art_quick_shl_long;
   qpoints->pShrLong = art_quick_shr_long;
   qpoints->pUshrLong = art_quick_ushr_long;
+  if (kArm32QuickCodeUseSoftFloat) {
+    qpoints->pFmod = fmod;
+    qpoints->pFmodf = fmodf;
+    qpoints->pD2l = art_d2l;
+    qpoints->pF2l = art_f2l;
+  } else {
+    qpoints->pFmod = art_quick_fmod;
+    qpoints->pFmodf = art_quick_fmodf;
+    qpoints->pD2l = art_quick_d2l;
+    qpoints->pF2l = art_quick_f2l;
+  }
 
   // Intrinsics
   qpoints->pIndexOf = art_quick_indexof;
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index aae0c94994..632b41435f 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -40,10 +40,10 @@
     .cfi_rel_offset r10, 24
     .cfi_rel_offset r11, 28
     .cfi_rel_offset lr, 32
-    vpush {s0-s31}                                @ 32 words (128 bytes) of floats.
-    .pad #128
-    .cfi_adjust_cfa_offset 128
-    sub sp, #12                                   @ 3 words of space, bottom word will hold Method*.
+    vpush {s16-s31}                               @ 16 words (64 bytes) of floats.
+    .pad #64
+    .cfi_adjust_cfa_offset 64
+    sub sp, #12                                   @ 3 words of space, bottom word will hold Method*
     .pad #12
     .cfi_adjust_cfa_offset 12
     RUNTIME_CURRENT1 \rTemp1, \rTemp2             @ Load Runtime::Current into rTemp1.
@@ -53,7 +53,7 @@
     str sp, [r9, #THREAD_TOP_QUICK_FRAME_OFFSET]  @ Place sp in Thread::Current()->top_quick_frame.
 
      // Ugly compile-time check, but we only have the preprocessor.
-#if (FRAME_SIZE_SAVE_ALL_CALLEE_SAVE != 36 + 128 + 12)
+#if (FRAME_SIZE_SAVE_ALL_CALLEE_SAVE != 36 + 64 + 12)
 #error "SAVE_ALL_CALLEE_SAVE_FRAME(ARM) size not as expected."
 #endif
 .endm
@@ -101,15 +101,7 @@
 .endm
 
 .macro RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME_AND_RETURN
-    add sp, #4               @ bottom word holds Method*
-    pop {r5-r8, r10-r11, lr} @ 7 words of callee saves
-    .cfi_restore r5
-    .cfi_restore r6
-    .cfi_restore r7
-    .cfi_restore r8
-    .cfi_restore r10
-    .cfi_restore r11
-    .cfi_adjust_cfa_offset -FRAME_SIZE_REFS_ONLY_CALLEE_SAVE
+    RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME
     bx  lr                   @ return
 .endm
 
@@ -117,9 +109,10 @@
      * Macro that sets up the callee save frame to conform with
      * Runtime::CreateCalleeSaveMethod(kRefsAndArgs).
      */
-.macro SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME rTemp1, rTemp2
-    push {r1-r3, r5-r8, r10-r11, lr}   @ 10 words of callee saves
+.macro SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME_REGISTERS_ONLY
+    push {r1-r3, r5-r8, r10-r11, lr}   @ 10 words of callee saves and args.
     .save {r1-r3, r5-r8, r10-r11, lr}
+    .cfi_adjust_cfa_offset 40
     .cfi_rel_offset r1, 0
     .cfi_rel_offset r2, 4
     .cfi_rel_offset r3, 8
@@ -130,47 +123,39 @@
     .cfi_rel_offset r10, 28
     .cfi_rel_offset r11, 32
     .cfi_rel_offset lr, 36
-    .cfi_adjust_cfa_offset 40
+    vpush {s0-s15}                     @ 16 words of float args.
+    .pad #64
+    .cfi_adjust_cfa_offset 64
     sub sp, #8                         @ 2 words of space, bottom word will hold Method*
     .pad #8
     .cfi_adjust_cfa_offset 8
+    // Ugly compile-time check, but we only have the preprocessor.
+#if (FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE != 40 + 64 + 8)
+#error "REFS_AND_ARGS_CALLEE_SAVE_FRAME(ARM) size not as expected."
+#endif
+.endm
+
+.macro SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME rTemp1, rTemp2
+    SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME_REGISTERS_ONLY
     RUNTIME_CURRENT3 \rTemp1, \rTemp2  @ Load Runtime::Current into rTemp1.
     THIS_LOAD_REQUIRES_READ_BARRIER
      @ rTemp1 is kRefsAndArgs Method*.
     ldr \rTemp1, [\rTemp1, #RUNTIME_REFS_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET]
     str \rTemp1, [sp, #0]                         @ Place Method* at bottom of stack.
     str sp, [r9, #THREAD_TOP_QUICK_FRAME_OFFSET]  @ Place sp in Thread::Current()->top_quick_frame.
-
-    // Ugly compile-time check, but we only have the preprocessor.
-#if (FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE != 40 + 8)
-#error "REFS_AND_ARGS_CALLEE_SAVE_FRAME(ARM) size not as expected."
-#endif
 .endm
 
 .macro SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME_WITH_METHOD_IN_R0
-    push {r1-r3, r5-r8, r10-r11, lr}   @ 10 words of callee saves
-    .save {r1-r3, r5-r8, r10-r11, lr}
-    .cfi_rel_offset r1, 0
-    .cfi_rel_offset r2, 4
-    .cfi_rel_offset r3, 8
-    .cfi_rel_offset r5, 12
-    .cfi_rel_offset r6, 16
-    .cfi_rel_offset r7, 20
-    .cfi_rel_offset r8, 24
-    .cfi_rel_offset r10, 28
-    .cfi_rel_offset r11, 32
-    .cfi_rel_offset lr, 36
-    .cfi_adjust_cfa_offset 40
-    sub sp, #8                         @ 2 words of space, bottom word will hold Method*
-    .pad #8
-    .cfi_adjust_cfa_offset 8
-
+    SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME_REGISTERS_ONLY
     str r0, [sp, #0]                   @ Store ArtMethod* to bottom of stack.
     str sp, [r9, #THREAD_TOP_QUICK_FRAME_OFFSET]  @ Place sp in Thread::Current()->top_quick_frame.
 .endm
 
 .macro RESTORE_REFS_AND_ARGS_CALLEE_SAVE_FRAME
     add  sp, #8                      @ rewind sp
+    .cfi_adjust_cfa_offset -8
+    vpop {s0-s15}
+    .cfi_adjust_cfa_offset -64
     pop {r1-r3, r5-r8, r10-r11, lr}  @ 10 words of callee saves
     .cfi_restore r1
     .cfi_restore r2
@@ -181,7 +166,7 @@
     .cfi_restore r8
     .cfi_restore r10
     .cfi_restore r11
-    .cfi_adjust_cfa_offset -48
+    .cfi_adjust_cfa_offset -40
 .endm
 
 
@@ -373,60 +358,91 @@ INVOKE_TRAMPOLINE art_quick_invoke_super_trampoline_with_access_check, artInvoke
 INVOKE_TRAMPOLINE art_quick_invoke_virtual_trampoline_with_access_check, artInvokeVirtualTrampolineWithAccessCheck
 
     /*
-     * Quick invocation stub.
+     * Quick invocation stub internal.
      * On entry:
      *   r0 = method pointer
      *   r1 = argument array or NULL for no argument methods
      *   r2 = size of argument array in bytes
      *   r3 = (managed) thread pointer
      *   [sp] = JValue* result
-     *   [sp + 4] = shorty
+     *   [sp + 4] = result_in_float
+     *   [sp + 8] = core register argument array
+     *   [sp + 12] = fp register argument array
+     *  +-------------------------+
+     *  | uint32_t* fp_reg_args   |
+     *  | uint32_t* core_reg_args |
+     *  |   result_in_float       | <- Caller frame
+     *  |   Jvalue* result        |
+     *  +-------------------------+
+     *  |          lr             |
+     *  |          r11            |
+     *  |          r9             |
+     *  |          r4             | <- r11
+     *  +-------------------------+
+     *  | uint32_t out[n-1]       |
+     *  |    :      :             |        Outs
+     *  | uint32_t out[0]         |
+     *  | StackRef<ArtMethod>     | <- SP  value=null
+     *  +-------------------------+
      */
-ENTRY art_quick_invoke_stub
-    push   {r0, r4, r5, r9, r11, lr}       @ spill regs
-    .save  {r0, r4, r5, r9, r11, lr}
-    .pad #24
-    .cfi_adjust_cfa_offset 24
-    .cfi_rel_offset r0, 0
-    .cfi_rel_offset r4, 4
-    .cfi_rel_offset r5, 8
-    .cfi_rel_offset r9, 12
-    .cfi_rel_offset r11, 16
-    .cfi_rel_offset lr, 20
+ENTRY art_quick_invoke_stub_internal
+    push   {r4, r9, r11, lr}               @ spill regs
+    .save  {r4, r9, r11, lr}
+    .pad #16
+    .cfi_adjust_cfa_offset 16
+    .cfi_rel_offset r4, 0
+    .cfi_rel_offset r9, 4
+    .cfi_rel_offset r11, 8
+    .cfi_rel_offset lr, 12
     mov    r11, sp                         @ save the stack pointer
     .cfi_def_cfa_register r11
+
     mov    r9, r3                          @ move managed thread pointer into r9
-#ifdef ARM_R4_SUSPEND_FLAG
-    mov    r4, #SUSPEND_CHECK_INTERVAL     @ reset r4 to suspend check interval
-#endif
-    add    r5, r2, #4                      @ create space for method pointer in frame
 
-    sub    r5, sp, r5                      @ reserve & align *stack* to 16 bytes: native calling
-    and    r5, #0xFFFFFFF0                 @ convention only aligns to 8B, so we have to ensure ART
-    mov    sp, r5                          @ 16B alignment ourselves.
+    add    r4, r2, #4                      @ create space for method pointer in frame
+    sub    r4, sp, r4                      @ reserve & align *stack* to 16 bytes: native calling
+    and    r4, #0xFFFFFFF0                 @ convention only aligns to 8B, so we have to ensure ART
+    mov    sp, r4                          @ 16B alignment ourselves.
 
+    mov    r4, r0                          @ save method*
     add    r0, sp, #4                      @ pass stack pointer + method ptr as dest for memcpy
     bl     memcpy                          @ memcpy (dest, src, bytes)
-    ldr    r0, [r11]                       @ restore method*
-    ldr    r1, [sp, #4]                    @ copy arg value for r1
-    ldr    r2, [sp, #8]                    @ copy arg value for r2
-    ldr    r3, [sp, #12]                   @ copy arg value for r3
     mov    ip, #0                          @ set ip to 0
     str    ip, [sp]                        @ store NULL for method* at bottom of frame
+
+    ldr    ip, [r11, #28]                  @ load fp register argument array pointer
+    vldm   ip, {s0-s15}                    @ copy s0 - s15
+
+    ldr    ip, [r11, #24]                  @ load core register argument array pointer
+    mov    r0, r4                          @ restore method*
+    add    ip, ip, #4                      @ skip r0
+    ldm    ip, {r1-r3}                     @ copy r1 - r3
+
+#ifdef ARM_R4_SUSPEND_FLAG
+    mov    r4, #SUSPEND_CHECK_INTERVAL     @ reset r4 to suspend check interval
+#endif
+
     ldr    ip, [r0, #MIRROR_ART_METHOD_QUICK_CODE_OFFSET]  @ get pointer to the code
     blx    ip                              @ call the method
+
     mov    sp, r11                         @ restore the stack pointer
-    ldr    ip, [sp, #24]                   @ load the result pointer
-    strd   r0, [ip]                        @ store r0/r1 into result pointer
-    pop    {r0, r4, r5, r9, r11, lr}       @ restore spill regs
-    .cfi_restore r0
+    .cfi_def_cfa_register sp
+
+    ldr    r4, [sp, #20]                   @ load result_is_float
+    ldr    r9, [sp, #16]                   @ load the result pointer
+    cmp    r4, #0
+    ite    eq
+    strdeq r0, [r9]                        @ store r0/r1 into result pointer
+    vstrne d0, [r9]                        @ store s0-s1/d0 into result pointer
+
+    pop    {r4, r9, r11, lr}               @ restore spill regs
     .cfi_restore r4
-    .cfi_restore r5
     .cfi_restore r9
+    .cfi_restore r11
     .cfi_restore lr
-    .cfi_adjust_cfa_offset -24
+    .cfi_adjust_cfa_offset -16
     bx     lr
-END art_quick_invoke_stub
+END art_quick_invoke_stub_internal
 
     /*
      * On entry r0 is uint32_t* gprs_ and r1 is uint32_t* fprs_
@@ -869,13 +885,14 @@ ENTRY art_quick_proxy_invoke_handler
     mov     r3, sp                 @ pass SP
     blx     artQuickProxyInvokeHandler  @ (Method* proxy method, receiver, Thread*, SP)
     ldr     r2, [r9, #THREAD_EXCEPTION_OFFSET]  @ load Thread::Current()->exception_
-    add     sp, #16                @ skip r1-r3, 4 bytes padding.
-    .cfi_adjust_cfa_offset -16
-    cbnz    r2, 1f                 @ success if no exception is pending
+    // Tear down the callee-save frame. Skip arg registers.
+    add     sp, #(FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE - FRAME_SIZE_REFS_ONLY_CALLEE_SAVE)
+    .cfi_adjust_cfa_offset -(FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE - FRAME_SIZE_REFS_ONLY_CALLEE_SAVE)
     RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME
+    cbnz    r2, 1f                 @ success if no exception is pending
+    vmov    d0, r0, r1             @ store into fpr, for when it's a fpr return...
     bx      lr                     @ return on success
 1:
-    RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME
     DELIVER_PENDING_EXCEPTION
 END art_quick_proxy_invoke_handler
 
@@ -977,20 +994,13 @@ ENTRY art_quick_generic_jni_trampoline
     ldr r2, [r9, #THREAD_EXCEPTION_OFFSET]  @ load Thread::Current()->exception_
     cbnz r2, .Lexception_in_native
 
-    // Tear down the callee-save frame.
-    add  sp, #12                      @ rewind sp
-    // Do not pop r0 and r1, they contain the return value.
-    pop {r2-r3, r5-r8, r10-r11, lr}  @ 9 words of callee saves
-    .cfi_restore r2
-    .cfi_restore r3
-    .cfi_restore r5
-    .cfi_restore r6
-    .cfi_restore r7
-    .cfi_restore r8
-    .cfi_restore r10
-    .cfi_restore r11
-    .cfi_adjust_cfa_offset -48
+    // Tear down the callee-save frame. Skip arg registers.
+    add     sp, #FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE-FRAME_SIZE_REFS_ONLY_CALLEE_SAVE
+    .cfi_adjust_cfa_offset -(FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE-FRAME_SIZE_REFS_ONLY_CALLEE_SAVE)
+    RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME
 
+    // store into fpr, for when it's a fpr return...
+    vmov d0, r0, r1
     bx lr      // ret
 
 .Lentry_error:
@@ -1010,11 +1020,13 @@ ENTRY art_quick_to_interpreter_bridge
     mov     r2, sp                 @ pass SP
     blx     artQuickToInterpreterBridge    @ (Method* method, Thread*, SP)
     ldr     r2, [r9, #THREAD_EXCEPTION_OFFSET]  @ load Thread::Current()->exception_
-    add     sp, #16                @ skip r1-r3, 4 bytes padding.
-    .cfi_adjust_cfa_offset -16
+    // Tear down the callee-save frame. Skip arg registers.
+    add     sp, #(FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE - FRAME_SIZE_REFS_ONLY_CALLEE_SAVE)
+    .cfi_adjust_cfa_offset -(FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE - FRAME_SIZE_REFS_ONLY_CALLEE_SAVE)
     RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME
     cbnz    r2, 1f                 @ success if no exception is pending
-    bx    lr                       @ return on success
+    vmov    d0, r0, r1             @ store into fpr, for when it's a fpr return...
+    bx      lr                     @ return on success
 1:
     DELIVER_PENDING_EXCEPTION
 END art_quick_to_interpreter_bridge
@@ -1435,3 +1447,54 @@ ENTRY art_quick_string_compareto
 .Ldone:
     pop   {r4, r7-r12, pc}
 END art_quick_string_compareto
+
+    /* Assembly routines used to handle ABI differences. */
+
+    /* double fmod(double a, double b) */
+    .extern fmod
+ENTRY art_quick_fmod
+    push  {lr}
+    .cfi_adjust_cfa_offset 4
+    .cfi_rel_offset lr, 0
+    sub   sp, #4
+    .cfi_adjust_cfa_offset 4
+    vmov  r0, r1, d0
+    vmov  r2, r3, d1
+    bl    fmod
+    vmov  d0, r0, r1
+    add   sp, #4
+    .cfi_adjust_cfa_offset -4
+    pop   {pc}
+    .cfi_adjust_cfa_offset -4
+END art_quick_fmod
+
+    /* float fmodf(float a, float b) */
+     .extern fmodf
+ENTRY art_quick_fmodf
+    push  {lr}
+    .cfi_adjust_cfa_offset 4
+    .cfi_rel_offset lr, 0
+    sub   sp, #4
+    .cfi_adjust_cfa_offset 4
+    vmov  r0, r1, d0
+    bl    fmodf
+    vmov  s0, r0
+    add   sp, #4
+    .cfi_adjust_cfa_offset -4
+    pop   {pc}
+    .cfi_adjust_cfa_offset -4
+END art_quick_fmod
+
+    /* int64_t art_d2l(double d) */
+    .extern art_d2l
+ENTRY art_quick_d2l
+    vmov  r0, r1, d0
+    b     art_d2l
+END art_quick_d2l
+
+    /* int64_t art_f2l(float f) */
+    .extern art_f2l
+ENTRY art_quick_f2l
+    vmov  r0, s0
+    b     art_f2l
+END art_quick_f2l
diff --git a/runtime/arch/arm/quick_entrypoints_cc_arm.cc b/runtime/arch/arm/quick_entrypoints_cc_arm.cc
new file mode 100644
index 0000000000..e21e6c1a2e
--- /dev/null
+++ b/runtime/arch/arm/quick_entrypoints_cc_arm.cc
@@ -0,0 +1,110 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mirror/art_method.h"
+#include "utils.h"  // For RoundUp().
+
+namespace art {
+
+// Assembly stub that does the final part of the up-call into Java.
+extern "C" void art_quick_invoke_stub_internal(mirror::ArtMethod*, uint32_t*, uint32_t,
+                                               Thread* self, JValue* result, uint32_t, uint32_t*,
+                                               uint32_t*);
+
+template <bool kIsStatic>
+static void quick_invoke_reg_setup(mirror::ArtMethod* method, uint32_t* args, uint32_t args_size,
+                                   Thread* self, JValue* result, const char* shorty) {
+  // Note: We do not follow aapcs ABI in quick code for both softfp and hardfp.
+  uint32_t core_reg_args[4];  // r0 ~ r3
+  uint32_t fp_reg_args[16];  // s0 ~ s15 (d0 ~ d7)
+  uint32_t gpr_index = 1;  // Index into core registers. Reserve r0 for mirror::ArtMethod*.
+  uint32_t fpr_index = 0;  // Index into float registers.
+  uint32_t fpr_double_index = 0;  // Index into float registers for doubles.
+  uint32_t arg_index = 0;  // Index into argument array.
+  const uint32_t result_in_float = kArm32QuickCodeUseSoftFloat ? 0 :
+      (shorty[0] == 'F' || shorty[0] == 'D') ? 1 : 0;
+
+  if (!kIsStatic) {
+    // Copy receiver for non-static methods.
+    core_reg_args[gpr_index++] = args[arg_index++];
+  }
+
+  for (uint32_t shorty_index = 1; shorty[shorty_index] != '\0'; ++shorty_index, ++arg_index) {
+    char arg_type = shorty[shorty_index];
+    if (kArm32QuickCodeUseSoftFloat) {
+      arg_type = (arg_type == 'D') ? 'J' : arg_type;  // Regard double as long.
+      arg_type = (arg_type == 'F') ? 'I' : arg_type;  // Regard float as int.
+    }
+    switch (arg_type) {
+      case 'D': {
+        // Copy double argument into fp_reg_args if there are still floating point reg arguments.
+        // Double should not overlap with float.
+        fpr_double_index = std::max(fpr_double_index, RoundUp(fpr_index, 2));
+        if (fpr_double_index < arraysize(fp_reg_args)) {
+          fp_reg_args[fpr_double_index++] = args[arg_index];
+          fp_reg_args[fpr_double_index++] = args[arg_index + 1];
+        }
+        ++arg_index;
+        break;
+      }
+      case 'F':
+        // Copy float argument into fp_reg_args if there are still floating point reg arguments.
+        // If fpr_index is odd then its pointing at a hole next to an existing float argument. If we
+        // encounter a float argument then pick it up from that hole. In the case fpr_index is even,
+        // ensure that we don't pick up an argument that overlaps with with a double from
+        // fpr_double_index. In either case, take care not to go beyond the maximum number of
+        // floating point arguments.
+        if (fpr_index % 2 == 0) {
+          fpr_index = std::max(fpr_double_index, fpr_index);
+        }
+        if (fpr_index < arraysize(fp_reg_args)) {
+          fp_reg_args[fpr_index++] = args[arg_index];
+        }
+        break;
+      case 'J':
+        if (gpr_index < arraysize(core_reg_args)) {
+          core_reg_args[gpr_index++] = args[arg_index];
+        }
+        ++arg_index;
+        FALLTHROUGH_INTENDED;  // Fall-through to take of the high part.
+      default:
+        if (gpr_index < arraysize(core_reg_args)) {
+          core_reg_args[gpr_index++] = args[arg_index];
+        }
+        break;
+    }
+  }
+
+  art_quick_invoke_stub_internal(method, args, args_size, self, result, result_in_float,
+      core_reg_args, fp_reg_args);
+}
+
+// Called by art::mirror::ArtMethod::Invoke to do entry into a non-static method.
+// TODO: migrate into an assembly implementation as with ARM64.
+extern "C" void art_quick_invoke_stub(mirror::ArtMethod* method, uint32_t* args, uint32_t args_size,
+                                      Thread* self, JValue* result, const char* shorty) {
+  quick_invoke_reg_setup<false>(method, args, args_size, self, result, shorty);
+}
+
+// Called by art::mirror::ArtMethod::Invoke to do entry into a static method.
+// TODO: migrate into an assembly implementation as with ARM64.
+extern "C" void art_quick_invoke_static_stub(mirror::ArtMethod* method, uint32_t* args,
+                                             uint32_t args_size, Thread* self, JValue* result,
+                                             const char* shorty) {
+  quick_invoke_reg_setup<true>(method, args, args_size, self, result, shorty);
+}
+
+}  // namespace art
diff --git a/runtime/arch/arm/quick_method_frame_info_arm.h b/runtime/arch/arm/quick_method_frame_info_arm.h
index 7595e94e26..c1f3fc256d 100644
--- a/runtime/arch/arm/quick_method_frame_info_arm.h
+++ b/runtime/arch/arm/quick_method_frame_info_arm.h
@@ -25,6 +25,8 @@
 namespace art {
 namespace arm {
 
+static constexpr uint32_t kArmCalleeSaveAlwaysSpills =
+    (1 << art::arm::LR);
 static constexpr uint32_t kArmCalleeSaveRefSpills =
     (1 << art::arm::R5) | (1 << art::arm::R6)  | (1 << art::arm::R7) | (1 << art::arm::R8) |
     (1 << art::arm::R10) | (1 << art::arm::R11);
@@ -32,23 +34,30 @@ static constexpr uint32_t kArmCalleeSaveArgSpills =
     (1 << art::arm::R1) | (1 << art::arm::R2) | (1 << art::arm::R3);
 static constexpr uint32_t kArmCalleeSaveAllSpills =
     (1 << art::arm::R4) | (1 << art::arm::R9);
-static constexpr uint32_t kArmCalleeSaveFpAllSpills =
+
+static constexpr uint32_t kArmCalleeSaveFpAlwaysSpills = 0;
+static constexpr uint32_t kArmCalleeSaveFpRefSpills = 0;
+static constexpr uint32_t kArmCalleeSaveFpArgSpills =
     (1 << art::arm::S0)  | (1 << art::arm::S1)  | (1 << art::arm::S2)  | (1 << art::arm::S3)  |
     (1 << art::arm::S4)  | (1 << art::arm::S5)  | (1 << art::arm::S6)  | (1 << art::arm::S7)  |
     (1 << art::arm::S8)  | (1 << art::arm::S9)  | (1 << art::arm::S10) | (1 << art::arm::S11) |
-    (1 << art::arm::S12) | (1 << art::arm::S13) | (1 << art::arm::S14) | (1 << art::arm::S15) |
+    (1 << art::arm::S12) | (1 << art::arm::S13) | (1 << art::arm::S14) | (1 << art::arm::S15);
+static constexpr uint32_t kArmCalleeSaveFpAllSpills =
     (1 << art::arm::S16) | (1 << art::arm::S17) | (1 << art::arm::S18) | (1 << art::arm::S19) |
     (1 << art::arm::S20) | (1 << art::arm::S21) | (1 << art::arm::S22) | (1 << art::arm::S23) |
     (1 << art::arm::S24) | (1 << art::arm::S25) | (1 << art::arm::S26) | (1 << art::arm::S27) |
     (1 << art::arm::S28) | (1 << art::arm::S29) | (1 << art::arm::S30) | (1 << art::arm::S31);
 
 constexpr uint32_t ArmCalleeSaveCoreSpills(Runtime::CalleeSaveType type) {
-  return kArmCalleeSaveRefSpills | (type == Runtime::kRefsAndArgs ? kArmCalleeSaveArgSpills : 0) |
-      (type == Runtime::kSaveAll ? kArmCalleeSaveAllSpills : 0) | (1 << art::arm::LR);
+  return kArmCalleeSaveAlwaysSpills | kArmCalleeSaveRefSpills |
+      (type == Runtime::kRefsAndArgs ? kArmCalleeSaveArgSpills : 0) |
+      (type == Runtime::kSaveAll ? kArmCalleeSaveAllSpills : 0);
 }
 
 constexpr uint32_t ArmCalleeSaveFpSpills(Runtime::CalleeSaveType type) {
-  return type == Runtime::kSaveAll ? kArmCalleeSaveFpAllSpills : 0;
+  return kArmCalleeSaveFpAlwaysSpills | kArmCalleeSaveFpRefSpills |
+      (type == Runtime::kRefsAndArgs ? kArmCalleeSaveFpArgSpills: 0) |
+      (type == Runtime::kSaveAll ? kArmCalleeSaveFpAllSpills : 0);
 }
 
 constexpr uint32_t ArmCalleeSaveFrameSize(Runtime::CalleeSaveType type) {
diff --git a/runtime/arch/arm64/quick_method_frame_info_arm64.h b/runtime/arch/arm64/quick_method_frame_info_arm64.h
index 15c6c07592..0e1e32b955 100644
--- a/runtime/arch/arm64/quick_method_frame_info_arm64.h
+++ b/runtime/arch/arm64/quick_method_frame_info_arm64.h
@@ -54,7 +54,7 @@ static constexpr uint32_t kArm64CalleeSaveFpArgSpills =
     (1 << art::arm64::D0) | (1 << art::arm64::D1) | (1 << art::arm64::D2) |
     (1 << art::arm64::D3) | (1 << art::arm64::D4) | (1 << art::arm64::D5) |
     (1 << art::arm64::D6) | (1 << art::arm64::D7);
-static constexpr uint32_t kArm64FpAllSpills =
+static constexpr uint32_t kArm64CalleeSaveFpAllSpills =
     (1 << art::arm64::D8)  | (1 << art::arm64::D9)  | (1 << art::arm64::D10) |
     (1 << art::arm64::D11)  | (1 << art::arm64::D12)  | (1 << art::arm64::D13) |
     (1 << art::arm64::D14)  | (1 << art::arm64::D15);
@@ -68,7 +68,7 @@ constexpr uint32_t Arm64CalleeSaveCoreSpills(Runtime::CalleeSaveType type) {
 constexpr uint32_t Arm64CalleeSaveFpSpills(Runtime::CalleeSaveType type) {
   return kArm64CalleeSaveFpAlwaysSpills | kArm64CalleeSaveFpRefSpills |
       (type == Runtime::kRefsAndArgs ? kArm64CalleeSaveFpArgSpills: 0) |
-      (type == Runtime::kSaveAll ? kArm64FpAllSpills : 0);
+      (type == Runtime::kSaveAll ? kArm64CalleeSaveFpAllSpills : 0);
 }
 
 constexpr uint32_t Arm64CalleeSaveFrameSize(Runtime::CalleeSaveType type) {
diff --git a/runtime/arch/mips/quick_entrypoints_mips.S b/runtime/arch/mips/quick_entrypoints_mips.S
index 905b8676be..aff2da755c 100644
--- a/runtime/arch/mips/quick_entrypoints_mips.S
+++ b/runtime/arch/mips/quick_entrypoints_mips.S
@@ -65,9 +65,10 @@
     .cfi_rel_offset 16, 20
     # 1 word for alignment, 4 open words for args $a0-$a3, bottom will hold Method*
 
-    ld $t0, _ZN3art7Runtime9instance_E
+    lw $t0, %got(_ZN3art7Runtime9instance_E)($gp)
+    lw $t0, 0($t0)
     THIS_LOAD_REQUIRES_READ_BARRIER
-    ld $t0, RUNTIME_SAVE_ALL_CALLEE_SAVE_FRAME_OFFSET($t0)
+    lw $t0, RUNTIME_SAVE_ALL_CALLEE_SAVE_FRAME_OFFSET($t0)
     sw $t0, 0($sp)                                # Place Method* at bottom of stack.
     sw $sp, THREAD_TOP_QUICK_FRAME_OFFSET(rSELF)  # Place sp in Thread::Current()->top_quick_frame.
 .endm
@@ -107,9 +108,10 @@
     .cfi_rel_offset 18, 28
     # 3 words for alignment and extra args, 4 open words for args $a0-$a3, bottom will hold Method*
 
-    ld $t0, _ZN3art7Runtime9instance_E
+    lw $t0, %got(_ZN3art7Runtime9instance_E)($gp)
+    lw $t0, 0($t0)
     THIS_LOAD_REQUIRES_READ_BARRIER
-    ld $t0, RUNTIME_REFS_ONLY_CALLEE_SAVE_FRAME_OFFSET($t0)
+    lw $t0, RUNTIME_REFS_ONLY_CALLEE_SAVE_FRAME_OFFSET($t0)
     sw $t0, 0($sp)                                # Place Method* at bottom of stack.
     sw $sp, THREAD_TOP_QUICK_FRAME_OFFSET(rSELF)  # Place sp in Thread::Current()->top_quick_frame.
 .endm
@@ -201,9 +203,10 @@
     .cfi_rel_offset 5, 4
     # bottom will hold Method*
 
-    ld $t0, _ZN3art7Runtime9instance_E
+    lw $t0, %got(_ZN3art7Runtime9instance_E)($gp)
+    lw $t0, 0($t0)
     THIS_LOAD_REQUIRES_READ_BARRIER
-    ld $t0, RUNTIME_REFS_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET($t0)
+    lw $t0, RUNTIME_REFS_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET($t0)
     sw $t0, 0($sp)                                # Place Method* at bottom of stack.
     sw $sp, THREAD_TOP_QUICK_FRAME_OFFSET(rSELF)  # Place sp in Thread::Current()->top_quick_frame.
 .endm
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index f6717fb006..854effd772 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -5284,6 +5284,7 @@ mirror::ArtMethod* ClassLinker::ResolveMethod(const DexFile& dex_file, uint32_t
       break;
     default:
       LOG(FATAL) << "Unreachable - invocation type: " << type;
+      UNREACHABLE();
   }
   if (resolved == nullptr) {
     // Search by name, which works across dex files.
diff --git a/runtime/dex_file.cc b/runtime/dex_file.cc
index 78cbe58416..761441eae6 100644
--- a/runtime/dex_file.cc
+++ b/runtime/dex_file.cc
@@ -1182,13 +1182,14 @@ void EncodedStaticFieldValueIterator::Next() {
   case kArray:
   case kAnnotation:
     UNIMPLEMENTED(FATAL) << ": type " << type_;
-    break;
+    UNREACHABLE();
   case kNull:
     jval_.l = NULL;
     width = 0;
     break;
   default:
     LOG(FATAL) << "Unreached";
+    UNREACHABLE();
   }
   ptr_ += width;
 }
diff --git a/runtime/dex_instruction.cc b/runtime/dex_instruction.cc
index af38433abc..a802759474 100644
--- a/runtime/dex_instruction.cc
+++ b/runtime/dex_instruction.cc
@@ -113,7 +113,7 @@ size_t Instruction::SizeInCodeUnitsComplexOpcode() const {
         return 1;  // NOP.
       } else {
         LOG(FATAL) << "Unreachable: " << DumpString(nullptr);
-        return 0;
+        UNREACHABLE();
       }
   }
 }
diff --git a/runtime/dwarf.h b/runtime/dwarf.h
index 370ad95732..7daa5f1485 100644
--- a/runtime/dwarf.h
+++ b/runtime/dwarf.h
@@ -364,38 +364,38 @@ enum Operation : uint16_t {
   DW_OP_reg29 = 0x6d,
   DW_OP_reg30 = 0x6e,
   DW_OP_reg31 = 0x6f,
-  DW_OP_breg0 = 0x50,
-  DW_OP_breg1 = 0x51,
-  DW_OP_breg2 = 0x52,
-  DW_OP_breg3 = 0x53,
-  DW_OP_breg4 = 0x54,
-  DW_OP_breg5 = 0x55,
-  DW_OP_breg6 = 0x56,
-  DW_OP_breg7 = 0x57,
-  DW_OP_breg8 = 0x58,
-  DW_OP_breg9 = 0x59,
-  DW_OP_breg10 = 0x5a,
-  DW_OP_breg11 = 0x5b,
-  DW_OP_breg12 = 0x5c,
-  DW_OP_breg13 = 0x5d,
-  DW_OP_breg14 = 0x5e,
-  DW_OP_breg15 = 0x5f,
-  DW_OP_breg16 = 0x60,
-  DW_OP_breg17 = 0x61,
-  DW_OP_breg18 = 0x62,
-  DW_OP_breg19 = 0x63,
-  DW_OP_breg20 = 0x64,
-  DW_OP_breg21 = 0x65,
-  DW_OP_breg22 = 0x66,
-  DW_OP_breg23 = 0x67,
-  DW_OP_breg24 = 0x68,
-  DW_OP_breg25 = 0x69,
-  DW_OP_breg26 = 0x6a,
-  DW_OP_breg27 = 0x6b,
-  DW_OP_breg28 = 0x6c,
-  DW_OP_breg29 = 0x6d,
-  DW_OP_breg30 = 0x6e,
-  DW_OP_breg31 = 0x6f,
+  DW_OP_breg0 = 0x70,
+  DW_OP_breg1 = 0x71,
+  DW_OP_breg2 = 0x72,
+  DW_OP_breg3 = 0x73,
+  DW_OP_breg4 = 0x74,
+  DW_OP_breg5 = 0x75,
+  DW_OP_breg6 = 0x76,
+  DW_OP_breg7 = 0x77,
+  DW_OP_breg8 = 0x78,
+  DW_OP_breg9 = 0x79,
+  DW_OP_breg10 = 0x7a,
+  DW_OP_breg11 = 0x7b,
+  DW_OP_breg12 = 0x7c,
+  DW_OP_breg13 = 0x7d,
+  DW_OP_breg14 = 0x7e,
+  DW_OP_breg15 = 0x7f,
+  DW_OP_breg16 = 0x80,
+  DW_OP_breg17 = 0x81,
+  DW_OP_breg18 = 0x82,
+  DW_OP_breg19 = 0x83,
+  DW_OP_breg20 = 0x84,
+  DW_OP_breg21 = 0x85,
+  DW_OP_breg22 = 0x86,
+  DW_OP_breg23 = 0x87,
+  DW_OP_breg24 = 0x88,
+  DW_OP_breg25 = 0x89,
+  DW_OP_breg26 = 0x8a,
+  DW_OP_breg27 = 0x8b,
+  DW_OP_breg28 = 0x8c,
+  DW_OP_breg29 = 0x8d,
+  DW_OP_breg30 = 0x8e,
+  DW_OP_breg31 = 0x8f,
   DW_OP_regx = 0x90,
   DW_OP_fbreg = 0x91,
   DW_OP_bregx = 0x92,
diff --git a/runtime/entrypoints/entrypoint_utils-inl.h b/runtime/entrypoints/entrypoint_utils-inl.h
index 9fb9a3ba56..ccc5d835c9 100644
--- a/runtime/entrypoints/entrypoint_utils-inl.h
+++ b/runtime/entrypoints/entrypoint_utils-inl.h
@@ -499,11 +499,8 @@ static inline mirror::ArtField* FindFieldFast(uint32_t field_idx,
     case StaticPrimitiveRead:    is_primitive = true;  is_set = false; is_static = true;  break;
     case StaticPrimitiveWrite:   is_primitive = true;  is_set = true;  is_static = true;  break;
     default:
-      LOG(FATAL) << "UNREACHABLE";  // Assignment below to avoid GCC warnings.
-      is_primitive = true;
-      is_set = true;
-      is_static = true;
-      break;
+      LOG(FATAL) << "UNREACHABLE";
+      UNREACHABLE();
   }
   if (UNLIKELY(resolved_field->IsStatic() != is_static)) {
     // Incompatible class change.
diff --git a/runtime/entrypoints/portable/portable_trampoline_entrypoints.cc b/runtime/entrypoints/portable/portable_trampoline_entrypoints.cc
index c3664bfac6..61d66ba461 100644
--- a/runtime/entrypoints/portable/portable_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/portable/portable_trampoline_entrypoints.cc
@@ -172,7 +172,7 @@ class BuildPortableShadowFrameVisitor : public PortableArgumentVisitor {
         break;
       case Primitive::kPrimVoid:
         LOG(FATAL) << "UNREACHABLE";
-        break;
+        UNREACHABLE();
     }
     ++cur_reg_;
   }
@@ -261,8 +261,7 @@ class BuildPortableArgumentVisitor : public PortableArgumentVisitor {
         break;
       case Primitive::kPrimVoid:
         LOG(FATAL) << "UNREACHABLE";
-        val.j = 0;
-        break;
+        UNREACHABLE();
     }
     args_.push_back(val);
   }
diff --git a/runtime/entrypoints/quick/quick_alloc_entrypoints.cc b/runtime/entrypoints/quick/quick_alloc_entrypoints.cc
index a2869ecc45..7dbfdd5679 100644
--- a/runtime/entrypoints/quick/quick_alloc_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_alloc_entrypoints.cc
@@ -248,7 +248,8 @@ void ResetQuickAllocEntryPoints(QuickEntryPoints* qpoints) {
     }
 #endif
     default: {
-      LOG(FATAL) << "Unimplemented";
+      UNIMPLEMENTED(FATAL);
+      UNREACHABLE();
     }
   }
 }
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index af341bb1fa..e0aab75ddf 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -50,15 +50,19 @@ class QuickArgumentVisitor {
   // | arg1 spill |  |
   // | Method*    | ---
   // | LR         |
-  // | ...        |    callee saves
-  // | R3         |    arg3
-  // | R2         |    arg2
-  // | R1         |    arg1
-  // | R0         |    padding
+  // | ...        |    4x6 bytes callee saves
+  // | R3         |
+  // | R2         |
+  // | R1         |
+  // | S15        |
+  // | :          |
+  // | S0         |
+  // |            |    4x2 bytes padding
   // | Method*    |  <- sp
-  static constexpr bool kQuickSoftFloatAbi = true;  // This is a soft float ABI.
-  static constexpr size_t kNumQuickGprArgs = 3;  // 3 arguments passed in GPRs.
-  static constexpr size_t kNumQuickFprArgs = 0;  // 0 arguments passed in FPRs.
+  static constexpr bool kQuickSoftFloatAbi = kArm32QuickCodeUseSoftFloat;
+  static constexpr bool kQuickDoubleRegAlignedFloatBackFilled = !kArm32QuickCodeUseSoftFloat;
+  static constexpr size_t kNumQuickGprArgs = 3;
+  static constexpr size_t kNumQuickFprArgs = kArm32QuickCodeUseSoftFloat ? 0 : 16;
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset =
       arm::ArmCalleeSaveFpr1Offset(Runtime::kRefsAndArgs);  // Offset of first FPR arg.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset =
@@ -90,6 +94,7 @@ class QuickArgumentVisitor {
   // |            |    padding
   // | Method*    |  <- sp
   static constexpr bool kQuickSoftFloatAbi = false;  // This is a hard float ABI.
+  static constexpr bool kQuickDoubleRegAlignedFloatBackFilled = false;
   static constexpr size_t kNumQuickGprArgs = 7;  // 7 arguments passed in GPRs.
   static constexpr size_t kNumQuickFprArgs = 8;  // 8 arguments passed in FPRs.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset =
@@ -117,6 +122,7 @@ class QuickArgumentVisitor {
   // | A1         |    arg1
   // | A0/Method* |  <- sp
   static constexpr bool kQuickSoftFloatAbi = true;  // This is a soft float ABI.
+  static constexpr bool kQuickDoubleRegAlignedFloatBackFilled = false;
   static constexpr size_t kNumQuickGprArgs = 3;  // 3 arguments passed in GPRs.
   static constexpr size_t kNumQuickFprArgs = 0;  // 0 arguments passed in FPRs.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 0;  // Offset of first FPR arg.
@@ -141,6 +147,7 @@ class QuickArgumentVisitor {
   // | ECX         |    arg1
   // | EAX/Method* |  <- sp
   static constexpr bool kQuickSoftFloatAbi = true;  // This is a soft float ABI.
+  static constexpr bool kQuickDoubleRegAlignedFloatBackFilled = false;
   static constexpr size_t kNumQuickGprArgs = 3;  // 3 arguments passed in GPRs.
   static constexpr size_t kNumQuickFprArgs = 0;  // 0 arguments passed in FPRs.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 0;  // Offset of first FPR arg.
@@ -178,6 +185,7 @@ class QuickArgumentVisitor {
   // | Padding         |
   // | RDI/Method*     |  <- sp
   static constexpr bool kQuickSoftFloatAbi = false;  // This is a hard float ABI.
+  static constexpr bool kQuickDoubleRegAlignedFloatBackFilled = false;
   static constexpr size_t kNumQuickGprArgs = 5;  // 5 arguments passed in GPRs.
   static constexpr size_t kNumQuickFprArgs = 8;  // 8 arguments passed in FPRs.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 16;  // Offset of first FPR arg.
@@ -222,8 +230,16 @@ class QuickArgumentVisitor {
           fpr_args_(reinterpret_cast<uint8_t*>(sp) + kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset),
           stack_args_(reinterpret_cast<uint8_t*>(sp) + kQuickCalleeSaveFrame_RefAndArgs_FrameSize
                       + StackArgumentStartFromShorty(is_static, shorty, shorty_len)),
-          gpr_index_(0), fpr_index_(0), stack_index_(0), cur_type_(Primitive::kPrimVoid),
-          is_split_long_or_double_(false) {}
+          gpr_index_(0), fpr_index_(0), fpr_double_index_(0), stack_index_(0),
+          cur_type_(Primitive::kPrimVoid), is_split_long_or_double_(false) {
+    COMPILE_ASSERT(kQuickSoftFloatAbi == (kNumQuickFprArgs == 0), knum_of_quick_fpr_arg_unexpected);
+    COMPILE_ASSERT(!(kQuickSoftFloatAbi && kQuickDoubleRegAlignedFloatBackFilled),
+        kdouble_align_unexpected);
+    // For register alignment, we want to assume that counters(fpr_double_index_) are even if the
+    // next register is even.
+    COMPILE_ASSERT(!kQuickDoubleRegAlignedFloatBackFilled || kNumQuickFprArgs % 2 == 0,
+        knum_quick_fpr_args_not_even);
+  }
 
   virtual ~QuickArgumentVisitor() {}
 
@@ -237,7 +253,11 @@ class QuickArgumentVisitor {
     if (!kQuickSoftFloatAbi) {
       Primitive::Type type = GetParamPrimitiveType();
       if (UNLIKELY((type == Primitive::kPrimDouble) || (type == Primitive::kPrimFloat))) {
-        if ((kNumQuickFprArgs != 0) && (fpr_index_ + 1 < kNumQuickFprArgs + 1)) {
+        if (type == Primitive::kPrimDouble && kQuickDoubleRegAlignedFloatBackFilled) {
+          if (fpr_double_index_ + 2 < kNumQuickFprArgs + 1) {
+            return fpr_args_ + (fpr_double_index_ * GetBytesPerFprSpillLocation(kRuntimeISA));
+          }
+        } else if (fpr_index_ + 1 < kNumQuickFprArgs + 1) {
           return fpr_args_ + (fpr_index_ * GetBytesPerFprSpillLocation(kRuntimeISA));
         }
         return stack_args_ + (stack_index_ * kBytesStackArgLocation);
@@ -268,28 +288,30 @@ class QuickArgumentVisitor {
 
   uint64_t ReadSplitLongParam() const {
     DCHECK(IsSplitLongOrDouble());
+    // Read low half from register.
     uint64_t low_half = *reinterpret_cast<uint32_t*>(GetParamAddress());
-    uint64_t high_half = *reinterpret_cast<uint32_t*>(stack_args_);
+    // Read high half from the stack. As current stack_index_ indexes the argument, the high part
+    // index should be (stack_index_ + 1).
+    uint64_t high_half = *reinterpret_cast<uint32_t*>(stack_args_
+        + (stack_index_ + 1) * kBytesStackArgLocation);
     return (low_half & 0xffffffffULL) | (high_half << 32);
   }
 
   void VisitArguments() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    // This implementation doesn't support reg-spill area for hard float
-    // ABI targets such as x86_64 and aarch64. So, for those targets whose
-    // 'kQuickSoftFloatAbi' is 'false':
-    //     (a) 'stack_args_' should point to the first method's argument
-    //     (b) whatever the argument type it is, the 'stack_index_' should
-    //         be moved forward along with every visiting.
+    // (a) 'stack_args_' should point to the first method's argument
+    // (b) whatever the argument type it is, the 'stack_index_' should
+    //     be moved forward along with every visiting.
     gpr_index_ = 0;
     fpr_index_ = 0;
+    if (kQuickDoubleRegAlignedFloatBackFilled) {
+      fpr_double_index_ = 0;
+    }
     stack_index_ = 0;
     if (!is_static_) {  // Handle this.
       cur_type_ = Primitive::kPrimNot;
       is_split_long_or_double_ = false;
       Visit();
-      if (!kQuickSoftFloatAbi || kNumQuickGprArgs == 0) {
-        stack_index_++;
-      }
+      stack_index_++;
       if (kNumQuickGprArgs > 0) {
         gpr_index_++;
       }
@@ -305,9 +327,7 @@ class QuickArgumentVisitor {
         case Primitive::kPrimInt:
           is_split_long_or_double_ = false;
           Visit();
-          if (!kQuickSoftFloatAbi || kNumQuickGprArgs == gpr_index_) {
-            stack_index_++;
-          }
+          stack_index_++;
           if (gpr_index_ < kNumQuickGprArgs) {
             gpr_index_++;
           }
@@ -315,17 +335,24 @@ class QuickArgumentVisitor {
         case Primitive::kPrimFloat:
           is_split_long_or_double_ = false;
           Visit();
+          stack_index_++;
           if (kQuickSoftFloatAbi) {
             if (gpr_index_ < kNumQuickGprArgs) {
               gpr_index_++;
-            } else {
-              stack_index_++;
             }
           } else {
-            if ((kNumQuickFprArgs != 0) && (fpr_index_ + 1 < kNumQuickFprArgs + 1)) {
+            if (fpr_index_ + 1 < kNumQuickFprArgs + 1) {
               fpr_index_++;
+              if (kQuickDoubleRegAlignedFloatBackFilled) {
+                // Double should not overlap with float.
+                // For example, if fpr_index_ = 3, fpr_double_index_ should be at least 4.
+                fpr_double_index_ = std::max(fpr_double_index_, RoundUp(fpr_index_, 2));
+                // Float should not overlap with double.
+                if (fpr_index_ % 2 == 0) {
+                  fpr_index_ = std::max(fpr_double_index_, fpr_index_);
+                }
+              }
             }
-            stack_index_++;
           }
           break;
         case Primitive::kPrimDouble:
@@ -334,42 +361,46 @@ class QuickArgumentVisitor {
             is_split_long_or_double_ = (GetBytesPerGprSpillLocation(kRuntimeISA) == 4) &&
                 ((gpr_index_ + 1) == kNumQuickGprArgs);
             Visit();
-            if (!kQuickSoftFloatAbi || kNumQuickGprArgs == gpr_index_) {
-              if (kBytesStackArgLocation == 4) {
-                stack_index_+= 2;
-              } else {
-                CHECK_EQ(kBytesStackArgLocation, 8U);
-                stack_index_++;
-              }
+            if (kBytesStackArgLocation == 4) {
+              stack_index_+= 2;
+            } else {
+              CHECK_EQ(kBytesStackArgLocation, 8U);
+              stack_index_++;
             }
             if (gpr_index_ < kNumQuickGprArgs) {
               gpr_index_++;
               if (GetBytesPerGprSpillLocation(kRuntimeISA) == 4) {
                 if (gpr_index_ < kNumQuickGprArgs) {
                   gpr_index_++;
-                } else if (kQuickSoftFloatAbi) {
-                  stack_index_++;
                 }
               }
             }
           } else {
             is_split_long_or_double_ = (GetBytesPerFprSpillLocation(kRuntimeISA) == 4) &&
-                ((fpr_index_ + 1) == kNumQuickFprArgs);
+                ((fpr_index_ + 1) == kNumQuickFprArgs) && !kQuickDoubleRegAlignedFloatBackFilled;
             Visit();
-            if ((kNumQuickFprArgs != 0) && (fpr_index_ + 1 < kNumQuickFprArgs + 1)) {
-              fpr_index_++;
-              if (GetBytesPerFprSpillLocation(kRuntimeISA) == 4) {
-                if ((kNumQuickFprArgs != 0) && (fpr_index_ + 1 < kNumQuickFprArgs + 1)) {
-                  fpr_index_++;
-                }
-              }
-            }
             if (kBytesStackArgLocation == 4) {
               stack_index_+= 2;
             } else {
               CHECK_EQ(kBytesStackArgLocation, 8U);
               stack_index_++;
             }
+            if (kQuickDoubleRegAlignedFloatBackFilled) {
+              if (fpr_double_index_ + 2 < kNumQuickFprArgs + 1) {
+                fpr_double_index_ += 2;
+                // Float should not overlap with double.
+                if (fpr_index_ % 2 == 0) {
+                  fpr_index_ = std::max(fpr_double_index_, fpr_index_);
+                }
+              }
+            } else if (fpr_index_ + 1 < kNumQuickFprArgs + 1) {
+              fpr_index_++;
+              if (GetBytesPerFprSpillLocation(kRuntimeISA) == 4) {
+                if (fpr_index_ + 1 < kNumQuickFprArgs + 1) {
+                  fpr_index_++;
+                }
+              }
+            }
           }
           break;
         default:
@@ -381,16 +412,8 @@ class QuickArgumentVisitor {
  private:
   static size_t StackArgumentStartFromShorty(bool is_static, const char* shorty,
                                              uint32_t shorty_len) {
-    if (kQuickSoftFloatAbi) {
-      CHECK_EQ(kNumQuickFprArgs, 0U);
-      return (kNumQuickGprArgs * GetBytesPerGprSpillLocation(kRuntimeISA))
-          + sizeof(StackReference<mirror::ArtMethod>) /* StackReference<ArtMethod> */;
-    } else {
-      // For now, there is no reg-spill area for the targets with
-      // hard float ABI. So, the offset pointing to the first method's
-      // parameter ('this' for non-static methods) should be returned.
-      return sizeof(StackReference<mirror::ArtMethod>);  // Skip StackReference<ArtMethod>.
-    }
+    // 'stack_args_' points to the first method's argument
+    return sizeof(StackReference<mirror::ArtMethod>);  // Skip StackReference<ArtMethod>.
   }
 
  protected:
@@ -403,7 +426,14 @@ class QuickArgumentVisitor {
   uint8_t* const fpr_args_;  // Address of FPR arguments in callee save frame.
   uint8_t* const stack_args_;  // Address of stack arguments in caller's frame.
   uint32_t gpr_index_;  // Index into spilled GPRs.
-  uint32_t fpr_index_;  // Index into spilled FPRs.
+  // Index into spilled FPRs.
+  // In case kQuickDoubleRegAlignedFloatBackFilled, it may index a hole while fpr_double_index_
+  // holds a higher register number.
+  uint32_t fpr_index_;
+  // Index into spilled FPRs for aligned double.
+  // Only used when kQuickDoubleRegAlignedFloatBackFilled. Next available double register indexed in
+  // terms of singles, may be behind fpr_index.
+  uint32_t fpr_double_index_;
   uint32_t stack_index_;  // Index into arguments on the stack.
   // The current type of argument during VisitArguments.
   Primitive::Type cur_type_;
@@ -456,7 +486,7 @@ void BuildQuickShadowFrameVisitor::Visit() {
       break;
     case Primitive::kPrimVoid:
       LOG(FATAL) << "UNREACHABLE";
-      break;
+      UNREACHABLE();
   }
   ++cur_reg_;
 }
@@ -564,8 +594,7 @@ void BuildQuickArgumentVisitor::Visit() {
       break;
     case Primitive::kPrimVoid:
       LOG(FATAL) << "UNREACHABLE";
-      val.j = 0;
-      break;
+      UNREACHABLE();
   }
   args_->push_back(val);
 }
@@ -943,8 +972,8 @@ template<class T> class BuildNativeCallFrameStateMachine {
         delegate_(delegate) {
     // For register alignment, we want to assume that counters (gpr_index_, fpr_index_) are even iff
     // the next register is even; counting down is just to make the compiler happy...
-    CHECK_EQ(kNumNativeGprArgs % 2, 0U);
-    CHECK_EQ(kNumNativeFprArgs % 2, 0U);
+    COMPILE_ASSERT(kNumNativeGprArgs % 2 == 0U, knum_native_gpr_args_not_even);
+    COMPILE_ASSERT(kNumNativeFprArgs % 2 == 0U, knum_native_fpr_args_not_even);
   }
 
   virtual ~BuildNativeCallFrameStateMachine() {}
@@ -1557,7 +1586,7 @@ void BuildGenericJniFrameVisitor::Visit() {
       break;
     case Primitive::kPrimVoid:
       LOG(FATAL) << "UNREACHABLE";
-      break;
+      UNREACHABLE();
   }
 }
 
diff --git a/runtime/gc/collector/mark_sweep.cc b/runtime/gc/collector/mark_sweep.cc
index 83da0639ea..ad3bb11d80 100644
--- a/runtime/gc/collector/mark_sweep.cc
+++ b/runtime/gc/collector/mark_sweep.cc
@@ -813,6 +813,7 @@ void MarkSweep::ScanGrayObjects(bool paused, uint8_t minimum_age) {
           break;
         default:
           LOG(FATAL) << "Unreachable";
+          UNREACHABLE();
         }
         TimingLogger::ScopedTiming t(name, GetTimings());
         ScanObjectVisitor visitor(this);
diff --git a/runtime/gc/gc_cause.cc b/runtime/gc/gc_cause.cc
index f0e1512adf..6be683df48 100644
--- a/runtime/gc/gc_cause.cc
+++ b/runtime/gc/gc_cause.cc
@@ -35,8 +35,8 @@ const char* PrettyCause(GcCause cause) {
     case kGcCauseTrim: return "HeapTrim";
     default:
       LOG(FATAL) << "Unreachable";
+      UNREACHABLE();
   }
-  return "";
 }
 
 std::ostream& operator<<(std::ostream& os, const GcCause& gc_cause) {
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index bceac4403d..c0008aac84 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -1325,8 +1325,9 @@ mirror::Object* Heap::AllocateInternalWithGc(Thread* self, AllocatorType allocat
               // Throw OOM by default.
               break;
             default: {
-              LOG(FATAL) << "Unimplemented homogeneous space compaction result "
-                         << static_cast<size_t>(result);
+              UNIMPLEMENTED(FATAL) << "homogeneous space compaction result: "
+                  << static_cast<size_t>(result);
+              UNREACHABLE();
             }
           }
           // Always print that we ran homogeneous space compation since this can cause jank.
@@ -1761,7 +1762,8 @@ void Heap::ChangeCollector(CollectorType collector_type) {
         break;
       }
       default: {
-        LOG(FATAL) << "Unimplemented";
+        UNIMPLEMENTED(FATAL);
+        UNREACHABLE();
       }
     }
     if (IsGcConcurrent()) {
diff --git a/runtime/gc/space/bump_pointer_space.cc b/runtime/gc/space/bump_pointer_space.cc
index 8f42642b17..0a55b52c08 100644
--- a/runtime/gc/space/bump_pointer_space.cc
+++ b/runtime/gc/space/bump_pointer_space.cc
@@ -201,8 +201,8 @@ void BumpPointerSpace::Walk(ObjectCallback* callback, void* arg) {
 }
 
 accounting::ContinuousSpaceBitmap::SweepCallback* BumpPointerSpace::GetSweepCallback() {
-  LOG(FATAL) << "Unimplemented";
-  return nullptr;
+  UNIMPLEMENTED(FATAL);
+  UNREACHABLE();
 }
 
 uint64_t BumpPointerSpace::GetBytesAllocated() {
diff --git a/runtime/gc/space/space.cc b/runtime/gc/space/space.cc
index bff28f6d19..b233805e4c 100644
--- a/runtime/gc/space/space.cc
+++ b/runtime/gc/space/space.cc
@@ -39,33 +39,33 @@ std::ostream& operator<<(std::ostream& os, const Space& space) {
 }
 
 DlMallocSpace* Space::AsDlMallocSpace() {
-  LOG(FATAL) << "Unreachable";
-  return nullptr;
+  UNIMPLEMENTED(FATAL) << "Unreachable";
+  UNREACHABLE();
 }
 
 RosAllocSpace* Space::AsRosAllocSpace() {
-  LOG(FATAL) << "Unreachable";
-  return nullptr;
+  UNIMPLEMENTED(FATAL) << "Unreachable";
+  UNREACHABLE();
 }
 
 ZygoteSpace* Space::AsZygoteSpace() {
-  LOG(FATAL) << "Unreachable";
-  return nullptr;
+  UNIMPLEMENTED(FATAL) << "Unreachable";
+  UNREACHABLE();
 }
 
 BumpPointerSpace* Space::AsBumpPointerSpace() {
-  LOG(FATAL) << "Unreachable";
-  return nullptr;
+  UNIMPLEMENTED(FATAL) << "Unreachable";
+  UNREACHABLE();
 }
 
 AllocSpace* Space::AsAllocSpace() {
-  LOG(FATAL) << "Unimplemented";
-  return nullptr;
+  UNIMPLEMENTED(FATAL) << "Unreachable";
+  UNREACHABLE();
 }
 
 ContinuousMemMapAllocSpace* Space::AsContinuousMemMapAllocSpace() {
-  LOG(FATAL) << "Unimplemented";
-  return nullptr;
+  UNIMPLEMENTED(FATAL) << "Unreachable";
+  UNREACHABLE();
 }
 
 DiscontinuousSpace::DiscontinuousSpace(const std::string& name,
diff --git a/runtime/gc/space/zygote_space.cc b/runtime/gc/space/zygote_space.cc
index 51d84f5acb..9de0548561 100644
--- a/runtime/gc/space/zygote_space.cc
+++ b/runtime/gc/space/zygote_space.cc
@@ -58,7 +58,8 @@ ZygoteSpace* ZygoteSpace::Create(const std::string& name, MemMap* mem_map,
 }
 
 void ZygoteSpace::Clear() {
-  LOG(FATAL) << "Unimplemented";
+  UNIMPLEMENTED(FATAL);
+  UNREACHABLE();
 }
 
 ZygoteSpace::ZygoteSpace(const std::string& name, MemMap* mem_map, size_t objects_allocated)
diff --git a/runtime/globals.h b/runtime/globals.h
index b7bd44d7c9..4d33196c98 100644
--- a/runtime/globals.h
+++ b/runtime/globals.h
@@ -112,6 +112,8 @@ static constexpr TraceClockSource kDefaultTraceClockSource = kTraceClockSourceWa
 
 static constexpr bool kDefaultMustRelocate = true;
 
+static constexpr bool kArm32QuickCodeUseSoftFloat = false;
+
 }  // namespace art
 
 #endif  // ART_RUNTIME_GLOBALS_H_
diff --git a/runtime/interpreter/interpreter.cc b/runtime/interpreter/interpreter.cc
index dfb03cdeb8..9de12f2401 100644
--- a/runtime/interpreter/interpreter.cc
+++ b/runtime/interpreter/interpreter.cc
@@ -325,7 +325,7 @@ template<bool do_access_check, bool transaction_active>
 JValue ExecuteGotoImpl(Thread* self, MethodHelper& mh, const DexFile::CodeItem* code_item,
                        ShadowFrame& shadow_frame, JValue result_register) {
   LOG(FATAL) << "UNREACHABLE";
-  exit(0);
+  UNREACHABLE();
 }
 // Explicit definitions of ExecuteGotoImpl.
 template<> SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
diff --git a/runtime/interpreter/interpreter_common.cc b/runtime/interpreter/interpreter_common.cc
index 3ccdd03136..c887a8877f 100644
--- a/runtime/interpreter/interpreter_common.cc
+++ b/runtime/interpreter/interpreter_common.cc
@@ -80,6 +80,7 @@ bool DoFieldGet(Thread* self, ShadowFrame& shadow_frame, const Instruction* inst
       break;
     default:
       LOG(FATAL) << "Unreachable: " << field_type;
+      UNREACHABLE();
   }
   return true;
 }
@@ -153,6 +154,7 @@ bool DoIGetQuick(ShadowFrame& shadow_frame, const Instruction* inst, uint16_t in
       break;
     default:
       LOG(FATAL) << "Unreachable: " << field_type;
+      UNREACHABLE();
   }
   return true;
 }
@@ -195,7 +197,7 @@ static JValue GetFieldValue(const ShadowFrame& shadow_frame, uint32_t vreg)
       break;
     default:
       LOG(FATAL) << "Unreachable: " << field_type;
-      break;
+      UNREACHABLE();
   }
   return field_value;
 }
@@ -285,6 +287,7 @@ bool DoFieldPut(Thread* self, const ShadowFrame& shadow_frame, const Instruction
     }
     default:
       LOG(FATAL) << "Unreachable: " << field_type;
+      UNREACHABLE();
   }
   return true;
 }
@@ -369,6 +372,7 @@ bool DoIPutQuick(const ShadowFrame& shadow_frame, const Instruction* inst, uint1
       break;
     default:
       LOG(FATAL) << "Unreachable: " << field_type;
+      UNREACHABLE();
   }
   return true;
 }
diff --git a/runtime/mirror/art_method-inl.h b/runtime/mirror/art_method-inl.h
index 664a412292..d262fd5211 100644
--- a/runtime/mirror/art_method-inl.h
+++ b/runtime/mirror/art_method-inl.h
@@ -171,7 +171,7 @@ inline bool ArtMethod::CheckIncompatibleClassChange(InvokeType type) {
     }
     default:
       LOG(FATAL) << "Unreachable - invocation type: " << type;
-      return true;
+      UNREACHABLE();
   }
 }
 
@@ -223,9 +223,7 @@ inline const uint8_t* ArtMethod::GetVmapTable() {
 }
 
 inline const uint8_t* ArtMethod::GetVmapTable(const void* code_pointer) {
-  if (IsOptimized()) {
-    LOG(FATAL) << "Unimplemented vmap table for optimized compiler";
-  }
+  CHECK(!IsOptimized()) << "Unimplemented vmap table for optimized compiler";
   DCHECK(code_pointer != nullptr);
   DCHECK(code_pointer == GetQuickOatCodePointer());
   uint32_t offset =
diff --git a/runtime/mirror/art_method.cc b/runtime/mirror/art_method.cc
index 9584d155ce..b2190048f9 100644
--- a/runtime/mirror/art_method.cc
+++ b/runtime/mirror/art_method.cc
@@ -43,7 +43,7 @@ namespace mirror {
 extern "C" void art_portable_invoke_stub(ArtMethod*, uint32_t*, uint32_t, Thread*, JValue*, char);
 extern "C" void art_quick_invoke_stub(ArtMethod*, uint32_t*, uint32_t, Thread*, JValue*,
                                       const char*);
-#ifdef __LP64__
+#if defined(__LP64__) || defined(__arm__)
 extern "C" void art_quick_invoke_static_stub(ArtMethod*, uint32_t*, uint32_t, Thread*, JValue*,
                                              const char*);
 #endif
@@ -396,7 +396,7 @@ void ArtMethod::Invoke(Thread* self, uint32_t* args, uint32_t args_size, JValue*
       }
 
       if (!IsPortableCompiled()) {
-#ifdef __LP64__
+#if defined(__LP64__) || defined(__arm__)
         if (!IsStatic()) {
           (*art_quick_invoke_stub)(this, args, args_size, self, result, shorty);
         } else {
diff --git a/runtime/mirror/object-inl.h b/runtime/mirror/object-inl.h
index b89da9d4fe..c9e60bc0b2 100644
--- a/runtime/mirror/object-inl.h
+++ b/runtime/mirror/object-inl.h
@@ -121,7 +121,7 @@ inline Object* Object::GetReadBarrierPointer() {
       OFFSET_OF_OBJECT_MEMBER(Object, x_rb_ptr_));
 #else
   LOG(FATAL) << "Unreachable";
-  return nullptr;
+  UNREACHABLE();
 #endif
 }
 
@@ -134,6 +134,7 @@ inline void Object::SetReadBarrierPointer(Object* rb_ptr) {
       OFFSET_OF_OBJECT_MEMBER(Object, x_rb_ptr_), rb_ptr);
 #else
   LOG(FATAL) << "Unreachable";
+  UNREACHABLE();
 #endif
 }
 
@@ -156,7 +157,7 @@ inline bool Object::AtomicSetReadBarrierPointer(Object* expected_rb_ptr, Object*
   return true;
 #else
   LOG(FATAL) << "Unreachable";
-  return false;
+  UNREACHABLE();
 #endif
 }
 
@@ -166,13 +167,12 @@ inline void Object::AssertReadBarrierPointer() const {
     DCHECK(obj->GetReadBarrierPointer() == nullptr)
         << "Bad Baker pointer: obj=" << reinterpret_cast<void*>(obj)
         << " ptr=" << reinterpret_cast<void*>(obj->GetReadBarrierPointer());
-  } else if (kUseBrooksReadBarrier) {
+  } else {
+    CHECK(kUseBrooksReadBarrier);
     Object* obj = const_cast<Object*>(this);
     DCHECK_EQ(obj, obj->GetReadBarrierPointer())
         << "Bad Brooks pointer: obj=" << reinterpret_cast<void*>(obj)
         << " ptr=" << reinterpret_cast<void*>(obj->GetReadBarrierPointer());
-  } else {
-    LOG(FATAL) << "Unreachable";
   }
 }
 
diff --git a/runtime/monitor.cc b/runtime/monitor.cc
index 6123934d3c..5020ced396 100644
--- a/runtime/monitor.cc
+++ b/runtime/monitor.cc
@@ -877,7 +877,7 @@ uint32_t Monitor::GetLockOwnerThreadId(mirror::Object* obj) {
     }
     default: {
       LOG(FATAL) << "Unreachable";
-      return ThreadList::kInvalidThreadId;
+      UNREACHABLE();
     }
   }
 }
@@ -1032,7 +1032,7 @@ bool Monitor::IsValidLockWord(LockWord lock_word) {
       return true;
     default:
       LOG(FATAL) << "Unreachable";
-      return false;
+      UNREACHABLE();
   }
 }
 
diff --git a/runtime/native/dalvik_system_VMRuntime.cc b/runtime/native/dalvik_system_VMRuntime.cc
index ec7d82db8e..c35bb30b48 100644
--- a/runtime/native/dalvik_system_VMRuntime.cc
+++ b/runtime/native/dalvik_system_VMRuntime.cc
@@ -326,6 +326,7 @@ static void PreloadDexCachesResolveMethod(Handle<mirror::DexCache> dex_cache, ui
       break;
     default:
       LOG(FATAL) << "Unreachable - invocation type: " << invoke_type;
+      UNREACHABLE();
   }
   if (method == NULL) {
     return;
diff --git a/runtime/native/java_lang_System.cc b/runtime/native/java_lang_System.cc
index ee99e78067..43681a70fd 100644
--- a/runtime/native/java_lang_System.cc
+++ b/runtime/native/java_lang_System.cc
@@ -93,7 +93,7 @@ static void System_arraycopy(JNIEnv* env, jclass, jobject javaSrc, jint srcPos,
     switch (dstComponentPrimitiveType) {
       case Primitive::kPrimVoid:
         LOG(FATAL) << "Unreachable, cannot have arrays of type void";
-        return;
+        UNREACHABLE();
       case Primitive::kPrimBoolean:
       case Primitive::kPrimByte:
         DCHECK_EQ(Primitive::ComponentSize(dstComponentPrimitiveType), 1U);
@@ -122,7 +122,7 @@ static void System_arraycopy(JNIEnv* env, jclass, jobject javaSrc, jint srcPos,
       }
       default:
         LOG(FATAL) << "Unknown array type: " << PrettyTypeOf(srcArray);
-        return;
+        UNREACHABLE();
     }
   }
   // If one of the arrays holds a primitive type the other array must hold the exact same type.
diff --git a/runtime/quick_exception_handler.h b/runtime/quick_exception_handler.h
index b93769cb97..cf1ecbf29a 100644
--- a/runtime/quick_exception_handler.h
+++ b/runtime/quick_exception_handler.h
@@ -40,6 +40,7 @@ class QuickExceptionHandler {
 
   ~QuickExceptionHandler() {
     LOG(FATAL) << "UNREACHABLE";  // Expected to take long jump.
+    UNREACHABLE();
   }
 
   void FindCatch(const ThrowLocation& throw_location, mirror::Throwable* exception,
diff --git a/runtime/stack.cc b/runtime/stack.cc
index 0cdc984e87..0adf0313ff 100644
--- a/runtime/stack.cc
+++ b/runtime/stack.cc
@@ -127,7 +127,8 @@ mirror::Object* StackVisitor::GetThisObject() const {
     }
   } else if (m->IsOptimized()) {
     // TODO: Implement, currently only used for exceptions when jdwp is enabled.
-    LOG(WARNING) << "StackVisitor::GetThisObject is unimplemented with the optimizing compiler";
+    UNIMPLEMENTED(WARNING)
+        << "StackVisitor::GetThisObject is unimplemented with the optimizing compiler";
     return nullptr;
   } else {
     const DexFile::CodeItem* code_item = m->GetCodeItem();
diff --git a/runtime/stack_map.h b/runtime/stack_map.h
index b1c46a9771..a58ecab17d 100644
--- a/runtime/stack_map.h
+++ b/runtime/stack_map.h
@@ -286,7 +286,7 @@ class CodeInfo {
       }
     }
     LOG(FATAL) << "Unreachable";
-    return StackMap(MemoryRegion());
+    UNREACHABLE();
   }
 
   StackMap GetStackMapForNativePcOffset(uint32_t native_pc_offset) {
@@ -298,7 +298,7 @@ class CodeInfo {
       }
     }
     LOG(FATAL) << "Unreachable";
-    return StackMap(MemoryRegion());
+    UNREACHABLE();
   }
 
  private:
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 9c04133e3a..da82c766f0 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -1971,6 +1971,7 @@ void Thread::QuickDeliverException() {
   exception_handler.UpdateInstrumentationStack();
   exception_handler.DoLongJump();
   LOG(FATAL) << "UNREACHABLE";
+  UNREACHABLE();
 }
 
 Context* Thread::GetLongJumpContext() {
diff --git a/runtime/verifier/register_line.h b/runtime/verifier/register_line.h
index c7fd369472..8f7823ac09 100644
--- a/runtime/verifier/register_line.h
+++ b/runtime/verifier/register_line.h
@@ -118,9 +118,7 @@ class RegisterLine {
 
   void FillWithGarbage() {
     memset(&line_, 0xf1, num_regs_ * sizeof(uint16_t));
-    while (!monitors_.empty()) {
-      monitors_.pop_back();
-    }
+    monitors_.clear();
     reg_to_lock_depths_.clear();
   }
 
diff --git a/test/800-smali/expected.txt b/test/800-smali/expected.txt
index 468e7a6ee1..4002fbf7a5 100644
--- a/test/800-smali/expected.txt
+++ b/test/800-smali/expected.txt
@@ -1,2 +1,3 @@
 b/17790197
+FloatBadArgReg
 Done!
diff --git a/test/800-smali/smali/FloatBadArgReg.smali b/test/800-smali/smali/FloatBadArgReg.smali
new file mode 100644
index 0000000000..719ba093d5
--- /dev/null
+++ b/test/800-smali/smali/FloatBadArgReg.smali
@@ -0,0 +1,16 @@
+.class public LFloatBadArgReg;
+
+.super Ljava/lang/Object;
+
+.method public static getInt(I)I
+    .registers 2
+    const/4 v0, 0x0
+    if-ne v0, v0, :after
+    float-to-int v0, v0
+    :exit
+    add-int/2addr v0, v1
+    return v0
+    :after
+    move v1, v0
+    goto :exit
+.end method
diff --git a/test/800-smali/src/Main.java b/test/800-smali/src/Main.java
index 0ef3a9d195..c86470ce67 100644
--- a/test/800-smali/src/Main.java
+++ b/test/800-smali/src/Main.java
@@ -49,6 +49,8 @@ public class Main {
         testCases = new LinkedList<TestCase>();
 
         testCases.add(new TestCase("b/17790197", "B17790197", "getInt", null, null, 100));
+        testCases.add(new TestCase("FloatBadArgReg", "FloatBadArgReg", "getInt",
+            new Object[]{100}, null, 100));
     }
 
     public void runTests() {
diff --git a/test/Android.run-test.mk b/test/Android.run-test.mk
index 5b95beb96e..2de4d5ca73 100644
--- a/test/Android.run-test.mk
+++ b/test/Android.run-test.mk
@@ -67,7 +67,10 @@ TEST_ART_RUN_TEST_BUILD_RULES :=
 # General rules to build and run a run-test.
 
 TARGET_TYPES := host target
-PREBUILD_TYPES := prebuild
+PREBUILD_TYPES :=
+ifeq ($(ART_TEST_RUN_TEST_PREBUILD),true)
+  PREBUILD_TYPES += prebuild
+endif
 ifeq ($(ART_TEST_RUN_TEST_NO_PREBUILD),true)
   PREBUILD_TYPES += no-prebuild
 endif
@@ -117,8 +120,12 @@ endif
 ifeq ($(ART_TEST_RUN_TEST_NDEBUG),true)
   RUN_TYPES += ndebug
 endif
-ADDRESS_SIZES_TARGET := $(ART_PHONY_TEST_TARGET_SUFFIX) $(2ND_ART_PHONY_TEST_TARGET_SUFFIX)
-ADDRESS_SIZES_HOST := $(ART_PHONY_TEST_HOST_SUFFIX) $(2ND_ART_PHONY_TEST_HOST_SUFFIX)
+ADDRESS_SIZES_TARGET := $(ART_PHONY_TEST_TARGET_SUFFIX)
+ADDRESS_SIZES_HOST := $(ART_PHONY_TEST_HOST_SUFFIX)
+ifeq ($(ART_TEST_RUN_TEST_2ND_ARCH),true)
+  ADDRESS_SIZES_TARGET += $(2ND_ART_PHONY_TEST_TARGET_SUFFIX)
+  ADDRESS_SIZES_HOST += $(2ND_ART_PHONY_TEST_HOST_SUFFIX)
+endif
 ALL_ADDRESS_SIZES := 64 32
 
 # List all run test names with number arguments agreeing with the comment above.