54 files changed, 1563 insertions, 673 deletions
diff --git a/Android.mk b/Android.mk
index 86d29b2055..3e5e44a6ce 100644
--- a/Android.mk
+++ b/Android.mk
@@ -40,8 +40,6 @@ clean-oat: clean-oat-host clean-oat-target
 
 .PHONY: clean-oat-host
 clean-oat-host:
-	rm -rf $(ART_NATIVETEST_OUT)
-	rm -rf $(ART_TEST_OUT)
 	rm -f $(HOST_CORE_IMG_OUT)
 	rm -f $(HOST_CORE_OAT_OUT)
 	rm -f $(HOST_OUT_JAVA_LIBRARIES)/$(ART_HOST_ARCH)/*.odex
@@ -58,7 +56,10 @@ ifdef TARGET_2ND_ARCH
 endif
 	rm -rf $(DEXPREOPT_PRODUCT_DIR_FULL_PATH)
 	rm -f $(TARGET_OUT_UNSTRIPPED)/system/framework/*.odex
-	rm -f $(TARGET_OUT_UNSTRIPPED)/system/framework/*.oat
+	rm -f $(TARGET_OUT_UNSTRIPPED)/system/framework/*/*.oat
+	rm -f $(TARGET_OUT_UNSTRIPPED)/system/framework/*/*.art
+	rm -f $(TARGET_OUT)/framework/*/*.oat
+	rm -f $(TARGET_OUT)/framework/*/*.art
 	rm -f $(TARGET_OUT_APPS)/*.odex
 	rm -f $(TARGET_OUT_INTERMEDIATES)/JAVA_LIBRARIES/*_intermediates/javalib.odex
 	rm -f $(TARGET_OUT_INTERMEDIATES)/APPS/*_intermediates/*.odex
@@ -108,7 +109,7 @@ include $(art_path)/sigchainlib/Android.mk
 ART_HOST_DEPENDENCIES := \
 	$(ART_HOST_EXECUTABLES) \
 	$(HOST_OUT_JAVA_LIBRARIES)/core-libart-hostdex.jar \
-	$(HOST_LIBRARY_PATH)/libjavacore$(ART_HOST_SHLIB_EXTENSION)
+	$(ART_HOST_OUT_SHARED_LIBRARIES)/libjavacore$(ART_HOST_SHLIB_EXTENSION)
 ART_TARGET_DEPENDENCIES := \
 	$(ART_TARGET_EXECUTABLES) \
 	$(TARGET_OUT_JAVA_LIBRARIES)/core-libart.jar \
@@ -365,7 +366,7 @@ build-art-target: $(ART_TARGET_EXECUTABLES) $(ART_TARGET_GTEST_EXECUTABLES) $(TA
 ifeq ($(HOST_PREFER_32_BIT),true)
 art-host:   $(HOST_OUT_EXECUTABLES)/art $(HOST_OUT)/bin/dalvikvm32 $(HOST_OUT)/lib/libart.so $(HOST_OUT)/bin/dex2oat $(HOST_CORE_IMG_OUT) $(HOST_OUT)/lib/libjavacore.so $(HOST_OUT)/bin/dalvikvm
 else
-art-host:   $(HOST_OUT_EXECUTABLES)/art $(HOST_OUT)/bin/dalvikvm64 $(HOST_OUT)/bin/dalvikvm32 $(HOST_OUT)/lib/libart.so $(HOST_OUT)/bin/dex2oat $(HOST_CORE_IMG_OUT) $(HOST_OUT)/lib/libjavacore.so $(HOST_OUT)/bin/dalvikvm
+art-host:   $(HOST_OUT_EXECUTABLES)/art $(HOST_OUT)/bin/dalvikvm64 $(HOST_OUT)/bin/dalvikvm32 $(HOST_OUT)/lib/libart.so $(HOST_OUT)/bin/dex2oat $(HOST_CORE_IMG_OUT) $(HOST_OUT)/lib/libjavacore.so $(HOST_OUT)/lib64/libjavacore.so $(HOST_OUT)/bin/dalvikvm
 endif
 
 .PHONY: art-host-debug
diff --git a/build/Android.common.mk b/build/Android.common.mk
index 150b404d5b..39a734d2ed 100644
--- a/build/Android.common.mk
+++ b/build/Android.common.mk
@@ -59,7 +59,6 @@ ifeq ($(HOST_PREFER_32_BIT),true)
   2ND_ART_HOST_ARCH :=
   2ND_HOST_ARCH :=
   ART_HOST_LIBRARY_PATH := $(HOST_LIBRARY_PATH)
-  2ND_ART_HOST_LIBRARY_PATH :=
   ART_HOST_OUT_SHARED_LIBRARIES := $(2ND_HOST_OUT_SHARED_LIBRARIES)
   2ND_ART_HOST_OUT_SHARED_LIBRARIES :=
 else
@@ -71,7 +70,6 @@ else
   2ND_ART_HOST_ARCH := x86
   2ND_HOST_ARCH := x86
   ART_HOST_LIBRARY_PATH := $(HOST_LIBRARY_PATH)
-  2ND_ART_HOST_LIBRARY_PATH := $(HOST_LIBRARY_PATH)32
   ART_HOST_OUT_SHARED_LIBRARIES := $(HOST_OUT_SHARED_LIBRARIES)
   2ND_ART_HOST_OUT_SHARED_LIBRARIES := $(2ND_HOST_OUT_SHARED_LIBRARIES)
 endif
diff --git a/build/Android.gtest.mk b/build/Android.gtest.mk
index 7a0b46850d..e4676561c7 100644
--- a/build/Android.gtest.mk
+++ b/build/Android.gtest.mk
@@ -228,7 +228,7 @@ define define-art-gtest-rule-host
   gtest_exe := $$(HOST_OUT_EXECUTABLES)/$(1)$$($(2)ART_PHONY_TEST_HOST_SUFFIX)
   # Dependencies for all host gtests.
   gtest_deps := $$(HOST_CORE_DEX_LOCATIONS) \
-    $$($(2)ART_HOST_LIBRARY_PATH)/libjavacore$$(ART_HOST_SHLIB_EXTENSION)
+    $$($(2)ART_HOST_OUT_SHARED_LIBRARIES)/libjavacore$$(ART_HOST_SHLIB_EXTENSION)
 
 
 .PHONY: $$(gtest_rule)
diff --git a/compiler/dex/mir_graph.h b/compiler/dex/mir_graph.h
index f4e28e6c0e..398c7f641f 100644
--- a/compiler/dex/mir_graph.h
+++ b/compiler/dex/mir_graph.h
@@ -27,6 +27,7 @@
 #include "mir_method_info.h"
 #include "utils/arena_bit_vector.h"
 #include "utils/growable_array.h"
+#include "reg_location.h"
 #include "reg_storage.h"
 
 namespace art {
@@ -492,39 +493,6 @@ class ChildBlockIterator {
 };
 
 /*
- * Whereas a SSA name describes a definition of a Dalvik vreg, the RegLocation describes
- * the type of an SSA name (and, can also be used by code generators to record where the
- * value is located (i.e. - physical register, frame, spill, etc.).  For each SSA name (SReg)
- * there is a RegLocation.
- * A note on SSA names:
- *   o SSA names for Dalvik vRegs v0..vN will be assigned 0..N.  These represent the "vN_0"
- *     names.  Negative SSA names represent special values not present in the Dalvik byte code.
- *     For example, SSA name -1 represents an invalid SSA name, and SSA name -2 represents the
- *     the Method pointer.  SSA names < -2 are reserved for future use.
- *   o The vN_0 names for non-argument Dalvik should in practice never be used (as they would
- *     represent the read of an undefined local variable).  The first definition of the
- *     underlying Dalvik vReg will result in a vN_1 name.
- *
- * FIXME: The orig_sreg field was added as a workaround for llvm bitcode generation.  With
- * the latest restructuring, we should be able to remove it and rely on s_reg_low throughout.
- */
-struct RegLocation {
-  RegLocationType location:3;
-  unsigned wide:1;
-  unsigned defined:1;   // Do we know the type?
-  unsigned is_const:1;  // Constant, value in mir_graph->constant_values[].
-  unsigned fp:1;        // Floating point?
-  unsigned core:1;      // Non-floating point?
-  unsigned ref:1;       // Something GC cares about.
-  unsigned high_word:1;  // High word of pair?
-  unsigned home:1;      // Does this represent the home location?
-  RegStorage reg;       // Encoded physical registers.
-  int16_t s_reg_low;    // SSA name for low Dalvik word.
-  int16_t orig_sreg;    // TODO: remove after Bitcode gen complete
-                        // and consolidate usage w/ s_reg_low.
-};
-
-/*
  * Collection of information describing an invoke, and the destination of
  * the subsequent MOVE_RESULT (if applicable).  Collected as a unit to enable
  * more efficient invoke code generation.
diff --git a/compiler/dex/quick/arm64/arm64_lir.h b/compiler/dex/quick/arm64/arm64_lir.h
index b0865f1c3f..ea7f439cfb 100644
--- a/compiler/dex/quick/arm64/arm64_lir.h
+++ b/compiler/dex/quick/arm64/arm64_lir.h
@@ -179,6 +179,8 @@ constexpr RegStorage rs_wLR(RegStorage::kValid | rwLR);
 // RegisterLocation templates return values (following the hard-float calling convention).
 const RegLocation arm_loc_c_return =
     {kLocPhysReg, 0, 0, 0, 0, 0, 0, 0, 1, rs_w0, INVALID_SREG, INVALID_SREG};
+const RegLocation arm_loc_c_return_ref =
+    {kLocPhysReg, 0, 0, 0, 0, 0, 1, 0, 1, rs_x0, INVALID_SREG, INVALID_SREG};
 const RegLocation arm_loc_c_return_wide =
     {kLocPhysReg, 1, 0, 0, 0, 0, 0, 0, 1, rs_x0, INVALID_SREG, INVALID_SREG};
 const RegLocation arm_loc_c_return_float =
diff --git a/compiler/dex/quick/arm64/call_arm64.cc b/compiler/dex/quick/arm64/call_arm64.cc
index cfdf926fba..3e0b3cf314 100644
--- a/compiler/dex/quick/arm64/call_arm64.cc
+++ b/compiler/dex/quick/arm64/call_arm64.cc
@@ -132,7 +132,7 @@ void Arm64Mir2Lir::GenPackedSwitch(MIR* mir, uint32_t table_offset,
 
   // Load the displacement from the switch table
   RegStorage disp_reg = AllocTemp();
-  LoadBaseIndexed(table_base, As64BitReg(key_reg), As64BitReg(disp_reg), 2, k32);
+  LoadBaseIndexed(table_base, As64BitReg(key_reg), disp_reg, 2, k32);
 
   // Get base branch address.
   RegStorage branch_reg = AllocTempWide();
@@ -195,7 +195,7 @@ void Arm64Mir2Lir::GenMonitorEnter(int opt_flags, RegLocation rl_src) {
   // TUNING: How much performance we get when we inline this?
   // Since we've already flush all register.
   FlushAllRegs();
-  LoadValueDirectFixed(rl_src, rs_w0);
+  LoadValueDirectFixed(rl_src, rs_x0);  // = TargetRefReg(kArg0)
   LockCallTemps();  // Prepare for explicit register usage
   LIR* null_check_branch = nullptr;
   if ((opt_flags & MIR_IGNORE_NULL_CHECK) && !(cu_->disable_opt & (1 << kNullCheckElimination))) {
@@ -243,7 +243,7 @@ void Arm64Mir2Lir::GenMonitorExit(int opt_flags, RegLocation rl_src) {
   // TUNING: How much performance we get when we inline this?
   // Since we've already flush all register.
   FlushAllRegs();
-  LoadValueDirectFixed(rl_src, rs_w0);  // Get obj
+  LoadValueDirectFixed(rl_src, rs_x0);  // Get obj
   LockCallTemps();  // Prepare for explicit register usage
   LIR* null_check_branch = nullptr;
   if ((opt_flags & MIR_IGNORE_NULL_CHECK) && !(cu_->disable_opt & (1 << kNullCheckElimination))) {
@@ -291,12 +291,12 @@ void Arm64Mir2Lir::GenMoveException(RegLocation rl_dest) {
  */
 void Arm64Mir2Lir::MarkGCCard(RegStorage val_reg, RegStorage tgt_addr_reg) {
   RegStorage reg_card_base = AllocTempWide();
-  RegStorage reg_card_no = AllocTemp();
+  RegStorage reg_card_no = AllocTempWide();  // Needs to be wide as addr is ref=64b
   LIR* branch_over = OpCmpImmBranch(kCondEq, val_reg, 0, NULL);
   LoadWordDisp(rs_xSELF, Thread::CardTableOffset<8>().Int32Value(), reg_card_base);
   OpRegRegImm(kOpLsr, reg_card_no, tgt_addr_reg, gc::accounting::CardTable::kCardShift);
   // TODO(Arm64): generate "strb wB, [xB, wC, uxtw]" rather than "strb wB, [xB, xC]"?
-  StoreBaseIndexed(reg_card_base, As64BitReg(reg_card_no), As32BitReg(reg_card_base),
+  StoreBaseIndexed(reg_card_base, reg_card_no, As32BitReg(reg_card_base),
                    0, kUnsignedByte);
   LIR* target = NewLIR0(kPseudoTargetLabel);
   branch_over->target = target;
diff --git a/compiler/dex/quick/arm64/codegen_arm64.h b/compiler/dex/quick/arm64/codegen_arm64.h
index a9340a5ccf..f71713fc96 100644
--- a/compiler/dex/quick/arm64/codegen_arm64.h
+++ b/compiler/dex/quick/arm64/codegen_arm64.h
@@ -24,13 +24,8 @@
 
 namespace art {
 
-class Arm64Mir2Lir : public Mir2Lir {
+class Arm64Mir2Lir FINAL : public Mir2Lir {
  protected:
-  // If we detect a size error, FATAL out.
-  static constexpr bool kFailOnSizeError = false && kIsDebugBuild;
-  // If we detect a size error, report to LOG.
-  static constexpr bool kReportSizeError = false && kIsDebugBuild;
-
   // TODO: consolidate 64-bit target support.
   class InToRegStorageMapper {
    public:
@@ -102,7 +97,19 @@ class Arm64Mir2Lir : public Mir2Lir {
                            int offset, int check_value, LIR* target) OVERRIDE;
 
     // Required for target - register utilities.
-    RegStorage TargetReg(SpecialTargetRegister reg);
+    RegStorage TargetReg(SpecialTargetRegister reg) OVERRIDE;
+    RegStorage TargetReg(SpecialTargetRegister symbolic_reg, bool is_wide) OVERRIDE {
+      RegStorage reg = TargetReg(symbolic_reg);
+      if (is_wide) {
+        return (reg.Is64Bit()) ? reg : As64BitReg(reg);
+      } else {
+        return (reg.Is32Bit()) ? reg : As32BitReg(reg);
+      }
+    }
+    RegStorage TargetRefReg(SpecialTargetRegister symbolic_reg) OVERRIDE {
+      RegStorage reg = TargetReg(symbolic_reg);
+      return (reg.Is64Bit() ? reg : As64BitReg(reg));
+    }
     RegStorage GetArgMappingToPhysicalReg(int arg_num);
     RegLocation GetReturnAlt();
     RegLocation GetReturnWideAlt();
diff --git a/compiler/dex/quick/arm64/int_arm64.cc b/compiler/dex/quick/arm64/int_arm64.cc
index 56fb2dd018..18a4e8f2a5 100644
--- a/compiler/dex/quick/arm64/int_arm64.cc
+++ b/compiler/dex/quick/arm64/int_arm64.cc
@@ -181,6 +181,8 @@ LIR* Arm64Mir2Lir::OpRegCopyNoInsert(RegStorage r_dest, RegStorage r_src) {
 
   if (LIKELY(dest_is_fp == src_is_fp)) {
     if (LIKELY(!dest_is_fp)) {
+      DCHECK_EQ(r_dest.Is64Bit(), r_src.Is64Bit());
+
       // Core/core copy.
       // Copies involving the sp register require a different instruction.
       opcode = UNLIKELY(A64_REG_IS_SP(r_dest.GetReg())) ? kA64Add4RRdT : kA64Mov2rr;
@@ -210,14 +212,14 @@ LIR* Arm64Mir2Lir::OpRegCopyNoInsert(RegStorage r_dest, RegStorage r_src) {
       if (r_dest.IsDouble()) {
         opcode = kA64Fmov2Sx;
       } else {
-        DCHECK(r_src.IsSingle());
+        r_src = Check32BitReg(r_src);
         opcode = kA64Fmov2sw;
       }
     } else {
       if (r_src.IsDouble()) {
         opcode = kA64Fmov2xS;
       } else {
-        DCHECK(r_dest.Is32Bit());
+        r_dest = Check32BitReg(r_dest);
         opcode = kA64Fmov2ws;
       }
     }
@@ -655,7 +657,7 @@ void Arm64Mir2Lir::GenIntToLong(RegLocation rl_dest, RegLocation rl_src) {
 
   rl_src = LoadValue(rl_src, kCoreReg);
   rl_result = EvalLocWide(rl_dest, kCoreReg, true);
-  NewLIR4(WIDE(kA64Sbfm4rrdd), rl_result.reg.GetReg(), rl_src.reg.GetReg(), 0, 31);
+  NewLIR4(WIDE(kA64Sbfm4rrdd), rl_result.reg.GetReg(), As64BitReg(rl_src.reg).GetReg(), 0, 31);
   StoreValueWide(rl_dest, rl_result);
 }
 
diff --git a/compiler/dex/quick/arm64/target_arm64.cc b/compiler/dex/quick/arm64/target_arm64.cc
index 6105837f79..dcb0050a80 100644
--- a/compiler/dex/quick/arm64/target_arm64.cc
+++ b/compiler/dex/quick/arm64/target_arm64.cc
@@ -88,7 +88,7 @@ RegLocation Arm64Mir2Lir::LocCReturn() {
 }
 
 RegLocation Arm64Mir2Lir::LocCReturnRef() {
-  return arm_loc_c_return;
+  return arm_loc_c_return_ref;
 }
 
 RegLocation Arm64Mir2Lir::LocCReturnWide() {
@@ -1097,7 +1097,7 @@ int Arm64Mir2Lir::GenDalvikArgsRange(CallInfo* info, int call_state,
 
       // Instead of allocating a new temp, simply reuse one of the registers being used
       // for argument passing.
-      RegStorage temp = TargetReg(kArg3);
+      RegStorage temp = TargetReg(kArg3, false);
 
       // Now load the argument VR and store to the outs.
       Load32Disp(TargetReg(kSp), current_src_offset, temp);
diff --git a/compiler/dex/quick/arm64/utility_arm64.cc b/compiler/dex/quick/arm64/utility_arm64.cc
index e2484101df..ca78e5be72 100644
--- a/compiler/dex/quick/arm64/utility_arm64.cc
+++ b/compiler/dex/quick/arm64/utility_arm64.cc
@@ -732,7 +732,7 @@ LIR* Arm64Mir2Lir::OpRegRegImm64(OpKind op, RegStorage r_dest, RegStorage r_src1
         return NewLIR4(opcode | wide, r_dest.GetReg(), r_src1.GetReg(), abs_value >> 12, 1);
       } else {
         log_imm = -1;
-        alt_opcode = (neg) ? kA64Add4RRre : kA64Sub4RRre;
+        alt_opcode = (op == kOpAdd) ? kA64Add4RRre : kA64Sub4RRre;
         info = EncodeExtend(is_wide ? kA64Uxtx : kA64Uxtw, 0);
       }
       break;
@@ -891,9 +891,8 @@ LIR* Arm64Mir2Lir::LoadBaseIndexed(RegStorage r_base, RegStorage r_index, RegSto
   LIR* load;
   int expected_scale = 0;
   ArmOpcode opcode = kA64Brk1d;
-  DCHECK(r_base.Is64Bit());
-  // TODO: need a cleaner handling of index registers here and throughout.
-  r_index = Check32BitReg(r_index);
+  r_base = Check64BitReg(r_base);
+  r_index = Check64BitReg(r_index);
 
   if (r_dest.IsFloat()) {
     if (r_dest.IsDouble()) {
@@ -928,17 +927,21 @@ LIR* Arm64Mir2Lir::LoadBaseIndexed(RegStorage r_base, RegStorage r_index, RegSto
       expected_scale = 2;
       break;
     case kUnsignedHalf:
+      r_dest = Check32BitReg(r_dest);
       opcode = kA64Ldrh4wXxd;
       expected_scale = 1;
       break;
     case kSignedHalf:
+      r_dest = Check32BitReg(r_dest);
       opcode = kA64Ldrsh4rXxd;
       expected_scale = 1;
       break;
     case kUnsignedByte:
+      r_dest = Check32BitReg(r_dest);
       opcode = kA64Ldrb3wXx;
       break;
     case kSignedByte:
+      r_dest = Check32BitReg(r_dest);
       opcode = kA64Ldrsb3rXx;
       break;
     default:
@@ -968,9 +971,8 @@ LIR* Arm64Mir2Lir::StoreBaseIndexed(RegStorage r_base, RegStorage r_index, RegSt
   LIR* store;
   int expected_scale = 0;
   ArmOpcode opcode = kA64Brk1d;
-  DCHECK(r_base.Is64Bit());
-  // TODO: need a cleaner handling of index registers here and throughout.
-  r_index = Check32BitReg(r_index);
+  r_base = Check64BitReg(r_base);
+  r_index = Check64BitReg(r_index);
 
   if (r_src.IsFloat()) {
     if (r_src.IsDouble()) {
@@ -1006,11 +1008,13 @@ LIR* Arm64Mir2Lir::StoreBaseIndexed(RegStorage r_base, RegStorage r_index, RegSt
       break;
     case kUnsignedHalf:
     case kSignedHalf:
+      r_src = Check32BitReg(r_src);
       opcode = kA64Strh4wXxd;
       expected_scale = 1;
       break;
     case kUnsignedByte:
     case kSignedByte:
+      r_src = Check32BitReg(r_src);
       opcode = kA64Strb3wXx;
       break;
     default:
diff --git a/compiler/dex/quick/codegen_util.cc b/compiler/dex/quick/codegen_util.cc
index f31b670164..e571b3a407 100644
--- a/compiler/dex/quick/codegen_util.cc
+++ b/compiler/dex/quick/codegen_util.cc
@@ -1184,7 +1184,7 @@ void Mir2Lir::LoadCodeAddress(const MethodReference& target_method, InvokeType t
     // resolve these invokes to the same method, so we don't care which one we record here.
     data_target->operands[2] = type;
   }
-  LIR* load_pc_rel = OpPcRelLoad(TargetReg(symbolic_reg), data_target);
+  LIR* load_pc_rel = OpPcRelLoad(TargetRefReg(symbolic_reg), data_target);
   AppendLIR(load_pc_rel);
   DCHECK_NE(cu_->instruction_set, kMips) << reinterpret_cast<void*>(data_target);
 }
@@ -1200,7 +1200,7 @@ void Mir2Lir::LoadMethodAddress(const MethodReference& target_method, InvokeType
     // resolve these invokes to the same method, so we don't care which one we record here.
     data_target->operands[2] = type;
   }
-  LIR* load_pc_rel = OpPcRelLoad(TargetReg(symbolic_reg), data_target);
+  LIR* load_pc_rel = OpPcRelLoad(TargetRefReg(symbolic_reg), data_target);
   AppendLIR(load_pc_rel);
   DCHECK_NE(cu_->instruction_set, kMips) << reinterpret_cast<void*>(data_target);
 }
diff --git a/compiler/dex/quick/gen_common.cc b/compiler/dex/quick/gen_common.cc
index 04a23cf133..2c59055243 100644
--- a/compiler/dex/quick/gen_common.cc
+++ b/compiler/dex/quick/gen_common.cc
@@ -127,14 +127,17 @@ void Mir2Lir::GenArrayBoundsCheck(int index, RegStorage length) {
       m2l_->ResetDefTracking();
       GenerateTargetLabel(kPseudoThrowTarget);
 
-      m2l_->OpRegCopy(m2l_->TargetReg(kArg1), length_);
-      m2l_->LoadConstant(m2l_->TargetReg(kArg0), index_);
+      RegStorage arg1_32 = m2l_->TargetReg(kArg1, false);
+      RegStorage arg0_32 = m2l_->TargetReg(kArg0, false);
+
+      m2l_->OpRegCopy(arg1_32, length_);
+      m2l_->LoadConstant(arg0_32, index_);
       if (m2l_->cu_->target64) {
         m2l_->CallRuntimeHelperRegReg(QUICK_ENTRYPOINT_OFFSET(8, pThrowArrayBounds),
-                                      m2l_->TargetReg(kArg0), m2l_->TargetReg(kArg1), true);
+                                      arg0_32, arg1_32, true);
       } else {
         m2l_->CallRuntimeHelperRegReg(QUICK_ENTRYPOINT_OFFSET(4, pThrowArrayBounds),
-                                      m2l_->TargetReg(kArg0), m2l_->TargetReg(kArg1), true);
+                                      arg0_32, arg1_32, true);
       }
     }
 
@@ -473,7 +476,7 @@ void Mir2Lir::GenFilledNewArray(CallInfo* info) {
     switch (cu_->instruction_set) {
       case kThumb2:
       case kArm64:
-        r_val = TargetReg(kLr);
+        r_val = TargetReg(kLr, false);
         break;
       case kX86:
       case kX86_64:
@@ -597,10 +600,10 @@ void Mir2Lir::GenSput(MIR* mir, RegLocation rl_src, bool is_long_or_double,
       // May do runtime call so everything to home locations.
       FlushAllRegs();
       // Using fixed register to sync with possible call to runtime support.
-      RegStorage r_method = TargetReg(kArg1);
+      RegStorage r_method = TargetRefReg(kArg1);
       LockTemp(r_method);
       LoadCurrMethodDirect(r_method);
-      r_base = TargetReg(kArg0);
+      r_base = TargetRefReg(kArg0);
       LockTemp(r_base);
       LoadRefDisp(r_method, mirror::ArtMethod::DexCacheResolvedTypesOffset().Int32Value(), r_base,
                   kNotVolatile);
@@ -901,12 +904,12 @@ void Mir2Lir::GenArrayObjPut(int opt_flags, RegLocation rl_array, RegLocation rl
 
 void Mir2Lir::GenConstClass(uint32_t type_idx, RegLocation rl_dest) {
   RegLocation rl_method = LoadCurrMethod();
-  DCHECK(!cu_->target64 || rl_method.reg.Is64Bit());
+  CheckRegLocation(rl_method);
   RegStorage res_reg = AllocTempRef();
   RegLocation rl_result = EvalLoc(rl_dest, kRefReg, true);
   if (!cu_->compiler_driver->CanAccessTypeWithoutChecks(cu_->method_idx,
-                                                   *cu_->dex_file,
-                                                   type_idx)) {
+                                                        *cu_->dex_file,
+                                                        type_idx)) {
     // Call out to helper which resolves type and verifies access.
     // Resolved type returned in kRet0.
     if (cu_->target64) {
@@ -991,15 +994,15 @@ void Mir2Lir::GenConstString(uint32_t string_idx, RegLocation rl_dest) {
       DCHECK(!IsTemp(rl_method.reg));
       r_method = rl_method.reg;
     } else {
-      r_method = TargetReg(kArg2);
+      r_method = TargetRefReg(kArg2);
       LoadCurrMethodDirect(r_method);
     }
     LoadRefDisp(r_method, mirror::ArtMethod::DexCacheStringsOffset().Int32Value(),
-                TargetReg(kArg0), kNotVolatile);
+                TargetRefReg(kArg0), kNotVolatile);
 
     // Might call out to helper, which will return resolved string in kRet0
-    LoadRefDisp(TargetReg(kArg0), offset_of_string, TargetReg(kRet0), kNotVolatile);
-    LIR* fromfast = OpCmpImmBranch(kCondEq, TargetReg(kRet0), 0, NULL);
+    LoadRefDisp(TargetRefReg(kArg0), offset_of_string, TargetRefReg(kRet0), kNotVolatile);
+    LIR* fromfast = OpCmpImmBranch(kCondEq, TargetRefReg(kRet0), 0, NULL);
     LIR* cont = NewLIR0(kPseudoTargetLabel);
 
     {
@@ -1189,8 +1192,9 @@ void Mir2Lir::GenInstanceofCallingHelper(bool needs_access_check, bool type_know
   FlushAllRegs();
   // May generate a call - use explicit registers
   LockCallTemps();
-  LoadCurrMethodDirect(TargetReg(kArg1));  // kArg1 <= current Method*
-  RegStorage class_reg = TargetReg(kArg2);  // kArg2 will hold the Class*
+  RegStorage method_reg = TargetRefReg(kArg1);
+  LoadCurrMethodDirect(method_reg);   // kArg1 <= current Method*
+  RegStorage class_reg = TargetRefReg(kArg2);  // kArg2 will hold the Class*
   if (needs_access_check) {
     // Check we have access to type_idx and if not throw IllegalAccessError,
     // returns Class* in kArg0
@@ -1205,12 +1209,12 @@ void Mir2Lir::GenInstanceofCallingHelper(bool needs_access_check, bool type_know
     LoadValueDirectFixed(rl_src, TargetReg(kArg0));  // kArg0 <= ref
   } else if (use_declaring_class) {
     LoadValueDirectFixed(rl_src, TargetReg(kArg0));  // kArg0 <= ref
-    LoadRefDisp(TargetReg(kArg1), mirror::ArtMethod::DeclaringClassOffset().Int32Value(),
+    LoadRefDisp(method_reg, mirror::ArtMethod::DeclaringClassOffset().Int32Value(),
                 class_reg, kNotVolatile);
   } else {
     // Load dex cache entry into class_reg (kArg2)
     LoadValueDirectFixed(rl_src, TargetReg(kArg0));  // kArg0 <= ref
-    LoadRefDisp(TargetReg(kArg1), mirror::ArtMethod::DexCacheResolvedTypesOffset().Int32Value(),
+    LoadRefDisp(method_reg, mirror::ArtMethod::DexCacheResolvedTypesOffset().Int32Value(),
                 class_reg, kNotVolatile);
     int32_t offset_of_type = ClassArray::OffsetOfElement(type_idx).Int32Value();
     LoadRefDisp(class_reg, offset_of_type, class_reg, kNotVolatile);
@@ -1224,7 +1228,7 @@ void Mir2Lir::GenInstanceofCallingHelper(bool needs_access_check, bool type_know
       } else {
         CallRuntimeHelperImm(QUICK_ENTRYPOINT_OFFSET(4, pInitializeType), type_idx, true);
       }
-      OpRegCopy(TargetReg(kArg2), TargetReg(kRet0));  // Align usage with fast path
+      OpRegCopy(TargetRefReg(kArg2), TargetRefReg(kRet0));  // Align usage with fast path
       LoadValueDirectFixed(rl_src, TargetReg(kArg0));  /* reload Ref */
       // Rejoin code paths
       LIR* hop_target = NewLIR0(kPseudoTargetLabel);
@@ -1232,7 +1236,7 @@ void Mir2Lir::GenInstanceofCallingHelper(bool needs_access_check, bool type_know
     }
   }
   /* kArg0 is ref, kArg2 is class. If ref==null, use directly as bool result */
-  RegLocation rl_result = GetReturn(kRefReg);
+  RegLocation rl_result = GetReturn(kCoreReg);
   if (cu_->instruction_set == kMips) {
     // On MIPS rArg0 != rl_result, place false in result if branch is taken.
     LoadConstant(rl_result.reg, 0);
@@ -1241,7 +1245,7 @@ void Mir2Lir::GenInstanceofCallingHelper(bool needs_access_check, bool type_know
 
   /* load object->klass_ */
   DCHECK_EQ(mirror::Object::ClassOffset().Int32Value(), 0);
-  LoadRefDisp(TargetReg(kArg0), mirror::Object::ClassOffset().Int32Value(), TargetReg(kArg1),
+  LoadRefDisp(TargetRefReg(kArg0), mirror::Object::ClassOffset().Int32Value(), TargetRefReg(kArg1),
               kNotVolatile);
   /* kArg0 is ref, kArg1 is ref->klass_, kArg2 is class */
   LIR* branchover = NULL;
@@ -1339,26 +1343,27 @@ void Mir2Lir::GenCheckCast(uint32_t insn_idx, uint32_t type_idx, RegLocation rl_
   FlushAllRegs();
   // May generate a call - use explicit registers
   LockCallTemps();
-  LoadCurrMethodDirect(TargetReg(kArg1));  // kArg1 <= current Method*
-  RegStorage class_reg = TargetReg(kArg2);  // kArg2 will hold the Class*
+  RegStorage method_reg = TargetRefReg(kArg1);
+  LoadCurrMethodDirect(method_reg);  // kArg1 <= current Method*
+  RegStorage class_reg = TargetRefReg(kArg2);  // kArg2 will hold the Class*
   if (needs_access_check) {
     // Check we have access to type_idx and if not throw IllegalAccessError,
     // returns Class* in kRet0
     // InitializeTypeAndVerifyAccess(idx, method)
     if (cu_->target64) {
-      CallRuntimeHelperImmReg(QUICK_ENTRYPOINT_OFFSET(8, pInitializeTypeAndVerifyAccess),
-                              type_idx, TargetReg(kArg1), true);
+      CallRuntimeHelperImm(QUICK_ENTRYPOINT_OFFSET(8, pInitializeTypeAndVerifyAccess),
+                           type_idx, true);
     } else {
-      CallRuntimeHelperImmReg(QUICK_ENTRYPOINT_OFFSET(4, pInitializeTypeAndVerifyAccess),
-                              type_idx, TargetReg(kArg1), true);
+      CallRuntimeHelperImm(QUICK_ENTRYPOINT_OFFSET(4, pInitializeTypeAndVerifyAccess),
+                           type_idx, true);
     }
-    OpRegCopy(class_reg, TargetReg(kRet0));  // Align usage with fast path
+    OpRegCopy(class_reg, TargetRefReg(kRet0));  // Align usage with fast path
   } else if (use_declaring_class) {
-    LoadRefDisp(TargetReg(kArg1), mirror::ArtMethod::DeclaringClassOffset().Int32Value(),
+    LoadRefDisp(method_reg, mirror::ArtMethod::DeclaringClassOffset().Int32Value(),
                 class_reg, kNotVolatile);
   } else {
     // Load dex cache entry into class_reg (kArg2)
-    LoadRefDisp(TargetReg(kArg1), mirror::ArtMethod::DexCacheResolvedTypesOffset().Int32Value(),
+    LoadRefDisp(method_reg, mirror::ArtMethod::DexCacheResolvedTypesOffset().Int32Value(),
                 class_reg, kNotVolatile);
     int32_t offset_of_type = ClassArray::OffsetOfElement(type_idx).Int32Value();
     LoadRefDisp(class_reg, offset_of_type, class_reg, kNotVolatile);
@@ -1383,12 +1388,12 @@ void Mir2Lir::GenCheckCast(uint32_t insn_idx, uint32_t type_idx, RegLocation rl_
           // InitializeTypeFromCode(idx, method)
           if (m2l_->cu_->target64) {
             m2l_->CallRuntimeHelperImmReg(QUICK_ENTRYPOINT_OFFSET(8, pInitializeType), type_idx_,
-                                          m2l_->TargetReg(kArg1), true);
+                                          m2l_->TargetRefReg(kArg1), true);
           } else {
             m2l_->CallRuntimeHelperImmReg(QUICK_ENTRYPOINT_OFFSET(4, pInitializeType), type_idx_,
-                                                      m2l_->TargetReg(kArg1), true);
+                                          m2l_->TargetRefReg(kArg1), true);
           }
-          m2l_->OpRegCopy(class_reg_, m2l_->TargetReg(kRet0));  // Align usage with fast path
+          m2l_->OpRegCopy(class_reg_, m2l_->TargetRefReg(kRet0));  // Align usage with fast path
           m2l_->OpUnconditionalBranch(cont_);
         }
 
@@ -1401,7 +1406,7 @@ void Mir2Lir::GenCheckCast(uint32_t insn_idx, uint32_t type_idx, RegLocation rl_
     }
   }
   // At this point, class_reg (kArg2) has class
-  LoadValueDirectFixed(rl_src, TargetReg(kArg0));  // kArg0 <= ref
+  LoadValueDirectFixed(rl_src, TargetRefReg(kArg0));  // kArg0 <= ref
 
   // Slow path for the case where the classes are not equal.  In this case we need
   // to call a helper function to do the check.
@@ -1435,7 +1440,7 @@ void Mir2Lir::GenCheckCast(uint32_t insn_idx, uint32_t type_idx, RegLocation rl_
 
   if (type_known_abstract) {
     // Easier case, run slow path if target is non-null (slow path will load from target)
-    LIR* branch = OpCmpImmBranch(kCondNe, TargetReg(kArg0), 0, NULL);
+    LIR* branch = OpCmpImmBranch(kCondNe, TargetReg(kArg0), 0, nullptr);
     LIR* cont = NewLIR0(kPseudoTargetLabel);
     AddSlowPath(new (arena_) SlowPath(this, branch, cont, true));
   } else {
@@ -1444,13 +1449,13 @@ void Mir2Lir::GenCheckCast(uint32_t insn_idx, uint32_t type_idx, RegLocation rl_
     // slow path if the classes are not equal.
 
     /* Null is OK - continue */
-    LIR* branch1 = OpCmpImmBranch(kCondEq, TargetReg(kArg0), 0, NULL);
+    LIR* branch1 = OpCmpImmBranch(kCondEq, TargetReg(kArg0), 0, nullptr);
     /* load object->klass_ */
     DCHECK_EQ(mirror::Object::ClassOffset().Int32Value(), 0);
-    LoadRefDisp(TargetReg(kArg0), mirror::Object::ClassOffset().Int32Value(), TargetReg(kArg1),
-                kNotVolatile);
+    LoadRefDisp(TargetRefReg(kArg0), mirror::Object::ClassOffset().Int32Value(),
+                TargetRefReg(kArg1), kNotVolatile);
 
-    LIR* branch2 = OpCmpBranch(kCondNe, TargetReg(kArg1), class_reg, NULL);
+    LIR* branch2 = OpCmpBranch(kCondNe, TargetRefReg(kArg1), class_reg, nullptr);
     LIR* cont = NewLIR0(kPseudoTargetLabel);
 
     // Add the slow path that will not perform load since this is already done.
diff --git a/compiler/dex/quick/gen_invoke.cc b/compiler/dex/quick/gen_invoke.cc
index 569c97f3ae..bf51d28be3 100644
--- a/compiler/dex/quick/gen_invoke.cc
+++ b/compiler/dex/quick/gen_invoke.cc
@@ -185,11 +185,11 @@ void Mir2Lir::CallRuntimeHelperImmRegLocation(ThreadOffset<pointer_size> helper_
                                               RegLocation arg1, bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   if (arg1.wide == 0) {
-    LoadValueDirectFixed(arg1, TargetReg(kArg1));
+    LoadValueDirectFixed(arg1, TargetReg(kArg1, arg1));
   } else {
     RegStorage r_tmp;
     if (cu_->target64) {
-      r_tmp = RegStorage::Solo64(TargetReg(kArg1).GetReg());
+      r_tmp = TargetReg(kArg1, true);
     } else {
       if (cu_->instruction_set == kMips) {
         // skip kArg1 for stack alignment.
@@ -211,7 +211,8 @@ template <size_t pointer_size>
 void Mir2Lir::CallRuntimeHelperRegLocationImm(ThreadOffset<pointer_size> helper_offset,
                                               RegLocation arg0, int arg1, bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
-  LoadValueDirectFixed(arg0, TargetReg(kArg0));
+  DCHECK(!arg0.wide);
+  LoadValueDirectFixed(arg0, TargetReg(kArg0, arg0));
   LoadConstant(TargetReg(kArg1), arg1);
   ClobberCallerSave();
   CallHelper<pointer_size>(r_tgt, helper_offset, safepoint_pc);
@@ -223,7 +224,7 @@ template <size_t pointer_size>
 void Mir2Lir::CallRuntimeHelperImmReg(ThreadOffset<pointer_size> helper_offset, int arg0,
                                       RegStorage arg1, bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
-  OpRegCopy(TargetReg(kArg1), arg1);
+  OpRegCopy(TargetReg(kArg1, arg1.Is64Bit()), arg1);
   LoadConstant(TargetReg(kArg0), arg0);
   ClobberCallerSave();
   CallHelper<pointer_size>(r_tgt, helper_offset, safepoint_pc);
@@ -276,7 +277,7 @@ void Mir2Lir::CallRuntimeHelperRegMethodRegLocation(ThreadOffset<pointer_size> h
     OpRegCopy(TargetReg(kArg0), arg0);
   }
   LoadCurrMethodDirect(TargetReg(kArg1));
-  LoadValueDirectFixed(arg2, TargetReg(kArg2));
+  LoadValueDirectFixed(arg2, TargetReg(kArg2, arg2));
   ClobberCallerSave();
   CallHelper<pointer_size>(r_tgt, helper_offset, safepoint_pc);
 }
@@ -288,80 +289,103 @@ void Mir2Lir::CallRuntimeHelperRegLocationRegLocation(ThreadOffset<pointer_size>
                                                       RegLocation arg0, RegLocation arg1,
                                                       bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
-  if (arg0.wide == 0) {
-    LoadValueDirectFixed(arg0, arg0.fp ? TargetReg(kFArg0) : TargetReg(kArg0));
+  if (cu_->instruction_set == kArm64) {
+    RegStorage arg0_reg = TargetReg((arg0.fp) ? kFArg0 : kArg0, arg0);
+
+    RegStorage arg1_reg;
+    if (arg1.fp == arg0.fp) {
+      arg1_reg = TargetReg((arg1.fp) ? kFArg1 : kArg1, arg1);
+    } else {
+      arg1_reg = TargetReg((arg1.fp) ? kFArg0 : kArg0, arg1);
+    }
+
+    if (arg0.wide == 0) {
+      LoadValueDirectFixed(arg0, arg0_reg);
+    } else {
+      LoadValueDirectWideFixed(arg0, arg0_reg);
+    }
+
     if (arg1.wide == 0) {
-      if (cu_->instruction_set == kMips) {
-        LoadValueDirectFixed(arg1, arg1.fp ? TargetReg(kFArg2) : TargetReg(kArg1));
-      } else if (cu_->instruction_set == kArm64) {
-        LoadValueDirectFixed(arg1, arg1.fp ? TargetReg(kFArg1) : TargetReg(kArg1));
-      } else if (cu_->instruction_set == kX86_64) {
-        if (arg0.fp) {
-          LoadValueDirectFixed(arg1, arg1.fp ? TargetReg(kFArg1) : TargetReg(kArg0));
+      LoadValueDirectFixed(arg1, arg1_reg);
+    } else {
+      LoadValueDirectWideFixed(arg1, arg1_reg);
+    }
+  } else {
+    if (arg0.wide == 0) {
+      LoadValueDirectFixed(arg0, arg0.fp ? TargetReg(kFArg0) : TargetReg(kArg0));
+      if (arg1.wide == 0) {
+        if (cu_->instruction_set == kMips) {
+          LoadValueDirectFixed(arg1, arg1.fp ? TargetReg(kFArg2) : TargetReg(kArg1));
+        } else if (cu_->instruction_set == kArm64) {
+          LoadValueDirectFixed(arg1, arg1.fp ? TargetReg(kFArg1) : TargetReg(kArg1));
+        } else if (cu_->instruction_set == kX86_64) {
+          if (arg0.fp) {
+            LoadValueDirectFixed(arg1, arg1.fp ? TargetReg(kFArg1) : TargetReg(kArg0));
+          } else {
+            LoadValueDirectFixed(arg1, arg1.fp ? TargetReg(kFArg0) : TargetReg(kArg1));
+          }
         } else {
-          LoadValueDirectFixed(arg1, arg1.fp ? TargetReg(kFArg0) : TargetReg(kArg1));
+          LoadValueDirectFixed(arg1, TargetReg(kArg1));
         }
       } else {
-        LoadValueDirectFixed(arg1, TargetReg(kArg1));
+        if (cu_->instruction_set == kMips) {
+          RegStorage r_tmp;
+          if (arg1.fp) {
+            r_tmp = RegStorage::MakeRegPair(TargetReg(kFArg2), TargetReg(kFArg3));
+          } else {
+            // skip kArg1 for stack alignment.
+            r_tmp = RegStorage::MakeRegPair(TargetReg(kArg2), TargetReg(kArg3));
+          }
+          LoadValueDirectWideFixed(arg1, r_tmp);
+        } else {
+          RegStorage r_tmp;
+          if (cu_->target64) {
+            r_tmp = RegStorage::Solo64(TargetReg(kArg1).GetReg());
+          } else {
+            r_tmp = RegStorage::MakeRegPair(TargetReg(kArg1), TargetReg(kArg2));
+          }
+          LoadValueDirectWideFixed(arg1, r_tmp);
+        }
       }
     } else {
-      if (cu_->instruction_set == kMips) {
-        RegStorage r_tmp;
-        if (arg1.fp) {
-          r_tmp = RegStorage::MakeRegPair(TargetReg(kFArg2), TargetReg(kFArg3));
+      RegStorage r_tmp;
+      if (arg0.fp) {
+        if (cu_->target64) {
+          r_tmp = RegStorage::FloatSolo64(TargetReg(kFArg0).GetReg());
         } else {
-          // skip kArg1 for stack alignment.
-          r_tmp = RegStorage::MakeRegPair(TargetReg(kArg2), TargetReg(kArg3));
+          r_tmp = RegStorage::MakeRegPair(TargetReg(kFArg0), TargetReg(kFArg1));
         }
-        LoadValueDirectWideFixed(arg1, r_tmp);
       } else {
-        RegStorage r_tmp;
         if (cu_->target64) {
-          r_tmp = RegStorage::Solo64(TargetReg(kArg1).GetReg());
+          r_tmp = RegStorage::Solo64(TargetReg(kArg0).GetReg());
         } else {
-          r_tmp = RegStorage::MakeRegPair(TargetReg(kArg1), TargetReg(kArg2));
+          r_tmp = RegStorage::MakeRegPair(TargetReg(kArg0), TargetReg(kArg1));
         }
-        LoadValueDirectWideFixed(arg1, r_tmp);
-      }
-    }
-  } else {
-    RegStorage r_tmp;
-    if (arg0.fp) {
-      if (cu_->target64) {
-        r_tmp = RegStorage::FloatSolo64(TargetReg(kFArg0).GetReg());
-      } else {
-        r_tmp = RegStorage::MakeRegPair(TargetReg(kFArg0), TargetReg(kFArg1));
-      }
-    } else {
-      if (cu_->target64) {
-        r_tmp = RegStorage::Solo64(TargetReg(kArg0).GetReg());
-      } else {
-        r_tmp = RegStorage::MakeRegPair(TargetReg(kArg0), TargetReg(kArg1));
-      }
-    }
-    LoadValueDirectWideFixed(arg0, r_tmp);
-    if (arg1.wide == 0) {
-      if (cu_->target64) {
-        LoadValueDirectFixed(arg1, arg1.fp ? TargetReg(kFArg1) : TargetReg(kArg1));
-      } else {
-        LoadValueDirectFixed(arg1, arg1.fp ? TargetReg(kFArg2) : TargetReg(kArg2));
       }
-    } else {
-      RegStorage r_tmp;
-      if (arg1.fp) {
+      LoadValueDirectWideFixed(arg0, r_tmp);
+      if (arg1.wide == 0) {
         if (cu_->target64) {
-          r_tmp = RegStorage::FloatSolo64(TargetReg(kFArg1).GetReg());
+          LoadValueDirectFixed(arg1, arg1.fp ? TargetReg(kFArg1) : TargetReg(kArg1));
         } else {
-          r_tmp = RegStorage::MakeRegPair(TargetReg(kFArg2), TargetReg(kFArg3));
+          LoadValueDirectFixed(arg1, arg1.fp ? TargetReg(kFArg2) : TargetReg(kArg2));
         }
       } else {
-        if (cu_->target64) {
-          r_tmp = RegStorage::Solo64(TargetReg(kArg1).GetReg());
+        RegStorage r_tmp;
+        if (arg1.fp) {
+          if (cu_->target64) {
+            r_tmp = RegStorage::FloatSolo64(TargetReg(kFArg1).GetReg());
+          } else {
+            r_tmp = RegStorage::MakeRegPair(TargetReg(kFArg2), TargetReg(kFArg3));
+          }
         } else {
-          r_tmp = RegStorage::MakeRegPair(TargetReg(kArg2), TargetReg(kArg3));
+          if (cu_->target64) {
+            r_tmp = RegStorage::Solo64(TargetReg(kArg1).GetReg());
+          } else {
+            r_tmp = RegStorage::MakeRegPair(TargetReg(kArg2), TargetReg(kArg3));
+          }
         }
+        LoadValueDirectWideFixed(arg1, r_tmp);
       }
-      LoadValueDirectWideFixed(arg1, r_tmp);
     }
   }
   ClobberCallerSave();
@@ -381,16 +405,16 @@ void Mir2Lir::CopyToArgumentRegs(RegStorage arg0, RegStorage arg1) {
   if (IsSameReg(arg1, TargetReg(kArg0))) {
     if (IsSameReg(arg0, TargetReg(kArg1))) {
       // Swap kArg0 and kArg1 with kArg2 as temp.
-      OpRegCopy(TargetArgReg(kArg2, arg1.Is64Bit()), arg1);
-      OpRegCopy(TargetArgReg(kArg0, arg0.Is64Bit()), arg0);
-      OpRegCopy(TargetArgReg(kArg1, arg1.Is64Bit()), TargetReg(kArg2));
+      OpRegCopy(TargetReg(kArg2, arg1.Is64Bit()), arg1);
+      OpRegCopy(TargetReg(kArg0, arg0.Is64Bit()), arg0);
+      OpRegCopy(TargetReg(kArg1, arg1.Is64Bit()), TargetReg(kArg2, arg1.Is64Bit()));
     } else {
-      OpRegCopy(TargetArgReg(kArg1, arg1.Is64Bit()), arg1);
-      OpRegCopy(TargetArgReg(kArg0, arg0.Is64Bit()), arg0);
+      OpRegCopy(TargetReg(kArg1, arg1.Is64Bit()), arg1);
+      OpRegCopy(TargetReg(kArg0, arg0.Is64Bit()), arg0);
     }
   } else {
-    OpRegCopy(TargetArgReg(kArg0, arg0.Is64Bit()), arg0);
-    OpRegCopy(TargetArgReg(kArg1, arg1.Is64Bit()), arg1);
+    OpRegCopy(TargetReg(kArg0, arg0.Is64Bit()), arg0);
+    OpRegCopy(TargetReg(kArg1, arg1.Is64Bit()), arg1);
   }
 }
 
@@ -421,9 +445,9 @@ template <size_t pointer_size>
 void Mir2Lir::CallRuntimeHelperImmMethodRegLocation(ThreadOffset<pointer_size> helper_offset,
                                                     int arg0, RegLocation arg2, bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
-  LoadValueDirectFixed(arg2, TargetReg(kArg2));
+  LoadValueDirectFixed(arg2, TargetReg(kArg2, arg2));
   LoadCurrMethodDirect(TargetReg(kArg1));
-  LoadConstant(TargetReg(kArg0), arg0);
+  LoadConstant(TargetReg(kArg0, arg0), arg0);
   ClobberCallerSave();
   CallHelper<pointer_size>(r_tgt, helper_offset, safepoint_pc);
 }
@@ -449,13 +473,13 @@ void Mir2Lir::CallRuntimeHelperImmRegLocationRegLocation(ThreadOffset<pointer_si
   RegStorage r_tgt = CallHelperSetup(helper_offset);
   DCHECK_EQ(static_cast<unsigned int>(arg1.wide), 0U);  // The static_cast works around an
                                                         // instantiation bug in GCC.
-  LoadValueDirectFixed(arg1, TargetReg(kArg1));
+  LoadValueDirectFixed(arg1, TargetReg(kArg1, arg1));
   if (arg2.wide == 0) {
-    LoadValueDirectFixed(arg2, TargetReg(kArg2));
+    LoadValueDirectFixed(arg2, TargetReg(kArg2, arg2));
   } else {
     RegStorage r_tmp;
     if (cu_->target64) {
-      r_tmp = RegStorage::Solo64(TargetReg(kArg2).GetReg());
+      r_tmp = TargetReg(kArg2, true);
     } else {
       r_tmp = RegStorage::MakeRegPair(TargetReg(kArg2), TargetReg(kArg3));
     }
@@ -474,12 +498,9 @@ void Mir2Lir::CallRuntimeHelperRegLocationRegLocationRegLocation(ThreadOffset<po
                                                                  RegLocation arg2,
                                                                  bool safepoint_pc) {
   RegStorage r_tgt = CallHelperSetup(helper_offset);
-  DCHECK_EQ(static_cast<unsigned int>(arg0.wide), 0U);
-  LoadValueDirectFixed(arg0, TargetReg(kArg0));
-  DCHECK_EQ(static_cast<unsigned int>(arg1.wide), 0U);
-  LoadValueDirectFixed(arg1, TargetReg(kArg1));
-  DCHECK_EQ(static_cast<unsigned int>(arg1.wide), 0U);
-  LoadValueDirectFixed(arg2, TargetReg(kArg2));
+  LoadValueDirectFixed(arg0, TargetReg(kArg0, arg0));
+  LoadValueDirectFixed(arg1, TargetReg(kArg1, arg1));
+  LoadValueDirectFixed(arg2, TargetReg(kArg2, arg2));
   ClobberCallerSave();
   CallHelper<pointer_size>(r_tgt, helper_offset, safepoint_pc);
 }
@@ -502,13 +523,13 @@ void Mir2Lir::FlushIns(RegLocation* ArgLocs, RegLocation rl_method) {
    */
   RegLocation rl_src = rl_method;
   rl_src.location = kLocPhysReg;
-  rl_src.reg = TargetReg(kArg0);
+  rl_src.reg = TargetRefReg(kArg0);
   rl_src.home = false;
   MarkLive(rl_src);
   StoreValue(rl_method, rl_src);
   // If Method* has been promoted, explicitly flush
   if (rl_method.location == kLocPhysReg) {
-    StoreRefDisp(TargetReg(kSp), 0, TargetReg(kArg0), kNotVolatile);
+    StoreRefDisp(TargetReg(kSp), 0, rl_src.reg, kNotVolatile);
   }
 
   if (cu_->num_ins == 0) {
@@ -615,15 +636,16 @@ static int NextSDCallInsn(CompilationUnit* cu, CallInfo* info,
       return -1;
     }
   } else {
+    RegStorage arg0_ref = cg->TargetRefReg(kArg0);
     switch (state) {
     case 0:  // Get the current Method* [sets kArg0]
       // TUNING: we can save a reg copy if Method* has been promoted.
-      cg->LoadCurrMethodDirect(cg->TargetReg(kArg0));
+      cg->LoadCurrMethodDirect(arg0_ref);
       break;
     case 1:  // Get method->dex_cache_resolved_methods_
-      cg->LoadRefDisp(cg->TargetReg(kArg0),
+      cg->LoadRefDisp(arg0_ref,
                       mirror::ArtMethod::DexCacheResolvedMethodsOffset().Int32Value(),
-                      cg->TargetReg(kArg0),
+                      arg0_ref,
                       kNotVolatile);
       // Set up direct code if known.
       if (direct_code != 0) {
@@ -637,15 +659,15 @@ static int NextSDCallInsn(CompilationUnit* cu, CallInfo* info,
       break;
     case 2:  // Grab target method*
       CHECK_EQ(cu->dex_file, target_method.dex_file);
-      cg->LoadRefDisp(cg->TargetReg(kArg0),
+      cg->LoadRefDisp(arg0_ref,
                       ObjArray::OffsetOfElement(target_method.dex_method_index).Int32Value(),
-                      cg->TargetReg(kArg0),
+                      arg0_ref,
                       kNotVolatile);
       break;
     case 3:  // Grab the code from the method*
       if (cu->instruction_set != kX86 && cu->instruction_set != kX86_64) {
         if (direct_code == 0) {
-          cg->LoadWordDisp(cg->TargetReg(kArg0),
+          cg->LoadWordDisp(arg0_ref,
                            mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset().Int32Value(),
                            cg->TargetReg(kInvokeTgt));
         }
@@ -678,13 +700,13 @@ static int NextVCallInsn(CompilationUnit* cu, CallInfo* info,
   switch (state) {
     case 0: {  // Get "this" [set kArg1]
       RegLocation  rl_arg = info->args[0];
-      cg->LoadValueDirectFixed(rl_arg, cg->TargetReg(kArg1));
+      cg->LoadValueDirectFixed(rl_arg, cg->TargetRefReg(kArg1));
       break;
     }
     case 1:  // Is "this" null? [use kArg1]
-      cg->GenNullCheck(cg->TargetReg(kArg1), info->opt_flags);
+      cg->GenNullCheck(cg->TargetRefReg(kArg1), info->opt_flags);
       // get this->klass_ [use kArg1, set kInvokeTgt]
-      cg->LoadRefDisp(cg->TargetReg(kArg1), mirror::Object::ClassOffset().Int32Value(),
+      cg->LoadRefDisp(cg->TargetRefReg(kArg1), mirror::Object::ClassOffset().Int32Value(),
                       cg->TargetReg(kInvokeTgt),
                       kNotVolatile);
       cg->MarkPossibleNullPointerException(info->opt_flags);
@@ -697,12 +719,12 @@ static int NextVCallInsn(CompilationUnit* cu, CallInfo* info,
     case 3:  // Get target method [use kInvokeTgt, set kArg0]
       cg->LoadRefDisp(cg->TargetReg(kInvokeTgt),
                       ObjArray::OffsetOfElement(method_idx).Int32Value(),
-                      cg->TargetReg(kArg0),
+                      cg->TargetRefReg(kArg0),
                       kNotVolatile);
       break;
     case 4:  // Get the compiled code address [uses kArg0, sets kInvokeTgt]
       if (cu->instruction_set != kX86 && cu->instruction_set != kX86_64) {
-        cg->LoadWordDisp(cg->TargetReg(kArg0),
+        cg->LoadWordDisp(cg->TargetRefReg(kArg0),
                          mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset().Int32Value(),
                          cg->TargetReg(kInvokeTgt));
         break;
@@ -736,13 +758,13 @@ static int NextInterfaceCallInsn(CompilationUnit* cu, CallInfo* info, int state,
       break;
     case 1: {  // Get "this" [set kArg1]
       RegLocation  rl_arg = info->args[0];
-      cg->LoadValueDirectFixed(rl_arg, cg->TargetReg(kArg1));
+      cg->LoadValueDirectFixed(rl_arg, cg->TargetRefReg(kArg1));
       break;
     }
     case 2:  // Is "this" null? [use kArg1]
-      cg->GenNullCheck(cg->TargetReg(kArg1), info->opt_flags);
+      cg->GenNullCheck(cg->TargetRefReg(kArg1), info->opt_flags);
       // Get this->klass_ [use kArg1, set kInvokeTgt]
-      cg->LoadRefDisp(cg->TargetReg(kArg1), mirror::Object::ClassOffset().Int32Value(),
+      cg->LoadRefDisp(cg->TargetRefReg(kArg1), mirror::Object::ClassOffset().Int32Value(),
                       cg->TargetReg(kInvokeTgt),
                       kNotVolatile);
       cg->MarkPossibleNullPointerException(info->opt_flags);
@@ -757,12 +779,12 @@ static int NextInterfaceCallInsn(CompilationUnit* cu, CallInfo* info, int state,
       // NOTE: native pointer.
       cg->LoadRefDisp(cg->TargetReg(kInvokeTgt),
                        ObjArray::OffsetOfElement(method_idx % ClassLinker::kImtSize).Int32Value(),
-                       cg->TargetReg(kArg0),
+                       cg->TargetRefReg(kArg0),
                        kNotVolatile);
       break;
     case 5:  // Get the compiled code address [use kArg0, set kInvokeTgt]
       if (cu->instruction_set != kX86 && cu->instruction_set != kX86_64) {
-        cg->LoadWordDisp(cg->TargetReg(kArg0),
+        cg->LoadWordDisp(cg->TargetRefReg(kArg0),
                          mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset().Int32Value(),
                          cg->TargetReg(kInvokeTgt));
         break;
diff --git a/compiler/dex/quick/gen_loadstore.cc b/compiler/dex/quick/gen_loadstore.cc
index bfb77fc222..1cddeb9771 100644
--- a/compiler/dex/quick/gen_loadstore.cc
+++ b/compiler/dex/quick/gen_loadstore.cc
@@ -192,7 +192,7 @@ void Mir2Lir::StoreValue(RegLocation rl_dest, RegLocation rl_src) {
       IsPromoted(rl_src.reg) ||
       (rl_dest.location == kLocPhysReg)) {
       // Src is live/promoted or Dest has assigned reg.
-      rl_dest = EvalLoc(rl_dest, kAnyReg, false);
+      rl_dest = EvalLoc(rl_dest, rl_dest.ref || rl_src.ref ? kRefReg : kAnyReg, false);
       OpRegCopy(rl_dest.reg, rl_src.reg);
     } else {
       // Just re-assign the registers.  Dest gets Src's regs
@@ -201,7 +201,7 @@ void Mir2Lir::StoreValue(RegLocation rl_dest, RegLocation rl_src) {
     }
   } else {
     // Load Src either into promoted Dest or temps allocated for Dest
-    rl_dest = EvalLoc(rl_dest, kAnyReg, false);
+    rl_dest = EvalLoc(rl_dest, rl_dest.ref ? kRefReg : kAnyReg, false);
     LoadValueDirect(rl_src, rl_dest.reg);
   }
 
diff --git a/compiler/dex/quick/mir_to_lir-inl.h b/compiler/dex/quick/mir_to_lir-inl.h
index 9912101eb1..9a62255f5d 100644
--- a/compiler/dex/quick/mir_to_lir-inl.h
+++ b/compiler/dex/quick/mir_to_lir-inl.h
@@ -253,6 +253,19 @@ inline art::Mir2Lir::RegisterInfo* Mir2Lir::GetRegInfo(RegStorage reg) {
   return res;
 }
 
+inline void Mir2Lir::CheckRegLocation(RegLocation rl) const {
+  if (kFailOnSizeError || kReportSizeError) {
+    CheckRegLocationImpl(rl, kFailOnSizeError, kReportSizeError);
+  }
+}
+
+inline void Mir2Lir::CheckRegStorage(RegStorage rs, WidenessCheck wide, RefCheck ref, FPCheck fp)
+    const {
+  if (kFailOnSizeError || kReportSizeError) {
+    CheckRegStorageImpl(rs, wide, ref, fp, kFailOnSizeError, kReportSizeError);
+  }
+}
+
 }  // namespace art
 
 #endif  // ART_COMPILER_DEX_QUICK_MIR_TO_LIR_INL_H_
diff --git a/compiler/dex/quick/mir_to_lir.cc b/compiler/dex/quick/mir_to_lir.cc
index 5d68187d8b..984e8ea5f8 100644
--- a/compiler/dex/quick/mir_to_lir.cc
+++ b/compiler/dex/quick/mir_to_lir.cc
@@ -1267,4 +1267,55 @@ LIR* Mir2Lir::LIRSlowPath::GenerateTargetLabel(int opcode) {
   return target;
 }
 
+
+void Mir2Lir::CheckRegStorageImpl(RegStorage rs, WidenessCheck wide, RefCheck ref, FPCheck fp,
+                                  bool fail, bool report)
+    const  {
+  if (rs.Valid()) {
+    if (ref == RefCheck::kCheckRef) {
+      if (cu_->target64 && !rs.Is64Bit()) {
+        if (fail) {
+          CHECK(false) << "Reg storage not 64b for ref.";
+        } else if (report) {
+          LOG(WARNING) << "Reg storage not 64b for ref.";
+        }
+      }
+    }
+    if (wide == WidenessCheck::kCheckWide) {
+      if (!rs.Is64Bit()) {
+        if (fail) {
+          CHECK(false) << "Reg storage not 64b for wide.";
+        } else if (report) {
+          LOG(WARNING) << "Reg storage not 64b for wide.";
+        }
+      }
+    }
+    // A tighter check would be nice, but for now soft-float will not check float at all.
+    if (fp == FPCheck::kCheckFP && cu_->instruction_set != kArm) {
+      if (!rs.IsFloat()) {
+        if (fail) {
+          CHECK(false) << "Reg storage not float for fp.";
+        } else if (report) {
+          LOG(WARNING) << "Reg storage not float for fp.";
+        }
+      }
+    } else if (fp == FPCheck::kCheckNotFP) {
+      if (rs.IsFloat()) {
+        if (fail) {
+          CHECK(false) << "Reg storage float for not-fp.";
+        } else if (report) {
+          LOG(WARNING) << "Reg storage float for not-fp.";
+        }
+      }
+    }
+  }
+}
+
+void Mir2Lir::CheckRegLocationImpl(RegLocation rl, bool fail, bool report) const {
+  // Regrettably can't use the fp part of rl, as that is not really indicative of where a value
+  // will be stored.
+  CheckRegStorageImpl(rl.reg, rl.wide ? WidenessCheck::kCheckWide : WidenessCheck::kCheckNotWide,
+      rl.ref ? RefCheck::kCheckRef : RefCheck::kCheckNotRef, FPCheck::kIgnoreFP, fail, report);
+}
+
 }  // namespace art
diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index 171e871393..0c00df39f8 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h
@@ -21,6 +21,7 @@
 #include "compiled_method.h"
 #include "dex/compiler_enums.h"
 #include "dex/compiler_ir.h"
+#include "dex/reg_location.h"
 #include "dex/reg_storage.h"
 #include "dex/backend.h"
 #include "dex/quick/resource_mask.h"
@@ -124,7 +125,6 @@ struct CompilationUnit;
 struct InlineMethod;
 struct MIR;
 struct LIR;
-struct RegLocation;
 struct RegisterInfo;
 class DexFileMethodInliner;
 class MIRGraph;
@@ -237,6 +237,9 @@ COMPILE_ASSERT(!IsLargeFrame(kSmallFrameSize, kX86_64),
 
 class Mir2Lir : public Backend {
   public:
+    static constexpr bool kFailOnSizeError = true && kIsDebugBuild;
+    static constexpr bool kReportSizeError = true && kIsDebugBuild;
+
     /*
      * Auxiliary information describing the location of data embedded in the Dalvik
      * byte code stream.
@@ -1171,7 +1174,43 @@ class Mir2Lir : public Backend {
     virtual void MarkGCCard(RegStorage val_reg, RegStorage tgt_addr_reg) = 0;
 
     // Required for target - register utilities.
+
+    /**
+     * @brief Portable way of getting special registers from the backend.
+     * @param reg Enumeration describing the purpose of the register.
+     * @return Return the #RegStorage corresponding to the given purpose @p reg.
+     * @note This function is currently allowed to return any suitable view of the registers
+     *   (e.g. this could be 64-bit solo or 32-bit solo for 64-bit backends).
+     */
     virtual RegStorage TargetReg(SpecialTargetRegister reg) = 0;
+
+    /**
+     * @brief Portable way of getting special registers from the backend.
+     * @param reg Enumeration describing the purpose of the register.
+     * @param is_wide Whether the view should be 64-bit (rather than 32-bit).
+     * @return Return the #RegStorage corresponding to the given purpose @p reg.
+     */
+    virtual RegStorage TargetReg(SpecialTargetRegister reg, bool is_wide) {
+      return TargetReg(reg);
+    }
+
+    /**
+     * @brief Portable way of getting a special register for storing a reference.
+     * @see TargetReg()
+     */
+    virtual RegStorage TargetRefReg(SpecialTargetRegister reg) {
+      return TargetReg(reg);
+    }
+
+    // Get a reg storage corresponding to the wide & ref flags of the reg location.
+    virtual RegStorage TargetReg(SpecialTargetRegister reg, RegLocation loc) {
+      if (loc.ref) {
+        return TargetRefReg(reg);
+      } else {
+        return TargetReg(reg, loc.wide);
+      }
+    }
+
     virtual RegStorage GetArgMappingToPhysicalReg(int arg_num) = 0;
     virtual RegLocation GetReturnAlt() = 0;
     virtual RegLocation GetReturnWideAlt() = 0;
@@ -1567,6 +1606,45 @@ class Mir2Lir : public Backend {
      */
     virtual void GenConst(RegLocation rl_dest, int value);
 
+    enum class WidenessCheck {  // private
+      kIgnoreWide,
+      kCheckWide,
+      kCheckNotWide
+    };
+
+    enum class RefCheck {  // private
+      kIgnoreRef,
+      kCheckRef,
+      kCheckNotRef
+    };
+
+    enum class FPCheck {  // private
+      kIgnoreFP,
+      kCheckFP,
+      kCheckNotFP
+    };
+
+    /**
+     * Check whether a reg storage seems well-formed, that is, if a reg storage is valid,
+     * that it has the expected form for the flags.
+     * A flag value of 0 means ignore. A flag value of -1 means false. A flag value of 1 means true.
+     */
+    void CheckRegStorageImpl(RegStorage rs, WidenessCheck wide, RefCheck ref, FPCheck fp, bool fail,
+                             bool report)
+        const;
+
+    /**
+     * Check whether a reg location seems well-formed, that is, if a reg storage is encoded,
+     * that it has the expected size.
+     */
+    void CheckRegLocationImpl(RegLocation rl, bool fail, bool report) const;
+
+    // See CheckRegStorageImpl. Will print or fail depending on kFailOnSizeError and
+    // kReportSizeError.
+    void CheckRegStorage(RegStorage rs, WidenessCheck wide, RefCheck ref, FPCheck fp) const;
+    // See CheckRegLocationImpl.
+    void CheckRegLocation(RegLocation rl) const;
+
   public:
     // TODO: add accessors for these.
     LIR* literal_list_;                        // Constants.
diff --git a/compiler/dex/quick/ralloc_util.cc b/compiler/dex/quick/ralloc_util.cc
index 81dabd448e..38370ad889 100644
--- a/compiler/dex/quick/ralloc_util.cc
+++ b/compiler/dex/quick/ralloc_util.cc
@@ -420,24 +420,28 @@ RegStorage Mir2Lir::AllocTempWide() {
     RegStorage high_reg = AllocTemp();
     res = RegStorage::MakeRegPair(low_reg, high_reg);
   }
+  CheckRegStorage(res, WidenessCheck::kCheckWide, RefCheck::kIgnoreRef, FPCheck::kCheckNotFP);
   return res;
 }
 
 RegStorage Mir2Lir::AllocTempRef() {
   RegStorage res = AllocTempBody(*reg_pool_->ref_regs_, reg_pool_->next_ref_reg_, true);
   DCHECK(!res.IsPair());
+  CheckRegStorage(res, WidenessCheck::kCheckNotWide, RefCheck::kCheckRef, FPCheck::kCheckNotFP);
   return res;
 }
 
 RegStorage Mir2Lir::AllocTempSingle() {
   RegStorage res = AllocTempBody(reg_pool_->sp_regs_, &reg_pool_->next_sp_reg_, true);
   DCHECK(res.IsSingle()) << "Reg: 0x" << std::hex << res.GetRawBits();
+  CheckRegStorage(res, WidenessCheck::kCheckNotWide, RefCheck::kCheckNotRef, FPCheck::kIgnoreFP);
   return res;
 }
 
 RegStorage Mir2Lir::AllocTempDouble() {
   RegStorage res = AllocTempBody(reg_pool_->dp_regs_, &reg_pool_->next_dp_reg_, true);
   DCHECK(res.IsDouble()) << "Reg: 0x" << std::hex << res.GetRawBits();
+  CheckRegStorage(res, WidenessCheck::kCheckWide, RefCheck::kCheckNotRef, FPCheck::kIgnoreFP);
   return res;
 }
 
@@ -474,13 +478,15 @@ RegStorage Mir2Lir::AllocLiveReg(int s_reg, int reg_class, bool wide) {
   RegStorage reg;
   if (reg_class == kRefReg) {
     reg = FindLiveReg(*reg_pool_->ref_regs_, s_reg);
+    CheckRegStorage(reg, WidenessCheck::kCheckNotWide, RefCheck::kCheckRef, FPCheck::kCheckNotFP);
   }
   if (!reg.Valid() && ((reg_class == kAnyReg) || (reg_class == kFPReg))) {
     reg = FindLiveReg(wide ? reg_pool_->dp_regs_ : reg_pool_->sp_regs_, s_reg);
   }
   if (!reg.Valid() && (reg_class != kFPReg)) {
     if (cu_->target64) {
-      reg = FindLiveReg(wide ? reg_pool_->core64_regs_ : reg_pool_->core_regs_, s_reg);
+      reg = FindLiveReg(wide || reg_class == kRefReg ? reg_pool_->core64_regs_ :
+                                                       reg_pool_->core_regs_, s_reg);
     } else {
       reg = FindLiveReg(reg_pool_->core_regs_, s_reg);
     }
@@ -525,6 +531,9 @@ RegStorage Mir2Lir::AllocLiveReg(int s_reg, int reg_class, bool wide) {
       ClobberSReg(s_reg + 1);
     }
   }
+  CheckRegStorage(reg, WidenessCheck::kIgnoreWide,
+                  reg_class == kRefReg ? RefCheck::kCheckRef : RefCheck::kIgnoreRef,
+                  FPCheck::kIgnoreFP);
   return reg;
 }
 
@@ -996,7 +1005,7 @@ RegLocation Mir2Lir::UpdateLoc(RegLocation loc) {
   if (loc.location != kLocPhysReg) {
     DCHECK((loc.location == kLocDalvikFrame) ||
          (loc.location == kLocCompilerTemp));
-    RegStorage reg = AllocLiveReg(loc.s_reg_low, kAnyReg, false);
+    RegStorage reg = AllocLiveReg(loc.s_reg_low, loc.ref ? kRefReg : kAnyReg, false);
     if (reg.Valid()) {
       bool match = true;
       RegisterInfo* info = GetRegInfo(reg);
@@ -1010,6 +1019,7 @@ RegLocation Mir2Lir::UpdateLoc(RegLocation loc) {
         FreeTemp(reg);
       }
     }
+    CheckRegLocation(loc);
   }
   return loc;
 }
@@ -1044,6 +1054,7 @@ RegLocation Mir2Lir::UpdateLocWide(RegLocation loc) {
         FreeTemp(reg);
       }
     }
+    CheckRegLocation(loc);
   }
   return loc;
 }
@@ -1073,6 +1084,7 @@ RegLocation Mir2Lir::EvalLocWide(RegLocation loc, int reg_class, bool update) {
       MarkWide(loc.reg);
       MarkLive(loc);
     }
+    CheckRegLocation(loc);
     return loc;
   }
 
@@ -1086,10 +1098,16 @@ RegLocation Mir2Lir::EvalLocWide(RegLocation loc, int reg_class, bool update) {
     loc.location = kLocPhysReg;
     MarkLive(loc);
   }
+  CheckRegLocation(loc);
   return loc;
 }
 
 RegLocation Mir2Lir::EvalLoc(RegLocation loc, int reg_class, bool update) {
+  // Narrow reg_class if the loc is a ref.
+  if (loc.ref && reg_class == kAnyReg) {
+    reg_class = kRefReg;
+  }
+
   if (loc.wide) {
     return EvalLocWide(loc, reg_class, update);
   }
@@ -1106,17 +1124,20 @@ RegLocation Mir2Lir::EvalLoc(RegLocation loc, int reg_class, bool update) {
       loc.reg = new_reg;
       MarkLive(loc);
     }
+    CheckRegLocation(loc);
     return loc;
   }
 
   DCHECK_NE(loc.s_reg_low, INVALID_SREG);
 
   loc.reg = AllocTypedTemp(loc.fp, reg_class);
+  CheckRegLocation(loc);
 
   if (update) {
     loc.location = kLocPhysReg;
     MarkLive(loc);
   }
+  CheckRegLocation(loc);
   return loc;
 }
 
@@ -1338,6 +1359,7 @@ RegLocation Mir2Lir::GetReturnWide(RegisterClass reg_class) {
   Clobber(res.reg);
   LockTemp(res.reg);
   MarkWide(res.reg);
+  CheckRegLocation(res);
   return res;
 }
 
@@ -1354,6 +1376,7 @@ RegLocation Mir2Lir::GetReturn(RegisterClass reg_class) {
   } else {
     LockTemp(res.reg);
   }
+  CheckRegLocation(res);
   return res;
 }
 
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index b15591b413..64b4af86a2 100644
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -1883,54 +1883,42 @@ void X86Mir2Lir::FlushIns(RegLocation* ArgLocs, RegLocation rl_method) {
    */
   ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
   for (int i = 0; i < cu_->num_ins; i++) {
-    PromotionMap* v_map = &promotion_map_[start_vreg + i];
-    RegStorage reg = RegStorage::InvalidReg();
     // get reg corresponding to input
-    reg = GetArgMappingToPhysicalReg(i);
+    RegStorage reg = GetArgMappingToPhysicalReg(i);
 
+    RegLocation* t_loc = &ArgLocs[i];
     if (reg.Valid()) {
-      // If arriving in register
-      bool need_flush = true;
-      RegLocation* t_loc = &ArgLocs[i];
-      if ((v_map->core_location == kLocPhysReg) && !t_loc->fp) {
-        OpRegCopy(RegStorage::Solo32(v_map->core_reg), reg);
-        need_flush = false;
-      } else if ((v_map->fp_location == kLocPhysReg) && t_loc->fp) {
-        OpRegCopy(RegStorage::Solo32(v_map->FpReg), reg);
-        need_flush = false;
-      } else {
-        need_flush = true;
-      }
+      // If arriving in register.
 
-      // For wide args, force flush if not fully promoted
-      if (t_loc->wide) {
-        PromotionMap* p_map = v_map + (t_loc->high_word ? -1 : +1);
-        // Is only half promoted?
-        need_flush |= (p_map->core_location != v_map->core_location) ||
-            (p_map->fp_location != v_map->fp_location);
-      }
-      if (need_flush) {
-        if (t_loc->wide && t_loc->fp) {
-          StoreBaseDisp(TargetReg(kSp), SRegOffset(start_vreg + i), reg, k64, kNotVolatile);
-          // Increment i to skip the next one
-          i++;
-        } else if (t_loc->wide && !t_loc->fp) {
-          StoreBaseDisp(TargetReg(kSp), SRegOffset(start_vreg + i), reg, k64, kNotVolatile);
-          // Increment i to skip the next one
-          i++;
+      // We have already updated the arg location with promoted info
+      // so we can be based on it.
+      if (t_loc->location == kLocPhysReg) {
+        // Just copy it.
+        OpRegCopy(t_loc->reg, reg);
+      } else {
+        // Needs flush.
+        if (t_loc->ref) {
+          StoreRefDisp(TargetReg(kSp), SRegOffset(start_vreg + i), reg, kNotVolatile);
         } else {
-          Store32Disp(TargetReg(kSp), SRegOffset(start_vreg + i), reg);
+          StoreBaseDisp(TargetReg(kSp), SRegOffset(start_vreg + i), reg, t_loc->wide ? k64 : k32,
+                        kNotVolatile);
         }
       }
     } else {
-      // If arriving in frame & promoted
-      if (v_map->core_location == kLocPhysReg) {
-        Load32Disp(TargetReg(kSp), SRegOffset(start_vreg + i), RegStorage::Solo32(v_map->core_reg));
-      }
-      if (v_map->fp_location == kLocPhysReg) {
-        Load32Disp(TargetReg(kSp), SRegOffset(start_vreg + i), RegStorage::Solo32(v_map->FpReg));
+      // If arriving in frame & promoted.
+      if (t_loc->location == kLocPhysReg) {
+        if (t_loc->ref) {
+          LoadRefDisp(TargetReg(kSp), SRegOffset(start_vreg + i), t_loc->reg, kNotVolatile);
+        } else {
+          LoadBaseDisp(TargetReg(kSp), SRegOffset(start_vreg + i), t_loc->reg,
+                       t_loc->wide ? k64 : k32, kNotVolatile);
+        }
       }
     }
+    if (t_loc->wide) {
+      // Increment i to skip the next one.
+      i++;
+    }
   }
 }
 
diff --git a/compiler/dex/reg_location.h b/compiler/dex/reg_location.h
new file mode 100644
index 0000000000..38f59dac5f
--- /dev/null
+++ b/compiler/dex/reg_location.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_DEX_REG_LOCATION_H_
+#define ART_COMPILER_DEX_REG_LOCATION_H_
+
+#include "reg_storage.h"
+
+namespace art {
+
+
+/*
+ * Whereas a SSA name describes a definition of a Dalvik vreg, the RegLocation describes
+ * the type of an SSA name (and, can also be used by code generators to record where the
+ * value is located (i.e. - physical register, frame, spill, etc.).  For each SSA name (SReg)
+ * there is a RegLocation.
+ * A note on SSA names:
+ *   o SSA names for Dalvik vRegs v0..vN will be assigned 0..N.  These represent the "vN_0"
+ *     names.  Negative SSA names represent special values not present in the Dalvik byte code.
+ *     For example, SSA name -1 represents an invalid SSA name, and SSA name -2 represents the
+ *     the Method pointer.  SSA names < -2 are reserved for future use.
+ *   o The vN_0 names for non-argument Dalvik should in practice never be used (as they would
+ *     represent the read of an undefined local variable).  The first definition of the
+ *     underlying Dalvik vReg will result in a vN_1 name.
+ *
+ * FIXME: The orig_sreg field was added as a workaround for llvm bitcode generation.  With
+ * the latest restructuring, we should be able to remove it and rely on s_reg_low throughout.
+ */
+struct RegLocation {
+  RegLocationType location:3;
+  unsigned wide:1;
+  unsigned defined:1;   // Do we know the type?
+  unsigned is_const:1;  // Constant, value in mir_graph->constant_values[].
+  unsigned fp:1;        // Floating point?
+  unsigned core:1;      // Non-floating point?
+  unsigned ref:1;       // Something GC cares about.
+  unsigned high_word:1;  // High word of pair?
+  unsigned home:1;      // Does this represent the home location?
+  RegStorage reg;       // Encoded physical registers.
+  int16_t s_reg_low;    // SSA name for low Dalvik word.
+  int16_t orig_sreg;    // TODO: remove after Bitcode gen complete
+                        // and consolidate usage w/ s_reg_low.
+};
+
+}  // namespace art
+
+#endif  // ART_COMPILER_DEX_REG_LOCATION_H_
diff --git a/compiler/jni/jni_compiler_test.cc b/compiler/jni/jni_compiler_test.cc
index 8f4eddbea3..25b489ba79 100644
--- a/compiler/jni/jni_compiler_test.cc
+++ b/compiler/jni/jni_compiler_test.cc
@@ -1284,13 +1284,6 @@ TEST_F(JniCompilerTest, WithoutImplementation) {
   EXPECT_TRUE(env_->ExceptionCheck() == JNI_TRUE);
 }
 
-template <typename U, typename V> V convert(U in) {
-  DCHECK_LE(sizeof(U), sizeof(V));
-  union { U u; V v; } tmp;
-  tmp.u = in;
-  return tmp.v;
-}
-
 void Java_MyClassNatives_stackArgsIntsFirst(JNIEnv* env, jclass klass, jint i1, jint i2, jint i3,
                                             jint i4, jint i5, jint i6, jint i7, jint i8, jint i9,
                                             jint i10, jfloat f1, jfloat f2, jfloat f3, jfloat f4,
@@ -1307,25 +1300,25 @@ void Java_MyClassNatives_stackArgsIntsFirst(JNIEnv* env, jclass klass, jint i1,
   EXPECT_EQ(i9, 9);
   EXPECT_EQ(i10, 10);
 
-  jint i11 = convert<jfloat, jint>(f1);
+  jint i11 = bit_cast<jfloat, jint>(f1);
   EXPECT_EQ(i11, 11);
-  jint i12 = convert<jfloat, jint>(f2);
+  jint i12 = bit_cast<jfloat, jint>(f2);
   EXPECT_EQ(i12, 12);
-  jint i13 = convert<jfloat, jint>(f3);
+  jint i13 = bit_cast<jfloat, jint>(f3);
   EXPECT_EQ(i13, 13);
-  jint i14 = convert<jfloat, jint>(f4);
+  jint i14 = bit_cast<jfloat, jint>(f4);
   EXPECT_EQ(i14, 14);
-  jint i15 = convert<jfloat, jint>(f5);
+  jint i15 = bit_cast<jfloat, jint>(f5);
   EXPECT_EQ(i15, 15);
-  jint i16 = convert<jfloat, jint>(f6);
+  jint i16 = bit_cast<jfloat, jint>(f6);
   EXPECT_EQ(i16, 16);
-  jint i17 = convert<jfloat, jint>(f7);
+  jint i17 = bit_cast<jfloat, jint>(f7);
   EXPECT_EQ(i17, 17);
-  jint i18 = convert<jfloat, jint>(f8);
+  jint i18 = bit_cast<jfloat, jint>(f8);
   EXPECT_EQ(i18, 18);
-  jint i19 = convert<jfloat, jint>(f9);
+  jint i19 = bit_cast<jfloat, jint>(f9);
   EXPECT_EQ(i19, 19);
-  jint i20 = convert<jfloat, jint>(f10);
+  jint i20 = bit_cast<jfloat, jint>(f10);
   EXPECT_EQ(i20, 20);
 }
 
@@ -1345,16 +1338,16 @@ TEST_F(JniCompilerTest, StackArgsIntsFirst) {
   jint i9 = 9;
   jint i10 = 10;
 
-  jfloat f1 = convert<jint, jfloat>(11);
-  jfloat f2 = convert<jint, jfloat>(12);
-  jfloat f3 = convert<jint, jfloat>(13);
-  jfloat f4 = convert<jint, jfloat>(14);
-  jfloat f5 = convert<jint, jfloat>(15);
-  jfloat f6 = convert<jint, jfloat>(16);
-  jfloat f7 = convert<jint, jfloat>(17);
-  jfloat f8 = convert<jint, jfloat>(18);
-  jfloat f9 = convert<jint, jfloat>(19);
-  jfloat f10 = convert<jint, jfloat>(20);
+  jfloat f1 = bit_cast<jint, jfloat>(11);
+  jfloat f2 = bit_cast<jint, jfloat>(12);
+  jfloat f3 = bit_cast<jint, jfloat>(13);
+  jfloat f4 = bit_cast<jint, jfloat>(14);
+  jfloat f5 = bit_cast<jint, jfloat>(15);
+  jfloat f6 = bit_cast<jint, jfloat>(16);
+  jfloat f7 = bit_cast<jint, jfloat>(17);
+  jfloat f8 = bit_cast<jint, jfloat>(18);
+  jfloat f9 = bit_cast<jint, jfloat>(19);
+  jfloat f10 = bit_cast<jint, jfloat>(20);
 
   env_->CallStaticVoidMethod(jklass_, jmethod_, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, f1, f2,
                              f3, f4, f5, f6, f7, f8, f9, f10);
@@ -1376,25 +1369,25 @@ void Java_MyClassNatives_stackArgsFloatsFirst(JNIEnv* env, jclass klass, jfloat
   EXPECT_EQ(i9, 9);
   EXPECT_EQ(i10, 10);
 
-  jint i11 = convert<jfloat, jint>(f1);
+  jint i11 = bit_cast<jfloat, jint>(f1);
   EXPECT_EQ(i11, 11);
-  jint i12 = convert<jfloat, jint>(f2);
+  jint i12 = bit_cast<jfloat, jint>(f2);
   EXPECT_EQ(i12, 12);
-  jint i13 = convert<jfloat, jint>(f3);
+  jint i13 = bit_cast<jfloat, jint>(f3);
   EXPECT_EQ(i13, 13);
-  jint i14 = convert<jfloat, jint>(f4);
+  jint i14 = bit_cast<jfloat, jint>(f4);
   EXPECT_EQ(i14, 14);
-  jint i15 = convert<jfloat, jint>(f5);
+  jint i15 = bit_cast<jfloat, jint>(f5);
   EXPECT_EQ(i15, 15);
-  jint i16 = convert<jfloat, jint>(f6);
+  jint i16 = bit_cast<jfloat, jint>(f6);
   EXPECT_EQ(i16, 16);
-  jint i17 = convert<jfloat, jint>(f7);
+  jint i17 = bit_cast<jfloat, jint>(f7);
   EXPECT_EQ(i17, 17);
-  jint i18 = convert<jfloat, jint>(f8);
+  jint i18 = bit_cast<jfloat, jint>(f8);
   EXPECT_EQ(i18, 18);
-  jint i19 = convert<jfloat, jint>(f9);
+  jint i19 = bit_cast<jfloat, jint>(f9);
   EXPECT_EQ(i19, 19);
-  jint i20 = convert<jfloat, jint>(f10);
+  jint i20 = bit_cast<jfloat, jint>(f10);
   EXPECT_EQ(i20, 20);
 }
 
@@ -1414,16 +1407,16 @@ TEST_F(JniCompilerTest, StackArgsFloatsFirst) {
   jint i9 = 9;
   jint i10 = 10;
 
-  jfloat f1 = convert<jint, jfloat>(11);
-  jfloat f2 = convert<jint, jfloat>(12);
-  jfloat f3 = convert<jint, jfloat>(13);
-  jfloat f4 = convert<jint, jfloat>(14);
-  jfloat f5 = convert<jint, jfloat>(15);
-  jfloat f6 = convert<jint, jfloat>(16);
-  jfloat f7 = convert<jint, jfloat>(17);
-  jfloat f8 = convert<jint, jfloat>(18);
-  jfloat f9 = convert<jint, jfloat>(19);
-  jfloat f10 = convert<jint, jfloat>(20);
+  jfloat f1 = bit_cast<jint, jfloat>(11);
+  jfloat f2 = bit_cast<jint, jfloat>(12);
+  jfloat f3 = bit_cast<jint, jfloat>(13);
+  jfloat f4 = bit_cast<jint, jfloat>(14);
+  jfloat f5 = bit_cast<jint, jfloat>(15);
+  jfloat f6 = bit_cast<jint, jfloat>(16);
+  jfloat f7 = bit_cast<jint, jfloat>(17);
+  jfloat f8 = bit_cast<jint, jfloat>(18);
+  jfloat f9 = bit_cast<jint, jfloat>(19);
+  jfloat f10 = bit_cast<jint, jfloat>(20);
 
   env_->CallStaticVoidMethod(jklass_, jmethod_, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, i1, i2, i3,
                              i4, i5, i6, i7, i8, i9, i10);
@@ -1444,25 +1437,25 @@ void Java_MyClassNatives_stackArgsMixed(JNIEnv* env, jclass klass, jint i1, jflo
   EXPECT_EQ(i9, 9);
   EXPECT_EQ(i10, 10);
 
-  jint i11 = convert<jfloat, jint>(f1);
+  jint i11 = bit_cast<jfloat, jint>(f1);
   EXPECT_EQ(i11, 11);
-  jint i12 = convert<jfloat, jint>(f2);
+  jint i12 = bit_cast<jfloat, jint>(f2);
   EXPECT_EQ(i12, 12);
-  jint i13 = convert<jfloat, jint>(f3);
+  jint i13 = bit_cast<jfloat, jint>(f3);
   EXPECT_EQ(i13, 13);
-  jint i14 = convert<jfloat, jint>(f4);
+  jint i14 = bit_cast<jfloat, jint>(f4);
   EXPECT_EQ(i14, 14);
-  jint i15 = convert<jfloat, jint>(f5);
+  jint i15 = bit_cast<jfloat, jint>(f5);
   EXPECT_EQ(i15, 15);
-  jint i16 = convert<jfloat, jint>(f6);
+  jint i16 = bit_cast<jfloat, jint>(f6);
   EXPECT_EQ(i16, 16);
-  jint i17 = convert<jfloat, jint>(f7);
+  jint i17 = bit_cast<jfloat, jint>(f7);
   EXPECT_EQ(i17, 17);
-  jint i18 = convert<jfloat, jint>(f8);
+  jint i18 = bit_cast<jfloat, jint>(f8);
   EXPECT_EQ(i18, 18);
-  jint i19 = convert<jfloat, jint>(f9);
+  jint i19 = bit_cast<jfloat, jint>(f9);
   EXPECT_EQ(i19, 19);
-  jint i20 = convert<jfloat, jint>(f10);
+  jint i20 = bit_cast<jfloat, jint>(f10);
   EXPECT_EQ(i20, 20);
 }
 
@@ -1482,16 +1475,16 @@ TEST_F(JniCompilerTest, StackArgsMixed) {
   jint i9 = 9;
   jint i10 = 10;
 
-  jfloat f1 = convert<jint, jfloat>(11);
-  jfloat f2 = convert<jint, jfloat>(12);
-  jfloat f3 = convert<jint, jfloat>(13);
-  jfloat f4 = convert<jint, jfloat>(14);
-  jfloat f5 = convert<jint, jfloat>(15);
-  jfloat f6 = convert<jint, jfloat>(16);
-  jfloat f7 = convert<jint, jfloat>(17);
-  jfloat f8 = convert<jint, jfloat>(18);
-  jfloat f9 = convert<jint, jfloat>(19);
-  jfloat f10 = convert<jint, jfloat>(20);
+  jfloat f1 = bit_cast<jint, jfloat>(11);
+  jfloat f2 = bit_cast<jint, jfloat>(12);
+  jfloat f3 = bit_cast<jint, jfloat>(13);
+  jfloat f4 = bit_cast<jint, jfloat>(14);
+  jfloat f5 = bit_cast<jint, jfloat>(15);
+  jfloat f6 = bit_cast<jint, jfloat>(16);
+  jfloat f7 = bit_cast<jint, jfloat>(17);
+  jfloat f8 = bit_cast<jint, jfloat>(18);
+  jfloat f9 = bit_cast<jint, jfloat>(19);
+  jfloat f10 = bit_cast<jint, jfloat>(20);
 
   env_->CallStaticVoidMethod(jklass_, jmethod_, i1, f1, i2, f2, i3, f3, i4, f4, i5, f5, i6, f6, i7,
                              f7, i8, f8, i9, f9, i10, f10);
diff --git a/compiler/optimizing/builder.cc b/compiler/optimizing/builder.cc
index c3a322caee..cc995f72a1 100644
--- a/compiler/optimizing/builder.cc
+++ b/compiler/optimizing/builder.cc
@@ -226,7 +226,7 @@ HBasicBlock* HGraphBuilder::FindBlockStartingAt(int32_t index) const {
 }
 
 template<typename T>
-void HGraphBuilder::Binop_32x(const Instruction& instruction, Primitive::Type type) {
+void HGraphBuilder::Binop_23x(const Instruction& instruction, Primitive::Type type) {
   HInstruction* first = LoadLocal(instruction.VRegB(), type);
   HInstruction* second = LoadLocal(instruction.VRegC(), type);
   current_block_->AddInstruction(new (arena_) T(type, first, second));
@@ -501,22 +501,22 @@ bool HGraphBuilder::AnalyzeDexInstruction(const Instruction& instruction, int32_
     }
 
     case Instruction::ADD_INT: {
-      Binop_32x<HAdd>(instruction, Primitive::kPrimInt);
+      Binop_23x<HAdd>(instruction, Primitive::kPrimInt);
       break;
     }
 
     case Instruction::ADD_LONG: {
-      Binop_32x<HAdd>(instruction, Primitive::kPrimLong);
+      Binop_23x<HAdd>(instruction, Primitive::kPrimLong);
       break;
     }
 
     case Instruction::SUB_INT: {
-      Binop_32x<HSub>(instruction, Primitive::kPrimInt);
+      Binop_23x<HSub>(instruction, Primitive::kPrimInt);
       break;
     }
 
     case Instruction::SUB_LONG: {
-      Binop_32x<HSub>(instruction, Primitive::kPrimLong);
+      Binop_23x<HSub>(instruction, Primitive::kPrimLong);
       break;
     }
 
@@ -573,6 +573,11 @@ bool HGraphBuilder::AnalyzeDexInstruction(const Instruction& instruction, int32_
       UpdateLocal(instruction.VRegA(), current_block_->GetLastInstruction());
       break;
 
+    case Instruction::CMP_LONG: {
+      Binop_23x<HCompare>(instruction, Primitive::kPrimLong);
+      break;
+    }
+
     case Instruction::NOP:
       break;
 
diff --git a/compiler/optimizing/builder.h b/compiler/optimizing/builder.h
index 0852a26c55..ee32ca80ac 100644
--- a/compiler/optimizing/builder.h
+++ b/compiler/optimizing/builder.h
@@ -73,7 +73,7 @@ class HGraphBuilder : public ValueObject {
   bool InitializeParameters(uint16_t number_of_parameters);
 
   template<typename T>
-  void Binop_32x(const Instruction& instruction, Primitive::Type type);
+  void Binop_23x(const Instruction& instruction, Primitive::Type type);
 
   template<typename T>
   void Binop_12x(const Instruction& instruction, Primitive::Type type);
@@ -84,11 +84,8 @@ class HGraphBuilder : public ValueObject {
   template<typename T>
   void Binop_22s(const Instruction& instruction, bool reverse);
 
-  template<typename T>
-  void If_22t(const Instruction& instruction, int32_t dex_offset);
-
-  template<typename T>
-  void If_21t(const Instruction& instruction, int32_t dex_offset);
+  template<typename T> void If_21t(const Instruction& instruction, int32_t dex_offset);
+  template<typename T> void If_22t(const Instruction& instruction, int32_t dex_offset);
 
   void BuildReturn(const Instruction& instruction, Primitive::Type type);
 
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index 83621e0f72..ae2f03080e 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -90,6 +90,7 @@ class CodeGenerator : public ArenaObject {
   virtual void SetupBlockedRegisters(bool* blocked_registers) const = 0;
   virtual void DumpCoreRegister(std::ostream& stream, int reg) const = 0;
   virtual void DumpFloatingPointRegister(std::ostream& stream, int reg) const = 0;
+  virtual InstructionSet GetInstructionSet() const = 0;
 
   void RecordPcInfo(uint32_t dex_pc) {
     struct PcInfo pc_info;
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index ec3c81533f..d87c14b4db 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -905,6 +905,48 @@ void InstructionCodeGeneratorARM::VisitNot(HNot* instruction) {
          locations->InAt(0).AsArm().AsCoreRegister(), ShifterOperand(1));
 }
 
+void LocationsBuilderARM::VisitCompare(HCompare* compare) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(compare);
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(1, Location::RequiresRegister());
+  locations->SetOut(Location::RequiresRegister());
+  compare->SetLocations(locations);
+}
+
+void InstructionCodeGeneratorARM::VisitCompare(HCompare* compare) {
+  Label greater, done;
+  LocationSummary* locations = compare->GetLocations();
+  switch (compare->InputAt(0)->GetType()) {
+    case Primitive::kPrimLong: {
+      Register output = locations->Out().AsArm().AsCoreRegister();
+      ArmManagedRegister left = locations->InAt(0).AsArm();
+      ArmManagedRegister right = locations->InAt(1).AsArm();
+      Label less, greater, done;
+      __ cmp(left.AsRegisterPairHigh(),
+             ShifterOperand(right.AsRegisterPairHigh()));  // Signed compare.
+      __ b(&less, LT);
+      __ b(&greater, GT);
+      __ cmp(left.AsRegisterPairLow(),
+             ShifterOperand(right.AsRegisterPairLow()));  // Unsigned compare.
+      __ LoadImmediate(output, 0);
+      __ b(&done, EQ);
+      __ b(&less, CC);
+
+      __ Bind(&greater);
+      __ LoadImmediate(output, 1);
+      __ b(&done);
+
+      __ Bind(&less);
+      __ LoadImmediate(output, -1);
+
+      __ Bind(&done);
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unimplemented compare type " << compare->InputAt(0)->GetType();
+  }
+}
+
 void LocationsBuilderARM::VisitPhi(HPhi* instruction) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
   for (size_t i = 0, e = instruction->InputCount(); i < e; ++i) {
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index 712a24cf67..c46c1b131c 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -171,6 +171,10 @@ class CodeGeneratorARM : public CodeGenerator {
     return &move_resolver_;
   }
 
+  virtual InstructionSet GetInstructionSet() const OVERRIDE {
+    return InstructionSet::kArm;
+  }
+
  private:
   // Helper method to move a 32bits value between two locations.
   void Move32(Location destination, Location source);
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index f624f3ce90..572d494719 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -81,12 +81,23 @@ ManagedRegister CodeGeneratorX86::AllocateFreeRegister(Primitive::Type type,
                                                        bool* blocked_registers) const {
   switch (type) {
     case Primitive::kPrimLong: {
-      size_t reg = AllocateFreeRegisterInternal(
-          GetBlockedRegisterPairs(blocked_registers), kNumberOfRegisterPairs);
+      bool* blocked_register_pairs = GetBlockedRegisterPairs(blocked_registers);
+      size_t reg = AllocateFreeRegisterInternal(blocked_register_pairs, kNumberOfRegisterPairs);
       X86ManagedRegister pair =
           X86ManagedRegister::FromRegisterPair(static_cast<RegisterPair>(reg));
       blocked_registers[pair.AsRegisterPairLow()] = true;
       blocked_registers[pair.AsRegisterPairHigh()] = true;
+      // Block all other register pairs that share a register with `pair`.
+      for (int i = 0; i < kNumberOfRegisterPairs; i++) {
+        X86ManagedRegister current =
+            X86ManagedRegister::FromRegisterPair(static_cast<RegisterPair>(i));
+        if (current.AsRegisterPairLow() == pair.AsRegisterPairLow()
+            || current.AsRegisterPairLow() == pair.AsRegisterPairHigh()
+            || current.AsRegisterPairHigh() == pair.AsRegisterPairLow()
+            || current.AsRegisterPairHigh() == pair.AsRegisterPairHigh()) {
+          blocked_register_pairs[i] = true;
+        }
+      }
       return pair;
     }
 
@@ -901,6 +912,46 @@ void InstructionCodeGeneratorX86::VisitNot(HNot* instruction) {
   __ xorl(out.AsX86().AsCpuRegister(), Immediate(1));
 }
 
+void LocationsBuilderX86::VisitCompare(HCompare* compare) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(compare);
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(1, Location::RequiresRegister());
+  locations->SetOut(Location::RequiresRegister());
+  compare->SetLocations(locations);
+}
+
+void InstructionCodeGeneratorX86::VisitCompare(HCompare* compare) {
+  Label greater, done;
+  LocationSummary* locations = compare->GetLocations();
+  switch (compare->InputAt(0)->GetType()) {
+    case Primitive::kPrimLong: {
+      Label less, greater, done;
+      Register output = locations->Out().AsX86().AsCpuRegister();
+      X86ManagedRegister left = locations->InAt(0).AsX86();
+      X86ManagedRegister right = locations->InAt(1).AsX86();
+      __ cmpl(left.AsRegisterPairHigh(), right.AsRegisterPairHigh());
+      __ j(kLess, &less);  // Signed compare.
+      __ j(kGreater, &greater);  // Signed compare.
+      __ cmpl(left.AsRegisterPairLow(), right.AsRegisterPairLow());
+      __ movl(output, Immediate(0));
+      __ j(kEqual, &done);
+      __ j(kBelow, &less);  // Unsigned compare.
+
+      __ Bind(&greater);
+      __ movl(output, Immediate(1));
+      __ jmp(&done);
+
+      __ Bind(&less);
+      __ movl(output, Immediate(-1));
+
+      __ Bind(&done);
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unimplemented compare type " << compare->InputAt(0)->GetType();
+  }
+}
+
 void LocationsBuilderX86::VisitPhi(HPhi* instruction) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
   for (size_t i = 0, e = instruction->InputCount(); i < e; ++i) {
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index acc670e09b..8a8216a56d 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -173,6 +173,10 @@ class CodeGeneratorX86 : public CodeGenerator {
     return &move_resolver_;
   }
 
+  virtual InstructionSet GetInstructionSet() const OVERRIDE {
+    return InstructionSet::kX86;
+  }
+
  private:
   // Helper method to move a 32bits value between two locations.
   void Move32(Location destination, Location source);
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 283f1f5e57..dc1d6164b1 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -228,7 +228,9 @@ void CodeGeneratorX86_64::Move(Location destination, Location source) {
   }
 }
 
-void CodeGeneratorX86_64::Move(HInstruction* instruction, Location location, HInstruction* move_for) {
+void CodeGeneratorX86_64::Move(HInstruction* instruction,
+                               Location location,
+                               HInstruction* move_for) {
   if (instruction->AsIntConstant() != nullptr) {
     Immediate imm(instruction->AsIntConstant()->GetValue());
     if (location.IsRegister()) {
@@ -383,7 +385,7 @@ void LocationsBuilderX86_64::VisitCondition(HCondition* comp) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(comp);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RequiresRegister());
-  locations->SetOut(Location::SameAsFirstInput());
+  locations->SetOut(Location::RequiresRegister());
   comp->SetLocations(locations);
 }
 
@@ -444,6 +446,39 @@ void InstructionCodeGeneratorX86_64::VisitGreaterThanOrEqual(HGreaterThanOrEqual
   VisitCondition(comp);
 }
 
+void LocationsBuilderX86_64::VisitCompare(HCompare* compare) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(compare);
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(1, Location::RequiresRegister());
+  locations->SetOut(Location::RequiresRegister());
+  compare->SetLocations(locations);
+}
+
+void InstructionCodeGeneratorX86_64::VisitCompare(HCompare* compare) {
+  Label greater, done;
+  LocationSummary* locations = compare->GetLocations();
+  switch (compare->InputAt(0)->GetType()) {
+    case Primitive::kPrimLong:
+      __ cmpq(locations->InAt(0).AsX86_64().AsCpuRegister(),
+              locations->InAt(1).AsX86_64().AsCpuRegister());
+      break;
+    default:
+      LOG(FATAL) << "Unimplemented compare type " << compare->InputAt(0)->GetType();
+  }
+
+  __ movl(locations->Out().AsX86_64().AsCpuRegister(), Immediate(0));
+  __ j(kEqual, &done);
+  __ j(kGreater, &greater);
+
+  __ movl(locations->Out().AsX86_64().AsCpuRegister(), Immediate(-1));
+  __ jmp(&done);
+
+  __ Bind(&greater);
+  __ movl(locations->Out().AsX86_64().AsCpuRegister(), Immediate(1));
+
+  __ Bind(&done);
+}
+
 void LocationsBuilderX86_64::VisitIntConstant(HIntConstant* constant) {
   // TODO: Support constant locations.
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(constant);
@@ -463,7 +498,7 @@ void LocationsBuilderX86_64::VisitLongConstant(HLongConstant* constant) {
 }
 
 void InstructionCodeGeneratorX86_64::VisitLongConstant(HLongConstant* constant) {
-  // Will be generated at use site.
+  codegen_->Move(constant, constant->GetLocations()->Out(), nullptr);
 }
 
 void LocationsBuilderX86_64::VisitReturnVoid(HReturnVoid* ret) {
@@ -812,10 +847,13 @@ void ParallelMoveResolverX86_64::EmitMove(size_t index) {
   if (source.IsRegister()) {
     if (destination.IsRegister()) {
       __ movq(destination.AsX86_64().AsCpuRegister(), source.AsX86_64().AsCpuRegister());
-    } else {
-      DCHECK(destination.IsStackSlot());
+    } else if (destination.IsStackSlot()) {
       __ movl(Address(CpuRegister(RSP), destination.GetStackIndex()),
               source.AsX86_64().AsCpuRegister());
+    } else {
+      DCHECK(destination.IsDoubleStackSlot());
+      __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()),
+              source.AsX86_64().AsCpuRegister());
     }
   } else if (source.IsStackSlot()) {
     if (destination.IsRegister()) {
@@ -826,18 +864,27 @@ void ParallelMoveResolverX86_64::EmitMove(size_t index) {
       __ movl(CpuRegister(TMP), Address(CpuRegister(RSP), source.GetStackIndex()));
       __ movl(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
     }
+  } else if (source.IsDoubleStackSlot()) {
+    if (destination.IsRegister()) {
+      __ movq(destination.AsX86_64().AsX86_64().AsCpuRegister(),
+              Address(CpuRegister(RSP), source.GetStackIndex()));
+    } else {
+      DCHECK(destination.IsDoubleStackSlot());
+      __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), source.GetStackIndex()));
+      __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
+    }
   } else {
     LOG(FATAL) << "Unimplemented";
   }
 }
 
-void ParallelMoveResolverX86_64::Exchange(CpuRegister reg, int mem) {
+void ParallelMoveResolverX86_64::Exchange32(CpuRegister reg, int mem) {
   __ movl(CpuRegister(TMP), Address(CpuRegister(RSP), mem));
-  __ movl(Address(CpuRegister(RSP), mem), CpuRegister(reg));
-  __ movl(CpuRegister(reg), CpuRegister(TMP));
+  __ movl(Address(CpuRegister(RSP), mem), reg);
+  __ movl(reg, CpuRegister(TMP));
 }
 
-void ParallelMoveResolverX86_64::Exchange(int mem1, int mem2) {
+void ParallelMoveResolverX86_64::Exchange32(int mem1, int mem2) {
   ScratchRegisterScope ensure_scratch(
       this, TMP, RAX, codegen_->GetNumberOfCoreRegisters());
 
@@ -850,6 +897,25 @@ void ParallelMoveResolverX86_64::Exchange(int mem1, int mem2) {
           CpuRegister(ensure_scratch.GetRegister()));
 }
 
+void ParallelMoveResolverX86_64::Exchange64(CpuRegister reg, int mem) {
+  __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), mem));
+  __ movq(Address(CpuRegister(RSP), mem), reg);
+  __ movq(reg, CpuRegister(TMP));
+}
+
+void ParallelMoveResolverX86_64::Exchange64(int mem1, int mem2) {
+  ScratchRegisterScope ensure_scratch(
+      this, TMP, RAX, codegen_->GetNumberOfCoreRegisters());
+
+  int stack_offset = ensure_scratch.IsSpilled() ? kX86_64WordSize : 0;
+  __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), mem1 + stack_offset));
+  __ movq(CpuRegister(ensure_scratch.GetRegister()),
+          Address(CpuRegister(RSP), mem2 + stack_offset));
+  __ movq(Address(CpuRegister(RSP), mem2 + stack_offset), CpuRegister(TMP));
+  __ movq(Address(CpuRegister(RSP), mem1 + stack_offset),
+          CpuRegister(ensure_scratch.GetRegister()));
+}
+
 void ParallelMoveResolverX86_64::EmitSwap(size_t index) {
   MoveOperands* move = moves_.Get(index);
   Location source = move->GetSource();
@@ -858,11 +924,17 @@ void ParallelMoveResolverX86_64::EmitSwap(size_t index) {
   if (source.IsRegister() && destination.IsRegister()) {
     __ xchgq(destination.AsX86_64().AsCpuRegister(), source.AsX86_64().AsCpuRegister());
   } else if (source.IsRegister() && destination.IsStackSlot()) {
-    Exchange(source.AsX86_64().AsCpuRegister(), destination.GetStackIndex());
+    Exchange32(source.AsX86_64().AsCpuRegister(), destination.GetStackIndex());
   } else if (source.IsStackSlot() && destination.IsRegister()) {
-    Exchange(destination.AsX86_64().AsCpuRegister(), source.GetStackIndex());
+    Exchange32(destination.AsX86_64().AsCpuRegister(), source.GetStackIndex());
   } else if (source.IsStackSlot() && destination.IsStackSlot()) {
-    Exchange(destination.GetStackIndex(), source.GetStackIndex());
+    Exchange32(destination.GetStackIndex(), source.GetStackIndex());
+  } else if (source.IsRegister() && destination.IsDoubleStackSlot()) {
+    Exchange64(source.AsX86_64().AsCpuRegister(), destination.GetStackIndex());
+  } else if (source.IsDoubleStackSlot() && destination.IsRegister()) {
+    Exchange64(destination.AsX86_64().AsCpuRegister(), source.GetStackIndex());
+  } else if (source.IsDoubleStackSlot() && destination.IsDoubleStackSlot()) {
+    Exchange64(destination.GetStackIndex(), source.GetStackIndex());
   } else {
     LOG(FATAL) << "Unimplemented";
   }
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index f07df292e0..d347a4f121 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -69,8 +69,10 @@ class ParallelMoveResolverX86_64 : public ParallelMoveResolver {
   X86_64Assembler* GetAssembler() const;
 
  private:
-  void Exchange(CpuRegister reg, int mem);
-  void Exchange(int mem1, int mem2);
+  void Exchange32(CpuRegister reg, int mem);
+  void Exchange32(int mem1, int mem2);
+  void Exchange64(CpuRegister reg, int mem);
+  void Exchange64(int mem1, int mem2);
 
   CodeGeneratorX86_64* const codegen_;
 
@@ -170,6 +172,10 @@ class CodeGeneratorX86_64 : public CodeGenerator {
   virtual void DumpCoreRegister(std::ostream& stream, int reg) const OVERRIDE;
   virtual void DumpFloatingPointRegister(std::ostream& stream, int reg) const OVERRIDE;
 
+  virtual InstructionSet GetInstructionSet() const OVERRIDE {
+    return InstructionSet::kX86_64;
+  }
+
  private:
   // Helper method to move a value between two locations.
   void Move(Location destination, Location source);
diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index a49ce64a2d..f033e2e22b 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc
@@ -108,9 +108,11 @@ class HGraphVisualizerPrinter : public HGraphVisitor {
       } else {
         codegen_.DumpCoreRegister(output_, location.reg().RegId());
       }
-    } else {
-      DCHECK(location.IsStackSlot());
+    } else if (location.IsStackSlot()) {
       output_ << location.GetStackIndex() << "(sp)";
+    } else {
+      DCHECK(location.IsDoubleStackSlot());
+      output_ << "2x" << location.GetStackIndex() << "(sp)";
     }
   }
 
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 503f31d990..92920845c3 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -414,6 +414,7 @@ class HBasicBlock : public ArenaObject {
   M(ReturnVoid)                                            \
   M(StoreLocal)                                            \
   M(Sub)                                                   \
+  M(Compare)                                               \
 
 
 #define FORWARD_DECLARATION(type) class H##type;
@@ -986,6 +987,22 @@ class HGreaterThanOrEqual : public HCondition {
 };
 
 
+// Instruction to check how two inputs compare to each other.
+// Result is 0 if input0 == input1, 1 if input0 > input1, or -1 if input0 < input1.
+class HCompare : public HBinaryOperation {
+ public:
+  HCompare(Primitive::Type type, HInstruction* first, HInstruction* second)
+      : HBinaryOperation(Primitive::kPrimInt, first, second) {
+    DCHECK_EQ(type, first->GetType());
+    DCHECK_EQ(type, second->GetType());
+  }
+
+  DECLARE_INSTRUCTION(Compare);
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HCompare);
+};
+
 // A local in the graph. Corresponds to a Dex register.
 class HLocal : public HTemplateInstruction<0> {
  public:
diff --git a/compiler/optimizing/register_allocator.cc b/compiler/optimizing/register_allocator.cc
index 1f4cb41582..68130dd5fc 100644
--- a/compiler/optimizing/register_allocator.cc
+++ b/compiler/optimizing/register_allocator.cc
@@ -55,7 +55,7 @@ bool RegisterAllocator::CanAllocateRegistersFor(const HGraph& graph,
          it.Advance()) {
       HInstruction* current = it.Current();
       if (current->NeedsEnvironment()) return false;
-      if (current->GetType() == Primitive::kPrimLong) return false;
+      if (current->GetType() == Primitive::kPrimLong && instruction_set != kX86_64) return false;
       if (current->GetType() == Primitive::kPrimFloat) return false;
       if (current->GetType() == Primitive::kPrimDouble) return false;
     }
@@ -139,7 +139,7 @@ void RegisterAllocator::AllocateRegistersInternal() {
         current->SetFrom(position + 1);
         current->SetRegister(output.reg().RegId());
         BlockRegister(output, position, position + 1, instruction->GetType());
-      } else if (output.IsStackSlot()) {
+      } else if (output.IsStackSlot() || output.IsDoubleStackSlot()) {
         current->SetSpillSlot(output.GetStackIndex());
       }
       for (size_t i = 0; i < instruction->InputCount(); ++i) {
@@ -430,7 +430,7 @@ bool RegisterAllocator::IsBlocked(int reg) const {
 // we spill `current` instead.
 bool RegisterAllocator::AllocateBlockedReg(LiveInterval* current) {
   size_t first_register_use = current->FirstRegisterUse();
-  if (current->FirstRegisterUse() == kNoLifetime) {
+  if (first_register_use == kNoLifetime) {
     AllocateSpillSlotFor(current);
     return false;
   }
@@ -559,6 +559,10 @@ LiveInterval* RegisterAllocator::Split(LiveInterval* interval, size_t position)
   }
 }
 
+static bool NeedTwoSpillSlot(Primitive::Type type) {
+  return type == Primitive::kPrimLong || type == Primitive::kPrimDouble;
+}
+
 void RegisterAllocator::AllocateSpillSlotFor(LiveInterval* interval) {
   LiveInterval* parent = interval->GetParent();
 
@@ -581,6 +585,43 @@ void RegisterAllocator::AllocateSpillSlotFor(LiveInterval* interval) {
   }
   size_t end = last_sibling->GetEnd();
 
+  if (NeedTwoSpillSlot(parent->GetType())) {
+    AllocateTwoSpillSlots(parent, end);
+  } else {
+    AllocateOneSpillSlot(parent, end);
+  }
+}
+
+void RegisterAllocator::AllocateTwoSpillSlots(LiveInterval* parent, size_t end) {
+  // Find an available spill slot.
+  size_t slot = 0;
+  for (size_t e = spill_slots_.Size(); slot < e; ++slot) {
+    // We check if it is less rather than less or equal because the parallel move
+    // resolver does not work when a single spill slot needs to be exchanged with
+    // a double spill slot. The strict comparison avoids needing to exchange these
+    // locations at the same lifetime position.
+    if (spill_slots_.Get(slot) < parent->GetStart()
+        && (slot == (e - 1) || spill_slots_.Get(slot + 1) < parent->GetStart())) {
+      break;
+    }
+  }
+
+  if (slot == spill_slots_.Size()) {
+    // We need a new spill slot.
+    spill_slots_.Add(end);
+    spill_slots_.Add(end);
+  } else if (slot == spill_slots_.Size() - 1) {
+    spill_slots_.Put(slot, end);
+    spill_slots_.Add(end);
+  } else {
+    spill_slots_.Put(slot, end);
+    spill_slots_.Put(slot + 1, end);
+  }
+
+  parent->SetSpillSlot(slot * kVRegSize);
+}
+
+void RegisterAllocator::AllocateOneSpillSlot(LiveInterval* parent, size_t end) {
   // Find an available spill slot.
   size_t slot = 0;
   for (size_t e = spill_slots_.Size(); slot < e; ++slot) {
@@ -604,7 +645,11 @@ static Location ConvertToLocation(LiveInterval* interval) {
     return Location::RegisterLocation(ManagedRegister(interval->GetRegister()));
   } else {
     DCHECK(interval->GetParent()->HasSpillSlot());
-    return Location::StackSlot(interval->GetParent()->GetSpillSlot());
+    if (NeedTwoSpillSlot(interval->GetType())) {
+      return Location::DoubleStackSlot(interval->GetParent()->GetSpillSlot());
+    } else {
+      return Location::StackSlot(interval->GetParent()->GetSpillSlot());
+    }
   }
 }
 
@@ -750,7 +795,9 @@ void RegisterAllocator::ConnectSiblings(LiveInterval* interval) {
     // We spill eagerly, so move must be at definition.
     InsertMoveAfter(interval->GetDefinedBy(),
                     Location::RegisterLocation(ManagedRegister(interval->GetRegister())),
-                    Location::StackSlot(interval->GetParent()->GetSpillSlot()));
+                    NeedTwoSpillSlot(interval->GetType())
+                        ? Location::DoubleStackSlot(interval->GetParent()->GetSpillSlot())
+                        : Location::StackSlot(interval->GetParent()->GetSpillSlot()));
   }
   UsePosition* use = current->GetFirstUse();
 
diff --git a/compiler/optimizing/register_allocator.h b/compiler/optimizing/register_allocator.h
index e63122ffed..7d4cd1a862 100644
--- a/compiler/optimizing/register_allocator.h
+++ b/compiler/optimizing/register_allocator.h
@@ -93,6 +93,8 @@ class RegisterAllocator {
 
   // Allocate a spill slot for the given interval.
   void AllocateSpillSlotFor(LiveInterval* interval);
+  void AllocateOneSpillSlot(LiveInterval* interval, size_t end);
+  void AllocateTwoSpillSlots(LiveInterval* interval, size_t end);
 
   // Connect adjacent siblings within blocks.
   void ConnectSiblings(LiveInterval* interval);
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index 41d1529ef5..4d5d613015 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -949,6 +949,14 @@ void X86_64Assembler::andl(CpuRegister dst, const Immediate& imm) {
 }
 
 
+void X86_64Assembler::andq(CpuRegister reg, const Immediate& imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  CHECK(imm.is_int32());  // andq only supports 32b immediate.
+  EmitRex64(reg);
+  EmitComplex(4, Operand(reg), imm);
+}
+
+
 void X86_64Assembler::orl(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitOptionalRex32(dst, src);
@@ -972,6 +980,14 @@ void X86_64Assembler::xorl(CpuRegister dst, CpuRegister src) {
 }
 
 
+void X86_64Assembler::xorq(CpuRegister dst, CpuRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitRex64(dst, src);
+  EmitUint8(0x33);
+  EmitOperand(dst.LowBits(), Operand(src));
+}
+
+
 void X86_64Assembler::xorq(CpuRegister dst, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   CHECK(imm.is_int32());  // xorq only supports 32b immediate.
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index 9aa5a54df4..7514854829 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -391,12 +391,14 @@ class X86_64Assembler FINAL : public Assembler {
 
   void andl(CpuRegister dst, const Immediate& imm);
   void andl(CpuRegister dst, CpuRegister src);
+  void andq(CpuRegister dst, const Immediate& imm);
 
   void orl(CpuRegister dst, const Immediate& imm);
   void orl(CpuRegister dst, CpuRegister src);
 
   void xorl(CpuRegister dst, CpuRegister src);
   void xorq(CpuRegister dst, const Immediate& imm);
+  void xorq(CpuRegister dst, CpuRegister src);
 
   void addl(CpuRegister dst, CpuRegister src);
   void addl(CpuRegister reg, const Immediate& imm);
diff --git a/dex2oat/dex2oat.cc b/dex2oat/dex2oat.cc
index 280f6d050d..3387f914b6 100644
--- a/dex2oat/dex2oat.cc
+++ b/dex2oat/dex2oat.cc
@@ -998,7 +998,7 @@ static int dex2oat(int argc, char** argv) {
     } else if (option == "--no-profile-file") {
       // No profile
     } else if (option.starts_with("--top-k-profile-threshold=")) {
-      ParseDouble(option.data(), '=', 10.0, 90.0, &top_k_profile_threshold);
+      ParseDouble(option.data(), '=', 0.0, 100.0, &top_k_profile_threshold);
     } else if (option == "--print-pass-names") {
       PassDriverMEOpts::PrintPassNames();
     } else if (option.starts_with("--disable-passes=")) {
@@ -1166,7 +1166,6 @@ static int dex2oat(int argc, char** argv) {
   CheckExplicitCheckOptions(instruction_set, &explicit_null_checks, &explicit_so_checks,
                             &explicit_suspend_checks);
 
-  LOG(INFO) << "init compiler options for explicit null: " << explicit_null_checks;
   CompilerOptions compiler_options(compiler_filter,
                                    huge_method_threshold,
                                    large_method_threshold,
diff --git a/disassembler/disassembler_x86.cc b/disassembler/disassembler_x86.cc
index b012bc1cc1..1d8cf9b4cd 100644
--- a/disassembler/disassembler_x86.cc
+++ b/disassembler/disassembler_x86.cc
@@ -896,6 +896,14 @@ DISASSEMBLER_ENTRY(cmp,
   case 0x99:
     opcode << "cdq";
     break;
+  case 0x9B:
+    if (instr[1] == 0xDF && instr[2] == 0xE0) {
+      opcode << "fstsw\tax";
+      instr += 2;
+    } else {
+      opcode << StringPrintf("unknown opcode '%02X'", *instr);
+    }
+    break;
   case 0xAF:
     opcode << (prefix[2] == 0x66 ? "scasw" : "scasl");
     break;
@@ -942,11 +950,25 @@ DISASSEMBLER_ENTRY(cmp,
     break;
   case 0xCC: opcode << "int 3"; break;
   case 0xD9:
-    static const char* d9_opcodes[] = {"flds", "unknown-d9", "fsts", "fstps", "fldenv", "fldcw", "fnstenv", "fnstcw"};
-    modrm_opcodes = d9_opcodes;
-    store = true;
-    has_modrm = true;
-    reg_is_opcode = true;
+    if (instr[1] == 0xF8) {
+      opcode << "fprem";
+      instr++;
+    } else {
+      static const char* d9_opcodes[] = {"flds", "unknown-d9", "fsts", "fstps", "fldenv", "fldcw",
+                                         "fnstenv", "fnstcw"};
+      modrm_opcodes = d9_opcodes;
+      store = true;
+      has_modrm = true;
+      reg_is_opcode = true;
+    }
+    break;
+  case 0xDA:
+    if (instr[1] == 0xE9) {
+      opcode << "fucompp";
+      instr++;
+    } else {
+      opcode << StringPrintf("unknown opcode '%02X'", *instr);
+    }
     break;
   case 0xDB:
     static const char* db_opcodes[] = {"fildl", "unknown-db", "unknown-db", "unknown-db", "unknown-db", "unknown-db", "unknown-db", "unknown-db"};
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index 83a683d185..4939610e60 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -127,7 +127,7 @@
 
     // Ugly compile-time check, but we only have the preprocessor.
 #if (FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE != 40 + 8)
-#error "REFS_AND_ARGS_CALLEE_SAVE_FRAME(ARM64) size not as expected."
+#error "REFS_AND_ARGS_CALLEE_SAVE_FRAME(ARM) size not as expected."
 #endif
 .endm
 
@@ -1007,7 +1007,92 @@ ENTRY art_quick_resolution_trampoline
     DELIVER_PENDING_EXCEPTION
 END art_quick_resolution_trampoline
 
-UNIMPLEMENTED art_quick_generic_jni_trampoline
+    /*
+     * Called to do a generic JNI down-call
+     */
+ENTRY art_quick_generic_jni_trampoline
+    SETUP_REF_AND_ARGS_CALLEE_SAVE_FRAME
+    str r0, [sp, #0]  // Store native ArtMethod* to bottom of stack.
+
+    // Save rSELF
+    mov r11, rSELF
+    // Save SP , so we can have static CFI info. r10 is saved in ref_and_args.
+    mov r10, sp
+    .cfi_def_cfa_register r10
+
+    sub sp, sp, #5120
+
+    // prepare for artQuickGenericJniTrampoline call
+    // (Thread*,  SP)
+    //    r0      r1   <= C calling convention
+    //  rSELF     r10  <= where they are
+
+    mov r0, rSELF   // Thread*
+    mov r1, r10
+    blx artQuickGenericJniTrampoline  // (Thread*, sp)
+
+    // The C call will have registered the complete save-frame on success.
+    // The result of the call is:
+    // r0: pointer to native code, 0 on error.
+    // r1: pointer to the bottom of the used area of the alloca, can restore stack till there.
+
+    // Check for error = 0.
+    cbz r0, .Lentry_error
+
+    // Release part of the alloca.
+    mov sp, r1
+
+    // Save the code pointer
+    mov r12, r0
+
+    // Load parameters from frame into registers.
+    pop {r0-r3}
+
+    // Softfloat.
+    // TODO: Change to hardfloat when supported.
+
+    blx r12           // native call.
+
+    // result sign extension is handled in C code
+    // prepare for artQuickGenericJniEndTrampoline call
+    // (Thread*, result, result_f)
+    //    r0      r1,r2    r3,stack       <= C calling convention
+    //    r11     r0,r1    r0,r1          <= where they are
+    sub sp, sp, #12 // Stack alignment.
+
+    push {r1}
+    mov r3, r0
+    mov r2, r1
+    mov r1, r0
+    mov r0, r11
+
+    blx artQuickGenericJniEndTrampoline
+
+    // Tear down the alloca.
+    mov sp, r10
+    .cfi_def_cfa_register sp
+
+    // Restore self pointer.
+    mov r9, r11
+
+    // Pending exceptions possible.
+    ldr r2, [r9, #THREAD_EXCEPTION_OFFSET]  @ load Thread::Current()->exception_
+    cbnz r2, .Lexception_in_native
+
+    // Tear down the callee-save frame.
+    RESTORE_REF_AND_ARGS_CALLEE_SAVE_FRAME
+
+    bx lr      // ret
+
+.Lentry_error:
+    mov sp, r10
+    .cfi_def_cfa_register sp
+    mov r9, r11
+.Lexception_in_native:
+    RESTORE_REF_AND_ARGS_CALLEE_SAVE_FRAME
+    DELIVER_PENDING_EXCEPTION
+
+END art_quick_generic_jni_trampoline
 
     .extern artQuickToInterpreterBridge
 ENTRY art_quick_to_interpreter_bridge
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index e088751392..7907b6ee13 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -1485,33 +1485,34 @@ ENTRY art_quick_generic_jni_trampoline
     mov x1, xFP
     bl artQuickGenericJniTrampoline  // (Thread*, sp)
 
-    // Get the updated pointer. This is the bottom of the frame _with_ handle scope.
-    ldr xFP, [sp]
-    add x9, sp, #8
+    // The C call will have registered the complete save-frame on success.
+    // The result of the call is:
+    // x0: pointer to native code, 0 on error.
+    // x1: pointer to the bottom of the used area of the alloca, can restore stack till there.
 
-    cmp x0, #0
-    b.mi .Lentry_error      // Check for error, negative value.
+    // Check for error = 0.
+    cbz x0, .Lentry_error
 
-    // release part of the alloca.
-    add x9, x9, x0
+    // Release part of the alloca.
+    mov sp, x1
 
-    // Get the code pointer
-    ldr xIP0, [x9, #0]
+    // Save the code pointer
+    mov xIP0, x0
 
     // Load parameters from frame into registers.
     // TODO Check with artQuickGenericJniTrampoline.
     //      Also, check again APPCS64 - the stack arguments are interleaved.
-    ldp x0, x1, [x9, #8]
-    ldp x2, x3, [x9, #24]
-    ldp x4, x5, [x9, #40]
-    ldp x6, x7, [x9, #56]
+    ldp x0, x1, [sp]
+    ldp x2, x3, [sp, #16]
+    ldp x4, x5, [sp, #32]
+    ldp x6, x7, [sp, #48]
 
-    ldp d0, d1, [x9, #72]
-    ldp d2, d3, [x9, #88]
-    ldp d4, d5, [x9, #104]
-    ldp d6, d7, [x9, #120]
+    ldp d0, d1, [sp, #64]
+    ldp d2, d3, [sp, #80]
+    ldp d4, d5, [sp, #96]
+    ldp d6, d7, [sp, #112]
 
-    add sp, x9, #136
+    add sp, sp, #128
 
     blr xIP0           // native call.
 
@@ -1520,13 +1521,11 @@ ENTRY art_quick_generic_jni_trampoline
 
     // result sign extension is handled in C code
     // prepare for artQuickGenericJniEndTrampoline call
-    // (Thread*,  SP, result, result_f)
-    //   x0       x1   x2       x3       <= C calling convention
-    mov x5, x0      // Save return value
+    // (Thread*, result, result_f)
+    //    x0       x1       x2        <= C calling convention
+    mov x1, x0      // Result (from saved)
     mov x0, xSELF   // Thread register
-    mov x1, xFP     // Stack pointer
-    mov x2, x5      // Result (from saved)
-    fmov x3, d0     // d0 will contain floating point result, but needs to go into x3
+    fmov x2, d0     // d0 will contain floating point result, but needs to go into x2
 
     bl artQuickGenericJniEndTrampoline
 
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 0326f9edef..24b9e465e8 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -1127,8 +1127,7 @@ DEFINE_FUNCTION art_quick_generic_jni_trampoline
     // This also stores the native ArtMethod reference at the bottom of the stack.
 
     movl %esp, %ebp                 // save SP at callee-save frame
-    movl %esp, %edi
-    CFI_DEF_CFA_REGISTER(edi)
+    CFI_DEF_CFA_REGISTER(ebp)
     subl LITERAL(5120), %esp
     // prepare for artQuickGenericJniTrampoline call
     // (Thread*,  SP)
@@ -1141,46 +1140,39 @@ DEFINE_FUNCTION art_quick_generic_jni_trampoline
     pushl %fs:THREAD_SELF_OFFSET  // Pass Thread::Current().
     SETUP_GOT_NOSAVE              // Clobbers ebx.
     call PLT_SYMBOL(artQuickGenericJniTrampoline)  // (Thread*, sp)
-    // Drop call stack.
-    addl LITERAL(16), %esp
 
-    // At the bottom of the alloca we now have the name pointer to the method=bottom of callee-save
-    // get the adjusted frame pointer
-    popl %ebp
+    // The C call will have registered the complete save-frame on success.
+    // The result of the call is:
+    // eax: pointer to native code, 0 on error.
+    // edx: pointer to the bottom of the used area of the alloca, can restore stack till there.
 
-    // Check for error, negative value.
+    // Check for error = 0.
     test %eax, %eax
-    js .Lentry_error
+    jz .Lentry_error
 
-    // release part of the alloca, get the code pointer
-    addl %eax, %esp
-    popl %eax
+    // Release part of the alloca.
+    movl %edx, %esp
 
     // On x86 there are no registers passed, so nothing to pop here.
 
     // Native call.
     call *%eax
 
-    // Pop native stack, but keep the space that was reserved cookie.
-    movl %ebp, %esp
-    subl LITERAL(16), %esp        // Alignment.
-
     // result sign extension is handled in C code
     // prepare for artQuickGenericJniEndTrampoline call
-    // (Thread*,  SP,  result, result_f)
-    //  (esp)   4(esp)  8(esp)  16(esp)    <= C calling convention
-    //  fs:...    ebp  eax:edx   xmm0      <= where they are
+    // (Thread*, result, result_f)
+    //  (esp)    4(esp)  12(esp)    <= C calling convention
+    //  fs:...  eax:edx   xmm0      <= where they are
 
-    subl LITERAL(8), %esp         // Pass float result.
+    subl LITERAL(20), %esp         // Padding & pass float result.
     movsd %xmm0, (%esp)
     pushl %edx                    // Pass int result.
     pushl %eax
-    pushl %ebp                    // Pass SP (to ArtMethod).
     pushl %fs:THREAD_SELF_OFFSET  // Pass Thread::Current().
     call PLT_SYMBOL(artQuickGenericJniEndTrampoline)
 
     // Tear down the alloca.
-    movl %edi, %esp
+    movl %ebp, %esp
     CFI_DEF_CFA_REGISTER(esp)
 
     // Pending exceptions possible.
@@ -1204,7 +1196,7 @@ DEFINE_FUNCTION art_quick_generic_jni_trampoline
     punpckldq %xmm1, %xmm0
     ret
 .Lentry_error:
-    movl %edi, %esp
+    movl %ebp, %esp
     CFI_DEF_CFA_REGISTER(esp)
 .Lexception_in_native:
     RESTORE_REF_AND_ARGS_CALLEE_SAVE_FRAME
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 668fb882c7..8fa947c9b3 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -1167,11 +1167,9 @@ DEFINE_FUNCTION art_quick_generic_jni_trampoline
     movq %xmm5, 56(%rsp)
     movq %xmm6, 64(%rsp)
     movq %xmm7, 72(%rsp)
-    // Store native ArtMethod* to bottom of stack.
-    movq %rdi, 0(%rsp)
-    movq %rsp, %rbp                 // save SP at callee-save frame
-    movq %rsp, %rbx
-    CFI_DEF_CFA_REGISTER(rbx)
+    movq %rdi, 0(%rsp)              // Store native ArtMethod* to bottom of stack.
+    movq %rsp, %rbp                 // save SP at (old) callee-save frame
+    CFI_DEF_CFA_REGISTER(rbp)
     //
     // reserve a lot of space
     //
@@ -1198,17 +1196,17 @@ DEFINE_FUNCTION art_quick_generic_jni_trampoline
     movq %rbp, %rsi
     call PLT_SYMBOL(artQuickGenericJniTrampoline)  // (Thread*, sp)
 
-    // At the bottom of the alloca we now have the name pointer to the method=bottom of callee-save
-    // get the adjusted frame pointer
-    popq %rbp
+    // The C call will have registered the complete save-frame on success.
+    // The result of the call is:
+    // %rax: pointer to native code, 0 on error.
+    // %rdx: pointer to the bottom of the used area of the alloca, can restore stack till there.
 
-    // Check for error, negative value.
+    // Check for error = 0.
     test %rax, %rax
-    js .Lentry_error
+    jz .Lentry_error
 
-    // release part of the alloca, get the code pointer
-    addq %rax, %rsp
-    popq %rax
+    // Release part of the alloca.
+    movq %rdx, %rsp
 
     // pop from the register-passing alloca region
     // what's the right layout?
@@ -1228,21 +1226,22 @@ DEFINE_FUNCTION art_quick_generic_jni_trampoline
     movq 48(%rsp), %xmm6
     movq 56(%rsp), %xmm7
     addq LITERAL(64), %rsp          // floating-point done
+
     // native call
-    call *%rax                      // Stack should be aligned 16B without the return addr?
+    call *%rax
+
     // result sign extension is handled in C code
     // prepare for artQuickGenericJniEndTrampoline call
-    // (Thread*,  SP, result, result_f)
-    //   rdi      rsi   rdx   rcx       <= C calling convention
-    //  gs:...    rbp   rax   xmm0      <= where they are
+    // (Thread*,  result, result_f)
+    //   rdi      rsi   rdx       <= C calling convention
+    //  gs:...    rax   xmm0      <= where they are
     movq %gs:THREAD_SELF_OFFSET, %rdi
-    movq %rbp, %rsi
-    movq %rax, %rdx
-    movq %xmm0, %rcx
+    movq %rax, %rsi
+    movq %xmm0, %rdx
     call PLT_SYMBOL(artQuickGenericJniEndTrampoline)
 
     // Tear down the alloca.
-    movq %rbx, %rsp
+    movq %rbp, %rsp
     CFI_DEF_CFA_REGISTER(rsp)
 
     // Pending exceptions possible.
@@ -1280,7 +1279,7 @@ DEFINE_FUNCTION art_quick_generic_jni_trampoline
     movq %rax, %xmm0
     ret
 .Lentry_error:
-    movq %rbx, %rsp
+    movq %rbp, %rsp
     CFI_DEF_CFA_REGISTER(rsp)
 .Lexception_in_native:
     // TODO: the handle scope contains the this pointer which is used by the debugger for exception
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index 7a144b6d72..6fb962452e 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -185,8 +185,8 @@ class QuickArgumentVisitor {
       case 3: return (5 * GetBytesPerGprSpillLocation(kRuntimeISA));
       case 4: return (6 * GetBytesPerGprSpillLocation(kRuntimeISA));
       default:
-        LOG(FATAL) << "Unexpected GPR index: " << gpr_index;
-        return 0;
+      LOG(FATAL) << "Unexpected GPR index: " << gpr_index;
+      return 0;
     }
   }
 #else
@@ -209,16 +209,15 @@ class QuickArgumentVisitor {
     return *reinterpret_cast<uintptr_t*>(lr);
   }
 
-  QuickArgumentVisitor(StackReference<mirror::ArtMethod>* sp, bool is_static,
-                       const char* shorty, uint32_t shorty_len)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) :
-      is_static_(is_static), shorty_(shorty), shorty_len_(shorty_len),
-      gpr_args_(reinterpret_cast<byte*>(sp) + kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset),
-      fpr_args_(reinterpret_cast<byte*>(sp) + kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset),
-      stack_args_(reinterpret_cast<byte*>(sp) + kQuickCalleeSaveFrame_RefAndArgs_FrameSize
-                  + StackArgumentStartFromShorty(is_static, shorty, shorty_len)),
-      gpr_index_(0), fpr_index_(0), stack_index_(0), cur_type_(Primitive::kPrimVoid),
-      is_split_long_or_double_(false) { }
+  QuickArgumentVisitor(StackReference<mirror::ArtMethod>* sp, bool is_static, const char* shorty,
+                       uint32_t shorty_len) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) :
+          is_static_(is_static), shorty_(shorty), shorty_len_(shorty_len),
+          gpr_args_(reinterpret_cast<byte*>(sp) + kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset),
+          fpr_args_(reinterpret_cast<byte*>(sp) + kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset),
+          stack_args_(reinterpret_cast<byte*>(sp) + kQuickCalleeSaveFrame_RefAndArgs_FrameSize
+                      + StackArgumentStartFromShorty(is_static, shorty, shorty_len)),
+          gpr_index_(0), fpr_index_(0), stack_index_(0), cur_type_(Primitive::kPrimVoid),
+          is_split_long_or_double_(false) {}
 
   virtual ~QuickArgumentVisitor() {}
 
@@ -388,9 +387,12 @@ class QuickArgumentVisitor {
     }
   }
 
+ protected:
   const bool is_static_;
   const char* const shorty_;
   const uint32_t shorty_len_;
+
+ private:
   byte* const gpr_args_;  // Address of GPR arguments in callee save frame.
   byte* const fpr_args_;  // Address of FPR arguments in callee save frame.
   byte* const stack_args_;  // Address of stack arguments in caller's frame.
@@ -409,7 +411,7 @@ class BuildQuickShadowFrameVisitor FINAL : public QuickArgumentVisitor {
   BuildQuickShadowFrameVisitor(StackReference<mirror::ArtMethod>* sp, bool is_static,
                                const char* shorty, uint32_t shorty_len, ShadowFrame* sf,
                                size_t first_arg_reg) :
-    QuickArgumentVisitor(sp, is_static, shorty, shorty_len), sf_(sf), cur_reg_(first_arg_reg) {}
+      QuickArgumentVisitor(sp, is_static, shorty, shorty_len), sf_(sf), cur_reg_(first_arg_reg) {}
 
   void Visit() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) OVERRIDE;
 
@@ -420,7 +422,7 @@ class BuildQuickShadowFrameVisitor FINAL : public QuickArgumentVisitor {
   DISALLOW_COPY_AND_ASSIGN(BuildQuickShadowFrameVisitor);
 };
 
-void BuildQuickShadowFrameVisitor::Visit()  {
+void BuildQuickShadowFrameVisitor::Visit() {
   Primitive::Type type = GetParamPrimitiveType();
   switch (type) {
     case Primitive::kPrimLong:  // Fall-through.
@@ -465,13 +467,14 @@ extern "C" uint64_t artQuickToInterpreterBridge(mirror::ArtMethod* method, Threa
     return 0;
   } else {
     DCHECK(!method->IsNative()) << PrettyMethod(method);
-    const char* old_cause = self->StartAssertNoThreadSuspension("Building interpreter shadow frame");
+    const char* old_cause = self->StartAssertNoThreadSuspension(
+        "Building interpreter shadow frame");
     const DexFile::CodeItem* code_item = method->GetCodeItem();
     DCHECK(code_item != nullptr) << PrettyMethod(method);
     uint16_t num_regs = code_item->registers_size_;
     void* memory = alloca(ShadowFrame::ComputeSize(num_regs));
-    ShadowFrame* shadow_frame(ShadowFrame::Create(num_regs, NULL,  // No last shadow coming from quick.
-                                                  method, 0, memory));
+    // No last shadow coming from quick.
+    ShadowFrame* shadow_frame(ShadowFrame::Create(num_regs, nullptr, method, 0, memory));
     size_t first_arg_reg = code_item->registers_size_ - code_item->ins_size_;
     uint32_t shorty_len = 0;
     const char* shorty = method->GetShorty(&shorty_len);
@@ -512,7 +515,7 @@ class BuildQuickArgumentVisitor FINAL : public QuickArgumentVisitor {
   BuildQuickArgumentVisitor(StackReference<mirror::ArtMethod>* sp, bool is_static,
                             const char* shorty, uint32_t shorty_len,
                             ScopedObjectAccessUnchecked* soa, std::vector<jvalue>* args) :
-    QuickArgumentVisitor(sp, is_static, shorty, shorty_len), soa_(soa), args_(args) {}
+      QuickArgumentVisitor(sp, is_static, shorty, shorty_len), soa_(soa), args_(args) {}
 
   void Visit() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) OVERRIDE;
 
@@ -584,7 +587,8 @@ extern "C" uint64_t artQuickProxyInvokeHandler(mirror::ArtMethod* proxy_method,
   const char* old_cause =
       self->StartAssertNoThreadSuspension("Adding to IRT proxy object arguments");
   // Register the top of the managed stack, making stack crawlable.
-  DCHECK_EQ(sp->AsMirrorPtr(), proxy_method) << PrettyMethod(proxy_method);
+  DCHECK_EQ(sp->AsMirrorPtr(), proxy_method)
+  << PrettyMethod(proxy_method);
   self->SetTopOfStack(sp, 0);
   DCHECK_EQ(proxy_method->GetFrameSizeInBytes(),
             Runtime::Current()->GetCalleeSaveMethod(Runtime::kRefsAndArgs)->GetFrameSizeInBytes())
@@ -600,7 +604,7 @@ extern "C" uint64_t artQuickProxyInvokeHandler(mirror::ArtMethod* proxy_method,
   // Placing arguments into args vector and remove the receiver.
   mirror::ArtMethod* non_proxy_method = proxy_method->GetInterfaceMethodIfProxy();
   CHECK(!non_proxy_method->IsStatic()) << PrettyMethod(proxy_method) << " "
-      << PrettyMethod(non_proxy_method);
+                                       << PrettyMethod(non_proxy_method);
   std::vector<jvalue> args;
   uint32_t shorty_len = 0;
   const char* shorty = proxy_method->GetShorty(&shorty_len);
@@ -632,7 +636,7 @@ class RememberForGcArgumentVisitor FINAL : public QuickArgumentVisitor {
   RememberForGcArgumentVisitor(StackReference<mirror::ArtMethod>* sp, bool is_static,
                                const char* shorty, uint32_t shorty_len,
                                ScopedObjectAccessUnchecked* soa) :
-    QuickArgumentVisitor(sp, is_static, shorty, shorty_len), soa_(soa) {}
+      QuickArgumentVisitor(sp, is_static, shorty, shorty_len), soa_(soa) {}
 
   void Visit() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) OVERRIDE;
 
@@ -641,7 +645,8 @@ class RememberForGcArgumentVisitor FINAL : public QuickArgumentVisitor {
  private:
   ScopedObjectAccessUnchecked* const soa_;
   // References which we must update when exiting in case the GC moved the objects.
-  std::vector<std::pair<jobject, StackReference<mirror::Object>*>> references_;
+  std::vector<std::pair<jobject, StackReference<mirror::Object>*> > references_;
+
   DISALLOW_COPY_AND_ASSIGN(RememberForGcArgumentVisitor);
 };
 
@@ -663,7 +668,6 @@ void RememberForGcArgumentVisitor::FixupReferences() {
   }
 }
 
-
 // Lazily resolve a method for quick. Called by stub code.
 extern "C" const void* artQuickResolutionTrampoline(mirror::ArtMethod* called,
                                                     mirror::Object* receiver,
@@ -740,7 +744,6 @@ extern "C" const void* artQuickResolutionTrampoline(mirror::ArtMethod* called,
         is_range = false;
     }
     dex_method_idx = (is_range) ? instr->VRegB_3rc() : instr->VRegB_35c();
-
   } else {
     invoke_type = kStatic;
     dex_file = called->GetDexFile();
@@ -825,8 +828,6 @@ extern "C" const void* artQuickResolutionTrampoline(mirror::ArtMethod* called,
   return code;
 }
 
-
-
 /*
  * This class uses a couple of observations to unite the different calling conventions through
  * a few constants.
@@ -867,7 +868,7 @@ extern "C" const void* artQuickResolutionTrampoline(mirror::ArtMethod* called,
  *                                          entry in the HandleScope (nullptr if necessary).
  *
  */
-template <class T> class BuildGenericJniFrameStateMachine {
+template<class T> class BuildNativeCallFrameStateMachine {
  public:
 #if defined(__arm__)
   // TODO: These are all dummy values!
@@ -912,7 +913,7 @@ template <class T> class BuildGenericJniFrameStateMachine {
 
   static constexpr size_t kRegistersNeededForLong = 2;
   static constexpr size_t kRegistersNeededForDouble = 2;
-  static constexpr bool kMultiRegistersAligned = false;       // x86 not using regs, anyways
+  static constexpr bool kMultiRegistersAligned = false;  // x86 not using regs, anyways
   static constexpr bool kMultiRegistersWidened = false;
   static constexpr bool kAlignLongOnStack = false;
   static constexpr bool kAlignDoubleOnStack = false;
@@ -932,34 +933,34 @@ template <class T> class BuildGenericJniFrameStateMachine {
 #endif
 
  public:
-  explicit BuildGenericJniFrameStateMachine(T* delegate) : gpr_index_(kNumNativeGprArgs),
-                                                           fpr_index_(kNumNativeFprArgs),
-                                                           stack_entries_(0),
-                                                           delegate_(delegate) {
+  explicit BuildNativeCallFrameStateMachine(T* delegate)
+      : gpr_index_(kNumNativeGprArgs),
+        fpr_index_(kNumNativeFprArgs),
+        stack_entries_(0),
+        delegate_(delegate) {
     // For register alignment, we want to assume that counters (gpr_index_, fpr_index_) are even iff
     // the next register is even; counting down is just to make the compiler happy...
     CHECK_EQ(kNumNativeGprArgs % 2, 0U);
     CHECK_EQ(kNumNativeFprArgs % 2, 0U);
   }
 
-  virtual ~BuildGenericJniFrameStateMachine() {}
+  virtual ~BuildNativeCallFrameStateMachine() {}
 
   bool HavePointerGpr() {
     return gpr_index_ > 0;
   }
 
-  void AdvancePointer(void* val) {
+  void AdvancePointer(const void* val) {
     if (HavePointerGpr()) {
       gpr_index_--;
       PushGpr(reinterpret_cast<uintptr_t>(val));
     } else {
-      stack_entries_++;         // TODO: have a field for pointer length as multiple of 32b
+      stack_entries_++;  // TODO: have a field for pointer length as multiple of 32b
       PushStack(reinterpret_cast<uintptr_t>(val));
       gpr_index_ = 0;
     }
   }
 
-
   bool HaveHandleScopeGpr() {
     return gpr_index_ > 0;
   }
@@ -976,7 +977,6 @@ template <class T> class BuildGenericJniFrameStateMachine {
     }
   }
 
-
   bool HaveIntGpr() {
     return gpr_index_ > 0;
   }
@@ -992,7 +992,6 @@ template <class T> class BuildGenericJniFrameStateMachine {
     }
   }
 
-
   bool HaveLongGpr() {
     return gpr_index_ >= kRegistersNeededForLong + (LongGprNeedsPadding() ? 1 : 0);
   }
@@ -1039,30 +1038,22 @@ template <class T> class BuildGenericJniFrameStateMachine {
     }
   }
 
-
   bool HaveFloatFpr() {
     return fpr_index_ > 0;
   }
 
-  template <typename U, typename V> V convert(U in) {
-    CHECK_LE(sizeof(U), sizeof(V));
-    union { U u; V v; } tmp;
-    tmp.u = in;
-    return tmp.v;
-  }
-
   void AdvanceFloat(float val) {
     if (kNativeSoftFloatAbi) {
-      AdvanceInt(convert<float, uint32_t>(val));
+      AdvanceInt(bit_cast<float, uint32_t>(val));
     } else {
       if (HaveFloatFpr()) {
         fpr_index_--;
         if (kRegistersNeededForDouble == 1) {
           if (kMultiRegistersWidened) {
-            PushFpr8(convert<double, uint64_t>(val));
+            PushFpr8(bit_cast<double, uint64_t>(val));
           } else {
             // No widening, just use the bits.
-            PushFpr8(convert<float, uint64_t>(val));
+            PushFpr8(bit_cast<float, uint64_t>(val));
           }
         } else {
           PushFpr4(val);
@@ -1071,16 +1062,17 @@ template <class T> class BuildGenericJniFrameStateMachine {
         stack_entries_++;
         if (kRegistersNeededForDouble == 1 && kMultiRegistersWidened) {
           // Need to widen before storing: Note the "double" in the template instantiation.
-          PushStack(convert<double, uintptr_t>(val));
+          // Note: We need to jump through those hoops to make the compiler happy.
+          DCHECK_EQ(sizeof(uintptr_t), sizeof(uint64_t));
+          PushStack(static_cast<uintptr_t>(bit_cast<double, uint64_t>(val)));
         } else {
-          PushStack(convert<float, uintptr_t>(val));
+          PushStack(bit_cast<float, uintptr_t>(val));
         }
         fpr_index_ = 0;
       }
     }
   }
 
-
   bool HaveDoubleFpr() {
     return fpr_index_ >= kRegistersNeededForDouble + (DoubleFprNeedsPadding() ? 1 : 0);
   }
@@ -1162,101 +1154,66 @@ template <class T> class BuildGenericJniFrameStateMachine {
   T* delegate_;             // What Push implementation gets called
 };
 
-class ComputeGenericJniFrameSize FINAL {
+// Computes the sizes of register stacks and call stack area. Handling of references can be extended
+// in subclasses.
+//
+// To handle native pointers, use "L" in the shorty for an object reference, which simulates
+// them with handles.
+class ComputeNativeCallFrameSize {
  public:
-  ComputeGenericJniFrameSize() : num_handle_scope_references_(0), num_stack_entries_(0) {}
+  ComputeNativeCallFrameSize() : num_stack_entries_(0) {}
+
+  virtual ~ComputeNativeCallFrameSize() {}
 
   uint32_t GetStackSize() {
     return num_stack_entries_ * sizeof(uintptr_t);
   }
 
-  // WARNING: After this, *sp won't be pointing to the method anymore!
-  void ComputeLayout(StackReference<mirror::ArtMethod>** m, bool is_static, const char* shorty,
-                     uint32_t shorty_len, void* sp, HandleScope** table,
-                     uint32_t* handle_scope_entries, uintptr_t** start_stack, uintptr_t** start_gpr,
-                     uint32_t** start_fpr, void** code_return, size_t* overall_size)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    ComputeAll(is_static, shorty, shorty_len);
-
-    mirror::ArtMethod* method = (*m)->AsMirrorPtr();
-
-    uint8_t* sp8 = reinterpret_cast<uint8_t*>(sp);
-
-    // First, fix up the layout of the callee-save frame.
-    // We have to squeeze in the HandleScope, and relocate the method pointer.
-
-    // "Free" the slot for the method.
-    sp8 += kPointerSize;  // In the callee-save frame we use a full pointer.
-
-    // Under the callee saves put handle scope and new method stack reference.
-    *handle_scope_entries = num_handle_scope_references_;
-
-    size_t handle_scope_size = HandleScope::SizeOf(num_handle_scope_references_);
-    size_t scope_and_method = handle_scope_size + sizeof(StackReference<mirror::ArtMethod>);
-
-    sp8 -= scope_and_method;
-    // Align by kStackAlignment.
-    sp8 = reinterpret_cast<uint8_t*>(RoundDown(reinterpret_cast<uintptr_t>(sp8), kStackAlignment));
-
-    uint8_t* sp8_table = sp8 + sizeof(StackReference<mirror::ArtMethod>);
-    *table = reinterpret_cast<HandleScope*>(sp8_table);
-    (*table)->SetNumberOfReferences(num_handle_scope_references_);
-
-    // Add a slot for the method pointer, and fill it. Fix the pointer-pointer given to us.
-    uint8_t* method_pointer = sp8;
-    StackReference<mirror::ArtMethod>* new_method_ref =
-        reinterpret_cast<StackReference<mirror::ArtMethod>*>(method_pointer);
-    new_method_ref->Assign(method);
-    *m = new_method_ref;
-
-    // Reference cookie and padding
-    sp8 -= 8;
-    // Store HandleScope size
-    *reinterpret_cast<uint32_t*>(sp8) = static_cast<uint32_t>(handle_scope_size & 0xFFFFFFFF);
-
-    // Next comes the native call stack.
+  uint8_t* LayoutCallStack(uint8_t* sp8) {
     sp8 -= GetStackSize();
     // Align by kStackAlignment.
     sp8 = reinterpret_cast<uint8_t*>(RoundDown(reinterpret_cast<uintptr_t>(sp8), kStackAlignment));
-    *start_stack = reinterpret_cast<uintptr_t*>(sp8);
+    return sp8;
+  }
 
-    // put fprs and gprs below
+  uint8_t* LayoutCallRegisterStacks(uint8_t* sp8, uintptr_t** start_gpr, uint32_t** start_fpr) {
     // Assumption is OK right now, as we have soft-float arm
-    size_t fregs = BuildGenericJniFrameStateMachine<ComputeGenericJniFrameSize>::kNumNativeFprArgs;
+    size_t fregs = BuildNativeCallFrameStateMachine<ComputeNativeCallFrameSize>::kNumNativeFprArgs;
     sp8 -= fregs * sizeof(uintptr_t);
     *start_fpr = reinterpret_cast<uint32_t*>(sp8);
-    size_t iregs = BuildGenericJniFrameStateMachine<ComputeGenericJniFrameSize>::kNumNativeGprArgs;
+    size_t iregs = BuildNativeCallFrameStateMachine<ComputeNativeCallFrameSize>::kNumNativeGprArgs;
     sp8 -= iregs * sizeof(uintptr_t);
     *start_gpr = reinterpret_cast<uintptr_t*>(sp8);
+    return sp8;
+  }
 
-    // reserve space for the code pointer
-    sp8 -= kPointerSize;
-    *code_return = reinterpret_cast<void*>(sp8);
+  uint8_t* LayoutNativeCall(uint8_t* sp8, uintptr_t** start_stack, uintptr_t** start_gpr,
+                            uint32_t** start_fpr) {
+    // Native call stack.
+    sp8 = LayoutCallStack(sp8);
+    *start_stack = reinterpret_cast<uintptr_t*>(sp8);
 
-    *overall_size = reinterpret_cast<uint8_t*>(sp) - sp8;
+    // Put fprs and gprs below.
+    sp8 = LayoutCallRegisterStacks(sp8, start_gpr, start_fpr);
 
-    // The new SP is stored at the end of the alloca, so it can be immediately popped
-    sp8 = reinterpret_cast<uint8_t*>(sp) - 5 * KB;
-    *(reinterpret_cast<uint8_t**>(sp8)) = method_pointer;
+    // Return the new bottom.
+    return sp8;
   }
 
-  void ComputeHandleScopeOffset() { }  // nothing to do, static right now
+  virtual void WalkHeader(BuildNativeCallFrameStateMachine<ComputeNativeCallFrameSize>* sm)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {}
 
-  void ComputeAll(bool is_static, const char* shorty, uint32_t shorty_len)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    BuildGenericJniFrameStateMachine<ComputeGenericJniFrameSize> sm(this);
-
-    // JNIEnv
-    sm.AdvancePointer(nullptr);
+  void Walk(const char* shorty, uint32_t shorty_len) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    BuildNativeCallFrameStateMachine<ComputeNativeCallFrameSize> sm(this);
 
-    // Class object or this as first argument
-    sm.AdvanceHandleScope(reinterpret_cast<mirror::Object*>(0x12345678));
+    WalkHeader(&sm);
 
     for (uint32_t i = 1; i < shorty_len; ++i) {
       Primitive::Type cur_type_ = Primitive::GetType(shorty[i]);
       switch (cur_type_) {
         case Primitive::kPrimNot:
-          sm.AdvanceHandleScope(reinterpret_cast<mirror::Object*>(0x12345678));
+          sm.AdvanceHandleScope(
+              reinterpret_cast<mirror::Object*>(0x12345678));
           break;
 
         case Primitive::kPrimBoolean:
@@ -1299,50 +1256,135 @@ class ComputeGenericJniFrameSize FINAL {
     // counting is already done in the superclass
   }
 
-  uintptr_t PushHandle(mirror::Object* /* ptr */) {
-    num_handle_scope_references_++;
+  virtual uintptr_t PushHandle(mirror::Object* /* ptr */) {
     return reinterpret_cast<uintptr_t>(nullptr);
   }
 
- private:
-  uint32_t num_handle_scope_references_;
+ protected:
   uint32_t num_stack_entries_;
 };
 
-// Visits arguments on the stack placing them into a region lower down the stack for the benefit
-// of transitioning into native code.
-class BuildGenericJniFrameVisitor FINAL : public QuickArgumentVisitor {
+class ComputeGenericJniFrameSize FINAL : public ComputeNativeCallFrameSize {
  public:
-  BuildGenericJniFrameVisitor(StackReference<mirror::ArtMethod>** sp, bool is_static,
-                              const char* shorty, uint32_t shorty_len, Thread* self) :
-      QuickArgumentVisitor(*sp, is_static, shorty, shorty_len), sm_(this) {
-    ComputeGenericJniFrameSize fsc;
-    fsc.ComputeLayout(sp, is_static, shorty, shorty_len, *sp, &handle_scope_, &handle_scope_expected_refs_,
-                      &cur_stack_arg_, &cur_gpr_reg_, &cur_fpr_reg_, &code_return_,
-                      &alloca_used_size_);
-    handle_scope_number_of_references_ = 0;
-    cur_hs_entry_ = GetFirstHandleScopeEntry();
+  ComputeGenericJniFrameSize() : num_handle_scope_references_(0) {}
+
+  // Lays out the callee-save frame. Assumes that the incorrect frame corresponding to RefsAndArgs
+  // is at *m = sp. Will update to point to the bottom of the save frame.
+  //
+  // Note: assumes ComputeAll() has been run before.
+  void LayoutCalleeSaveFrame(StackReference<mirror::ArtMethod>** m, void* sp, HandleScope** table,
+                             uint32_t* handle_scope_entries)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    mirror::ArtMethod* method = (*m)->AsMirrorPtr();
 
-    // jni environment is always first argument
-    sm_.AdvancePointer(self->GetJniEnv());
+    uint8_t* sp8 = reinterpret_cast<uint8_t*>(sp);
 
-    if (is_static) {
-      sm_.AdvanceHandleScope((*sp)->AsMirrorPtr()->GetDeclaringClass());
-    }
-  }
+    // First, fix up the layout of the callee-save frame.
+    // We have to squeeze in the HandleScope, and relocate the method pointer.
 
-  void Visit() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) OVERRIDE;
+    // "Free" the slot for the method.
+    sp8 += kPointerSize;  // In the callee-save frame we use a full pointer.
 
-  void FinalizeHandleScope(Thread* self) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+    // Under the callee saves put handle scope and new method stack reference.
+    *handle_scope_entries = num_handle_scope_references_;
 
-  StackReference<mirror::Object>* GetFirstHandleScopeEntry()
+    size_t handle_scope_size = HandleScope::SizeOf(num_handle_scope_references_);
+    size_t scope_and_method = handle_scope_size + sizeof(StackReference<mirror::ArtMethod>);
+
+    sp8 -= scope_and_method;
+    // Align by kStackAlignment.
+    sp8 = reinterpret_cast<uint8_t*>(RoundDown(
+        reinterpret_cast<uintptr_t>(sp8), kStackAlignment));
+
+    uint8_t* sp8_table = sp8 + sizeof(StackReference<mirror::ArtMethod>);
+    *table = reinterpret_cast<HandleScope*>(sp8_table);
+    (*table)->SetNumberOfReferences(num_handle_scope_references_);
+
+    // Add a slot for the method pointer, and fill it. Fix the pointer-pointer given to us.
+    uint8_t* method_pointer = sp8;
+    StackReference<mirror::ArtMethod>* new_method_ref =
+        reinterpret_cast<StackReference<mirror::ArtMethod>*>(method_pointer);
+    new_method_ref->Assign(method);
+    *m = new_method_ref;
+  }
+
+  // Adds space for the cookie. Note: may leave stack unaligned.
+  void LayoutCookie(uint8_t** sp) {
+    // Reference cookie and padding
+    *sp -= 8;
+  }
+
+  // Re-layout the callee-save frame (insert a handle-scope). Then add space for the cookie.
+  // Returns the new bottom. Note: this may be unaligned.
+  uint8_t* LayoutJNISaveFrame(StackReference<mirror::ArtMethod>** m, void* sp, HandleScope** table,
+                              uint32_t* handle_scope_entries)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    return handle_scope_->GetHandle(0).GetReference();
+    // First, fix up the layout of the callee-save frame.
+    // We have to squeeze in the HandleScope, and relocate the method pointer.
+    LayoutCalleeSaveFrame(m, sp, table, handle_scope_entries);
+
+    // The bottom of the callee-save frame is now where the method is, *m.
+    uint8_t* sp8 = reinterpret_cast<uint8_t*>(*m);
+
+    // Add space for cookie.
+    LayoutCookie(&sp8);
+
+    return sp8;
   }
 
-  jobject GetFirstHandleScopeJObject()
+  // WARNING: After this, *sp won't be pointing to the method anymore!
+  uint8_t* ComputeLayout(StackReference<mirror::ArtMethod>** m, bool is_static, const char* shorty,
+                         uint32_t shorty_len, HandleScope** table, uint32_t* handle_scope_entries,
+                         uintptr_t** start_stack, uintptr_t** start_gpr, uint32_t** start_fpr)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    return handle_scope_->GetHandle(0).ToJObject();
+    Walk(shorty, shorty_len);
+
+    // JNI part.
+    uint8_t* sp8 = LayoutJNISaveFrame(m, reinterpret_cast<void*>(*m), table, handle_scope_entries);
+
+    sp8 = LayoutNativeCall(sp8, start_stack, start_gpr, start_fpr);
+
+    // Return the new bottom.
+    return sp8;
+  }
+
+  uintptr_t PushHandle(mirror::Object* /* ptr */) OVERRIDE;
+
+  // Add JNIEnv* and jobj/jclass before the shorty-derived elements.
+  void WalkHeader(BuildNativeCallFrameStateMachine<ComputeNativeCallFrameSize>* sm) OVERRIDE
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
+ private:
+  uint32_t num_handle_scope_references_;
+};
+
+uintptr_t ComputeGenericJniFrameSize::PushHandle(mirror::Object* /* ptr */) {
+  num_handle_scope_references_++;
+  return reinterpret_cast<uintptr_t>(nullptr);
+}
+
+void ComputeGenericJniFrameSize::WalkHeader(
+    BuildNativeCallFrameStateMachine<ComputeNativeCallFrameSize>* sm) {
+  // JNIEnv
+  sm->AdvancePointer(nullptr);
+
+  // Class object or this as first argument
+  sm->AdvanceHandleScope(reinterpret_cast<mirror::Object*>(0x12345678));
+}
+
+// Class to push values to three separate regions. Used to fill the native call part. Adheres to
+// the template requirements of BuildGenericJniFrameStateMachine.
+class FillNativeCall {
+ public:
+  FillNativeCall(uintptr_t* gpr_regs, uint32_t* fpr_regs, uintptr_t* stack_args) :
+      cur_gpr_reg_(gpr_regs), cur_fpr_reg_(fpr_regs), cur_stack_arg_(stack_args) {}
+
+  virtual ~FillNativeCall() {}
+
+  void Reset(uintptr_t* gpr_regs, uint32_t* fpr_regs, uintptr_t* stack_args) {
+    cur_gpr_reg_ = gpr_regs;
+    cur_fpr_reg_ = fpr_regs;
+    cur_stack_arg_ = stack_args;
   }
 
   void PushGpr(uintptr_t val) {
@@ -1366,46 +1408,110 @@ class BuildGenericJniFrameVisitor FINAL : public QuickArgumentVisitor {
     cur_stack_arg_++;
   }
 
-  uintptr_t PushHandle(mirror::Object* ref) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    uintptr_t tmp;
-    if (ref == nullptr) {
-      *cur_hs_entry_ = StackReference<mirror::Object>();
-      tmp = reinterpret_cast<uintptr_t>(nullptr);
-    } else {
-      *cur_hs_entry_ = StackReference<mirror::Object>::FromMirrorPtr(ref);
-      tmp = reinterpret_cast<uintptr_t>(cur_hs_entry_);
+  virtual uintptr_t PushHandle(mirror::Object* ref) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    LOG(FATAL) << "(Non-JNI) Native call does not use handles.";
+    return 0U;
+  }
+
+ private:
+  uintptr_t* cur_gpr_reg_;
+  uint32_t* cur_fpr_reg_;
+  uintptr_t* cur_stack_arg_;
+};
+
+// Visits arguments on the stack placing them into a region lower down the stack for the benefit
+// of transitioning into native code.
+class BuildGenericJniFrameVisitor FINAL : public QuickArgumentVisitor {
+ public:
+  BuildGenericJniFrameVisitor(StackReference<mirror::ArtMethod>** sp, bool is_static,
+                              const char* shorty, uint32_t shorty_len, Thread* self)
+     : QuickArgumentVisitor(*sp, is_static, shorty, shorty_len),
+       jni_call_(nullptr, nullptr, nullptr, nullptr), sm_(&jni_call_) {
+    ComputeGenericJniFrameSize fsc;
+    uintptr_t* start_gpr_reg;
+    uint32_t* start_fpr_reg;
+    uintptr_t* start_stack_arg;
+    uint32_t handle_scope_entries;
+    bottom_of_used_area_ = fsc.ComputeLayout(sp, is_static, shorty, shorty_len, &handle_scope_,
+                                             &handle_scope_entries, &start_stack_arg,
+                                             &start_gpr_reg, &start_fpr_reg);
+
+    handle_scope_->SetNumberOfReferences(handle_scope_entries);
+    jni_call_.Reset(start_gpr_reg, start_fpr_reg, start_stack_arg, handle_scope_);
+
+    // jni environment is always first argument
+    sm_.AdvancePointer(self->GetJniEnv());
+
+    if (is_static) {
+      sm_.AdvanceHandleScope((*sp)->AsMirrorPtr()->GetDeclaringClass());
     }
-    cur_hs_entry_++;
-    handle_scope_number_of_references_++;
-    return tmp;
   }
 
-  // Size of the part of the alloca that we actually need.
-  size_t GetAllocaUsedSize() {
-    return alloca_used_size_;
+  void Visit() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) OVERRIDE;
+
+  void FinalizeHandleScope(Thread* self) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
+  StackReference<mirror::Object>* GetFirstHandleScopeEntry()
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    return handle_scope_->GetHandle(0).GetReference();
+  }
+
+  jobject GetFirstHandleScopeJObject() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    return handle_scope_->GetHandle(0).ToJObject();
   }
 
-  void* GetCodeReturn() {
-    return code_return_;
+  void* GetBottomOfUsedArea() {
+    return bottom_of_used_area_;
   }
 
  private:
-  uint32_t handle_scope_number_of_references_;
-  StackReference<mirror::Object>* cur_hs_entry_;
+  // A class to fill a JNI call. Adds reference/handle-scope management to FillNativeCall.
+  class FillJniCall FINAL : public FillNativeCall {
+   public:
+    FillJniCall(uintptr_t* gpr_regs, uint32_t* fpr_regs, uintptr_t* stack_args,
+                HandleScope* handle_scope) : FillNativeCall(gpr_regs, fpr_regs, stack_args),
+                                             handle_scope_(handle_scope), cur_entry_(0) {}
+
+    uintptr_t PushHandle(mirror::Object* ref) OVERRIDE SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
+    void Reset(uintptr_t* gpr_regs, uint32_t* fpr_regs, uintptr_t* stack_args, HandleScope* scope) {
+      FillNativeCall::Reset(gpr_regs, fpr_regs, stack_args);
+      handle_scope_ = scope;
+      cur_entry_ = 0U;
+    }
+
+    void ResetRemainingScopeSlots() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+      // Initialize padding entries.
+      size_t expected_slots = handle_scope_->NumberOfReferences();
+      while (cur_entry_ < expected_slots) {
+        handle_scope_->GetHandle(cur_entry_++).Assign(nullptr);
+      }
+      DCHECK_NE(cur_entry_, 0U);
+    }
+
+   private:
+    HandleScope* handle_scope_;
+    size_t cur_entry_;
+  };
+
   HandleScope* handle_scope_;
-  uint32_t handle_scope_expected_refs_;
-  uintptr_t* cur_gpr_reg_;
-  uint32_t* cur_fpr_reg_;
-  uintptr_t* cur_stack_arg_;
-  // StackReference<mirror::Object>* top_of_handle_scope_;
-  void* code_return_;
-  size_t alloca_used_size_;
+  FillJniCall jni_call_;
+  void* bottom_of_used_area_;
 
-  BuildGenericJniFrameStateMachine<BuildGenericJniFrameVisitor> sm_;
+  BuildNativeCallFrameStateMachine<FillJniCall> sm_;
 
   DISALLOW_COPY_AND_ASSIGN(BuildGenericJniFrameVisitor);
 };
 
+uintptr_t BuildGenericJniFrameVisitor::FillJniCall::PushHandle(mirror::Object* ref) {
+  uintptr_t tmp;
+  Handle<mirror::Object> h = handle_scope_->GetHandle(cur_entry_);
+  h.Assign(ref);
+  tmp = reinterpret_cast<uintptr_t>(h.ToJObject());
+  cur_entry_++;
+  return tmp;
+}
+
 void BuildGenericJniFrameVisitor::Visit() {
   Primitive::Type type = GetParamPrimitiveType();
   switch (type) {
@@ -1453,14 +1559,8 @@ void BuildGenericJniFrameVisitor::Visit() {
 }
 
 void BuildGenericJniFrameVisitor::FinalizeHandleScope(Thread* self) {
-  // Initialize padding entries.
-  while (handle_scope_number_of_references_ < handle_scope_expected_refs_) {
-    *cur_hs_entry_ = StackReference<mirror::Object>();
-    cur_hs_entry_++;
-    handle_scope_number_of_references_++;
-  }
-  handle_scope_->SetNumberOfReferences(handle_scope_expected_refs_);
-  DCHECK_NE(handle_scope_expected_refs_, 0U);
+  // Clear out rest of the scope.
+  jni_call_.ResetRemainingScopeSlots();
   // Install HandleScope.
   self->PushHandleScope(handle_scope_);
 }
@@ -1495,19 +1595,20 @@ void artQuickGenericJniEndJNINonRef(Thread* self, uint32_t cookie, jobject lock)
  * 1) How many bytes of the alloca can be released, if the value is non-negative.
  * 2) An error, if the value is negative.
  */
-extern "C" ssize_t artQuickGenericJniTrampoline(Thread* self, StackReference<mirror::ArtMethod>* sp)
+extern "C" TwoWordReturn artQuickGenericJniTrampoline(Thread* self,
+                                                      StackReference<mirror::ArtMethod>* sp)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   mirror::ArtMethod* called = sp->AsMirrorPtr();
   DCHECK(called->IsNative()) << PrettyMethod(called, true);
-
-  // run the visitor
   uint32_t shorty_len = 0;
   const char* shorty = called->GetShorty(&shorty_len);
+
+  // Run the visitor.
   BuildGenericJniFrameVisitor visitor(&sp, called->IsStatic(), shorty, shorty_len, self);
   visitor.VisitArguments();
   visitor.FinalizeHandleScope(self);
 
-  // fix up managed-stack things in Thread
+  // Fix up managed-stack things in Thread.
   self->SetTopOfStack(sp, 0);
 
   self->VerifyStack();
@@ -1519,7 +1620,7 @@ extern "C" ssize_t artQuickGenericJniTrampoline(Thread* self, StackReference<mir
     if (self->IsExceptionPending()) {
       self->PopHandleScope();
       // A negative value denotes an error.
-      return -1;
+      return GetTwoWordFailureValue();
     }
   } else {
     cookie = JniMethodStart(self);
@@ -1550,36 +1651,31 @@ extern "C" ssize_t artQuickGenericJniTrampoline(Thread* self, StackReference<mir
         artQuickGenericJniEndJNINonRef(self, cookie, lock);
       }
 
-      return -1;
+      return GetTwoWordFailureValue();
     }
     // Note that the native code pointer will be automatically set by artFindNativeMethod().
   }
 
-  // Store the native code pointer in the stack at the right location.
-  uintptr_t* code_pointer = reinterpret_cast<uintptr_t*>(visitor.GetCodeReturn());
-  *code_pointer = reinterpret_cast<uintptr_t>(nativeCode);
-
-  // 5K reserved, window_size + frame pointer used.
-  size_t window_size = visitor.GetAllocaUsedSize();
-  return (5 * KB) - window_size - kPointerSize;
+  // Return native code addr(lo) and bottom of alloca address(hi).
+  return GetTwoWordSuccessValue(reinterpret_cast<uintptr_t>(visitor.GetBottomOfUsedArea()),
+                                reinterpret_cast<uintptr_t>(nativeCode));
 }
 
 /*
  * Is called after the native JNI code. Responsible for cleanup (handle scope, saved state) and
  * unlocking.
  */
-extern "C" uint64_t artQuickGenericJniEndTrampoline(Thread* self,
-                                                    StackReference<mirror::ArtMethod>* sp,
-                                                    jvalue result, uint64_t result_f)
+extern "C" uint64_t artQuickGenericJniEndTrampoline(Thread* self, jvalue result, uint64_t result_f)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  StackReference<mirror::ArtMethod>* sp = self->GetManagedStack()->GetTopQuickFrame();
   uint32_t* sp32 = reinterpret_cast<uint32_t*>(sp);
   mirror::ArtMethod* called = sp->AsMirrorPtr();
   uint32_t cookie = *(sp32 - 1);
 
   jobject lock = nullptr;
   if (called->IsSynchronized()) {
-    HandleScope* table = reinterpret_cast<HandleScope*>(
-        reinterpret_cast<uint8_t*>(sp) + sizeof(StackReference<mirror::ArtMethod>));
+    HandleScope* table = reinterpret_cast<HandleScope*>(reinterpret_cast<uint8_t*>(sp)
+        + sizeof(StackReference<mirror::ArtMethod>));
     lock = table->GetHandle(0).ToJObject();
   }
 
@@ -1636,8 +1732,7 @@ static TwoWordReturn artInvokeCommon(uint32_t method_idx, mirror::Object* this_o
     FinishCalleeSaveFrameSetup(self, sp, Runtime::kRefsAndArgs);
     const DexFile* dex_file = caller_method->GetDeclaringClass()->GetDexCache()->GetDexFile();
     uint32_t shorty_len;
-    const char* shorty =
-        dex_file->GetMethodShorty(dex_file->GetMethodId(method_idx), &shorty_len);
+    const char* shorty = dex_file->GetMethodShorty(dex_file->GetMethodId(method_idx), &shorty_len);
     {
       // Remember the args in case a GC happens in FindMethodFromCode.
       ScopedObjectAccessUnchecked soa(self->GetJniEnv());
@@ -1657,8 +1752,9 @@ static TwoWordReturn artInvokeCommon(uint32_t method_idx, mirror::Object* this_o
   const void* code = method->GetEntryPointFromQuickCompiledCode();
 
   // When we return, the caller will branch to this address, so it had better not be 0!
-  DCHECK(code != nullptr) << "Code was NULL in method: " << PrettyMethod(method) << " location: "
-      << method->GetDexFile()->GetLocation();
+  DCHECK(code != nullptr) << "Code was NULL in method: " << PrettyMethod(method)
+                          << " location: "
+                          << method->GetDexFile()->GetLocation();
 
   return GetTwoWordSuccessValue(reinterpret_cast<uintptr_t>(code),
                                 reinterpret_cast<uintptr_t>(method));
@@ -1685,47 +1781,50 @@ EXPLICIT_INVOKE_COMMON_TEMPLATE_DECL(kSuper, false);
 EXPLICIT_INVOKE_COMMON_TEMPLATE_DECL(kSuper, true);
 #undef EXPLICIT_INVOKE_COMMON_TEMPLATE_DECL
 
-
 // See comments in runtime_support_asm.S
-extern "C" TwoWordReturn artInvokeInterfaceTrampolineWithAccessCheck(uint32_t method_idx,
-    mirror::Object* this_object,
-    mirror::ArtMethod* caller_method,
-    Thread* self,
-    StackReference<mirror::ArtMethod>* sp) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-  return artInvokeCommon<kInterface, true>(method_idx, this_object, caller_method, self, sp);
+extern "C" TwoWordReturn artInvokeInterfaceTrampolineWithAccessCheck(
+    uint32_t method_idx, mirror::Object* this_object,
+    mirror::ArtMethod* caller_method, Thread* self,
+    StackReference<mirror::ArtMethod>* sp)
+        SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  return artInvokeCommon<kInterface, true>(method_idx, this_object,
+                                           caller_method, self, sp);
 }
 
-
-extern "C" TwoWordReturn artInvokeDirectTrampolineWithAccessCheck(uint32_t method_idx,
-    mirror::Object* this_object,
-    mirror::ArtMethod* caller_method,
-    Thread* self,
-    StackReference<mirror::ArtMethod>* sp) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-  return artInvokeCommon<kDirect, true>(method_idx, this_object, caller_method, self, sp);
+extern "C" TwoWordReturn artInvokeDirectTrampolineWithAccessCheck(
+    uint32_t method_idx, mirror::Object* this_object,
+    mirror::ArtMethod* caller_method, Thread* self,
+    StackReference<mirror::ArtMethod>* sp)
+        SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  return artInvokeCommon<kDirect, true>(method_idx, this_object, caller_method,
+                                        self, sp);
 }
 
-extern "C" TwoWordReturn artInvokeStaticTrampolineWithAccessCheck(uint32_t method_idx,
-    mirror::Object* this_object,
-    mirror::ArtMethod* caller_method,
-    Thread* self,
-    StackReference<mirror::ArtMethod>* sp) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-  return artInvokeCommon<kStatic, true>(method_idx, this_object, caller_method, self, sp);
+extern "C" TwoWordReturn artInvokeStaticTrampolineWithAccessCheck(
+    uint32_t method_idx, mirror::Object* this_object,
+    mirror::ArtMethod* caller_method, Thread* self,
+    StackReference<mirror::ArtMethod>* sp)
+        SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  return artInvokeCommon<kStatic, true>(method_idx, this_object, caller_method,
+                                        self, sp);
 }
 
-extern "C" TwoWordReturn artInvokeSuperTrampolineWithAccessCheck(uint32_t method_idx,
-    mirror::Object* this_object,
-    mirror::ArtMethod* caller_method,
-    Thread* self,
-    StackReference<mirror::ArtMethod>* sp) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-  return artInvokeCommon<kSuper, true>(method_idx, this_object, caller_method, self, sp);
+extern "C" TwoWordReturn artInvokeSuperTrampolineWithAccessCheck(
+    uint32_t method_idx, mirror::Object* this_object,
+    mirror::ArtMethod* caller_method, Thread* self,
+    StackReference<mirror::ArtMethod>* sp)
+        SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  return artInvokeCommon<kSuper, true>(method_idx, this_object, caller_method,
+                                       self, sp);
 }
 
-extern "C" TwoWordReturn artInvokeVirtualTrampolineWithAccessCheck(uint32_t method_idx,
-    mirror::Object* this_object,
-    mirror::ArtMethod* caller_method,
-    Thread* self,
-    StackReference<mirror::ArtMethod>* sp) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-  return artInvokeCommon<kVirtual, true>(method_idx, this_object, caller_method, self, sp);
+extern "C" TwoWordReturn artInvokeVirtualTrampolineWithAccessCheck(
+    uint32_t method_idx, mirror::Object* this_object,
+    mirror::ArtMethod* caller_method, Thread* self,
+    StackReference<mirror::ArtMethod>* sp)
+        SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  return artInvokeCommon<kVirtual, true>(method_idx, this_object, caller_method,
+                                         self, sp);
 }
 
 // Determine target of interface dispatch. This object is known non-null.
@@ -1769,10 +1868,11 @@ extern "C" TwoWordReturn artInvokeInterfaceTrampoline(mirror::ArtMethod* interfa
       dex_method_idx = instr->VRegB_3rc();
     }
 
-    const DexFile* dex_file = caller_method->GetDeclaringClass()->GetDexCache()->GetDexFile();
+    const DexFile* dex_file = caller_method->GetDeclaringClass()->GetDexCache()
+        ->GetDexFile();
     uint32_t shorty_len;
-    const char* shorty =
-        dex_file->GetMethodShorty(dex_file->GetMethodId(dex_method_idx), &shorty_len);
+    const char* shorty = dex_file->GetMethodShorty(dex_file->GetMethodId(dex_method_idx),
+                                                   &shorty_len);
     {
       // Remember the args in case a GC happens in FindMethodFromCode.
       ScopedObjectAccessUnchecked soa(self->GetJniEnv());
@@ -1791,8 +1891,8 @@ extern "C" TwoWordReturn artInvokeInterfaceTrampoline(mirror::ArtMethod* interfa
   const void* code = method->GetEntryPointFromQuickCompiledCode();
 
   // When we return, the caller will branch to this address, so it had better not be 0!
-  DCHECK(code != nullptr) << "Code was NULL in method: " << PrettyMethod(method) << " location: "
-      << method->GetDexFile()->GetLocation();
+  DCHECK(code != nullptr) << "Code was NULL in method: " << PrettyMethod(method)
+                          << " location: " << method->GetDexFile()->GetLocation();
 
   return GetTwoWordSuccessValue(reinterpret_cast<uintptr_t>(code),
                                 reinterpret_cast<uintptr_t>(method));
diff --git a/runtime/jni_internal.cc b/runtime/jni_internal.cc
index 083f179f38..0624281c20 100644
--- a/runtime/jni_internal.cc
+++ b/runtime/jni_internal.cc
@@ -684,6 +684,11 @@ class JNI {
   static void ExceptionDescribe(JNIEnv* env) {
     ScopedObjectAccess soa(env);
 
+    // If we have no exception to describe, pass through.
+    if (!soa.Self()->GetException(nullptr)) {
+      return;
+    }
+
     StackHandleScope<3> hs(soa.Self());
     // TODO: Use nullptr instead of null handles?
     auto old_throw_this_object(hs.NewHandle<mirror::Object>(nullptr));
diff --git a/runtime/jni_internal_test.cc b/runtime/jni_internal_test.cc
index d255ec8dff..d50e094627 100644
--- a/runtime/jni_internal_test.cc
+++ b/runtime/jni_internal_test.cc
@@ -1472,6 +1472,12 @@ TEST_F(JniInternalTest, DeleteWeakGlobalRef) {
   env_->DeleteWeakGlobalRef(o2);
 }
 
+TEST_F(JniInternalTest, ExceptionDescribe) {
+  // This checks how ExceptionDescribe handles call without exception.
+  env_->ExceptionClear();
+  env_->ExceptionDescribe();
+}
+
 TEST_F(JniInternalTest, Throw) {
   EXPECT_EQ(JNI_ERR, env_->Throw(nullptr));
 
diff --git a/runtime/utils.h b/runtime/utils.h
index 68ea47541b..eb79968e21 100644
--- a/runtime/utils.h
+++ b/runtime/utils.h
@@ -203,6 +203,19 @@ static inline bool NeedsEscaping(uint16_t ch) {
   return (ch < ' ' || ch > '~');
 }
 
+// Interpret the bit pattern of input (type U) as type V. Requires the size
+// of V >= size of U (compile-time checked).
+template<typename U, typename V>
+static inline V bit_cast(U in) {
+  COMPILE_ASSERT(sizeof(U) <= sizeof(V), size_of_u_not_le_size_of_v);
+  union {
+    U u;
+    V v;
+  } tmp;
+  tmp.u = in;
+  return tmp.v;
+}
+
 std::string PrintableChar(uint16_t ch);
 
 // Returns an ASCII string corresponding to the given UTF-8 string.
diff --git a/test/013-math2/expected.txt b/test/013-math2/expected.txt
index d36c468e84..84fb9e2750 100644
--- a/test/013-math2/expected.txt
+++ b/test/013-math2/expected.txt
@@ -1 +1,2 @@
 a:32003
+b:-31993
diff --git a/test/013-math2/src/Main.java b/test/013-math2/src/Main.java
index 2c80c31ad9..7b8c4e49f0 100644
--- a/test/013-math2/src/Main.java
+++ b/test/013-math2/src/Main.java
@@ -26,7 +26,9 @@ public class Main {
 
         // a 16-bit constant
         a += 32000;
-        System.out.println("a:" +a);
+        b -= 32000;
+        System.out.println("a:" + a);
+        System.out.println("b:" + b);
     }
     public static void main(String args[]) {
         math_013();
diff --git a/test/405-optimizing-long-allocator/expected.txt b/test/405-optimizing-long-allocator/expected.txt
new file mode 100644
index 0000000000..e69de29bb2
--- /dev/null
+++ b/test/405-optimizing-long-allocator/expected.txt
diff --git a/test/405-optimizing-long-allocator/info.txt b/test/405-optimizing-long-allocator/info.txt
new file mode 100644
index 0000000000..b6b31aeddb
--- /dev/null
+++ b/test/405-optimizing-long-allocator/info.txt
@@ -0,0 +1 @@
+Tests with long for the optimizing compiler's register allocator.
diff --git a/test/405-optimizing-long-allocator/src/Main.java b/test/405-optimizing-long-allocator/src/Main.java
new file mode 100644
index 0000000000..9fd840b543
--- /dev/null
+++ b/test/405-optimizing-long-allocator/src/Main.java
@@ -0,0 +1,172 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Note that $opt$ is a marker for the optimizing compiler to ensure
+// it compiles these methods.
+
+public class Main {
+  public static void main(String[] args) {
+
+    expectEquals(4, $opt$TestLostCopy());
+    expectEquals(-10, $opt$TestTwoLive());
+    expectEquals(-20, $opt$TestThreeLive());
+    expectEquals(5, $opt$TestFourLive());
+    expectEquals(10, $opt$TestMultipleLive());
+    expectEquals(1, $opt$TestWithBreakAndContinue());
+    expectEquals(-15, $opt$testSpillInIf(5, 6, 7));
+    expectEquals(-567, $opt$TestAgressiveLive1(1, 2, 3, 4, 5, 6, 7));
+    expectEquals(-77, $opt$TestAgressiveLive2(1, 2, 3, 4, 5, 6, 7));
+
+    expectEquals(-55834574850L, $opt$testSpillInIf(5, 6L << 32, 7L << 32));
+    expectEquals(-73014444553L, $opt$TestAgressiveLive1(
+        1L << 32, (1L << 32) + 1, 3L << 32, 4L << 32, 5L << 32, 6L << 32, (1L << 32) + 2));
+    expectEquals(-124554051632L, $opt$TestAgressiveLive2(
+        1L << 32, (1L << 32) + 1, 3L << 32, 4L << 32, 5L << 32, 6L << 32, 7L << 32));
+  }
+
+  public static long $opt$TestLostCopy() {
+    long a = 0;
+    long b = 0;
+    do {
+      b = a;
+      a++;
+    } while (a != 5);
+    return b;
+  }
+
+  public static long $opt$TestTwoLive() {
+    long a = 0;
+    long b = 0;
+    do {
+      a++;
+      b += 3;
+    } while (a != 5);
+    return a - b;
+  }
+
+  public static long $opt$TestThreeLive() {
+    long a = 0;
+    long b = 0;
+    long c = 0;
+    do {
+      a++;
+      b += 3;
+      c += 2;
+    } while (a != 5);
+    return a - b - c;
+  }
+
+  public static long $opt$TestFourLive() {
+    long a = 0;
+    long b = 0;
+    long c = 0;
+    long d = 0;
+    do {
+      a++;
+      b += 3;
+      c += 2;
+      d++;
+    } while (a != 5);
+    return d;
+  }
+
+  public static long $opt$TestMultipleLive() {
+    long a = 0;
+    long b = 0;
+    long c = 0;
+    long d = 0;
+    long e = 0;
+    long f = 0;
+    long g = 0;
+    do {
+      a++;
+      b++;
+      c++;
+      d++;
+      e += 3;
+      f += 2;
+      g += 2;
+    } while (a != 5);
+    return f;
+  }
+
+  public static long $opt$TestWithBreakAndContinue() {
+    long a = 0;
+    long b = 0;
+    do {
+      a++;
+      if (a == 2) {
+        continue;
+      }
+      b++;
+      if (a == 5) {
+        break;
+      }
+    } while (true);
+    return a - b;
+  }
+
+  public static long $opt$testSpillInIf(long a, long b, long c) {
+    long d = 0;
+    long e = 0;
+    if (a == 5) {
+      b++;
+      c++;
+      d += 2;
+      e += 3;
+    }
+
+    return a - b - c - d - e;
+  }
+
+  public static long $opt$TestAgressiveLive1(long a, long b, long c, long d, long e, long f, long g) {
+    long h = a - b;
+    long i = c - d;
+    long j = e - f;
+    long k = 42 + g - a;
+    do {
+      b++;
+      while (k != 1) {
+        --k;
+        ++i;
+        if (i == 9) {
+          ++i;
+        }
+        j += 5;
+      }
+      k = 9;
+      h++;
+    } while (h != 5);
+    return a - b - c - d - e - f - g - h - i - j - k;
+  }
+
+  public static long $opt$TestAgressiveLive2(long a, long b, long c, long d, long e, long f, long g) {
+    long h = a - b;
+    long i = c - d;
+    long j = e - f;
+    long k = 42 + g - a;
+    do {
+      h++;
+    } while (h != 5);
+    return a - b - c - d - e - f - g - h - i - j - k;
+  }
+
+  public static void expectEquals(long expected, long value) {
+    if (expected != value) {
+      throw new Error("Expected: " + expected + ", got: " + value);
+    }
+  }
+}
diff --git a/test/Android.oat.mk b/test/Android.oat.mk
index da0ad8d0b5..a560a17608 100644
--- a/test/Android.oat.mk
+++ b/test/Android.oat.mk
@@ -163,14 +163,14 @@ ART_TEST_HOST_OAT_RULES :=
 # All tests require the host executables, libarttest and the core images.
 ART_TEST_HOST_OAT_DEPENDENCIES := \
   $(ART_HOST_EXECUTABLES) \
-  $(ART_HOST_LIBRARY_PATH)/libarttest$(ART_HOST_SHLIB_EXTENSION) \
-  $(ART_HOST_LIBRARY_PATH)/libjavacore$(ART_HOST_SHLIB_EXTENSION) \
+  $(ART_HOST_OUT_SHARED_LIBRARIES)/libarttest$(ART_HOST_SHLIB_EXTENSION) \
+  $(ART_HOST_OUT_SHARED_LIBRARIES)/libjavacore$(ART_HOST_SHLIB_EXTENSION) \
   $(HOST_CORE_IMG_OUT)
 
 ifneq ($(HOST_PREFER_32_BIT),true)
 ART_TEST_HOST_OAT_DEPENDENCIES += \
-  $(2ND_ART_HOST_LIBRARY_PATH)/libarttest$(ART_HOST_SHLIB_EXTENSION) \
-  $(2ND_ART_HOST_LIBRARY_PATH)/libjavacore$(ART_HOST_SHLIB_EXTENSION) \
+  $(2ND_ART_HOST_OUT_SHARED_LIBRARIES)/libarttest$(ART_HOST_SHLIB_EXTENSION) \
+  $(2ND_ART_HOST_OUT_SHARED_LIBRARIES)/libjavacore$(ART_HOST_SHLIB_EXTENSION) \
   $(2ND_HOST_CORE_IMG_OUT)
 endif
 
diff --git a/test/Android.run-test.mk b/test/Android.run-test.mk
index 13e967cca5..25bcf0a790 100644
--- a/test/Android.run-test.mk
+++ b/test/Android.run-test.mk
@@ -109,12 +109,12 @@ TEST_ART_TARGET_SYNC_DEPS += $(ART_TARGET_EXECUTABLES) $(TARGET_CORE_IMG_OUT) $(
 # All tests require the host executables and the core images.
 ART_TEST_HOST_RUN_TEST_DEPENDENCIES := \
   $(ART_HOST_EXECUTABLES) \
-  $(ART_HOST_LIBRARY_PATH)/libjavacore$(ART_HOST_SHLIB_EXTENSION) \
+  $(ART_HOST_OUT_SHARED_LIBRARIES)/libjavacore$(ART_HOST_SHLIB_EXTENSION) \
   $(HOST_CORE_IMG_OUT)
 
 ifneq ($(HOST_PREFER_32_BIT),true)
 ART_TEST_HOST_RUN_TEST_DEPENDENCIES += \
-  $(2ND_ART_HOST_LIBRARY_PATH)/libjavacore$(ART_HOST_SHLIB_EXTENSION) \
+  $(2ND_ART_HOST_OUT_SHARED_LIBRARIES)/libjavacore$(ART_HOST_SHLIB_EXTENSION) \
   $(2ND_HOST_CORE_IMG_OUT)
 endif
 
diff --git a/tools/art b/tools/art
index e3f409c21e..85517d3240 100755
--- a/tools/art
+++ b/tools/art
@@ -47,12 +47,23 @@ PROG_DIR="$(cd "${PROG_NAME%/*}" ; pwd -P)"
 ANDROID_BUILD_TOP="$(cd "${PROG_DIR}/../../../../" ; pwd -P)/"
 ANDROID_HOST_OUT=$PROG_DIR/..
 ANDROID_DATA=$PWD/android-data$$
+DALVIKVM_EXECUTABLE=$ANDROID_HOST_OUT/bin/dalvikvm
+
+function find_libdir() {
+  if [ "$(readlink "$DALVIKVM_EXECUTABLE")" = "dalvikvm64" ]; then
+    echo "lib64"
+  else
+    echo "lib"
+  fi
+}
+
+LD_LIBRARY_PATH=$ANDROID_HOST_OUT/"$(find_libdir)"
 
 mkdir -p $ANDROID_DATA/dalvik-cache/{x86,x86_64}
 ANDROID_DATA=$ANDROID_DATA \
   ANDROID_ROOT=$ANDROID_HOST_OUT \
-  LD_LIBRARY_PATH=$ANDROID_HOST_OUT/lib \
-  $invoke_with $ANDROID_HOST_OUT/bin/dalvikvm $lib \
+  LD_LIBRARY_PATH=$LD_LIBRARY_PATH \
+  $invoke_with $DALVIKVM_EXECUTABLE $lib \
     -Ximage:$ANDROID_HOST_OUT/framework/core.art \
      "$@"
 EXIT_STATUS=$?