Merge "Replace the bool kDoReadBarrier template parameter with an enum."
diff --git a/Android.mk b/Android.mk
index 4351be9..92339af 100644
--- a/Android.mk
+++ b/Android.mk
@@ -46,9 +46,6 @@
 	rm -f $(ART_TEST_OUT)/*.odex
 	rm -f $(ART_TEST_OUT)/*.oat
 	rm -f $(ART_TEST_OUT)/*.art
-	rm -f $(DALVIK_CACHE_OUT)/*@classes.dex
-	rm -f $(DALVIK_CACHE_OUT)/*.oat
-	rm -f $(DALVIK_CACHE_OUT)/*.art
 	rm -f $(HOST_OUT_JAVA_LIBRARIES)/*.odex
 	rm -f $(HOST_OUT_JAVA_LIBRARIES)/*.oat
 	rm -f $(HOST_OUT_JAVA_LIBRARIES)/*.art
@@ -66,25 +63,31 @@
 	rm -f $(2ND_TARGET_OUT_INTERMEDIATES)/JAVA_LIBRARIES/*_intermediates/javalib.odex
 	rm -f $(2ND_TARGET_OUT_INTERMEDIATES)/APPS/*_intermediates/*.odex
 endif
-	rm -rf /tmp/test-*/dalvik-cache/*@classes.dex
-	rm -rf /tmp/android-data/dalvik-cache/*@classes.dex
+	rm -rf /tmp/test-*/dalvik-cache/*
+	rm -rf /tmp/android-data/dalvik-cache/*
 
 .PHONY: clean-oat-target
 clean-oat-target:
 	adb remount
-	adb shell rm $(ART_NATIVETEST_DIR)/*.odex
-	adb shell rm $(ART_NATIVETEST_DIR)/*.oat
-	adb shell rm $(ART_NATIVETEST_DIR)/*.art
-	adb shell rm $(ART_TEST_DIR)/*.odex
-	adb shell rm $(ART_TEST_DIR)/*.oat
-	adb shell rm $(ART_TEST_DIR)/*.art
-	adb shell rm $(ART_DALVIK_CACHE_DIR)/*.dex
-	adb shell rm $(ART_DALVIK_CACHE_DIR)/*.oat
-	adb shell rm $(ART_DALVIK_CACHE_DIR)/*.art
-	adb shell rm $(DEXPREOPT_BOOT_JAR_DIR)/*.oat
-	adb shell rm $(DEXPREOPT_BOOT_JAR_DIR)/*.art
-	adb shell rm system/app/*.odex
-	adb shell rm data/run-test/test-*/dalvik-cache/*@classes.dex
+	adb shell rm -f $(ART_NATIVETEST_DIR)/*.odex
+	adb shell rm -f $(ART_NATIVETEST_DIR)/*.oat
+	adb shell rm -f $(ART_NATIVETEST_DIR)/*.art
+	adb shell rm -f $(ART_TEST_DIR)/*.odex
+	adb shell rm -f $(ART_TEST_DIR)/*.oat
+	adb shell rm -f $(ART_TEST_DIR)/*.art
+ifdef TARGET_2ND_ARCH
+	adb shell rm -f $(2ND_ART_NATIVETEST_DIR)/*.odex
+	adb shell rm -f $(2ND_ART_NATIVETEST_DIR)/*.oat
+	adb shell rm -f $(2ND_ART_NATIVETEST_DIR)/*.art
+	adb shell rm -f $(2ND_ART_TEST_DIR)/*.odex
+	adb shell rm -f $(2ND_ART_TEST_DIR)/*.oat
+	adb shell rm -f $(2ND_ART_TEST_DIR)/*.art
+endif
+	adb shell rm -rf $(ART_DALVIK_CACHE_DIR)/*
+	adb shell rm -f $(DEXPREOPT_BOOT_JAR_DIR)/*.oat
+	adb shell rm -f $(DEXPREOPT_BOOT_JAR_DIR)/*.art
+	adb shell rm -f system/app/*.odex
+	adb shell rm -rf data/run-test/test-*/dalvik-cache/*
 
 ifneq ($(art_dont_bother),true)
 
diff --git a/compiler/dex/mir_dataflow.cc b/compiler/dex/mir_dataflow.cc
index 36f1be7..651fa66 100644
--- a/compiler/dex/mir_dataflow.cc
+++ b/compiler/dex/mir_dataflow.cc
@@ -879,7 +879,7 @@
       new (arena_) ArenaBitVector(arena_, cu_->num_dalvik_registers, false, kBitMapLiveIn);
 
   for (mir = bb->first_mir_insn; mir != NULL; mir = mir->next) {
-    uint64_t df_attributes = oat_data_flow_attributes_[mir->dalvikInsn.opcode];
+    uint64_t df_attributes = GetDataFlowAttributes(mir);
     DecodedInstruction *d_insn = &mir->dalvikInsn;
 
     if (df_attributes & DF_HAS_USES) {
@@ -994,7 +994,7 @@
         static_cast<struct SSARepresentation *>(arena_->Alloc(sizeof(SSARepresentation),
                                                               kArenaAllocDFInfo));
 
-    uint64_t df_attributes = oat_data_flow_attributes_[mir->dalvikInsn.opcode];
+    uint64_t df_attributes = GetDataFlowAttributes(mir);
 
       // If not a pseudo-op, note non-leaf or can throw
     if (static_cast<int>(mir->dalvikInsn.opcode) <
@@ -1252,7 +1252,7 @@
       use_counts_.Put(s_reg, use_counts_.Get(s_reg) + weight);
     }
     if (!(cu_->disable_opt & (1 << kPromoteCompilerTemps))) {
-      uint64_t df_attributes = oat_data_flow_attributes_[mir->dalvikInsn.opcode];
+      uint64_t df_attributes = GetDataFlowAttributes(mir);
       // Implicit use of Method* ? */
       if (df_attributes & DF_UMS) {
         /*
diff --git a/compiler/dex/mir_graph.cc b/compiler/dex/mir_graph.cc
index 8ce4f1f..6857edb 100644
--- a/compiler/dex/mir_graph.cc
+++ b/compiler/dex/mir_graph.cc
@@ -621,7 +621,7 @@
     int flags = Instruction::FlagsOf(insn->dalvikInsn.opcode);
     int verify_flags = Instruction::VerifyFlagsOf(insn->dalvikInsn.opcode);
 
-    uint64_t df_flags = oat_data_flow_attributes_[insn->dalvikInsn.opcode];
+    uint64_t df_flags = GetDataFlowAttributes(insn);
     merged_df_flags |= df_flags;
 
     if (df_flags & DF_HAS_DEFS) {
@@ -743,6 +743,17 @@
   }
 }
 
+uint64_t MIRGraph::GetDataFlowAttributes(Instruction::Code opcode) {
+  DCHECK_LT((size_t) opcode, (sizeof(oat_data_flow_attributes_) / sizeof(oat_data_flow_attributes_[0])));
+  return oat_data_flow_attributes_[opcode];
+}
+
+uint64_t MIRGraph::GetDataFlowAttributes(MIR* mir) {
+  DCHECK(mir != nullptr);
+  Instruction::Code opcode = mir->dalvikInsn.opcode;
+  return GetDataFlowAttributes(opcode);
+}
+
 // TODO: use a configurable base prefix, and adjust callers to supply pass name.
 /* Dump the CFG into a DOT graph */
 void MIRGraph::DumpCFG(const char* dir_prefix, bool all_blocks, const char *suffix) {
diff --git a/compiler/dex/mir_graph.h b/compiler/dex/mir_graph.h
index 2c125f6..5997e5b 100644
--- a/compiler/dex/mir_graph.h
+++ b/compiler/dex/mir_graph.h
@@ -851,6 +851,9 @@
    */
   void CountUses(struct BasicBlock* bb);
 
+  static uint64_t GetDataFlowAttributes(Instruction::Code opcode);
+  static uint64_t GetDataFlowAttributes(MIR* mir);
+
   /**
    * @brief Combine BasicBlocks
    * @param the BasicBlock we are considering
@@ -868,7 +871,6 @@
   RegLocation* reg_location_;                         // Map SSA names to location.
   SafeMap<unsigned int, unsigned int> block_id_map_;  // Block collapse lookup cache.
 
-  static const uint64_t oat_data_flow_attributes_[kMirOpLast];
   static const char* extended_mir_op_names_[kMirOpLast - kMirOpFirst];
   static const uint32_t analysis_attributes_[kMirOpLast];
 
@@ -985,6 +987,7 @@
   GrowableArray<MirIFieldLoweringInfo> ifield_lowering_infos_;
   GrowableArray<MirSFieldLoweringInfo> sfield_lowering_infos_;
   GrowableArray<MirMethodLoweringInfo> method_lowering_infos_;
+  static const uint64_t oat_data_flow_attributes_[kMirOpLast];
 
   friend class ClassInitCheckEliminationTest;
   friend class LocalValueNumberingTest;
diff --git a/compiler/dex/mir_optimization.cc b/compiler/dex/mir_optimization.cc
index 937e258..72c46cc 100644
--- a/compiler/dex/mir_optimization.cc
+++ b/compiler/dex/mir_optimization.cc
@@ -43,11 +43,11 @@
 
   for (mir = bb->first_mir_insn; mir != NULL; mir = mir->next) {
     // Skip pass if BB has MIR without SSA representation.
-    if (mir->ssa_rep == NULL) {
+    if (mir->ssa_rep == nullptr) {
        return;
     }
 
-    uint64_t df_attributes = oat_data_flow_attributes_[mir->dalvikInsn.opcode];
+    uint64_t df_attributes = GetDataFlowAttributes(mir);
 
     DecodedInstruction *d_insn = &mir->dalvikInsn;
 
@@ -559,7 +559,7 @@
       if (mir->ssa_rep == NULL) {
         continue;
       }
-      uint64_t df_attributes = oat_data_flow_attributes_[mir->dalvikInsn.opcode];
+      uint64_t df_attributes = GetDataFlowAttributes(mir);
       if (df_attributes & DF_HAS_NULL_CHKS) {
         checkstats_->null_checks++;
         if (mir->optimization_flags & MIR_IGNORE_NULL_CHECK) {
@@ -644,7 +644,7 @@
     MIR* mir = bb->last_mir_insn;
     // Grab the attributes from the paired opcode
     MIR* throw_insn = mir->meta.throw_insn;
-    uint64_t df_attributes = oat_data_flow_attributes_[throw_insn->dalvikInsn.opcode];
+    uint64_t df_attributes = GetDataFlowAttributes(throw_insn);
     bool can_combine = true;
     if (df_attributes & DF_HAS_NULL_CHKS) {
       can_combine &= ((throw_insn->optimization_flags & MIR_IGNORE_NULL_CHECK) != 0);
@@ -796,7 +796,7 @@
       continue;
     }
 
-    uint64_t df_attributes = oat_data_flow_attributes_[mir->dalvikInsn.opcode];
+    uint64_t df_attributes = GetDataFlowAttributes(mir);
 
     // Might need a null check?
     if (df_attributes & DF_HAS_NULL_CHKS) {
diff --git a/compiler/dex/mir_optimization_test.cc b/compiler/dex/mir_optimization_test.cc
index 40ced70..891d9fb 100644
--- a/compiler/dex/mir_optimization_test.cc
+++ b/compiler/dex/mir_optimization_test.cc
@@ -172,7 +172,7 @@
       mir->offset = 2 * i;  // All insns need to be at least 2 code units long.
       mir->width = 2u;
       mir->optimization_flags = 0u;
-      merged_df_flags |= MIRGraph::oat_data_flow_attributes_[def->opcode];
+      merged_df_flags |= MIRGraph::GetDataFlowAttributes(def->opcode);
     }
     cu_.mir_graph->merged_df_flags_ = merged_df_flags;
 
diff --git a/compiler/dex/portable/mir_to_gbc.cc b/compiler/dex/portable/mir_to_gbc.cc
index 70438ec..576e242 100644
--- a/compiler/dex/portable/mir_to_gbc.cc
+++ b/compiler/dex/portable/mir_to_gbc.cc
@@ -722,7 +722,7 @@
   /* Prep Src and Dest locations */
   int next_sreg = 0;
   int next_loc = 0;
-  uint64_t attrs = mir_graph_->oat_data_flow_attributes_[opcode];
+  uint64_t attrs = MirGraph::GetDataFlowAttributes(opcode);
   rl_src[0] = rl_src[1] = rl_src[2] = mir_graph_->GetBadLoc();
   if (attrs & DF_UA) {
     if (attrs & DF_A_WIDE) {
diff --git a/compiler/dex/quick/codegen_util.cc b/compiler/dex/quick/codegen_util.cc
index b030bb4..0596d4f 100644
--- a/compiler/dex/quick/codegen_util.cc
+++ b/compiler/dex/quick/codegen_util.cc
@@ -1018,8 +1018,8 @@
       vmap_encoder.PushBackUnsigned(fp_vmap_table_[i] + VmapTable::kEntryAdjustment);
     }
   } else {
-    DCHECK_EQ(__builtin_popcount(core_spill_mask_), 0);
-    DCHECK_EQ(__builtin_popcount(fp_spill_mask_), 0);
+    DCHECK_EQ(POPCOUNT(core_spill_mask_), 0);
+    DCHECK_EQ(POPCOUNT(fp_spill_mask_), 0);
     DCHECK_EQ(core_vmap_table_.size(), 0u);
     DCHECK_EQ(fp_vmap_table_.size(), 0u);
     vmap_encoder.PushBackUnsigned(0u);  // Size is 0.
diff --git a/compiler/dex/quick/mir_to_lir.cc b/compiler/dex/quick/mir_to_lir.cc
index df7a7c1..107987e 100644
--- a/compiler/dex/quick/mir_to_lir.cc
+++ b/compiler/dex/quick/mir_to_lir.cc
@@ -42,7 +42,7 @@
   RegStorage reg_arg_high = wide ? GetArgMappingToPhysicalReg(in_position + 1) :
       RegStorage::InvalidReg();
 
-  int offset = StackVisitor::GetOutVROffset(in_position);
+  int offset = StackVisitor::GetOutVROffset(in_position, cu_->instruction_set);
   if (cu_->instruction_set == kX86 || cu_->instruction_set == kX86_64) {
     /*
      * When doing a call for x86, it moves the stack pointer in order to push return.
@@ -81,7 +81,7 @@
 }
 
 void Mir2Lir::LoadArgDirect(int in_position, RegLocation rl_dest) {
-  int offset = StackVisitor::GetOutVROffset(in_position);
+  int offset = StackVisitor::GetOutVROffset(in_position, cu_->instruction_set);
   if (cu_->instruction_set == kX86 || cu_->instruction_set == kX86_64) {
     /*
      * When doing a call for x86, it moves the stack pointer in order to push return.
@@ -286,7 +286,7 @@
   // Prep Src and Dest locations.
   int next_sreg = 0;
   int next_loc = 0;
-  uint64_t attrs = mir_graph_->oat_data_flow_attributes_[opcode];
+  uint64_t attrs = MIRGraph::GetDataFlowAttributes(opcode);
   rl_src[0] = rl_src[1] = rl_src[2] = mir_graph_->GetBadLoc();
   if (attrs & DF_UA) {
     if (attrs & DF_A_WIDE) {
diff --git a/compiler/dex/quick/ralloc_util.cc b/compiler/dex/quick/ralloc_util.cc
index 39783a2..6455572 100644
--- a/compiler/dex/quick/ralloc_util.cc
+++ b/compiler/dex/quick/ralloc_util.cc
@@ -1141,7 +1141,8 @@
 /* Returns sp-relative offset in bytes for a VReg */
 int Mir2Lir::VRegOffset(int v_reg) {
   return StackVisitor::GetVRegOffset(cu_->code_item, core_spill_mask_,
-                                     fp_spill_mask_, frame_size_, v_reg);
+                                     fp_spill_mask_, frame_size_, v_reg,
+                                     cu_->instruction_set);
 }
 
 /* Returns sp-relative offset in bytes for a SReg */
diff --git a/compiler/dex/quick/x86/utility_x86.cc b/compiler/dex/quick/x86/utility_x86.cc
index 4d45055..b972d08 100644
--- a/compiler/dex/quick/x86/utility_x86.cc
+++ b/compiler/dex/quick/x86/utility_x86.cc
@@ -889,7 +889,7 @@
 
 void X86Mir2Lir::AnalyzeFPInstruction(int opcode, BasicBlock * bb, MIR *mir) {
   // Look at all the uses, and see if they are double constants.
-  uint64_t attrs = mir_graph_->oat_data_flow_attributes_[opcode];
+  uint64_t attrs = MIRGraph::GetDataFlowAttributes(static_cast<Instruction::Code>(opcode));
   int next_sreg = 0;
   if (attrs & DF_UA) {
     if (attrs & DF_A_WIDE) {
diff --git a/compiler/dex/vreg_analysis.cc b/compiler/dex/vreg_analysis.cc
index 4be0f59..d5c2598 100644
--- a/compiler/dex/vreg_analysis.cc
+++ b/compiler/dex/vreg_analysis.cc
@@ -124,7 +124,7 @@
 bool MIRGraph::InferTypeAndSize(BasicBlock* bb, MIR* mir, bool changed) {
   SSARepresentation *ssa_rep = mir->ssa_rep;
   if (ssa_rep) {
-    uint64_t attrs = oat_data_flow_attributes_[mir->dalvikInsn.opcode];
+    uint64_t attrs = GetDataFlowAttributes(mir);
     const int* uses = ssa_rep->uses;
     const int* defs = ssa_rep->defs;
 
diff --git a/compiler/jni/quick/arm/calling_convention_arm.h b/compiler/jni/quick/arm/calling_convention_arm.h
index 00a239b..604ce1c 100644
--- a/compiler/jni/quick/arm/calling_convention_arm.h
+++ b/compiler/jni/quick/arm/calling_convention_arm.h
@@ -71,6 +71,11 @@
   ManagedRegister CurrentParamRegister() OVERRIDE;
   FrameOffset CurrentParamStackOffset() OVERRIDE;
 
+  // AAPCS mandates return values are extended.
+  bool RequiresSmallResultTypeExtension() const OVERRIDE {
+    return false;
+  }
+
  protected:
   size_t NumberOfOutgoingStackArgs() OVERRIDE;
 
diff --git a/compiler/jni/quick/arm64/calling_convention_arm64.h b/compiler/jni/quick/arm64/calling_convention_arm64.h
index 92f547c..9fd3265 100644
--- a/compiler/jni/quick/arm64/calling_convention_arm64.h
+++ b/compiler/jni/quick/arm64/calling_convention_arm64.h
@@ -68,6 +68,11 @@
   ManagedRegister CurrentParamRegister() OVERRIDE;
   FrameOffset CurrentParamStackOffset() OVERRIDE;
 
+  // aarch64 calling convention leaves upper bits undefined.
+  bool RequiresSmallResultTypeExtension() const OVERRIDE {
+    return true;
+  }
+
  protected:
   size_t NumberOfOutgoingStackArgs() OVERRIDE;
 
diff --git a/compiler/jni/quick/calling_convention.h b/compiler/jni/quick/calling_convention.h
index 4d25d1c..18afd58 100644
--- a/compiler/jni/quick/calling_convention.h
+++ b/compiler/jni/quick/calling_convention.h
@@ -287,6 +287,8 @@
   FrameOffset ReturnValueSaveLocation() const;
   // Register that holds result if it is integer.
   virtual ManagedRegister IntReturnRegister() = 0;
+  // Whether the compiler needs to ensure zero-/sign-extension of a small result type
+  virtual bool RequiresSmallResultTypeExtension() const = 0;
 
   // Callee save registers to spill prior to native code (which may clobber)
   virtual const std::vector<ManagedRegister>& CalleeSaveRegisters() const = 0;
diff --git a/compiler/jni/quick/jni_compiler.cc b/compiler/jni/quick/jni_compiler.cc
index 93b1b5a..9f439eb 100644
--- a/compiler/jni/quick/jni_compiler.cc
+++ b/compiler/jni/quick/jni_compiler.cc
@@ -314,7 +314,7 @@
           mr_conv->InterproceduralScratchRegister());
 
   // 10. Fix differences in result widths.
-  if (instruction_set == kX86 || instruction_set == kX86_64) {
+  if (main_jni_conv->RequiresSmallResultTypeExtension()) {
     if (main_jni_conv->GetReturnType() == Primitive::kPrimByte ||
         main_jni_conv->GetReturnType() == Primitive::kPrimShort) {
       __ SignExtend(main_jni_conv->ReturnRegister(),
diff --git a/compiler/jni/quick/mips/calling_convention_mips.h b/compiler/jni/quick/mips/calling_convention_mips.h
index e33fbad..8d82dce 100644
--- a/compiler/jni/quick/mips/calling_convention_mips.h
+++ b/compiler/jni/quick/mips/calling_convention_mips.h
@@ -71,6 +71,11 @@
   ManagedRegister CurrentParamRegister() OVERRIDE;
   FrameOffset CurrentParamStackOffset() OVERRIDE;
 
+  // Mips does not need to extend small return types.
+  bool RequiresSmallResultTypeExtension() const OVERRIDE {
+    return false;
+  }
+
  protected:
   size_t NumberOfOutgoingStackArgs() OVERRIDE;
 
diff --git a/compiler/jni/quick/x86/calling_convention_x86.h b/compiler/jni/quick/x86/calling_convention_x86.h
index 5b9069c..025eb6d 100644
--- a/compiler/jni/quick/x86/calling_convention_x86.h
+++ b/compiler/jni/quick/x86/calling_convention_x86.h
@@ -69,6 +69,11 @@
   ManagedRegister CurrentParamRegister() OVERRIDE;
   FrameOffset CurrentParamStackOffset() OVERRIDE;
 
+  // x86 needs to extend small return types.
+  bool RequiresSmallResultTypeExtension() const OVERRIDE {
+    return true;
+  }
+
  protected:
   size_t NumberOfOutgoingStackArgs() OVERRIDE;
 
diff --git a/compiler/jni/quick/x86_64/calling_convention_x86_64.h b/compiler/jni/quick/x86_64/calling_convention_x86_64.h
index d545774..1ba5353 100644
--- a/compiler/jni/quick/x86_64/calling_convention_x86_64.h
+++ b/compiler/jni/quick/x86_64/calling_convention_x86_64.h
@@ -69,6 +69,11 @@
   ManagedRegister CurrentParamRegister() OVERRIDE;
   FrameOffset CurrentParamStackOffset() OVERRIDE;
 
+  // x86-64 needs to extend small return types.
+  bool RequiresSmallResultTypeExtension() const OVERRIDE {
+    return true;
+  }
+
  protected:
   size_t NumberOfOutgoingStackArgs() OVERRIDE;
 
diff --git a/compiler/utils/arm/assembler_arm.cc b/compiler/utils/arm/assembler_arm.cc
index effc38e..5c839dd 100644
--- a/compiler/utils/arm/assembler_arm.cc
+++ b/compiler/utils/arm/assembler_arm.cc
@@ -1107,7 +1107,7 @@
   // The offset is off by 8 due to the way the ARM CPUs read PC.
   offset -= 8;
   CHECK_ALIGNED(offset, 4);
-  CHECK(IsInt(CountOneBits(kBranchOffsetMask), offset)) << offset;
+  CHECK(IsInt(POPCOUNT(kBranchOffsetMask), offset)) << offset;
 
   // Properly preserve only the bits supported in the instruction.
   offset >>= 2;
diff --git a/compiler/utils/arm64/assembler_arm64.cc b/compiler/utils/arm64/assembler_arm64.cc
index 1d87eaa..b4bb979 100644
--- a/compiler/utils/arm64/assembler_arm64.cc
+++ b/compiler/utils/arm64/assembler_arm64.cc
@@ -467,12 +467,26 @@
 #endif
 }
 
-void Arm64Assembler::SignExtend(ManagedRegister /*mreg*/, size_t /*size*/) {
-  UNIMPLEMENTED(FATAL) << "no sign extension necessary for Arm64";
+void Arm64Assembler::SignExtend(ManagedRegister mreg, size_t size) {
+  Arm64ManagedRegister reg = mreg.AsArm64();
+  CHECK(size == 1 || size == 2) << size;
+  CHECK(reg.IsWRegister()) << reg;
+  if (size == 1) {
+    ___ sxtb(reg_w(reg.AsWRegister()), reg_w(reg.AsWRegister()));
+  } else {
+    ___ sxth(reg_w(reg.AsWRegister()), reg_w(reg.AsWRegister()));
+  }
 }
 
-void Arm64Assembler::ZeroExtend(ManagedRegister /*mreg*/, size_t /*size*/) {
-  UNIMPLEMENTED(FATAL) << "no zero extension necessary for Arm64";
+void Arm64Assembler::ZeroExtend(ManagedRegister mreg, size_t size) {
+  Arm64ManagedRegister reg = mreg.AsArm64();
+  CHECK(size == 1 || size == 2) << size;
+  CHECK(reg.IsWRegister()) << reg;
+  if (size == 1) {
+    ___ uxtb(reg_w(reg.AsWRegister()), reg_w(reg.AsWRegister()));
+  } else {
+    ___ uxth(reg_w(reg.AsWRegister()), reg_w(reg.AsWRegister()));
+  }
 }
 
 void Arm64Assembler::VerifyObject(ManagedRegister /*src*/, bool /*could_be_null*/) {
diff --git a/compiler/utils/mips/assembler_mips.cc b/compiler/utils/mips/assembler_mips.cc
index 45d3a97..9001f8a 100644
--- a/compiler/utils/mips/assembler_mips.cc
+++ b/compiler/utils/mips/assembler_mips.cc
@@ -123,7 +123,7 @@
 
 int32_t MipsAssembler::EncodeBranchOffset(int offset, int32_t inst, bool is_jump) {
   CHECK_ALIGNED(offset, 4);
-  CHECK(IsInt(CountOneBits(kBranchOffsetMask), offset)) << offset;
+  CHECK(IsInt(POPCOUNT(kBranchOffsetMask), offset)) << offset;
 
   // Properly preserve only the bits supported in the instruction.
   offset >>= 2;
diff --git a/dex2oat/dex2oat.cc b/dex2oat/dex2oat.cc
index 823b818..cdf26f1 100644
--- a/dex2oat/dex2oat.cc
+++ b/dex2oat/dex2oat.cc
@@ -1010,8 +1010,8 @@
   }
 
   if (compiler_filter_string == NULL) {
-    if (instruction_set == kX86_64 || instruction_set == kArm64) {
-      // TODO: currently x86-64 and arm64 are only interpreted.
+    if (instruction_set == kX86_64 || instruction_set == kArm64 || instruction_set == kMips) {
+      // TODO: implement/fix compilers for these architectures.
       compiler_filter_string = "interpret-only";
     } else if (image) {
       compiler_filter_string = "speed";
diff --git a/oatdump/oatdump.cc b/oatdump/oatdump.cc
index 1a67952..412a052 100644
--- a/oatdump/oatdump.cc
+++ b/oatdump/oatdump.cc
@@ -495,7 +495,8 @@
       } else {
         uint32_t offset = StackVisitor::GetVRegOffset(code_item, oat_method.GetCoreSpillMask(),
                                                       oat_method.GetFpSpillMask(),
-                                                      oat_method.GetFrameSizeInBytes(), reg);
+                                                      oat_method.GetFrameSizeInBytes(), reg,
+                                                      GetInstructionSet());
         os << "[sp + #" << offset << "]";
       }
     }
diff --git a/runtime/arch/arm/context_arm.cc b/runtime/arch/arm/context_arm.cc
index 102e126..0e1b25e 100644
--- a/runtime/arch/arm/context_arm.cc
+++ b/runtime/arch/arm/context_arm.cc
@@ -44,8 +44,8 @@
   mirror::ArtMethod* method = fr.GetMethod();
   uint32_t core_spills = method->GetCoreSpillMask();
   uint32_t fp_core_spills = method->GetFpSpillMask();
-  size_t spill_count = __builtin_popcount(core_spills);
-  size_t fp_spill_count = __builtin_popcount(fp_core_spills);
+  size_t spill_count = POPCOUNT(core_spills);
+  size_t fp_spill_count = POPCOUNT(fp_core_spills);
   size_t frame_size = method->GetFrameSizeInBytes();
   if (spill_count > 0) {
     // Lowest number spill is farthest away, walk registers and fill into context
diff --git a/runtime/arch/arm/fault_handler_arm.cc b/runtime/arch/arm/fault_handler_arm.cc
index 3bbec71..eddaa0b 100644
--- a/runtime/arch/arm/fault_handler_arm.cc
+++ b/runtime/arch/arm/fault_handler_arm.cc
@@ -233,9 +233,9 @@
 
   mirror::ArtMethod* method = reinterpret_cast<mirror::ArtMethod*>(sc->arm_r0);
   uint32_t spill_mask = method->GetCoreSpillMask();
-  uint32_t numcores = __builtin_popcount(spill_mask);
+  uint32_t numcores = POPCOUNT(spill_mask);
   uint32_t fp_spill_mask = method->GetFpSpillMask();
-  uint32_t numfps = __builtin_popcount(fp_spill_mask);
+  uint32_t numfps = POPCOUNT(fp_spill_mask);
   uint32_t spill_size = (numcores + numfps) * 4;
   LOG(DEBUG) << "spill size: " << spill_size;
   uint8_t* prevframe = prevsp + spill_size;
diff --git a/runtime/arch/arm64/context_arm64.cc b/runtime/arch/arm64/context_arm64.cc
index c96ff60..0890fa9 100644
--- a/runtime/arch/arm64/context_arm64.cc
+++ b/runtime/arch/arm64/context_arm64.cc
@@ -47,8 +47,8 @@
   mirror::ArtMethod* method = fr.GetMethod();
   uint32_t core_spills = method->GetCoreSpillMask();
   uint32_t fp_core_spills = method->GetFpSpillMask();
-  size_t spill_count = __builtin_popcount(core_spills);
-  size_t fp_spill_count = __builtin_popcount(fp_core_spills);
+  size_t spill_count = POPCOUNT(core_spills);
+  size_t fp_spill_count = POPCOUNT(fp_core_spills);
   size_t frame_size = method->GetFrameSizeInBytes();
 
   if (spill_count > 0) {
diff --git a/runtime/arch/mips/context_mips.cc b/runtime/arch/mips/context_mips.cc
index b957708..0950e71 100644
--- a/runtime/arch/mips/context_mips.cc
+++ b/runtime/arch/mips/context_mips.cc
@@ -43,8 +43,8 @@
   mirror::ArtMethod* method = fr.GetMethod();
   uint32_t core_spills = method->GetCoreSpillMask();
   uint32_t fp_core_spills = method->GetFpSpillMask();
-  size_t spill_count = __builtin_popcount(core_spills);
-  size_t fp_spill_count = __builtin_popcount(fp_core_spills);
+  size_t spill_count = POPCOUNT(core_spills);
+  size_t fp_spill_count = POPCOUNT(fp_core_spills);
   size_t frame_size = method->GetFrameSizeInBytes();
   if (spill_count > 0) {
     // Lowest number spill is farthest away, walk registers and fill into context.
diff --git a/runtime/arch/stub_test.cc b/runtime/arch/stub_test.cc
index 7027b32..8fbca94 100644
--- a/runtime/arch/stub_test.cc
+++ b/runtime/arch/stub_test.cc
@@ -183,12 +183,12 @@
 }
 
 
-#if defined(__i386__) || defined(__arm__)
+#if defined(__i386__) || defined(__arm__) || defined(__x86_64__)
 extern "C" void art_quick_lock_object(void);
 #endif
 
 TEST_F(StubTest, LockObject) {
-#if defined(__i386__) || defined(__arm__)
+#if defined(__i386__) || defined(__arm__) || defined(__x86_64__)
   Thread* self = Thread::Current();
   // Create an object
   ScopedObjectAccess soa(self);
@@ -633,4 +633,79 @@
 #endif
 }
 
+
+#if defined(__i386__) || defined(__arm__) || defined(__x86_64__)
+extern "C" void art_quick_string_compareto(void);
+#endif
+
+TEST_F(StubTest, StringCompareTo) {
+  TEST_DISABLED_FOR_HEAP_REFERENCE_POISONING();
+
+#if defined(__i386__) || defined(__arm__) || defined(__x86_64__)
+  // TODO: Check the "Unresolved" allocation stubs
+
+  Thread* self = Thread::Current();
+  ScopedObjectAccess soa(self);
+  // garbage is created during ClassLinker::Init
+
+  // Create some strings
+  // Use array so we can index into it and use a matrix for expected results
+  constexpr size_t string_count = 7;
+  const char* c[string_count] = { "", "", "a", "aa", "ab", "aac", "aac" };
+
+  SirtRef<mirror::String>* s[string_count];
+
+  for (size_t i = 0; i < string_count; ++i) {
+    s[i] = new SirtRef<mirror::String>(soa.Self(), mirror::String::AllocFromModifiedUtf8(soa.Self(),
+                                                                                         c[i]));
+  }
+
+  // TODO: wide characters
+
+  // Matrix of expectations. First component is first parameter. Note we only check against the
+  // sign, not the value.
+  int32_t expected[string_count][string_count] = {
+      {  0,  0, -1, -1, -1, -1, -1 },  // ""
+      {  0,  0, -1, -1, -1, -1, -1 },  // ""
+      {  1,  1,  0, -1, -1, -1, -1 },  // "a"
+      {  1,  1,  1,  0, -1, -1, -1 },  // "aa"
+      {  1,  1,  1,  1,  0,  1,  1 },  // "ab"
+      {  1,  1,  1,  1, -1,  0,  0 },  // "aac"
+      {  1,  1,  1,  1, -1,  0,  0 }   // "aac"
+  //    ""  ""   a  aa  ab  aac aac
+  };
+
+  // Play with it...
+
+  for (size_t x = 0; x < string_count; ++x) {
+    for (size_t y = 0; y < string_count; ++y) {
+      // Test string_compareto x y
+      size_t result = Invoke3(reinterpret_cast<size_t>(s[x]->get()),
+                              reinterpret_cast<size_t>(s[y]->get()), 0U,
+                              reinterpret_cast<uintptr_t>(&art_quick_string_compareto), self);
+
+      EXPECT_FALSE(self->IsExceptionPending());
+
+      // The result is a 32b signed integer
+      union {
+        size_t r;
+        int32_t i;
+      } conv;
+      conv.r = result;
+      int32_t e = expected[x][y];
+      EXPECT_TRUE(e == 0 ? conv.i == 0 : true) << "x=" << c[x] << " y=" << c[y];
+      EXPECT_TRUE(e < 0 ? conv.i < 0 : true)   << "x=" << c[x] << " y="  << c[y];
+      EXPECT_TRUE(e > 0 ? conv.i > 0 : true)   << "x=" << c[x] << " y=" << c[y];
+    }
+  }
+
+  // Tests done.
+#else
+  LOG(INFO) << "Skipping string_compareto as I don't know how to do that on " << kRuntimeISA;
+  // Force-print to std::cout so it's also outside the logcat.
+  std::cout << "Skipping string_compareto as I don't know how to do that on " << kRuntimeISA <<
+      std::endl;
+#endif
+}
+
 }  // namespace art
diff --git a/runtime/arch/x86/context_x86.cc b/runtime/arch/x86/context_x86.cc
index 5cf3001..c68d76a 100644
--- a/runtime/arch/x86/context_x86.cc
+++ b/runtime/arch/x86/context_x86.cc
@@ -38,7 +38,7 @@
 void X86Context::FillCalleeSaves(const StackVisitor& fr) {
   mirror::ArtMethod* method = fr.GetMethod();
   uint32_t core_spills = method->GetCoreSpillMask();
-  size_t spill_count = __builtin_popcount(core_spills);
+  size_t spill_count = POPCOUNT(core_spills);
   DCHECK_EQ(method->GetFpSpillMask(), 0u);
   size_t frame_size = method->GetFrameSizeInBytes();
   if (spill_count > 0) {
diff --git a/runtime/arch/x86_64/context_x86_64.cc b/runtime/arch/x86_64/context_x86_64.cc
index 3f1f86d..29a7065 100644
--- a/runtime/arch/x86_64/context_x86_64.cc
+++ b/runtime/arch/x86_64/context_x86_64.cc
@@ -42,8 +42,8 @@
   mirror::ArtMethod* method = fr.GetMethod();
   uint32_t core_spills = method->GetCoreSpillMask();
   uint32_t fp_core_spills = method->GetFpSpillMask();
-  size_t spill_count = __builtin_popcount(core_spills);
-  size_t fp_spill_count = __builtin_popcount(fp_core_spills);
+  size_t spill_count = POPCOUNT(core_spills);
+  size_t fp_spill_count = POPCOUNT(fp_core_spills);
   size_t frame_size = method->GetFrameSizeInBytes();
   if (spill_count > 0) {
     // Lowest number spill is farthest away, walk registers and fill into context.
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 4fefd20..9ccf6c9 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -661,13 +661,61 @@
 TWO_ARG_DOWNCALL art_quick_handle_fill_data, artHandleFillArrayDataFromCode, RETURN_IF_EAX_ZERO
 
 DEFINE_FUNCTION art_quick_lock_object
-    int3
-    int3
+    testl %edi, %edi                      // Null check object/rdi.
+    jz   .Lslow_lock
+.Lretry_lock:
+    movl LOCK_WORD_OFFSET(%edi), %ecx     // ecx := lock word.
+    test LITERAL(0xC0000000), %ecx        // Test the 2 high bits.
+    jne  .Lslow_lock                      // Slow path if either of the two high bits are set.
+    movl %gs:THREAD_ID_OFFSET, %edx       // edx := thread id
+    test %ecx, %ecx
+    jnz  .Lalready_thin                   // Lock word contains a thin lock.
+    // unlocked case - %edx holds thread id with count of 0
+    xor  %eax, %eax                       // eax == 0 for comparison with lock word in cmpxchg
+    lock cmpxchg  %edx, LOCK_WORD_OFFSET(%edi)
+    jnz  .Lretry_lock                     // cmpxchg failed retry
+    ret
+.Lalready_thin:
+    cmpw %cx, %dx                         // do we hold the lock already?
+    jne  .Lslow_lock
+    addl LITERAL(65536), %ecx             // increment recursion count
+    test LITERAL(0xC0000000), %ecx        // overflowed if either of top two bits are set
+    jne  .Lslow_lock                      // count overflowed so go slow
+    movl %ecx, LOCK_WORD_OFFSET(%edi)     // update lockword, cmpxchg not necessary as we hold lock
+    ret
+.Lslow_lock:
+    SETUP_REF_ONLY_CALLEE_SAVE_FRAME
+    movq %gs:THREAD_SELF_OFFSET, %rsi     // pass Thread::Current()
+    movq %rsp, %rdx                       // pass SP
+    call PLT_SYMBOL(artLockObjectFromCode)  // artLockObjectFromCode(object, Thread*, SP)
+    RESTORE_REF_ONLY_CALLEE_SAVE_FRAME    // restore frame up to return address
+    RETURN_IF_EAX_ZERO
 END_FUNCTION art_quick_lock_object
 
 DEFINE_FUNCTION art_quick_unlock_object
-    int3
-    int3
+    testl %edi, %edi                      // null check object/edi
+    jz   .Lslow_unlock
+    movl LOCK_WORD_OFFSET(%edi), %ecx     // ecx := lock word
+    movl %gs:THREAD_ID_OFFSET, %edx       // edx := thread id
+    test %ecx, %ecx
+    jb   .Lslow_unlock                    // lock word contains a monitor
+    cmpw %cx, %dx                         // does the thread id match?
+    jne  .Lslow_unlock
+    cmpl LITERAL(65536), %ecx
+    jae  .Lrecursive_thin_unlock
+    movl LITERAL(0), LOCK_WORD_OFFSET(%edi)
+    ret
+.Lrecursive_thin_unlock:
+    subl LITERAL(65536), %ecx
+    mov  %ecx, LOCK_WORD_OFFSET(%edi)
+    ret
+.Lslow_unlock:
+    SETUP_REF_ONLY_CALLEE_SAVE_FRAME
+    movq %gs:THREAD_SELF_OFFSET, %rsi     // pass Thread::Current()
+    movq %rsp, %rdx                       // pass SP
+    call PLT_SYMBOL(artUnlockObjectFromCode)  // artUnlockObjectFromCode(object, Thread*, SP)
+    RESTORE_REF_ONLY_CALLEE_SAVE_FRAME    // restore frame up to return address
+    RETURN_IF_EAX_ZERO
 END_FUNCTION art_quick_unlock_object
 
 DEFINE_FUNCTION art_quick_is_assignable
@@ -1169,5 +1217,47 @@
 UNIMPLEMENTED art_quick_deoptimize
 
 UNIMPLEMENTED art_quick_indexof
-UNIMPLEMENTED art_quick_string_compareto
+
+    /*
+     * String's compareTo.
+     *
+     * On entry:
+     *    rdi:   this string object (known non-null)
+     *    rsi:   comp string object (known non-null)
+     */
+DEFINE_FUNCTION art_quick_string_compareto
+    movl STRING_COUNT_OFFSET(%edi), %r8d
+    movl STRING_COUNT_OFFSET(%esi), %r9d
+    movl STRING_VALUE_OFFSET(%edi), %r10d
+    movl STRING_VALUE_OFFSET(%esi), %r11d
+    movl STRING_OFFSET_OFFSET(%edi), %eax
+    movl STRING_OFFSET_OFFSET(%esi), %ecx
+    /* Build pointers to the start of string data */
+    leal STRING_DATA_OFFSET(%r10d, %eax, 2), %esi
+    leal STRING_DATA_OFFSET(%r11d, %ecx, 2), %edi
+    /* Calculate min length and count diff */
+    movl  %r8d, %ecx
+    movl  %r8d, %eax
+    subl  %r9d, %eax
+    cmovg %r9d, %ecx
+    /*
+     * At this point we have:
+     *   eax: value to return if first part of strings are equal
+     *   ecx: minimum among the lengths of the two strings
+     *   esi: pointer to this string data
+     *   edi: pointer to comp string data
+     */
+    jecxz .Lkeep_length
+    repe cmpsw                    // find nonmatching chars in [%esi] and [%edi], up to length %ecx
+    jne .Lnot_equal
+.Lkeep_length:
+    ret
+    .balign 16
+.Lnot_equal:
+    movzwl  -2(%esi), %eax        // get last compared char from this string
+    movzwl  -2(%edi), %ecx        // get last compared char from comp string
+    subl  %ecx, %eax              // return the difference
+    ret
+END_FUNCTION art_quick_string_compareto
+
 UNIMPLEMENTED art_quick_memcmp16
diff --git a/runtime/base/bit_vector.cc b/runtime/base/bit_vector.cc
index 12c0352..3df5101 100644
--- a/runtime/base/bit_vector.cc
+++ b/runtime/base/bit_vector.cc
@@ -201,7 +201,7 @@
 uint32_t BitVector::NumSetBits() const {
   uint32_t count = 0;
   for (uint32_t word = 0; word < storage_size_; word++) {
-    count += __builtin_popcount(storage_[word]);
+    count += POPCOUNT(storage_[word]);
   }
   return count;
 }
@@ -331,10 +331,10 @@
 
   uint32_t count = 0u;
   for (uint32_t word = 0u; word < word_end; word++) {
-    count += __builtin_popcount(storage[word]);
+    count += POPCOUNT(storage[word]);
   }
   if (partial_word_bits != 0u) {
-    count += __builtin_popcount(storage[word_end] & ~(0xffffffffu << partial_word_bits));
+    count += POPCOUNT(storage[word_end] & ~(0xffffffffu << partial_word_bits));
   }
   return count;
 }
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index c9e3c11..703229c 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -663,12 +663,8 @@
                               actual_image_oat_offset);
     return nullptr;
   }
-  // TODO: this registers the oat file now as we may use the oat_dex_file later and we want the
-  //       intern behavior of RegisterOatFile. However, if we take an early return we could remove
-  //       the oat file.
-  const OatFile* opened_oat_file = RegisterOatFile(oat_file.release());
-  const OatFile::OatDexFile* oat_dex_file = opened_oat_file->GetOatDexFile(dex_location,
-                                                                           &dex_location_checksum);
+  const OatFile::OatDexFile* oat_dex_file = oat_file->GetOatDexFile(dex_location,
+                                                                    &dex_location_checksum);
   if (oat_dex_file == nullptr) {
     *error_msg = StringPrintf("Failed to find oat file at '%s' containing '%s'", oat_location,
                               dex_location);
@@ -682,7 +678,11 @@
                               actual_dex_checksum);
     return nullptr;
   }
-  return oat_dex_file->OpenDexFile(error_msg);
+  const DexFile* dex_file = oat_dex_file->OpenDexFile(error_msg);
+  if (dex_file != nullptr) {
+    RegisterOatFile(oat_file.release());
+  }
+  return dex_file;
 }
 
 class ScopedFlock {
@@ -773,16 +773,15 @@
     error_msgs->push_back(error_msg);
     return nullptr;
   }
-  const OatFile* oat_file = OatFile::Open(oat_location, oat_location, NULL,
-                                          !Runtime::Current()->IsCompiler(),
-                                          &error_msg);
-  if (oat_file == nullptr) {
+  UniquePtr<OatFile> oat_file(OatFile::Open(oat_location, oat_location, NULL,
+                                            !Runtime::Current()->IsCompiler(),
+                                            &error_msg));
+  if (oat_file.get() == nullptr) {
     compound_msg = StringPrintf("\nFailed to open generated oat file '%s': %s",
                                 oat_location, error_msg.c_str());
     error_msgs->push_back(compound_msg);
     return nullptr;
   }
-  oat_file = RegisterOatFile(oat_file);
   const OatFile::OatDexFile* oat_dex_file = oat_file->GetOatDexFile(dex_location,
                                                                     &dex_location_checksum);
   if (oat_dex_file == nullptr) {
@@ -797,6 +796,7 @@
           << "dex_location=" << dex_location << " oat_location=" << oat_location << std::hex
           << " dex_location_checksum=" << dex_location_checksum
           << " DexFile::GetLocationChecksum()=" << result->GetLocationChecksum();
+  RegisterOatFile(oat_file.release());
   return result;
 }
 
@@ -857,32 +857,33 @@
     return nullptr;
   }
   *open_failed = false;
+  const DexFile* dex_file = nullptr;
   uint32_t dex_location_checksum;
   if (!DexFile::GetChecksum(dex_location, &dex_location_checksum, error_msg)) {
     // If no classes.dex found in dex_location, it has been stripped or is corrupt, assume oat is
     // up-to-date. This is the common case in user builds for jar's and apk's in the /system
     // directory.
-    const OatFile* opened_oat_file = oat_file.release();
-    opened_oat_file = RegisterOatFile(opened_oat_file);
-    const OatFile::OatDexFile* oat_dex_file = opened_oat_file->GetOatDexFile(dex_location, NULL);
+    const OatFile::OatDexFile* oat_dex_file = oat_file->GetOatDexFile(dex_location, NULL);
     if (oat_dex_file == nullptr) {
       *error_msg = StringPrintf("Dex checksum mismatch for location '%s' and failed to find oat "
                                 "dex file '%s': %s", oat_file_location.c_str(), dex_location,
                                 error_msg->c_str());
       return nullptr;
     }
-    return oat_dex_file->OpenDexFile(error_msg);
+    dex_file = oat_dex_file->OpenDexFile(error_msg);
+  } else {
+    bool verified = VerifyOatFileChecksums(oat_file.get(), dex_location, dex_location_checksum,
+                                           error_msg);
+    if (!verified) {
+      return nullptr;
+    }
+    dex_file = oat_file->GetOatDexFile(dex_location,
+                                       &dex_location_checksum)->OpenDexFile(error_msg);
   }
-
-  bool verified = VerifyOatFileChecksums(oat_file.get(), dex_location, dex_location_checksum,
-                                         error_msg);
-  if (!verified) {
-    return nullptr;
+  if (dex_file != nullptr) {
+    RegisterOatFile(oat_file.release());
   }
-  const OatFile* opened_oat_file = oat_file.release();
-  opened_oat_file = RegisterOatFile(opened_oat_file);
-  return opened_oat_file->GetOatDexFile(dex_location,
-                                        &dex_location_checksum)->OpenDexFile(error_msg);
+  return dex_file;
 }
 
 const DexFile* ClassLinker::FindDexFileInOatFileFromDexLocation(const char* dex_location,
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index 2b29591..680ffbe 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -34,12 +34,6 @@
 
 // Visits the arguments as saved to the stack by a Runtime::kRefAndArgs callee save frame.
 class QuickArgumentVisitor {
-  // Size of each spilled GPR.
-#ifdef __LP64__
-  static constexpr size_t kBytesPerGprSpillLocation = 8;
-#else
-  static constexpr size_t kBytesPerGprSpillLocation = 4;
-#endif
   // Number of bytes for each out register in the caller method's frame.
   static constexpr size_t kBytesStackArgLocation = 4;
 #if defined(__arm__)
@@ -61,13 +55,12 @@
   static constexpr bool kQuickSoftFloatAbi = true;  // This is a soft float ABI.
   static constexpr size_t kNumQuickGprArgs = 3;  // 3 arguments passed in GPRs.
   static constexpr size_t kNumQuickFprArgs = 0;  // 0 arguments passed in FPRs.
-  static constexpr size_t kBytesPerFprSpillLocation = 4;  // FPR spill size is 4 bytes.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 0;  // Offset of first FPR arg.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset = 8;  // Offset of first GPR arg.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_LrOffset = 44;  // Offset of return address.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_FrameSize = 48;  // Frame size.
   static size_t GprIndexToGprOffset(uint32_t gpr_index) {
-    return gpr_index * kBytesPerGprSpillLocation;
+    return gpr_index * GetBytesPerGprSpillLocation(kRuntimeISA);
   }
 #elif defined(__aarch64__)
   // The callee save frame is pointed to by SP.
@@ -93,13 +86,12 @@
   static constexpr bool kQuickSoftFloatAbi = false;  // This is a hard float ABI.
   static constexpr size_t kNumQuickGprArgs = 7;  // 7 arguments passed in GPRs.
   static constexpr size_t kNumQuickFprArgs = 8;  // 8 arguments passed in FPRs.
-  static constexpr size_t kBytesPerFprSpillLocation = 8;  // FPR spill size is 8 bytes.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset =16;  // Offset of first FPR arg.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset = 144;  // Offset of first GPR arg.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_LrOffset = 296;  // Offset of return address.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_FrameSize = 304;  // Frame size.
   static size_t GprIndexToGprOffset(uint32_t gpr_index) {
-    return gpr_index * kBytesPerGprSpillLocation;
+    return gpr_index * GetBytesPerGprSpillLocation(kRuntimeISA);
   }
 #elif defined(__mips__)
   // The callee save frame is pointed to by SP.
@@ -119,13 +111,12 @@
   static constexpr bool kQuickSoftFloatAbi = true;  // This is a soft float ABI.
   static constexpr size_t kNumQuickGprArgs = 3;  // 3 arguments passed in GPRs.
   static constexpr size_t kNumQuickFprArgs = 0;  // 0 arguments passed in FPRs.
-  static constexpr size_t kBytesPerFprSpillLocation = 4;  // FPR spill size is 4 bytes.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 0;  // Offset of first FPR arg.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset = 4;  // Offset of first GPR arg.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_LrOffset = 60;  // Offset of return address.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_FrameSize = 64;  // Frame size.
   static size_t GprIndexToGprOffset(uint32_t gpr_index) {
-    return gpr_index * kBytesPerGprSpillLocation;
+    return gpr_index * GetBytesPerGprSpillLocation(kRuntimeISA);
   }
 #elif defined(__i386__)
   // The callee save frame is pointed to by SP.
@@ -145,13 +136,12 @@
   static constexpr bool kQuickSoftFloatAbi = true;  // This is a soft float ABI.
   static constexpr size_t kNumQuickGprArgs = 3;  // 3 arguments passed in GPRs.
   static constexpr size_t kNumQuickFprArgs = 0;  // 0 arguments passed in FPRs.
-  static constexpr size_t kBytesPerFprSpillLocation = 8;  // FPR spill size is 8 bytes.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 0;  // Offset of first FPR arg.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset = 4;  // Offset of first GPR arg.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_LrOffset = 28;  // Offset of return address.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_FrameSize = 32;  // Frame size.
   static size_t GprIndexToGprOffset(uint32_t gpr_index) {
-    return gpr_index * kBytesPerGprSpillLocation;
+    return gpr_index * GetBytesPerGprSpillLocation(kRuntimeISA);
   }
 #elif defined(__x86_64__)
   // The callee save frame is pointed to by SP.
@@ -184,18 +174,17 @@
   static constexpr bool kQuickSoftFloatAbi = false;  // This is a hard float ABI.
   static constexpr size_t kNumQuickGprArgs = 5;  // 3 arguments passed in GPRs.
   static constexpr size_t kNumQuickFprArgs = 8;  // 0 arguments passed in FPRs.
-  static constexpr size_t kBytesPerFprSpillLocation = 8;  // FPR spill size is 8 bytes.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 16;  // Offset of first FPR arg.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset = 80;  // Offset of first GPR arg.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_LrOffset = 168;  // Offset of return address.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_FrameSize = 176;  // Frame size.
   static size_t GprIndexToGprOffset(uint32_t gpr_index) {
     switch (gpr_index) {
-      case 0: return (4 * kBytesPerGprSpillLocation);
-      case 1: return (1 * kBytesPerGprSpillLocation);
-      case 2: return (0 * kBytesPerGprSpillLocation);
-      case 3: return (5 * kBytesPerGprSpillLocation);
-      case 4: return (6 * kBytesPerGprSpillLocation);
+      case 0: return (4 * GetBytesPerGprSpillLocation(kRuntimeISA));
+      case 1: return (1 * GetBytesPerGprSpillLocation(kRuntimeISA));
+      case 2: return (0 * GetBytesPerGprSpillLocation(kRuntimeISA));
+      case 3: return (5 * GetBytesPerGprSpillLocation(kRuntimeISA));
+      case 4: return (6 * GetBytesPerGprSpillLocation(kRuntimeISA));
       default:
         LOG(FATAL) << "Unexpected GPR index: " << gpr_index;
         return 0;
@@ -248,7 +237,7 @@
       Primitive::Type type = GetParamPrimitiveType();
       if (UNLIKELY((type == Primitive::kPrimDouble) || (type == Primitive::kPrimFloat))) {
         if ((kNumQuickFprArgs != 0) && (fpr_index_ + 1 < kNumQuickFprArgs + 1)) {
-          return fpr_args_ + (fpr_index_ * kBytesPerFprSpillLocation);
+          return fpr_args_ + (fpr_index_ * GetBytesPerFprSpillLocation(kRuntimeISA));
         }
         return stack_args_ + (stack_index_ * kBytesStackArgLocation);
       }
@@ -260,7 +249,7 @@
   }
 
   bool IsSplitLongOrDouble() const {
-    if ((kBytesPerGprSpillLocation == 4) || (kBytesPerFprSpillLocation == 4)) {
+    if ((GetBytesPerGprSpillLocation(kRuntimeISA) == 4) || (GetBytesPerFprSpillLocation(kRuntimeISA) == 4)) {
       return is_split_long_or_double_;
     } else {
       return false;  // An optimization for when GPR and FPRs are 64bit.
@@ -341,7 +330,7 @@
         case Primitive::kPrimDouble:
         case Primitive::kPrimLong:
           if (kQuickSoftFloatAbi || (cur_type_ == Primitive::kPrimLong)) {
-            is_split_long_or_double_ = (kBytesPerGprSpillLocation == 4) &&
+            is_split_long_or_double_ = (GetBytesPerGprSpillLocation(kRuntimeISA) == 4) &&
                 ((gpr_index_ + 1) == kNumQuickGprArgs);
             Visit();
             if (!kQuickSoftFloatAbi || kNumQuickGprArgs == gpr_index_) {
@@ -354,7 +343,7 @@
             }
             if (gpr_index_ < kNumQuickGprArgs) {
               gpr_index_++;
-              if (kBytesPerGprSpillLocation == 4) {
+              if (GetBytesPerGprSpillLocation(kRuntimeISA) == 4) {
                 if (gpr_index_ < kNumQuickGprArgs) {
                   gpr_index_++;
                 } else if (kQuickSoftFloatAbi) {
@@ -363,12 +352,12 @@
               }
             }
           } else {
-            is_split_long_or_double_ = (kBytesPerFprSpillLocation == 4) &&
+            is_split_long_or_double_ = (GetBytesPerFprSpillLocation(kRuntimeISA) == 4) &&
                 ((fpr_index_ + 1) == kNumQuickFprArgs);
             Visit();
             if ((kNumQuickFprArgs != 0) && (fpr_index_ + 1 < kNumQuickFprArgs + 1)) {
               fpr_index_++;
-              if (kBytesPerFprSpillLocation == 4) {
+              if (GetBytesPerFprSpillLocation(kRuntimeISA) == 4) {
                 if ((kNumQuickFprArgs != 0) && (fpr_index_ + 1 < kNumQuickFprArgs + 1)) {
                   fpr_index_++;
                 }
@@ -393,12 +382,13 @@
                                              uint32_t shorty_len) {
     if (kQuickSoftFloatAbi) {
       CHECK_EQ(kNumQuickFprArgs, 0U);
-      return (kNumQuickGprArgs * kBytesPerGprSpillLocation) + kBytesPerGprSpillLocation /* ArtMethod* */;
+      return (kNumQuickGprArgs * GetBytesPerGprSpillLocation(kRuntimeISA))
+          + GetBytesPerGprSpillLocation(kRuntimeISA) /* ArtMethod* */;
     } else {
       // For now, there is no reg-spill area for the targets with
       // hard float ABI. So, the offset pointing to the first method's
       // parameter ('this' for non-static methods) should be returned.
-      return kBytesPerGprSpillLocation;  // Skip Method*.
+      return GetBytesPerGprSpillLocation(kRuntimeISA);  // Skip Method*.
     }
   }
 
diff --git a/runtime/gc/accounting/gc_allocator.h b/runtime/gc/accounting/gc_allocator.h
index 4fe9367..7dd7cca 100644
--- a/runtime/gc/accounting/gc_allocator.h
+++ b/runtime/gc/accounting/gc_allocator.h
@@ -73,7 +73,7 @@
 // GCAllocatorImpl<T> if kMeasureGCMemoryOverhead is true, std::allocator<T> otherwise.
 template <typename T>
 class GcAllocator : public TypeStaticIf<kMeasureGcMemoryOverhead, GcAllocatorImpl<T>,
-                                        std::allocator<T> >::value {
+                                        std::allocator<T> >::type {
 };
 
 }  // namespace accounting
diff --git a/runtime/gc/gc_cause.cc b/runtime/gc/gc_cause.cc
index b25f7ff..9e73f14 100644
--- a/runtime/gc/gc_cause.cc
+++ b/runtime/gc/gc_cause.cc
@@ -29,7 +29,9 @@
     case kGcCauseBackground: return "Background";
     case kGcCauseExplicit: return "Explicit";
     case kGcCauseForNativeAlloc: return "NativeAlloc";
-    case kGcCauseCollectorTransition: return" CollectorTransition";
+    case kGcCauseCollectorTransition: return "CollectorTransition";
+    case kGcCauseDisableMovingGc: return "DisableMovingGc";
+    case kGcCauseTrim: return "HeapTrim";
     default:
       LOG(FATAL) << "Unreachable";
   }
diff --git a/runtime/gc/gc_cause.h b/runtime/gc/gc_cause.h
index 7499b9e..10e6667 100644
--- a/runtime/gc/gc_cause.h
+++ b/runtime/gc/gc_cause.h
@@ -35,6 +35,10 @@
   kGcCauseForNativeAlloc,
   // GC triggered for a collector transition.
   kGcCauseCollectorTransition,
+  // Not a real GC cause, used when we disable moving GC (currently for GetPrimitiveArrayCritical).
+  kGcCauseDisableMovingGc,
+  // Not a real GC cause, used when we trim the heap.
+  kGcCauseTrim,
 };
 
 const char* PrettyCause(GcCause cause);
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 5d517bb..33026d1 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -498,7 +498,7 @@
   MutexLock mu(self, *gc_complete_lock_);
   ++disable_moving_gc_count_;
   if (IsMovingGc(collector_type_running_)) {
-    WaitForGcToCompleteLocked(self);
+    WaitForGcToCompleteLocked(kGcCauseDisableMovingGc, self);
   }
 }
 
@@ -962,7 +962,7 @@
     // trimming.
     MutexLock mu(self, *gc_complete_lock_);
     // Ensure there is only one GC at a time.
-    WaitForGcToCompleteLocked(self);
+    WaitForGcToCompleteLocked(kGcCauseTrim, self);
     collector_type_running_ = kCollectorTypeHeapTrim;
   }
   uint64_t start_ns = NanoTime();
@@ -1171,7 +1171,7 @@
   SirtRef<mirror::Class> sirt_klass(self, *klass);
   // The allocation failed. If the GC is running, block until it completes, and then retry the
   // allocation.
-  collector::GcType last_gc = WaitForGcToComplete(self);
+  collector::GcType last_gc = WaitForGcToComplete(kGcCauseForAlloc, self);
   if (last_gc != collector::kGcTypeNone) {
     // If we were the default allocator but the allocator changed while we were suspended,
     // abort the allocation.
@@ -1418,7 +1418,7 @@
       ScopedThreadStateChange tsc(self, kWaitingForGcToComplete);
       MutexLock mu(self, *gc_complete_lock_);
       // Ensure there is only one GC at a time.
-      WaitForGcToCompleteLocked(self);
+      WaitForGcToCompleteLocked(kGcCauseCollectorTransition, self);
       // If someone else beat us to it and changed the collector before we could, exit.
       // This is safe to do before the suspend all since we set the collector_type_running_ before
       // we exit the loop. If another thread attempts to do the heap transition before we exit,
@@ -1819,7 +1819,7 @@
     ScopedThreadStateChange tsc(self, kWaitingForGcToComplete);
     MutexLock mu(self, *gc_complete_lock_);
     // Ensure there is only one GC at a time.
-    WaitForGcToCompleteLocked(self);
+    WaitForGcToCompleteLocked(gc_cause, self);
     compacting_gc = IsMovingGc(collector_type_);
     // GC can be disabled if someone has a used GetPrimitiveArrayCritical.
     if (compacting_gc && disable_moving_gc_count_ != 0) {
@@ -2448,13 +2448,13 @@
   }
 }
 
-collector::GcType Heap::WaitForGcToComplete(Thread* self) {
+collector::GcType Heap::WaitForGcToComplete(GcCause cause, Thread* self) {
   ScopedThreadStateChange tsc(self, kWaitingForGcToComplete);
   MutexLock mu(self, *gc_complete_lock_);
-  return WaitForGcToCompleteLocked(self);
+  return WaitForGcToCompleteLocked(cause, self);
 }
 
-collector::GcType Heap::WaitForGcToCompleteLocked(Thread* self) {
+collector::GcType Heap::WaitForGcToCompleteLocked(GcCause cause, Thread* self) {
   collector::GcType last_gc_type = collector::kGcTypeNone;
   uint64_t wait_start = NanoTime();
   while (collector_type_running_ != kCollectorTypeNone) {
@@ -2467,7 +2467,8 @@
   uint64_t wait_time = NanoTime() - wait_start;
   total_wait_time_ += wait_time;
   if (wait_time > long_pause_log_threshold_) {
-    LOG(INFO) << "WaitForGcToComplete blocked for " << PrettyDuration(wait_time);
+    LOG(INFO) << "WaitForGcToComplete blocked for " << PrettyDuration(wait_time)
+        << " for cause " << cause;
   }
   return last_gc_type;
 }
@@ -2659,7 +2660,7 @@
     return;
   }
   // Wait for any GCs currently running to finish.
-  if (WaitForGcToComplete(self) == collector::kGcTypeNone) {
+  if (WaitForGcToComplete(kGcCauseBackground, self) == collector::kGcTypeNone) {
     // If the we can't run the GC type we wanted to run, find the next appropriate one and try that
     // instead. E.g. can't do partial, so do full instead.
     if (CollectGarbageInternal(next_gc_type_, kGcCauseBackground, false) ==
@@ -2792,7 +2793,7 @@
     // The second watermark is higher than the gc watermark. If you hit this it means you are
     // allocating native objects faster than the GC can keep up with.
     if (static_cast<size_t>(native_bytes_allocated_) > native_footprint_limit_) {
-      if (WaitForGcToComplete(self) != collector::kGcTypeNone) {
+      if (WaitForGcToComplete(kGcCauseForNativeAlloc, self) != collector::kGcTypeNone) {
         // Just finished a GC, attempt to run finalizers.
         RunFinalization(env);
         CHECK(!env->ExceptionCheck());
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index d3b5cdc..d770024 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -300,7 +300,8 @@
 
   // Blocks the caller until the garbage collector becomes idle and returns the type of GC we
   // waited for.
-  collector::GcType WaitForGcToComplete(Thread* self) LOCKS_EXCLUDED(gc_complete_lock_);
+  collector::GcType WaitForGcToComplete(GcCause cause, Thread* self)
+      LOCKS_EXCLUDED(gc_complete_lock_);
 
   // Update the heap's process state to a new value, may cause compaction to occur.
   void UpdateProcessState(ProcessState process_state);
@@ -641,7 +642,7 @@
 
   // Blocks the caller until the garbage collector becomes idle and returns the type of GC we
   // waited for.
-  collector::GcType WaitForGcToCompleteLocked(Thread* self)
+  collector::GcType WaitForGcToCompleteLocked(GcCause cause, Thread* self)
       EXCLUSIVE_LOCKS_REQUIRED(gc_complete_lock_);
 
   void RequestCollectorTransition(CollectorType desired_collector_type, uint64_t delta_time)
diff --git a/runtime/instruction_set.cc b/runtime/instruction_set.cc
index cbcd2e0..c1931a9 100644
--- a/runtime/instruction_set.cc
+++ b/runtime/instruction_set.cc
@@ -86,6 +86,52 @@
   }
 }
 
+size_t GetBytesPerGprSpillLocation(InstructionSet isa) {
+  switch (isa) {
+    case kArm:
+      // Fall-through.
+    case kThumb2:
+      return 4;
+    case kArm64:
+      return 8;
+    case kX86:
+      return 4;
+    case kX86_64:
+      return 8;
+    case kMips:
+      return 4;
+    case kNone:
+      LOG(FATAL) << "ISA kNone does not have spills.";
+      return 0;
+    default:
+      LOG(FATAL) << "Unknown ISA " << isa;
+      return 0;
+  }
+}
+
+size_t GetBytesPerFprSpillLocation(InstructionSet isa) {
+  switch (isa) {
+    case kArm:
+      // Fall-through.
+    case kThumb2:
+      return 4;
+    case kArm64:
+      return 8;
+    case kX86:
+      return 8;
+    case kX86_64:
+      return 8;
+    case kMips:
+      return 4;
+    case kNone:
+      LOG(FATAL) << "ISA kNone does not have spills.";
+      return 0;
+    default:
+      LOG(FATAL) << "Unknown ISA " << isa;
+      return 0;
+  }
+}
+
 size_t GetInstructionSetAlignment(InstructionSet isa) {
   switch (isa) {
     case kArm:
diff --git a/runtime/instruction_set.h b/runtime/instruction_set.h
index 4bc35a7..bfbbbd6 100644
--- a/runtime/instruction_set.h
+++ b/runtime/instruction_set.h
@@ -41,6 +41,8 @@
 size_t GetInstructionSetPointerSize(InstructionSet isa);
 size_t GetInstructionSetAlignment(InstructionSet isa);
 bool Is64BitInstructionSet(InstructionSet isa);
+size_t GetBytesPerGprSpillLocation(InstructionSet isa);
+size_t GetBytesPerFprSpillLocation(InstructionSet isa);
 
 #if defined(__arm__)
 static constexpr InstructionSet kRuntimeISA = kArm;
diff --git a/runtime/mirror/class.cc b/runtime/mirror/class.cc
index 64a849b..662303e 100644
--- a/runtime/mirror/class.cc
+++ b/runtime/mirror/class.cc
@@ -257,7 +257,7 @@
     for (Class* c = this; c != NULL; c = c->GetSuperClass()) {
       count += c->NumReferenceInstanceFieldsDuringLinking();
     }
-    CHECK_EQ((size_t)__builtin_popcount(new_reference_offsets), count);
+    CHECK_EQ((size_t)POPCOUNT(new_reference_offsets), count);
   }
   // Not called within a transaction.
   SetField32<false>(OFFSET_OF_OBJECT_MEMBER(Class, reference_instance_offsets_),
@@ -268,7 +268,7 @@
   if (new_reference_offsets != CLASS_WALK_SUPER) {
     // Sanity check that the number of bits set in the reference offset bitmap
     // agrees with the number of references
-    CHECK_EQ((size_t)__builtin_popcount(new_reference_offsets),
+    CHECK_EQ((size_t)POPCOUNT(new_reference_offsets),
              NumReferenceStaticFieldsDuringLinking());
   }
   // Not called within a transaction.
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index 20df78e..027feee 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -160,7 +160,7 @@
   Trace::Shutdown();
 
   // Make sure to let the GC complete if it is running.
-  heap_->WaitForGcToComplete(self);
+  heap_->WaitForGcToComplete(gc::kGcCauseBackground, self);
   heap_->DeleteThreadPool();
 
   // Make sure our internal threads are dead before we start tearing down things they're using.
@@ -1010,8 +1010,8 @@
                              (1 << art::arm::S27) | (1 << art::arm::S28) | (1 << art::arm::S29) |
                              (1 << art::arm::S30) | (1 << art::arm::S31);
     uint32_t fp_spills = type == kSaveAll ? fp_all_spills : 0;
-    size_t frame_size = RoundUp((__builtin_popcount(core_spills) /* gprs */ +
-                                 __builtin_popcount(fp_spills) /* fprs */ +
+    size_t frame_size = RoundUp((POPCOUNT(core_spills) /* gprs */ +
+                                 POPCOUNT(fp_spills) /* fprs */ +
                                  1 /* Method* */) * kArmPointerSize, kStackAlignment);
     method->SetFrameSizeInBytes(frame_size);
     method->SetCoreSpillMask(core_spills);
@@ -1024,7 +1024,7 @@
     uint32_t all_spills = (1 << art::mips::S0) | (1 << art::mips::S1);
     uint32_t core_spills = ref_spills | (type == kRefsAndArgs ? arg_spills : 0) |
                            (type == kSaveAll ? all_spills : 0) | (1 << art::mips::RA);
-    size_t frame_size = RoundUp((__builtin_popcount(core_spills) /* gprs */ +
+    size_t frame_size = RoundUp((POPCOUNT(core_spills) /* gprs */ +
                                 (type == kRefsAndArgs ? 0 : 3) + 1 /* Method* */) *
                                 kMipsPointerSize, kStackAlignment);
     method->SetFrameSizeInBytes(frame_size);
@@ -1035,7 +1035,7 @@
     uint32_t arg_spills = (1 << art::x86::ECX) | (1 << art::x86::EDX) | (1 << art::x86::EBX);
     uint32_t core_spills = ref_spills | (type == kRefsAndArgs ? arg_spills : 0) |
                          (1 << art::x86::kNumberOfCpuRegisters);  // fake return address callee save
-    size_t frame_size = RoundUp((__builtin_popcount(core_spills) /* gprs */ +
+    size_t frame_size = RoundUp((POPCOUNT(core_spills) /* gprs */ +
                                  1 /* Method* */) * kX86PointerSize, kStackAlignment);
     method->SetFrameSizeInBytes(frame_size);
     method->SetCoreSpillMask(core_spills);
@@ -1054,8 +1054,8 @@
         (1 << art::x86_64::XMM3) | (1 << art::x86_64::XMM4) | (1 << art::x86_64::XMM5) |
         (1 << art::x86_64::XMM6) | (1 << art::x86_64::XMM7);
     uint32_t fp_spills = (type == kRefsAndArgs ? fp_arg_spills : 0);
-    size_t frame_size = RoundUp((__builtin_popcount(core_spills) /* gprs */ +
-                                 __builtin_popcount(fp_spills) /* fprs */ +
+    size_t frame_size = RoundUp((POPCOUNT(core_spills) /* gprs */ +
+                                 POPCOUNT(fp_spills) /* fprs */ +
                                  1 /* Method* */) * kX86_64PointerSize, kStackAlignment);
     method->SetFrameSizeInBytes(frame_size);
     method->SetCoreSpillMask(core_spills);
@@ -1094,8 +1094,8 @@
                           (1 << art::arm64::D31);
       uint32_t fp_spills = fp_ref_spills | (type == kRefsAndArgs ? fp_arg_spills: 0)
                           | (type == kSaveAll ? fp_all_spills : 0);
-      size_t frame_size = RoundUp((__builtin_popcount(core_spills) /* gprs */ +
-                                   __builtin_popcount(fp_spills) /* fprs */ +
+      size_t frame_size = RoundUp((POPCOUNT(core_spills) /* gprs */ +
+                                   POPCOUNT(fp_spills) /* fprs */ +
                                    1 /* Method* */) * kArm64PointerSize, kStackAlignment);
       method->SetFrameSizeInBytes(frame_size);
       method->SetCoreSpillMask(core_spills);
diff --git a/runtime/stack.cc b/runtime/stack.cc
index ab3bd85..9c709ae 100644
--- a/runtime/stack.cc
+++ b/runtime/stack.cc
@@ -187,7 +187,7 @@
       uint32_t core_spills = m->GetCoreSpillMask();
       uint32_t fp_spills = m->GetFpSpillMask();
       size_t frame_size = m->GetFrameSizeInBytes();
-      int offset = GetVRegOffset(code_item, core_spills, fp_spills, frame_size, vreg);
+      int offset = GetVRegOffset(code_item, core_spills, fp_spills, frame_size, vreg, kRuntimeISA);
       byte* vreg_addr = reinterpret_cast<byte*>(GetCurrentQuickFrame()) + offset;
       *reinterpret_cast<uint32_t*>(vreg_addr) = new_value;
     }
diff --git a/runtime/stack.h b/runtime/stack.h
index ab903d6..73a823a 100644
--- a/runtime/stack.h
+++ b/runtime/stack.h
@@ -19,11 +19,13 @@
 
 #include "dex_file.h"
 #include "instrumentation.h"
+#include "arch/context.h"
 #include "base/casts.h"
 #include "base/macros.h"
-#include "arch/context.h"
+#include "instruction_set.h"
 #include "mirror/object.h"
 #include "mirror/object_reference.h"
+#include "utils.h"
 #include "verify_object.h"
 
 #include <stdint.h>
@@ -577,7 +579,7 @@
   uint32_t* GetVRegAddr(mirror::ArtMethod** cur_quick_frame, const DexFile::CodeItem* code_item,
                         uint32_t core_spills, uint32_t fp_spills, size_t frame_size,
                         uint16_t vreg) const {
-    int offset = GetVRegOffset(code_item, core_spills, fp_spills, frame_size, vreg);
+    int offset = GetVRegOffset(code_item, core_spills, fp_spills, frame_size, vreg, kRuntimeISA);
     DCHECK_EQ(cur_quick_frame, GetCurrentQuickFrame());
     byte* vreg_addr = reinterpret_cast<byte*>(cur_quick_frame) + offset;
     return reinterpret_cast<uint32_t*>(vreg_addr);
@@ -634,14 +636,15 @@
    */
   static int GetVRegOffset(const DexFile::CodeItem* code_item,
                            uint32_t core_spills, uint32_t fp_spills,
-                           size_t frame_size, int reg) {
+                           size_t frame_size, int reg, InstructionSet isa) {
     DCHECK_EQ(frame_size & (kStackAlignment - 1), 0U);
     DCHECK_NE(reg, static_cast<int>(kVRegInvalid));
-
-    int num_spills = __builtin_popcount(core_spills) + __builtin_popcount(fp_spills) + 1;  // Filler.
+    int spill_size = POPCOUNT(core_spills) * GetBytesPerGprSpillLocation(isa)
+        + POPCOUNT(fp_spills) * GetBytesPerFprSpillLocation(isa)
+        + sizeof(uint32_t);  // Filler.
     int num_ins = code_item->ins_size_;
     int num_regs = code_item->registers_size_ - num_ins;
-    int locals_start = frame_size - ((num_spills + num_regs) * sizeof(uint32_t));
+    int locals_start = frame_size - spill_size - num_regs * sizeof(uint32_t);
     if (reg == static_cast<int>(kVRegMethodPtrBaseReg)) {
       // The current method pointer corresponds to special location on stack.
       return 0;
@@ -654,19 +657,20 @@
        * temp is at offset -4 bytes from locals, the second is at -8 bytes from locals,
        * and so on.
        */
-      int relative_offset = (reg + std::abs(static_cast<int>(kVRegNonSpecialTempBaseReg)) - 1) * sizeof(uint32_t);
+      int relative_offset =
+          (reg + std::abs(static_cast<int>(kVRegNonSpecialTempBaseReg)) - 1) * sizeof(uint32_t);
       return locals_start + relative_offset;
     }  else if (reg < num_regs) {
       return locals_start + (reg * sizeof(uint32_t));
     } else {
       // Handle ins.
-      return frame_size + ((reg - num_regs) * sizeof(uint32_t)) + sizeof(StackReference<mirror::ArtMethod>);
+      return frame_size + ((reg - num_regs) * sizeof(uint32_t)) + GetBytesPerGprSpillLocation(isa);
     }
   }
 
-  static int GetOutVROffset(uint16_t out_num) {
+  static int GetOutVROffset(uint16_t out_num, InstructionSet isa) {
     // According to stack model, the first out is above the Method ptr.
-    return sizeof(StackReference<mirror::ArtMethod>) + (out_num * sizeof(uint32_t));
+    return GetBytesPerGprSpillLocation(isa) + (out_num * sizeof(uint32_t));
   }
 
   uintptr_t GetCurrentQuickFramePc() const {
diff --git a/runtime/utils.h b/runtime/utils.h
index 4b2f230..14a532e 100644
--- a/runtime/utils.h
+++ b/runtime/utils.h
@@ -47,7 +47,7 @@
 };
 
 template<typename T>
-static inline bool IsPowerOfTwo(T x) {
+static constexpr bool IsPowerOfTwo(T x) {
   return (x & (x - 1)) == 0;
 }
 
@@ -115,39 +115,46 @@
 }
 
 // A static if which determines whether to return type A or B based on the condition boolean.
-template <const bool condition, typename A, typename B>
+template <bool condition, typename A, typename B>
 struct TypeStaticIf {
-  typedef A value;
+  typedef A type;
 };
 
 // Specialization to handle the false case.
 template <typename A, typename B>
 struct TypeStaticIf<false, A,  B> {
-  typedef B value;
+  typedef B type;
+};
+
+// Type identity.
+template <typename T>
+struct TypeIdentity {
+  typedef T type;
 };
 
 // For rounding integers.
 template<typename T>
-static inline T RoundDown(T x, int n) {
-  DCHECK(IsPowerOfTwo(n));
-  return (x & -n);
+static constexpr T RoundDown(T x, typename TypeIdentity<T>::type n) {
+  return
+      // DCHECK(IsPowerOfTwo(n)) in a form acceptable in a constexpr function:
+      (kIsDebugBuild && !IsPowerOfTwo(n)) ? (LOG(FATAL) << n << " isn't a power of 2", T(0))
+      : (x & -n);
 }
 
 template<typename T>
-static inline T RoundUp(T x, int n) {
+static constexpr T RoundUp(T x, typename TypeIdentity<T>::type n) {
   return RoundDown(x + n - 1, n);
 }
 
 // For aligning pointers.
 template<typename T>
-static inline T* AlignDown(T* x, int n) {
-  CHECK(IsPowerOfTwo(n));
-  return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(x) & -static_cast<uintptr_t>(n));
+static inline T* AlignDown(T* x, uintptr_t n) {
+  return reinterpret_cast<T*>(RoundDown(reinterpret_cast<uintptr_t>(x), n));
 }
 
 template<typename T>
-static inline T* AlignUp(T* x, int n) {
-  return AlignDown(reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(x) + static_cast<uintptr_t>(n - 1)), n);
+static inline T* AlignUp(T* x, uintptr_t n) {
+  return reinterpret_cast<T*>(RoundUp(reinterpret_cast<uintptr_t>(x), n));
 }
 
 // Implementation is from "Hacker's Delight" by Henry S. Warren, Jr.,
@@ -162,33 +169,25 @@
   return x + 1;
 }
 
-// Implementation is from "Hacker's Delight" by Henry S. Warren, Jr.,
-// figure 5-2, page 66, where the function is called pop.
-static inline int CountOneBits(uint32_t x) {
-  x = x - ((x >> 1) & 0x55555555);
-  x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
-  x = (x + (x >> 4)) & 0x0F0F0F0F;
-  x = x + (x >> 8);
-  x = x + (x >> 16);
-  return static_cast<int>(x & 0x0000003F);
+template<typename T>
+static constexpr int CLZ(T x) {
+  return (sizeof(T) == sizeof(uint32_t))
+      ? __builtin_clz(x)
+      : __builtin_clzll(x);
 }
 
 template<typename T>
-static inline int CLZ(T x) {
-  if (sizeof(T) == sizeof(uint32_t)) {
-    return __builtin_clz(x);
-  } else {
-    return __builtin_clzll(x);
-  }
+static constexpr int CTZ(T x) {
+  return (sizeof(T) == sizeof(uint32_t))
+      ? __builtin_ctz(x)
+      : __builtin_ctzll(x);
 }
 
 template<typename T>
-static inline int CTZ(T x) {
-  if (sizeof(T) == sizeof(uint32_t)) {
-    return __builtin_ctz(x);
-  } else {
-    return __builtin_ctzll(x);
-  }
+static constexpr int POPCOUNT(T x) {
+  return (sizeof(T) == sizeof(uint32_t))
+      ? __builtin_popcount(x)
+      : __builtin_popcountll(x);
 }
 
 static inline uint32_t PointerToLowMemUInt32(const void* p) {
diff --git a/runtime/vmap_table.h b/runtime/vmap_table.h
index 2fbaebe..9821753 100644
--- a/runtime/vmap_table.h
+++ b/runtime/vmap_table.h
@@ -99,7 +99,7 @@
       }
       matches++;
     }
-    CHECK_LT(vmap_offset - matches, static_cast<uint32_t>(__builtin_popcount(spill_mask)));
+    CHECK_LT(vmap_offset - matches, static_cast<uint32_t>(POPCOUNT(spill_mask)));
     uint32_t spill_shifts = 0;
     while (matches != (vmap_offset + 1)) {
       DCHECK_NE(spill_mask, 0u);
diff --git a/test/JniTest/JniTest.java b/test/JniTest/JniTest.java
index d53cf5e..3c4ed35 100644
--- a/test/JniTest/JniTest.java
+++ b/test/JniTest/JniTest.java
@@ -24,6 +24,10 @@
         testCallStaticVoidMethodOnSubClass();
         testGetMirandaMethod();
         testZeroLengthByteBuffers();
+        testByteMethod();
+        testShortMethod();
+        testBooleanMethod();
+        testCharMethod();
     }
 
     private static native void testFindClassOnAttachedNativeThread();
@@ -79,4 +83,67 @@
     private static interface testGetMirandaMethod_MirandaInterface {
         public boolean inInterface();
     }
+
+    // Test sign-extension for values < 32b
+
+    native static byte byteMethod(byte b1, byte b2, byte b3, byte b4, byte b5, byte b6, byte b7,
+        byte b8, byte b9, byte b10);
+
+    private static void testByteMethod() {
+      byte returns[] = { 0, 1, 2, 127, -1, -2, -128 };
+      for (int i = 0; i < returns.length; i++) {
+        byte result = byteMethod((byte)i, (byte)2, (byte)(-3), (byte)4, (byte)(-5), (byte)6,
+            (byte)(-7), (byte)8, (byte)(-9), (byte)10);
+        if (returns[i] != result) {
+          System.out.println("Run " + i + " with " + returns[i] + " vs " + result);
+          throw new AssertionError();
+        }
+      }
+    }
+
+    native static short shortMethod(short s1, short s2, short s3, short s4, short s5, short s6, short s7,
+        short s8, short s9, short s10);
+
+    private static void testShortMethod() {
+      short returns[] = { 0, 1, 2, 127, 32767, -1, -2, -128, -32768 };
+      for (int i = 0; i < returns.length; i++) {
+        short result = shortMethod((short)i, (short)2, (short)(-3), (short)4, (short)(-5), (short)6,
+            (short)(-7), (short)8, (short)(-9), (short)10);
+        if (returns[i] != result) {
+          System.out.println("Run " + i + " with " + returns[i] + " vs " + result);
+          throw new AssertionError();
+        }
+      }
+    }
+
+    // Test zero-extension for values < 32b
+
+    native static boolean booleanMethod(boolean b1, boolean b2, boolean b3, boolean b4, boolean b5, boolean b6, boolean b7,
+        boolean b8, boolean b9, boolean b10);
+
+    private static void testBooleanMethod() {
+      if (booleanMethod(false, true, false, true, false, true, false, true, false, true)) {
+        throw new AssertionError();
+      }
+
+      if (!booleanMethod(true, true, false, true, false, true, false, true, false, true)) {
+        throw new AssertionError();
+      }
+    }
+
+    native static char charMethod(char c1, char c2, char c3, char c4, char c5, char c6, char c7,
+        char c8, char c9, char c10);
+
+    private static void testCharMethod() {
+      char returns[] = { (char)0, (char)1, (char)2, (char)127, (char)255, (char)256, (char)15000,
+          (char)34000 };
+      for (int i = 0; i < returns.length; i++) {
+        char result = charMethod((char)i, 'a', 'b', 'c', '0', '1', '2', (char)1234, (char)2345,
+            (char)3456);
+        if (returns[i] != result) {
+          System.out.println("Run " + i + " with " + (int)returns[i] + " vs " + (int)result);
+          throw new AssertionError();
+        }
+      }
+    }
 }
diff --git a/test/JniTest/jni_test.cc b/test/JniTest/jni_test.cc
index 33af94b..024ba53 100644
--- a/test/JniTest/jni_test.cc
+++ b/test/JniTest/jni_test.cc
@@ -137,3 +137,92 @@
   assert(env->GetDirectBufferAddress(byte_buffer) == &buffer[0]);
   assert(env->GetDirectBufferCapacity(byte_buffer) == 0);
 }
+
+constexpr size_t kByteReturnSize = 7;
+jbyte byte_returns[kByteReturnSize] = { 0, 1, 2, 127, -1, -2, -128 };
+
+extern "C" jbyte JNICALL Java_JniTest_byteMethod(JNIEnv* env, jclass klass, jbyte b1, jbyte b2,
+                                                    jbyte b3, jbyte b4, jbyte b5, jbyte b6,
+                                                    jbyte b7, jbyte b8, jbyte b9, jbyte b10) {
+  // We use b1 to drive the output.
+  assert(b2 == 2);
+  assert(b3 == -3);
+  assert(b4 == 4);
+  assert(b5 == -5);
+  assert(b6 == 6);
+  assert(b7 == -7);
+  assert(b8 == 8);
+  assert(b9 == -9);
+  assert(b10 == 10);
+
+  assert(0 <= b1);
+  assert(b1 < static_cast<jbyte>(kByteReturnSize));
+
+  return byte_returns[b1];
+}
+
+constexpr size_t kShortReturnSize = 9;
+jshort short_returns[kShortReturnSize] = { 0, 1, 2, 127, 32767, -1, -2, -128,
+    static_cast<jshort>(0x8000) };
+// The weird static_cast is because short int is only guaranteed down to -32767, not Java's -32768.
+
+extern "C" jshort JNICALL Java_JniTest_shortMethod(JNIEnv* env, jclass klass, jshort s1, jshort s2,
+                                                    jshort s3, jshort s4, jshort s5, jshort s6,
+                                                    jshort s7, jshort s8, jshort s9, jshort s10) {
+  // We use s1 to drive the output.
+  assert(s2 == 2);
+  assert(s3 == -3);
+  assert(s4 == 4);
+  assert(s5 == -5);
+  assert(s6 == 6);
+  assert(s7 == -7);
+  assert(s8 == 8);
+  assert(s9 == -9);
+  assert(s10 == 10);
+
+  assert(0 <= s1);
+  assert(s1 < static_cast<jshort>(kShortReturnSize));
+
+  return short_returns[s1];
+}
+
+extern "C" jboolean JNICALL Java_JniTest_booleanMethod(JNIEnv* env, jclass klass, jboolean b1,
+                                                       jboolean b2, jboolean b3, jboolean b4,
+                                                       jboolean b5, jboolean b6, jboolean b7,
+                                                       jboolean b8, jboolean b9, jboolean b10) {
+  // We use b1 to drive the output.
+  assert(b2 == JNI_TRUE);
+  assert(b3 == JNI_FALSE);
+  assert(b4 == JNI_TRUE);
+  assert(b5 == JNI_FALSE);
+  assert(b6 == JNI_TRUE);
+  assert(b7 == JNI_FALSE);
+  assert(b8 == JNI_TRUE);
+  assert(b9 == JNI_FALSE);
+  assert(b10 == JNI_TRUE);
+
+  assert(b1 == JNI_TRUE || b1 == JNI_FALSE);
+  return b1;
+}
+
+constexpr size_t kCharReturnSize = 8;
+jchar char_returns[kCharReturnSize] = { 0, 1, 2, 127, 255, 256, 15000, 34000 };
+
+extern "C" jchar JNICALL Java_JniTest_charMethod(JNIEnv* env, jclass klacc, jchar c1, jchar c2,
+                                                    jchar c3, jchar c4, jchar c5, jchar c6,
+                                                    jchar c7, jchar c8, jchar c9, jchar c10) {
+  // We use c1 to drive the output.
+  assert(c2 == 'a');
+  assert(c3 == 'b');
+  assert(c4 == 'c');
+  assert(c5 == '0');
+  assert(c6 == '1');
+  assert(c7 == '2');
+  assert(c8 == 1234);
+  assert(c9 == 2345);
+  assert(c10 == 3456);
+
+  assert(c1 < static_cast<jchar>(kCharReturnSize));
+
+  return char_returns[c1];
+}
diff --git a/tools/art b/tools/art
index c9c0d4f..e3f409c 100755
--- a/tools/art
+++ b/tools/art
@@ -48,7 +48,7 @@
 ANDROID_HOST_OUT=$PROG_DIR/..
 ANDROID_DATA=$PWD/android-data$$
 
-mkdir -p $ANDROID_DATA/dalvik-cache
+mkdir -p $ANDROID_DATA/dalvik-cache/{x86,x86_64}
 ANDROID_DATA=$ANDROID_DATA \
   ANDROID_ROOT=$ANDROID_HOST_OUT \
   LD_LIBRARY_PATH=$ANDROID_HOST_OUT/lib \