144 files changed, 2256 insertions, 1130 deletions
diff --git a/build/Android.common.mk b/build/Android.common.mk
index 05224569b3..f58aabc0b5 100644
--- a/build/Android.common.mk
+++ b/build/Android.common.mk
@@ -173,22 +173,24 @@ $(error Required DEX2OAT_TARGET_INSTRUCTION_SET_FEATURES is not set)
 endif
 ART_TARGET_CFLAGS += -DART_DEFAULT_INSTRUCTION_SET_FEATURES=$(DEX2OAT_TARGET_INSTRUCTION_SET_FEATURES)
 
-# Enable thread-safety for GCC 4.6 on the target but not for GCC 4.7 where this feature was removed.
+# Enable thread-safety for GCC 4.6, and clang, but not for GCC 4.7 or later where this feature was
+# removed. Warn when -Wthread-safety is not used.
 ifneq ($(filter 4.6 4.6.%, $(TARGET_GCC_VERSION)),)
   ART_TARGET_CFLAGS += -Wthread-safety
 else
-  # Warn if not using GCC 4.6 for target builds when not doing a top-level or 'mma' build.
-  ifneq ($(ONE_SHOT_MAKEFILE),)
-    # Enable target GCC 4.6 with: export TARGET_GCC_VERSION_EXP=4.6
-    $(info Using target GCC $(TARGET_GCC_VERSION) disables thread-safety checks.)
+  ifeq ($(ART_TARGET_CLANG),true)
+    ART_TARGET_CFLAGS += -Wthread-safety
+  else
+    # Warn if -Wthread-safety is not suport and not doing a top-level or 'mma' build.
+    ifneq ($(ONE_SHOT_MAKEFILE),)
+      # Enable target GCC 4.6 with: export TARGET_GCC_VERSION_EXP=4.6
+      $(info Using target GCC $(TARGET_GCC_VERSION) disables thread-safety checks.)
+    endif
   endif
 endif
-# We build with GCC 4.6 on the host.
+# We compile with GCC 4.6 or clang on the host, both of which support -Wthread-safety.
 ART_HOST_CFLAGS += -Wthread-safety
 
-# Make host builds easier to debug and profile by not omitting the frame pointer.
-ART_HOST_CFLAGS += -fno-omit-frame-pointer
-
 # To use oprofile_android --callgraph, uncomment this and recompile with "mmm art -B -j16"
 # ART_TARGET_CFLAGS += -fno-omit-frame-pointer -marm -mapcs
 
diff --git a/compiler/Android.mk b/compiler/Android.mk
index 499f23f6a5..fdc854016f 100644
--- a/compiler/Android.mk
+++ b/compiler/Android.mk
@@ -86,6 +86,7 @@ LIBART_COMPILER_SRC_FILES := \
 	utils/mips/managed_register_mips.cc \
 	utils/x86/assembler_x86.cc \
 	utils/x86/managed_register_x86.cc \
+	utils/scoped_arena_allocator.cc \
 	buffered_output_stream.cc \
 	compiler_backend.cc \
 	elf_fixup.cc \
@@ -260,12 +261,6 @@ $$(ENUM_OPERATOR_OUT_GEN): $$(GENERATED_SRC_DIR)/%_operator_out.cc : $(LOCAL_PAT
 
 endef
 
-ifeq ($(ART_BUILD_TARGET_NDEBUG),true)
-  $(eval $(call build-libart-compiler,target,ndebug))
-endif
-ifeq ($(ART_BUILD_TARGET_DEBUG),true)
-  $(eval $(call build-libart-compiler,target,debug))
-endif
 ifeq ($(WITH_HOST_DALVIK),true)
   # We always build dex2oat and dependencies, even if the host build is otherwise disabled, since they are used to cross compile for the target.
   ifeq ($(ART_BUILD_NDEBUG),true)
@@ -275,6 +270,12 @@ ifeq ($(WITH_HOST_DALVIK),true)
     $(eval $(call build-libart-compiler,host,debug))
   endif
 endif
+ifeq ($(ART_BUILD_TARGET_NDEBUG),true)
+  $(eval $(call build-libart-compiler,target,ndebug))
+endif
+ifeq ($(ART_BUILD_TARGET_DEBUG),true)
+  $(eval $(call build-libart-compiler,target,debug))
+endif
 
 # Rule to build /system/lib/libcompiler_rt.a
 # Usually static libraries are not installed on the device.
diff --git a/compiler/common_compiler_test.h b/compiler/common_compiler_test.h
index bca72b872b..def7b681dc 100644
--- a/compiler/common_compiler_test.h
+++ b/compiler/common_compiler_test.h
@@ -35,9 +35,9 @@ namespace art {
 // A signal handler called when have an illegal instruction.  We record the fact in
 // a global boolean and then increment the PC in the signal context to return to
 // the next instruction.  We know the instruction is an sdiv (4 bytes long).
-static void baddivideinst(int signo, siginfo *si, void *data) {
-  (void)signo;
-  (void)si;
+static inline void baddivideinst(int signo, siginfo *si, void *data) {
+  UNUSED(signo);
+  UNUSED(si);
   struct ucontext *uc = (struct ucontext *)data;
   struct sigcontext *sc = &uc->uc_mcontext;
   sc->arm_r0 = 0;     // set R0 to #0 to signal error
@@ -56,7 +56,7 @@ static void baddivideinst(int signo, siginfo *si, void *data) {
 
 extern "C" bool CheckForARMSDIVInstruction();
 
-static InstructionSetFeatures GuessInstructionFeatures() {
+static inline InstructionSetFeatures GuessInstructionFeatures() {
   InstructionSetFeatures f;
 
   // Uncomment this for processing of /proc/cpuinfo.
@@ -107,7 +107,7 @@ static InstructionSetFeatures GuessInstructionFeatures() {
 // Given a set of instruction features from the build, parse it.  The
 // input 'str' is a comma separated list of feature names.  Parse it and
 // return the InstructionSetFeatures object.
-static InstructionSetFeatures ParseFeatureList(std::string str) {
+static inline InstructionSetFeatures ParseFeatureList(std::string str) {
   InstructionSetFeatures result;
   typedef std::vector<std::string> FeatureList;
   FeatureList features;
diff --git a/compiler/dex/bit_vector_block_iterator.h b/compiler/dex/bit_vector_block_iterator.h
index 0821e9e238..0f1c2b6756 100644
--- a/compiler/dex/bit_vector_block_iterator.h
+++ b/compiler/dex/bit_vector_block_iterator.h
@@ -44,7 +44,7 @@ class BitVectorBlockIterator {
     BasicBlock* Next();
 
     void* operator new(size_t size, ArenaAllocator* arena) {
-      return arena->Alloc(size, ArenaAllocator::kAllocGrowableArray);
+      return arena->Alloc(size, kArenaAllocGrowableArray);
     };
     void operator delete(void* p) {}  // Nop.
 
diff --git a/compiler/dex/compiler_ir.h b/compiler/dex/compiler_ir.h
index ee880417ac..c71f0473f1 100644
--- a/compiler/dex/compiler_ir.h
+++ b/compiler/dex/compiler_ir.h
@@ -25,6 +25,7 @@
 #include "driver/compiler_driver.h"
 #include "driver/dex_compilation_unit.h"
 #include "safe_map.h"
+#include "utils/scoped_arena_allocator.h"
 #include "base/timing_logger.h"
 #include "utils/arena_allocator.h"
 
@@ -82,6 +83,7 @@ struct CompilationUnit {
 
   // TODO: move memory management to mir_graph, or just switch to using standard containers.
   ArenaAllocator arena;
+  ArenaStack arena_stack;  // Arenas for ScopedArenaAllocator.
 
   UniquePtr<MIRGraph> mir_graph;   // MIR container.
   UniquePtr<Backend> cg;           // Target-specific codegen.
diff --git a/compiler/dex/frontend.cc b/compiler/dex/frontend.cc
index b55b4715eb..1c2d16f6ca 100644
--- a/compiler/dex/frontend.cc
+++ b/compiler/dex/frontend.cc
@@ -98,6 +98,7 @@ CompilationUnit::CompilationUnit(ArenaPool* pool)
     num_regs(0),
     compiler_flip_match(false),
     arena(pool),
+    arena_stack(pool),
     mir_graph(NULL),
     cg(NULL),
     timings("QuickCompiler", true, false) {
@@ -247,9 +248,12 @@ static CompiledMethod* CompileMethod(CompilerDriver& driver,
   }
 
   if (cu.enable_debug & (1 << kDebugShowMemoryUsage)) {
-    if (cu.arena.BytesAllocated() > (5 * 1024 *1024)) {
-      MemStats mem_stats(cu.arena);
-      LOG(INFO) << PrettyMethod(method_idx, dex_file) << " " << Dumpable<MemStats>(mem_stats);
+    if (cu.arena.BytesAllocated() > (1 * 1024 *1024) ||
+        cu.arena_stack.PeakBytesAllocated() > 256 * 1024) {
+      MemStats mem_stats(cu.arena.GetMemStats());
+      MemStats peak_stats(cu.arena_stack.GetPeakStats());
+      LOG(INFO) << PrettyMethod(method_idx, dex_file) << " " << Dumpable<MemStats>(mem_stats)
+          << Dumpable<MemStats>(peak_stats);
     }
   }
 
diff --git a/compiler/dex/local_value_numbering.h b/compiler/dex/local_value_numbering.h
index 348bedcc75..535b613ba1 100644
--- a/compiler/dex/local_value_numbering.h
+++ b/compiler/dex/local_value_numbering.h
@@ -18,6 +18,8 @@
 #define ART_COMPILER_DEX_LOCAL_VALUE_NUMBERING_H_
 
 #include "compiler_internals.h"
+#include "UniquePtr.h"
+#include "utils/scoped_arena_allocator.h"
 
 #define NO_VALUE 0xffff
 #define ARRAY_REF 0xfffe
@@ -73,28 +75,26 @@ class LocalValueNumbering {
   };
 
   // Key is s_reg, value is value name.
-  typedef SafeMap<uint16_t, uint16_t> SregValueMap;
+  typedef SafeMap<uint16_t, uint16_t, std::less<uint16_t>,
+      ScopedArenaAllocatorAdapter<std::pair<uint16_t, uint16_t> > > SregValueMap;
   // Key is concatenation of opcode, operand1, operand2 and modifier, value is value name.
-  typedef SafeMap<uint64_t, uint16_t> ValueMap;
+  typedef SafeMap<uint64_t, uint16_t, std::less<uint64_t>,
+      ScopedArenaAllocatorAdapter<std::pair<uint64_t, uint16_t> > > ValueMap;
   // Key represents a memory address, value is generation.
-  typedef SafeMap<MemoryVersionKey, uint16_t, MemoryVersionKeyComparator> MemoryVersionMap;
+  typedef SafeMap<MemoryVersionKey, uint16_t, MemoryVersionKeyComparator,
+      ScopedArenaAllocatorAdapter<std::pair<MemoryVersionKey, uint16_t> > > MemoryVersionMap;
   // Maps field key to field id for resolved fields.
-  typedef SafeMap<FieldReference, uint32_t, FieldReferenceComparator> FieldIndexMap;
+  typedef SafeMap<FieldReference, uint32_t, FieldReferenceComparator,
+      ScopedArenaAllocatorAdapter<std::pair<FieldReference, uint16_t> > > FieldIndexMap;
+  // A set of value names.
+  typedef std::set<uint16_t, std::less<uint16_t>,
+      ScopedArenaAllocatorAdapter<uint16_t> > ValueNameSet;
 
  public:
-  explicit LocalValueNumbering(CompilationUnit* cu)
-      : cu_(cu),
-        sreg_value_map_(),
-        sreg_wide_value_map_(),
-        value_map_(),
-        next_memory_version_(1u),
-        global_memory_version_(0u),
-        memory_version_map_(),
-        field_index_map_(),
-        non_aliasing_refs_(),
-        null_checked_() {
-    std::fill_n(unresolved_sfield_version_, kFieldTypeCount, 0u);
-    std::fill_n(unresolved_ifield_version_, kFieldTypeCount, 0u);
+  static LocalValueNumbering* Create(CompilationUnit* cu) {
+    UniquePtr<ScopedArenaAllocator> allocator(ScopedArenaAllocator::Create(&cu->arena_stack));
+    void* addr = allocator->Alloc(sizeof(LocalValueNumbering), kArenaAllocMisc);
+    return new(addr) LocalValueNumbering(cu, allocator.release());
   }
 
   static uint64_t BuildKey(uint16_t op, uint16_t operand1, uint16_t operand2, uint16_t modifier) {
@@ -167,7 +167,26 @@ class LocalValueNumbering {
 
   uint16_t GetValueNumber(MIR* mir);
 
+  // Allow delete-expression to destroy a LocalValueNumbering object without deallocation.
+  static void operator delete(void* ptr) { UNUSED(ptr); }
+
  private:
+  LocalValueNumbering(CompilationUnit* cu, ScopedArenaAllocator* allocator)
+      : cu_(cu),
+        allocator_(allocator),
+        sreg_value_map_(std::less<uint16_t>(), allocator->Adapter()),
+        sreg_wide_value_map_(std::less<uint16_t>(), allocator->Adapter()),
+        value_map_(std::less<uint64_t>(), allocator->Adapter()),
+        next_memory_version_(1u),
+        global_memory_version_(0u),
+        memory_version_map_(MemoryVersionKeyComparator(), allocator->Adapter()),
+        field_index_map_(FieldReferenceComparator(), allocator->Adapter()),
+        non_aliasing_refs_(std::less<uint16_t>(), allocator->Adapter()),
+        null_checked_(std::less<uint16_t>(), allocator->Adapter()) {
+    std::fill_n(unresolved_sfield_version_, kFieldTypeCount, 0u);
+    std::fill_n(unresolved_ifield_version_, kFieldTypeCount, 0u);
+  }
+
   uint16_t GetFieldId(const DexFile* dex_file, uint16_t field_idx);
   void AdvanceGlobalMemory();
   uint16_t GetMemoryVersion(uint16_t base, uint16_t field, uint16_t type);
@@ -179,6 +198,7 @@ class LocalValueNumbering {
   void HandlePutObject(MIR* mir);
 
   CompilationUnit* const cu_;
+  UniquePtr<ScopedArenaAllocator> allocator_;
   SregValueMap sreg_value_map_;
   SregValueMap sreg_wide_value_map_;
   ValueMap value_map_;
@@ -189,8 +209,10 @@ class LocalValueNumbering {
   MemoryVersionMap memory_version_map_;
   FieldIndexMap field_index_map_;
   // Value names of references to objects that cannot be reached through a different value name.
-  std::set<uint16_t> non_aliasing_refs_;
-  std::set<uint16_t> null_checked_;
+  ValueNameSet non_aliasing_refs_;
+  ValueNameSet null_checked_;
+
+  DISALLOW_COPY_AND_ASSIGN(LocalValueNumbering);
 };
 
 }  // namespace art
diff --git a/compiler/dex/local_value_numbering_test.cc b/compiler/dex/local_value_numbering_test.cc
index 4599612db6..ebac871b2d 100644
--- a/compiler/dex/local_value_numbering_test.cc
+++ b/compiler/dex/local_value_numbering_test.cc
@@ -120,7 +120,7 @@ class LocalValueNumberingTest : public testing::Test {
 
   void DoPrepareMIRs(const MIRDef* defs, size_t count) {
     mir_count_ = count;
-    mirs_ = reinterpret_cast<MIR*>(cu_.arena.Alloc(sizeof(MIR) * count, ArenaAllocator::kAllocMIR));
+    mirs_ = reinterpret_cast<MIR*>(cu_.arena.Alloc(sizeof(MIR) * count, kArenaAllocMIR));
     ssa_reps_.resize(count);
     for (size_t i = 0u; i != count; ++i) {
       const MIRDef* def = &defs[i];
@@ -162,11 +162,16 @@ class LocalValueNumberingTest : public testing::Test {
   void PerformLVN() {
     value_names_.resize(mir_count_);
     for (size_t i = 0; i != mir_count_; ++i) {
-      value_names_[i] =  lvn_.GetValueNumber(&mirs_[i]);
+      value_names_[i] =  lvn_->GetValueNumber(&mirs_[i]);
     }
   }
 
-  LocalValueNumberingTest() : pool_(), cu_(&pool_), mir_count_(0u), mirs_(nullptr), lvn_(&cu_) {
+  LocalValueNumberingTest()
+      : pool_(),
+        cu_(&pool_),
+        mir_count_(0u),
+        mirs_(nullptr),
+        lvn_(LocalValueNumbering::Create(&cu_)) {
     cu_.mir_graph.reset(new MIRGraph(&cu_, &cu_.arena));
   }
 
@@ -176,7 +181,7 @@ class LocalValueNumberingTest : public testing::Test {
   MIR* mirs_;
   std::vector<SSARepresentation> ssa_reps_;
   std::vector<uint16_t> value_names_;
-  LocalValueNumbering lvn_;
+  UniquePtr<LocalValueNumbering> lvn_;
 };
 
 TEST_F(LocalValueNumberingTest, TestIGetIGetInvokeIGet) {
diff --git a/compiler/dex/mir_analysis.cc b/compiler/dex/mir_analysis.cc
index d159f49b3e..667ee267ea 100644
--- a/compiler/dex/mir_analysis.cc
+++ b/compiler/dex/mir_analysis.cc
@@ -1095,16 +1095,15 @@ bool MIRGraph::SkipCompilation() {
 }
 
 void MIRGraph::DoCacheFieldLoweringInfo() {
-  // Try to use stack-allocated array, resort to heap if we exceed the initial size.
-  static constexpr size_t kInitialSize = 32;
-  uint16_t stack_idxs[kInitialSize];
-  UniquePtr<uint16_t[]> allocated_idxs;
-  uint16_t* field_idxs = stack_idxs;
-  size_t size = kInitialSize;
+  // All IGET/IPUT/SGET/SPUT instructions take 2 code units and there must also be a RETURN.
+  const uint32_t max_refs = (current_code_item_->insns_size_in_code_units_ - 1u) / 2u;
+  ScopedArenaAllocator allocator(&cu_->arena_stack);
+  uint16_t* field_idxs =
+      reinterpret_cast<uint16_t*>(allocator.Alloc(max_refs * sizeof(uint16_t), kArenaAllocMisc));
 
   // Find IGET/IPUT/SGET/SPUT insns, store IGET/IPUT fields at the beginning, SGET/SPUT at the end.
   size_t ifield_pos = 0u;
-  size_t sfield_pos = size;
+  size_t sfield_pos = max_refs;
   AllNodesIterator iter(this);
   for (BasicBlock* bb = iter.Next(); bb != nullptr; bb = iter.Next()) {
     if (bb->block_type != kDalvikByteCode) {
@@ -1113,14 +1112,12 @@ void MIRGraph::DoCacheFieldLoweringInfo() {
     for (MIR* mir = bb->first_mir_insn; mir != nullptr; mir = mir->next) {
       if (mir->dalvikInsn.opcode >= Instruction::IGET &&
           mir->dalvikInsn.opcode <= Instruction::SPUT_SHORT) {
-        bool need_alloc = false;
         const Instruction* insn = Instruction::At(current_code_item_->insns_ + mir->offset);
-        uint16_t field_idx;
         // Get field index and try to find it among existing indexes. If found, it's usually among
         // the last few added, so we'll start the search from ifield_pos/sfield_pos. Though this
         // is a linear search, it actually performs much better than map based approach.
         if (mir->dalvikInsn.opcode <= Instruction::IPUT_SHORT) {
-          field_idx = insn->VRegC_22c();
+          uint16_t field_idx = insn->VRegC_22c();
           size_t i = ifield_pos;
           while (i != 0u && field_idxs[i - 1] != field_idx) {
             --i;
@@ -1129,44 +1126,18 @@ void MIRGraph::DoCacheFieldLoweringInfo() {
             mir->meta.ifield_lowering_info = i - 1;
           } else {
             mir->meta.ifield_lowering_info = ifield_pos;
-            if (UNLIKELY(ifield_pos == sfield_pos)) {
-              need_alloc = true;
-            } else {
-              field_idxs[ifield_pos++] = field_idx;
-            }
+            field_idxs[ifield_pos++] = field_idx;
           }
         } else {
-          field_idx = insn->VRegB_21c();
+          uint16_t field_idx = insn->VRegB_21c();
           size_t i = sfield_pos;
-          while (i != size && field_idxs[i] != field_idx) {
+          while (i != max_refs && field_idxs[i] != field_idx) {
             ++i;
           }
-          if (i != size) {
-            mir->meta.sfield_lowering_info = size - i - 1u;
-          } else {
-            mir->meta.sfield_lowering_info = size - sfield_pos;
-            if (UNLIKELY(ifield_pos == sfield_pos)) {
-              need_alloc = true;
-            } else {
-              field_idxs[--sfield_pos] = field_idx;
-            }
-          }
-        }
-        if (UNLIKELY(need_alloc)) {
-          DCHECK(field_idxs == stack_idxs);
-          // All IGET/IPUT/SGET/SPUT instructions take 2 code units and there must also be a RETURN.
-          uint32_t max_refs = (current_code_item_->insns_size_in_code_units_ - 1u) / 2u;
-          allocated_idxs.reset(new uint16_t[max_refs]);
-          field_idxs = allocated_idxs.get();
-          size_t sfield_count = size - sfield_pos;
-          sfield_pos = max_refs - sfield_count;
-          size = max_refs;
-          memcpy(field_idxs, stack_idxs, ifield_pos * sizeof(field_idxs[0]));
-          memcpy(field_idxs + sfield_pos, stack_idxs + ifield_pos,
-                 sfield_count * sizeof(field_idxs[0]));
-          if (mir->dalvikInsn.opcode <= Instruction::IPUT_SHORT) {
-            field_idxs[ifield_pos++] = field_idx;
+          if (i != max_refs) {
+            mir->meta.sfield_lowering_info = max_refs - i - 1u;
           } else {
+            mir->meta.sfield_lowering_info = max_refs - sfield_pos;
             field_idxs[--sfield_pos] = field_idx;
           }
         }
@@ -1186,16 +1157,16 @@ void MIRGraph::DoCacheFieldLoweringInfo() {
                                 ifield_lowering_infos_.GetRawStorage(), ifield_pos);
   }
 
-  if (sfield_pos != size) {
+  if (sfield_pos != max_refs) {
     // Resolve static field infos.
     DCHECK_EQ(sfield_lowering_infos_.Size(), 0u);
-    sfield_lowering_infos_.Resize(size - sfield_pos);
-    for (size_t pos = size; pos != sfield_pos;) {
+    sfield_lowering_infos_.Resize(max_refs - sfield_pos);
+    for (size_t pos = max_refs; pos != sfield_pos;) {
       --pos;
       sfield_lowering_infos_.Insert(MirSFieldLoweringInfo(field_idxs[pos]));
     }
     MirSFieldLoweringInfo::Resolve(cu_->compiler_driver, GetCurrentDexCompilationUnit(),
-                                sfield_lowering_infos_.GetRawStorage(), size - sfield_pos);
+                                sfield_lowering_infos_.GetRawStorage(), max_refs - sfield_pos);
   }
 }
 
diff --git a/compiler/dex/mir_dataflow.cc b/compiler/dex/mir_dataflow.cc
index 96804503fe..c3954fe3d7 100644
--- a/compiler/dex/mir_dataflow.cc
+++ b/compiler/dex/mir_dataflow.cc
@@ -955,10 +955,10 @@ void MIRGraph::DataFlowSSAFormat35C(MIR* mir) {
 
   mir->ssa_rep->num_uses = num_uses;
   mir->ssa_rep->uses = static_cast<int*>(arena_->Alloc(sizeof(int) * num_uses,
-                                                       ArenaAllocator::kAllocDFInfo));
+                                                       kArenaAllocDFInfo));
   // NOTE: will be filled in during type & size inference pass
   mir->ssa_rep->fp_use = static_cast<bool*>(arena_->Alloc(sizeof(bool) * num_uses,
-                                                          ArenaAllocator::kAllocDFInfo));
+                                                          kArenaAllocDFInfo));
 
   for (i = 0; i < num_uses; i++) {
     HandleSSAUse(mir->ssa_rep->uses, d_insn->arg[i], i);
@@ -973,10 +973,10 @@ void MIRGraph::DataFlowSSAFormat3RC(MIR* mir) {
 
   mir->ssa_rep->num_uses = num_uses;
   mir->ssa_rep->uses = static_cast<int*>(arena_->Alloc(sizeof(int) * num_uses,
-                                                       ArenaAllocator::kAllocDFInfo));
+                                                       kArenaAllocDFInfo));
   // NOTE: will be filled in during type & size inference pass
   mir->ssa_rep->fp_use = static_cast<bool*>(arena_->Alloc(sizeof(bool) * num_uses,
-                                                          ArenaAllocator::kAllocDFInfo));
+                                                          kArenaAllocDFInfo));
 
   for (i = 0; i < num_uses; i++) {
     HandleSSAUse(mir->ssa_rep->uses, d_insn->vC+i, i);
@@ -992,7 +992,7 @@ bool MIRGraph::DoSSAConversion(BasicBlock* bb) {
   for (mir = bb->first_mir_insn; mir != NULL; mir = mir->next) {
     mir->ssa_rep =
         static_cast<struct SSARepresentation *>(arena_->Alloc(sizeof(SSARepresentation),
-                                                              ArenaAllocator::kAllocDFInfo));
+                                                              kArenaAllocDFInfo));
 
     uint64_t df_attributes = oat_data_flow_attributes_[mir->dalvikInsn.opcode];
 
@@ -1042,9 +1042,9 @@ bool MIRGraph::DoSSAConversion(BasicBlock* bb) {
     if (num_uses) {
       mir->ssa_rep->num_uses = num_uses;
       mir->ssa_rep->uses = static_cast<int*>(arena_->Alloc(sizeof(int) * num_uses,
-                                                           ArenaAllocator::kAllocDFInfo));
+                                                           kArenaAllocDFInfo));
       mir->ssa_rep->fp_use = static_cast<bool*>(arena_->Alloc(sizeof(bool) * num_uses,
-                                                              ArenaAllocator::kAllocDFInfo));
+                                                              kArenaAllocDFInfo));
     }
 
     int num_defs = 0;
@@ -1059,9 +1059,9 @@ bool MIRGraph::DoSSAConversion(BasicBlock* bb) {
     if (num_defs) {
       mir->ssa_rep->num_defs = num_defs;
       mir->ssa_rep->defs = static_cast<int*>(arena_->Alloc(sizeof(int) * num_defs,
-                                                           ArenaAllocator::kAllocDFInfo));
+                                                           kArenaAllocDFInfo));
       mir->ssa_rep->fp_def = static_cast<bool*>(arena_->Alloc(sizeof(bool) * num_defs,
-                                                              ArenaAllocator::kAllocDFInfo));
+                                                              kArenaAllocDFInfo));
     }
 
     DecodedInstruction *d_insn = &mir->dalvikInsn;
@@ -1110,7 +1110,7 @@ bool MIRGraph::DoSSAConversion(BasicBlock* bb) {
    */
   bb->data_flow_info->vreg_to_ssa_map =
       static_cast<int*>(arena_->Alloc(sizeof(int) * cu_->num_dalvik_registers,
-                                      ArenaAllocator::kAllocDFInfo));
+                                      kArenaAllocDFInfo));
 
   memcpy(bb->data_flow_info->vreg_to_ssa_map, vreg_to_ssa_map_,
          sizeof(int) * cu_->num_dalvik_registers);
@@ -1147,11 +1147,11 @@ void MIRGraph::CompilerInitializeSSAConversion() {
    */
   vreg_to_ssa_map_ =
       static_cast<int*>(arena_->Alloc(sizeof(int) * num_dalvik_reg,
-                                      ArenaAllocator::kAllocDFInfo));
+                                      kArenaAllocDFInfo));
   /* Keep track of the higest def for each dalvik reg */
   ssa_last_defs_ =
       static_cast<int*>(arena_->Alloc(sizeof(int) * num_dalvik_reg,
-                                      ArenaAllocator::kAllocDFInfo));
+                                      kArenaAllocDFInfo));
 
   for (unsigned int i = 0; i < num_dalvik_reg; i++) {
     vreg_to_ssa_map_[i] = i;
@@ -1175,7 +1175,7 @@ void MIRGraph::CompilerInitializeSSAConversion() {
       bb->block_type == kExitBlock) {
       bb->data_flow_info =
           static_cast<BasicBlockDataFlow*>(arena_->Alloc(sizeof(BasicBlockDataFlow),
-                                                         ArenaAllocator::kAllocDFInfo));
+                                                         kArenaAllocDFInfo));
       }
   }
 }
diff --git a/compiler/dex/mir_field_info.cc b/compiler/dex/mir_field_info.cc
index 96eda01d1e..7c630e8229 100644
--- a/compiler/dex/mir_field_info.cc
+++ b/compiler/dex/mir_field_info.cc
@@ -24,7 +24,7 @@
 #include "mirror/class_loader.h"  // Only to allow casts in SirtRef<ClassLoader>.
 #include "mirror/dex_cache.h"     // Only to allow casts in SirtRef<DexCache>.
 #include "scoped_thread_state_change.h"
-#include "sirt_ref.h"
+#include "sirt_ref-inl.h"
 
 namespace art {
 
diff --git a/compiler/dex/mir_field_info.h b/compiler/dex/mir_field_info.h
index 41cb4cee14..e64e9fcf83 100644
--- a/compiler/dex/mir_field_info.h
+++ b/compiler/dex/mir_field_info.h
@@ -100,7 +100,7 @@ class MirFieldInfo {
 class MirIFieldLoweringInfo : public MirFieldInfo {
  public:
   // For each requested instance field retrieve the field's declaring location (dex file, class
-  // index and field index) and volatility and compute the whether we can fast path the access
+  // index and field index) and volatility and compute whether we can fast path the access
   // with IGET/IPUT. For fast path fields, retrieve the field offset.
   static void Resolve(CompilerDriver* compiler_driver, const DexCompilationUnit* mUnit,
                       MirIFieldLoweringInfo* field_infos, size_t count)
@@ -143,7 +143,7 @@ class MirIFieldLoweringInfo : public MirFieldInfo {
 class MirSFieldLoweringInfo : public MirFieldInfo {
  public:
   // For each requested static field retrieve the field's declaring location (dex file, class
-  // index and field index) and volatility and compute the whether we can fast path the access with
+  // index and field index) and volatility and compute whether we can fast path the access with
   // IGET/IPUT. For fast path fields (at least for IGET), retrieve the information needed for
   // the field access, i.e. the field offset, whether the field is in the same class as the
   // method being compiled, whether the declaring class can be safely assumed to be initialized
diff --git a/compiler/dex/mir_graph.cc b/compiler/dex/mir_graph.cc
index 46e854fb2b..868730fc37 100644
--- a/compiler/dex/mir_graph.cc
+++ b/compiler/dex/mir_graph.cc
@@ -411,7 +411,7 @@ BasicBlock* MIRGraph::ProcessCanSwitch(BasicBlock* cur_block, MIR* insn, DexOffs
                                       /* create */ true, /* immed_pred_block_p */ &cur_block);
     SuccessorBlockInfo *successor_block_info =
         static_cast<SuccessorBlockInfo*>(arena_->Alloc(sizeof(SuccessorBlockInfo),
-                                                       ArenaAllocator::kAllocSuccessor));
+                                                       kArenaAllocSuccessor));
     successor_block_info->block = case_block->id;
     successor_block_info->key =
         (insn->dalvikInsn.opcode == Instruction::PACKED_SWITCH) ?
@@ -459,7 +459,7 @@ BasicBlock* MIRGraph::ProcessCanThrow(BasicBlock* cur_block, MIR* insn, DexOffse
         catches_.insert(catch_block->start_offset);
       }
       SuccessorBlockInfo *successor_block_info = reinterpret_cast<SuccessorBlockInfo*>
-          (arena_->Alloc(sizeof(SuccessorBlockInfo), ArenaAllocator::kAllocSuccessor));
+          (arena_->Alloc(sizeof(SuccessorBlockInfo), kArenaAllocSuccessor));
       successor_block_info->block = catch_block->id;
       successor_block_info->key = iterator.GetHandlerTypeIndex();
       cur_block->successor_blocks->Insert(successor_block_info);
@@ -518,7 +518,7 @@ BasicBlock* MIRGraph::ProcessCanThrow(BasicBlock* cur_block, MIR* insn, DexOffse
   new_block->start_offset = insn->offset;
   cur_block->fall_through = new_block->id;
   new_block->predecessors->Insert(cur_block->id);
-  MIR* new_insn = static_cast<MIR*>(arena_->Alloc(sizeof(MIR), ArenaAllocator::kAllocMIR));
+  MIR* new_insn = static_cast<MIR*>(arena_->Alloc(sizeof(MIR), kArenaAllocMIR));
   *new_insn = *insn;
   insn->dalvikInsn.opcode =
       static_cast<Instruction::Code>(kMirOpCheck);
@@ -602,7 +602,7 @@ void MIRGraph::InlineMethod(const DexFile::CodeItem* code_item, uint32_t access_
 
   /* Parse all instructions and put them into containing basic blocks */
   while (code_ptr < code_end) {
-    MIR *insn = static_cast<MIR *>(arena_->Alloc(sizeof(MIR), ArenaAllocator::kAllocMIR));
+    MIR *insn = static_cast<MIR *>(arena_->Alloc(sizeof(MIR), kArenaAllocMIR));
     insn->offset = current_offset_;
     insn->m_unit_index = current_method_;
     int width = ParseInsn(code_ptr, &insn->dalvikInsn);
@@ -1042,7 +1042,7 @@ char* MIRGraph::GetDalvikDisassembly(const MIR* mir) {
     str.append("]--optimized away");
   }
   int length = str.length() + 1;
-  ret = static_cast<char*>(arena_->Alloc(length, ArenaAllocator::kAllocDFInfo));
+  ret = static_cast<char*>(arena_->Alloc(length, kArenaAllocDFInfo));
   strncpy(ret, str.c_str(), length);
   return ret;
 }
@@ -1157,7 +1157,7 @@ void MIRGraph::DumpMIRGraph() {
 CallInfo* MIRGraph::NewMemCallInfo(BasicBlock* bb, MIR* mir, InvokeType type,
                                   bool is_range) {
   CallInfo* info = static_cast<CallInfo*>(arena_->Alloc(sizeof(CallInfo),
-                                                        ArenaAllocator::kAllocMisc));
+                                                        kArenaAllocMisc));
   MIR* move_result_mir = FindMoveResult(bb, mir);
   if (move_result_mir == NULL) {
     info->result.location = kLocInvalid;
@@ -1167,7 +1167,7 @@ CallInfo* MIRGraph::NewMemCallInfo(BasicBlock* bb, MIR* mir, InvokeType type,
   }
   info->num_arg_words = mir->ssa_rep->num_uses;
   info->args = (info->num_arg_words == 0) ? NULL : static_cast<RegLocation*>
-      (arena_->Alloc(sizeof(RegLocation) * info->num_arg_words, ArenaAllocator::kAllocMisc));
+      (arena_->Alloc(sizeof(RegLocation) * info->num_arg_words, kArenaAllocMisc));
   for (int i = 0; i < info->num_arg_words; i++) {
     info->args[i] = GetRawSrc(mir, i);
   }
@@ -1182,7 +1182,7 @@ CallInfo* MIRGraph::NewMemCallInfo(BasicBlock* bb, MIR* mir, InvokeType type,
 // Allocate a new basic block.
 BasicBlock* MIRGraph::NewMemBB(BBType block_type, int block_id) {
   BasicBlock* bb = static_cast<BasicBlock*>(arena_->Alloc(sizeof(BasicBlock),
-                                                          ArenaAllocator::kAllocBB));
+                                                          kArenaAllocBB));
   bb->block_type = block_type;
   bb->id = block_id;
   // TUNING: better estimate of the exit block predecessors?
@@ -1196,7 +1196,7 @@ BasicBlock* MIRGraph::NewMemBB(BBType block_type, int block_id) {
 
 void MIRGraph::InitializeConstantPropagation() {
   is_constant_v_ = new (arena_) ArenaBitVector(arena_, GetNumSSARegs(), false);
-  constant_values_ = static_cast<int*>(arena_->Alloc(sizeof(int) * GetNumSSARegs(), ArenaAllocator::kAllocDFInfo));
+  constant_values_ = static_cast<int*>(arena_->Alloc(sizeof(int) * GetNumSSARegs(), kArenaAllocDFInfo));
 }
 
 void MIRGraph::InitializeMethodUses() {
diff --git a/compiler/dex/mir_graph.h b/compiler/dex/mir_graph.h
index 1eb9ef9bef..85d6d894b0 100644
--- a/compiler/dex/mir_graph.h
+++ b/compiler/dex/mir_graph.h
@@ -457,7 +457,7 @@ class MIRGraph {
 
   void EnableOpcodeCounting() {
     opcode_count_ = static_cast<int*>(arena_->Alloc(kNumPackedOpcodes * sizeof(int),
-                                                    ArenaAllocator::kAllocMisc));
+                                                    kArenaAllocMisc));
   }
 
   void ShowOpcodeStats();
diff --git a/compiler/dex/mir_optimization.cc b/compiler/dex/mir_optimization.cc
index 243452e968..03fc091e4d 100644
--- a/compiler/dex/mir_optimization.cc
+++ b/compiler/dex/mir_optimization.cc
@@ -245,7 +245,7 @@ CompilerTemp* MIRGraph::GetNewCompilerTemp(CompilerTempType ct_type, bool wide)
   }
 
   CompilerTemp *compiler_temp = static_cast<CompilerTemp *>(arena_->Alloc(sizeof(CompilerTemp),
-                                                            ArenaAllocator::kAllocRegAlloc));
+                                                            kArenaAllocRegAlloc));
 
   // Create the type of temp requested. Special temps need special handling because
   // they have a specific virtual register assignment.
@@ -313,7 +313,7 @@ bool MIRGraph::BasicBlockOpt(BasicBlock* bb) {
   bool use_lvn = bb->use_lvn;
   UniquePtr<LocalValueNumbering> local_valnum;
   if (use_lvn) {
-    local_valnum.reset(new LocalValueNumbering(cu_));
+    local_valnum.reset(LocalValueNumbering::Create(cu_));
   }
   while (bb != NULL) {
     for (MIR* mir = bb->first_mir_insn; mir != NULL; mir = mir->next) {
@@ -479,7 +479,7 @@ bool MIRGraph::BasicBlockOpt(BasicBlock* bb) {
                 DCHECK_EQ(SelectKind(if_true), kSelectMove);
                 DCHECK_EQ(SelectKind(if_false), kSelectMove);
                 int* src_ssa =
-                    static_cast<int*>(arena_->Alloc(sizeof(int) * 3, ArenaAllocator::kAllocDFInfo));
+                    static_cast<int*>(arena_->Alloc(sizeof(int) * 3, kArenaAllocDFInfo));
                 src_ssa[0] = mir->ssa_rep->uses[0];
                 src_ssa[1] = if_true->ssa_rep->uses[0];
                 src_ssa[2] = if_false->ssa_rep->uses[0];
@@ -488,14 +488,14 @@ bool MIRGraph::BasicBlockOpt(BasicBlock* bb) {
               }
               mir->ssa_rep->num_defs = 1;
               mir->ssa_rep->defs =
-                  static_cast<int*>(arena_->Alloc(sizeof(int) * 1, ArenaAllocator::kAllocDFInfo));
+                  static_cast<int*>(arena_->Alloc(sizeof(int) * 1, kArenaAllocDFInfo));
               mir->ssa_rep->fp_def =
-                  static_cast<bool*>(arena_->Alloc(sizeof(bool) * 1, ArenaAllocator::kAllocDFInfo));
+                  static_cast<bool*>(arena_->Alloc(sizeof(bool) * 1, kArenaAllocDFInfo));
               mir->ssa_rep->fp_def[0] = if_true->ssa_rep->fp_def[0];
               // Match type of uses to def.
               mir->ssa_rep->fp_use =
                   static_cast<bool*>(arena_->Alloc(sizeof(bool) * mir->ssa_rep->num_uses,
-                                                   ArenaAllocator::kAllocDFInfo));
+                                                   kArenaAllocDFInfo));
               for (int i = 0; i < mir->ssa_rep->num_uses; i++) {
                 mir->ssa_rep->fp_use[i] = mir->ssa_rep->fp_def[0];
               }
@@ -878,7 +878,7 @@ bool MIRGraph::EliminateNullChecksAndInferTypes(BasicBlock* bb) {
 
 void MIRGraph::DumpCheckStats() {
   Checkstats* stats =
-      static_cast<Checkstats*>(arena_->Alloc(sizeof(Checkstats), ArenaAllocator::kAllocDFInfo));
+      static_cast<Checkstats*>(arena_->Alloc(sizeof(Checkstats), kArenaAllocDFInfo));
   checkstats_ = stats;
   AllNodesIterator iter(this);
   for (BasicBlock* bb = iter.Next(); bb != NULL; bb = iter.Next()) {
diff --git a/compiler/dex/quick/arm/call_arm.cc b/compiler/dex/quick/arm/call_arm.cc
index f426055068..0fce5bbb3d 100644
--- a/compiler/dex/quick/arm/call_arm.cc
+++ b/compiler/dex/quick/arm/call_arm.cc
@@ -50,12 +50,12 @@ void ArmMir2Lir::GenSparseSwitch(MIR* mir, uint32_t table_offset,
   }
   // Add the table to the list - we'll process it later
   SwitchTable *tab_rec =
-      static_cast<SwitchTable*>(arena_->Alloc(sizeof(SwitchTable), ArenaAllocator::kAllocData));
+      static_cast<SwitchTable*>(arena_->Alloc(sizeof(SwitchTable), kArenaAllocData));
   tab_rec->table = table;
   tab_rec->vaddr = current_dalvik_offset_;
   uint32_t size = table[1];
   tab_rec->targets = static_cast<LIR**>(arena_->Alloc(size * sizeof(LIR*),
-                                                     ArenaAllocator::kAllocLIR));
+                                                     kArenaAllocLIR));
   switch_tables_.Insert(tab_rec);
 
   // Get the switch value
@@ -99,12 +99,12 @@ void ArmMir2Lir::GenPackedSwitch(MIR* mir, uint32_t table_offset,
   }
   // Add the table to the list - we'll process it later
   SwitchTable *tab_rec =
-      static_cast<SwitchTable*>(arena_->Alloc(sizeof(SwitchTable),  ArenaAllocator::kAllocData));
+      static_cast<SwitchTable*>(arena_->Alloc(sizeof(SwitchTable),  kArenaAllocData));
   tab_rec->table = table;
   tab_rec->vaddr = current_dalvik_offset_;
   uint32_t size = table[1];
   tab_rec->targets =
-      static_cast<LIR**>(arena_->Alloc(size * sizeof(LIR*), ArenaAllocator::kAllocLIR));
+      static_cast<LIR**>(arena_->Alloc(size * sizeof(LIR*), kArenaAllocLIR));
   switch_tables_.Insert(tab_rec);
 
   // Get the switch value
@@ -152,7 +152,7 @@ void ArmMir2Lir::GenFillArrayData(uint32_t table_offset, RegLocation rl_src) {
   const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset;
   // Add the table to the list - we'll process it later
   FillArrayData *tab_rec =
-      static_cast<FillArrayData*>(arena_->Alloc(sizeof(FillArrayData), ArenaAllocator::kAllocData));
+      static_cast<FillArrayData*>(arena_->Alloc(sizeof(FillArrayData), kArenaAllocData));
   tab_rec->table = table;
   tab_rec->vaddr = current_dalvik_offset_;
   uint16_t width = tab_rec->table[1];
diff --git a/compiler/dex/quick/arm/target_arm.cc b/compiler/dex/quick/arm/target_arm.cc
index ab1a053489..01d669b90c 100644
--- a/compiler/dex/quick/arm/target_arm.cc
+++ b/compiler/dex/quick/arm/target_arm.cc
@@ -554,13 +554,13 @@ void ArmMir2Lir::CompilerInitializeRegAlloc() {
   int num_fp_regs = sizeof(FpRegs)/sizeof(*FpRegs);
   int num_fp_temps = sizeof(fp_temps)/sizeof(*fp_temps);
   reg_pool_ = static_cast<RegisterPool*>(arena_->Alloc(sizeof(*reg_pool_),
-                                                       ArenaAllocator::kAllocRegAlloc));
+                                                       kArenaAllocRegAlloc));
   reg_pool_->num_core_regs = num_regs;
   reg_pool_->core_regs = reinterpret_cast<RegisterInfo*>
-      (arena_->Alloc(num_regs * sizeof(*reg_pool_->core_regs), ArenaAllocator::kAllocRegAlloc));
+      (arena_->Alloc(num_regs * sizeof(*reg_pool_->core_regs), kArenaAllocRegAlloc));
   reg_pool_->num_fp_regs = num_fp_regs;
   reg_pool_->FPRegs = static_cast<RegisterInfo*>
-      (arena_->Alloc(num_fp_regs * sizeof(*reg_pool_->FPRegs), ArenaAllocator::kAllocRegAlloc));
+      (arena_->Alloc(num_fp_regs * sizeof(*reg_pool_->FPRegs), kArenaAllocRegAlloc));
   CompilerInitPool(reg_pool_->core_regs, core_regs, reg_pool_->num_core_regs);
   CompilerInitPool(reg_pool_->FPRegs, FpRegs, reg_pool_->num_fp_regs);
   // Keep special registers from being allocated
diff --git a/compiler/dex/quick/codegen_util.cc b/compiler/dex/quick/codegen_util.cc
index 14469b61c3..34d3834682 100644
--- a/compiler/dex/quick/codegen_util.cc
+++ b/compiler/dex/quick/codegen_util.cc
@@ -358,7 +358,7 @@ LIR* Mir2Lir::ScanLiteralPoolWide(LIR* data_target, int val_lo, int val_hi) {
 LIR* Mir2Lir::AddWordData(LIR* *constant_list_p, int value) {
   /* Add the constant to the literal pool */
   if (constant_list_p) {
-    LIR* new_value = static_cast<LIR*>(arena_->Alloc(sizeof(LIR), ArenaAllocator::kAllocData));
+    LIR* new_value = static_cast<LIR*>(arena_->Alloc(sizeof(LIR), kArenaAllocData));
     new_value->operands[0] = value;
     new_value->next = *constant_list_p;
     *constant_list_p = new_value;
@@ -829,7 +829,7 @@ LIR* Mir2Lir::InsertCaseLabel(DexOffset vaddr, int keyVal) {
   LIR* res = boundary_lir;
   if (cu_->verbose) {
     // Only pay the expense if we're pretty-printing.
-    LIR* new_label = static_cast<LIR*>(arena_->Alloc(sizeof(LIR), ArenaAllocator::kAllocLIR));
+    LIR* new_label = static_cast<LIR*>(arena_->Alloc(sizeof(LIR), kArenaAllocLIR));
     new_label->dalvik_offset = vaddr;
     new_label->opcode = kPseudoCaseLabel;
     new_label->operands[0] = keyVal;
diff --git a/compiler/dex/quick/dex_file_method_inliner.cc b/compiler/dex/quick/dex_file_method_inliner.cc
index cb424d9169..7423393e13 100644
--- a/compiler/dex/quick/dex_file_method_inliner.cc
+++ b/compiler/dex/quick/dex_file_method_inliner.cc
@@ -18,7 +18,6 @@
 #include "base/macros.h"
 #include "base/mutex.h"
 #include "base/mutex-inl.h"
-#include "locks.h"
 #include "thread.h"
 #include "thread-inl.h"
 #include "dex/mir_graph.h"
diff --git a/compiler/dex/quick/dex_file_method_inliner.h b/compiler/dex/quick/dex_file_method_inliner.h
index b4d8dd6009..4aff01c066 100644
--- a/compiler/dex/quick/dex_file_method_inliner.h
+++ b/compiler/dex/quick/dex_file_method_inliner.h
@@ -23,7 +23,6 @@
 #include "safe_map.h"
 #include "dex/compiler_enums.h"
 #include "dex_file.h"
-#include "locks.h"
 
 namespace art {
 
diff --git a/compiler/dex/quick/local_optimizations.cc b/compiler/dex/quick/local_optimizations.cc
index 7a2dce13dc..6df91e674a 100644
--- a/compiler/dex/quick/local_optimizations.cc
+++ b/compiler/dex/quick/local_optimizations.cc
@@ -248,7 +248,7 @@ void Mir2Lir::ApplyLoadStoreElimination(LIR* head_lir, LIR* tail_lir) {
         /* Only sink store instructions */
         if (sink_distance && !is_this_lir_load) {
           LIR* new_store_lir =
-              static_cast<LIR*>(arena_->Alloc(sizeof(LIR), ArenaAllocator::kAllocLIR));
+              static_cast<LIR*>(arena_->Alloc(sizeof(LIR), kArenaAllocLIR));
           *new_store_lir = *this_lir;
           /*
            * Stop point found - insert *before* the check_lir
@@ -445,7 +445,7 @@ void Mir2Lir::ApplyLoadHoisting(LIR* head_lir, LIR* tail_lir) {
       if (slot >= 0) {
         LIR* cur_lir = prev_inst_list[slot];
         LIR* new_load_lir =
-          static_cast<LIR*>(arena_->Alloc(sizeof(LIR), ArenaAllocator::kAllocLIR));
+          static_cast<LIR*>(arena_->Alloc(sizeof(LIR), kArenaAllocLIR));
         *new_load_lir = *this_lir;
         /*
          * Insertion is guaranteed to succeed since check_lir
diff --git a/compiler/dex/quick/mips/call_mips.cc b/compiler/dex/quick/mips/call_mips.cc
index 88f46fd59a..234299e472 100644
--- a/compiler/dex/quick/mips/call_mips.cc
+++ b/compiler/dex/quick/mips/call_mips.cc
@@ -68,12 +68,12 @@ void MipsMir2Lir::GenSparseSwitch(MIR* mir, DexOffset table_offset,
   }
   // Add the table to the list - we'll process it later
   SwitchTable* tab_rec =
-      static_cast<SwitchTable*>(arena_->Alloc(sizeof(SwitchTable), ArenaAllocator::kAllocData));
+      static_cast<SwitchTable*>(arena_->Alloc(sizeof(SwitchTable), kArenaAllocData));
   tab_rec->table = table;
   tab_rec->vaddr = current_dalvik_offset_;
   int elements = table[1];
   tab_rec->targets =
-      static_cast<LIR**>(arena_->Alloc(elements * sizeof(LIR*), ArenaAllocator::kAllocLIR));
+      static_cast<LIR**>(arena_->Alloc(elements * sizeof(LIR*), kArenaAllocLIR));
   switch_tables_.Insert(tab_rec);
 
   // The table is composed of 8-byte key/disp pairs
@@ -146,12 +146,12 @@ void MipsMir2Lir::GenPackedSwitch(MIR* mir, DexOffset table_offset,
   }
   // Add the table to the list - we'll process it later
   SwitchTable* tab_rec =
-      static_cast<SwitchTable*>(arena_->Alloc(sizeof(SwitchTable), ArenaAllocator::kAllocData));
+      static_cast<SwitchTable*>(arena_->Alloc(sizeof(SwitchTable), kArenaAllocData));
   tab_rec->table = table;
   tab_rec->vaddr = current_dalvik_offset_;
   int size = table[1];
   tab_rec->targets = static_cast<LIR**>(arena_->Alloc(size * sizeof(LIR*),
-                                                      ArenaAllocator::kAllocLIR));
+                                                      kArenaAllocLIR));
   switch_tables_.Insert(tab_rec);
 
   // Get the switch value
@@ -226,7 +226,7 @@ void MipsMir2Lir::GenFillArrayData(DexOffset table_offset, RegLocation rl_src) {
   // Add the table to the list - we'll process it later
   FillArrayData* tab_rec =
       reinterpret_cast<FillArrayData*>(arena_->Alloc(sizeof(FillArrayData),
-                                                     ArenaAllocator::kAllocData));
+                                                     kArenaAllocData));
   tab_rec->table = table;
   tab_rec->vaddr = current_dalvik_offset_;
   uint16_t width = tab_rec->table[1];
diff --git a/compiler/dex/quick/mips/target_mips.cc b/compiler/dex/quick/mips/target_mips.cc
index 85c250da0f..4f495ee0fc 100644
--- a/compiler/dex/quick/mips/target_mips.cc
+++ b/compiler/dex/quick/mips/target_mips.cc
@@ -467,13 +467,13 @@ void MipsMir2Lir::CompilerInitializeRegAlloc() {
   int num_fp_regs = sizeof(FpRegs)/sizeof(*FpRegs);
   int num_fp_temps = sizeof(fp_temps)/sizeof(*fp_temps);
   reg_pool_ = static_cast<RegisterPool*>(arena_->Alloc(sizeof(*reg_pool_),
-                                                       ArenaAllocator::kAllocRegAlloc));
+                                                       kArenaAllocRegAlloc));
   reg_pool_->num_core_regs = num_regs;
   reg_pool_->core_regs = static_cast<RegisterInfo*>
-     (arena_->Alloc(num_regs * sizeof(*reg_pool_->core_regs), ArenaAllocator::kAllocRegAlloc));
+     (arena_->Alloc(num_regs * sizeof(*reg_pool_->core_regs), kArenaAllocRegAlloc));
   reg_pool_->num_fp_regs = num_fp_regs;
   reg_pool_->FPRegs = static_cast<RegisterInfo*>
-      (arena_->Alloc(num_fp_regs * sizeof(*reg_pool_->FPRegs), ArenaAllocator::kAllocRegAlloc));
+      (arena_->Alloc(num_fp_regs * sizeof(*reg_pool_->FPRegs), kArenaAllocRegAlloc));
   CompilerInitPool(reg_pool_->core_regs, core_regs, reg_pool_->num_core_regs);
   CompilerInitPool(reg_pool_->FPRegs, FpRegs, reg_pool_->num_fp_regs);
   // Keep special registers from being allocated
diff --git a/compiler/dex/quick/mir_to_lir-inl.h b/compiler/dex/quick/mir_to_lir-inl.h
index c2d12f6481..8b1f81d47f 100644
--- a/compiler/dex/quick/mir_to_lir-inl.h
+++ b/compiler/dex/quick/mir_to_lir-inl.h
@@ -45,7 +45,7 @@ inline void Mir2Lir::ClobberBody(RegisterInfo* p) {
 
 inline LIR* Mir2Lir::RawLIR(DexOffset dalvik_offset, int opcode, int op0,
                             int op1, int op2, int op3, int op4, LIR* target) {
-  LIR* insn = static_cast<LIR*>(arena_->Alloc(sizeof(LIR), ArenaAllocator::kAllocLIR));
+  LIR* insn = static_cast<LIR*>(arena_->Alloc(sizeof(LIR), kArenaAllocLIR));
   insn->dalvik_offset = dalvik_offset;
   insn->opcode = opcode;
   insn->operands[0] = op0;
diff --git a/compiler/dex/quick/mir_to_lir.cc b/compiler/dex/quick/mir_to_lir.cc
index d9b241e864..40ed5ef535 100644
--- a/compiler/dex/quick/mir_to_lir.cc
+++ b/compiler/dex/quick/mir_to_lir.cc
@@ -1066,7 +1066,7 @@ void Mir2Lir::MethodMIR2LIR() {
   // Hold the labels of each block.
   block_label_list_ =
       static_cast<LIR*>(arena_->Alloc(sizeof(LIR) * mir_graph_->GetNumBlocks(),
-                                      ArenaAllocator::kAllocLIR));
+                                      kArenaAllocLIR));
 
   PreOrderDfsIterator iter(mir_graph_);
   BasicBlock* curr_bb = iter.Next();
diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index 9e0e29995e..6955577670 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h
@@ -311,7 +311,7 @@ class Mir2Lir : public Backend {
       virtual void Compile() = 0;
 
       static void* operator new(size_t size, ArenaAllocator* arena) {
-        return arena->Alloc(size, ArenaAllocator::kAllocData);
+        return arena->Alloc(size, kArenaAllocData);
       }
 
      protected:
@@ -363,7 +363,7 @@ class Mir2Lir : public Backend {
     // strdup(), but allocates from the arena.
     char* ArenaStrdup(const char* str) {
       size_t len = strlen(str) + 1;
-      char* res = reinterpret_cast<char*>(arena_->Alloc(len, ArenaAllocator::kAllocMisc));
+      char* res = reinterpret_cast<char*>(arena_->Alloc(len, kArenaAllocMisc));
       if (res != NULL) {
         strncpy(res, str, len);
       }
diff --git a/compiler/dex/quick/ralloc_util.cc b/compiler/dex/quick/ralloc_util.cc
index 3a8942e46e..3cb6fd01c1 100644
--- a/compiler/dex/quick/ralloc_util.cc
+++ b/compiler/dex/quick/ralloc_util.cc
@@ -907,7 +907,7 @@ void Mir2Lir::DoPromotion() {
   const int promotion_threshold = 1;
   // Allocate the promotion map - one entry for each Dalvik vReg or compiler temp
   promotion_map_ = static_cast<PromotionMap*>
-      (arena_->Alloc(num_regs * sizeof(promotion_map_[0]), ArenaAllocator::kAllocRegAlloc));
+      (arena_->Alloc(num_regs * sizeof(promotion_map_[0]), kArenaAllocRegAlloc));
 
   // Allow target code to add any special registers
   AdjustSpillMask();
@@ -925,10 +925,10 @@ void Mir2Lir::DoPromotion() {
    */
   RefCounts *core_regs =
       static_cast<RefCounts*>(arena_->Alloc(sizeof(RefCounts) * num_regs,
-                                            ArenaAllocator::kAllocRegAlloc));
+                                            kArenaAllocRegAlloc));
   RefCounts *FpRegs =
       static_cast<RefCounts *>(arena_->Alloc(sizeof(RefCounts) * num_regs * 2,
-                                             ArenaAllocator::kAllocRegAlloc));
+                                             kArenaAllocRegAlloc));
   // Set ssa names for original Dalvik registers
   for (int i = 0; i < dalvik_regs; i++) {
     core_regs[i].s_reg = FpRegs[i].s_reg = i;
diff --git a/compiler/dex/quick/x86/call_x86.cc b/compiler/dex/quick/x86/call_x86.cc
index c92d2bb730..577f216f5e 100644
--- a/compiler/dex/quick/x86/call_x86.cc
+++ b/compiler/dex/quick/x86/call_x86.cc
@@ -69,12 +69,12 @@ void X86Mir2Lir::GenPackedSwitch(MIR* mir, DexOffset table_offset,
   }
   // Add the table to the list - we'll process it later
   SwitchTable* tab_rec =
-      static_cast<SwitchTable*>(arena_->Alloc(sizeof(SwitchTable), ArenaAllocator::kAllocData));
+      static_cast<SwitchTable*>(arena_->Alloc(sizeof(SwitchTable), kArenaAllocData));
   tab_rec->table = table;
   tab_rec->vaddr = current_dalvik_offset_;
   int size = table[1];
   tab_rec->targets = static_cast<LIR**>(arena_->Alloc(size * sizeof(LIR*),
-                                                      ArenaAllocator::kAllocLIR));
+                                                      kArenaAllocLIR));
   switch_tables_.Insert(tab_rec);
 
   // Get the switch value
@@ -134,7 +134,7 @@ void X86Mir2Lir::GenFillArrayData(DexOffset table_offset, RegLocation rl_src) {
   const uint16_t* table = cu_->insns + current_dalvik_offset_ + table_offset;
   // Add the table to the list - we'll process it later
   FillArrayData* tab_rec =
-      static_cast<FillArrayData*>(arena_->Alloc(sizeof(FillArrayData), ArenaAllocator::kAllocData));
+      static_cast<FillArrayData*>(arena_->Alloc(sizeof(FillArrayData), kArenaAllocData));
   tab_rec->table = table;
   tab_rec->vaddr = current_dalvik_offset_;
   uint16_t width = tab_rec->table[1];
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index 78a216923f..083fccb2b4 100644
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -457,15 +457,15 @@ void X86Mir2Lir::CompilerInitializeRegAlloc() {
   int num_fp_regs = sizeof(FpRegs)/sizeof(*FpRegs);
   int num_fp_temps = sizeof(fp_temps)/sizeof(*fp_temps);
   reg_pool_ = static_cast<RegisterPool*>(arena_->Alloc(sizeof(*reg_pool_),
-                                                       ArenaAllocator::kAllocRegAlloc));
+                                                       kArenaAllocRegAlloc));
   reg_pool_->num_core_regs = num_regs;
   reg_pool_->core_regs =
       static_cast<RegisterInfo*>(arena_->Alloc(num_regs * sizeof(*reg_pool_->core_regs),
-                                               ArenaAllocator::kAllocRegAlloc));
+                                               kArenaAllocRegAlloc));
   reg_pool_->num_fp_regs = num_fp_regs;
   reg_pool_->FPRegs =
       static_cast<RegisterInfo *>(arena_->Alloc(num_fp_regs * sizeof(*reg_pool_->FPRegs),
-                                                ArenaAllocator::kAllocRegAlloc));
+                                                kArenaAllocRegAlloc));
   CompilerInitPool(reg_pool_->core_regs, core_regs, reg_pool_->num_core_regs);
   CompilerInitPool(reg_pool_->FPRegs, FpRegs, reg_pool_->num_fp_regs);
   // Keep special registers from being allocated
diff --git a/compiler/dex/ssa_transformation.cc b/compiler/dex/ssa_transformation.cc
index 4e258ef7c7..8091528809 100644
--- a/compiler/dex/ssa_transformation.cc
+++ b/compiler/dex/ssa_transformation.cc
@@ -144,7 +144,7 @@ void MIRGraph::ComputeDefBlockMatrix() {
   /* Allocate num_dalvik_registers bit vector pointers */
   def_block_matrix_ = static_cast<ArenaBitVector**>
       (arena_->Alloc(sizeof(ArenaBitVector *) * num_registers,
-                     ArenaAllocator::kAllocDFInfo));
+                     kArenaAllocDFInfo));
   int i;
 
   /* Initialize num_register vectors with num_blocks bits each */
@@ -384,7 +384,7 @@ void MIRGraph::ComputeDominators() {
   /* Initalize & Clear i_dom_list */
   if (i_dom_list_ == NULL) {
     i_dom_list_ = static_cast<int*>(arena_->Alloc(sizeof(int) * num_reachable_blocks,
-                                                  ArenaAllocator::kAllocDFInfo));
+                                                  kArenaAllocDFInfo));
   }
   for (int i = 0; i < num_reachable_blocks; i++) {
     i_dom_list_[i] = NOTVISITED;
@@ -565,7 +565,7 @@ void MIRGraph::InsertPhiNodes() {
         continue;
       }
       MIR *phi =
-          static_cast<MIR*>(arena_->Alloc(sizeof(MIR), ArenaAllocator::kAllocDFInfo));
+          static_cast<MIR*>(arena_->Alloc(sizeof(MIR), kArenaAllocDFInfo));
       phi->dalvikInsn.opcode = static_cast<Instruction::Code>(kMirOpPhi);
       phi->dalvikInsn.vA = dalvik_reg;
       phi->offset = phi_bb->start_offset;
@@ -593,13 +593,13 @@ bool MIRGraph::InsertPhiNodeOperands(BasicBlock* bb) {
     size_t num_uses = bb->predecessors->Size();
     mir->ssa_rep->num_uses = num_uses;
     int* uses = static_cast<int*>(arena_->Alloc(sizeof(int) * num_uses,
-                                                ArenaAllocator::kAllocDFInfo));
+                                                kArenaAllocDFInfo));
     mir->ssa_rep->uses = uses;
     mir->ssa_rep->fp_use =
-        static_cast<bool*>(arena_->Alloc(sizeof(bool) * num_uses, ArenaAllocator::kAllocDFInfo));
+        static_cast<bool*>(arena_->Alloc(sizeof(bool) * num_uses, kArenaAllocDFInfo));
     BasicBlockId* incoming =
         static_cast<BasicBlockId*>(arena_->Alloc(sizeof(BasicBlockId) * num_uses,
-                                                 ArenaAllocator::kAllocDFInfo));
+                                                 kArenaAllocDFInfo));
     mir->meta.phi_incoming = incoming;
     int idx = 0;
     while (true) {
@@ -629,7 +629,7 @@ void MIRGraph::DoDFSPreOrderSSARename(BasicBlock* block) {
 
   /* Save SSA map snapshot */
   int* saved_ssa_map =
-      static_cast<int*>(arena_->Alloc(map_size, ArenaAllocator::kAllocDalvikToSSAMap));
+      static_cast<int*>(arena_->Alloc(map_size, kArenaAllocDalvikToSSAMap));
   memcpy(saved_ssa_map, vreg_to_ssa_map_, map_size);
 
   if (block->fall_through != NullBasicBlockId) {
diff --git a/compiler/dex/verified_method.h b/compiler/dex/verified_method.h
index aa0e72a5ca..257e70ce93 100644
--- a/compiler/dex/verified_method.h
+++ b/compiler/dex/verified_method.h
@@ -19,6 +19,7 @@
 
 #include <vector>
 
+#include "base/mutex.h"
 #include "method_reference.h"
 #include "safe_map.h"
 
diff --git a/compiler/dex/vreg_analysis.cc b/compiler/dex/vreg_analysis.cc
index 4d2c05166b..876973625d 100644
--- a/compiler/dex/vreg_analysis.cc
+++ b/compiler/dex/vreg_analysis.cc
@@ -410,7 +410,7 @@ void MIRGraph::InitRegLocations() {
   /* Allocate the location map */
   int max_regs = GetNumSSARegs() + GetMaxPossibleCompilerTemps();
   RegLocation* loc = static_cast<RegLocation*>(arena_->Alloc(max_regs * sizeof(*loc),
-                                                             ArenaAllocator::kAllocRegAlloc));
+                                                             kArenaAllocRegAlloc));
   for (int i = 0; i < GetNumSSARegs(); i++) {
     loc[i] = fresh_loc;
     loc[i].s_reg_low = i;
diff --git a/compiler/driver/compiler_driver-inl.h b/compiler/driver/compiler_driver-inl.h
index d401398ca4..1499ae4872 100644
--- a/compiler/driver/compiler_driver-inl.h
+++ b/compiler/driver/compiler_driver-inl.h
@@ -25,6 +25,7 @@
 #include "mirror/dex_cache.h"
 #include "mirror/art_field-inl.h"
 #include "scoped_thread_state_change.h"
+#include "sirt_ref-inl.h"
 
 namespace art {
 
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index fc22addbf1..d3d58c919f 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -49,6 +49,7 @@
 #include "mirror/throwable.h"
 #include "scoped_thread_state_change.h"
 #include "ScopedLocalRef.h"
+#include "sirt_ref-inl.h"
 #include "thread.h"
 #include "thread_pool.h"
 #include "trampolines/trampoline_compiler.h"
diff --git a/compiler/driver/compiler_driver.h b/compiler/driver/compiler_driver.h
index 80a6796a4e..ac70e5aee0 100644
--- a/compiler/driver/compiler_driver.h
+++ b/compiler/driver/compiler_driver.h
@@ -52,6 +52,7 @@ struct InlineIGetIPutData;
 class OatWriter;
 class ParallelCompilationManager;
 class ScopedObjectAccess;
+template<class T> class SirtRef;
 class TimingLogger;
 class VerificationResults;
 class VerifiedMethod;
diff --git a/compiler/driver/compiler_driver_test.cc b/compiler/driver/compiler_driver_test.cc
index 2b3af6281f..949fade906 100644
--- a/compiler/driver/compiler_driver_test.cc
+++ b/compiler/driver/compiler_driver_test.cc
@@ -30,6 +30,7 @@
 #include "mirror/dex_cache-inl.h"
 #include "mirror/object_array-inl.h"
 #include "mirror/object-inl.h"
+#include "sirt_ref-inl.h"
 
 namespace art {
 
diff --git a/compiler/elf_writer.h b/compiler/elf_writer.h
index 3610d1a8b2..03b965acf7 100644
--- a/compiler/elf_writer.h
+++ b/compiler/elf_writer.h
@@ -23,6 +23,7 @@
 #include <vector>
 
 #include "base/macros.h"
+#include "base/mutex.h"
 #include "os.h"
 
 namespace art {
diff --git a/compiler/image_writer.cc b/compiler/image_writer.cc
index 74be604c31..f4b507a64a 100644
--- a/compiler/image_writer.cc
+++ b/compiler/image_writer.cc
@@ -50,7 +50,7 @@
 #include "object_utils.h"
 #include "runtime.h"
 #include "scoped_thread_state_change.h"
-#include "sirt_ref.h"
+#include "sirt_ref-inl.h"
 #include "UniquePtr.h"
 #include "utils.h"
 
diff --git a/compiler/oat_writer.cc b/compiler/oat_writer.cc
index 5394bbc42e..ffd7b417e3 100644
--- a/compiler/oat_writer.cc
+++ b/compiler/oat_writer.cc
@@ -33,6 +33,7 @@
 #include "output_stream.h"
 #include "safe_map.h"
 #include "scoped_thread_state_change.h"
+#include "sirt_ref-inl.h"
 #include "verifier/method_verifier.h"
 
 namespace art {
diff --git a/compiler/trampolines/trampoline_compiler.h b/compiler/trampolines/trampoline_compiler.h
index 21245db70d..cb5aa273bd 100644
--- a/compiler/trampolines/trampoline_compiler.h
+++ b/compiler/trampolines/trampoline_compiler.h
@@ -20,7 +20,6 @@
 #include <stdint.h>
 #include <vector>
 
-#include "locks.h"
 #include "driver/compiler_driver.h"
 
 namespace art {
diff --git a/compiler/utils/allocation.h b/compiler/utils/allocation.h
index 07cd39788e..b0947cac68 100644
--- a/compiler/utils/allocation.h
+++ b/compiler/utils/allocation.h
@@ -26,7 +26,7 @@ class ArenaObject {
  public:
   // Allocate a new ArenaObject of 'size' bytes in the Arena.
   void* operator new(size_t size, ArenaAllocator* allocator) {
-    return allocator->Alloc(size, ArenaAllocator::kAllocMisc);
+    return allocator->Alloc(size, kArenaAllocMisc);
   }
 
   void operator delete(void*, size_t) {
diff --git a/compiler/utils/arena_allocator.cc b/compiler/utils/arena_allocator.cc
index 00c3c578df..ca4635d352 100644
--- a/compiler/utils/arena_allocator.cc
+++ b/compiler/utils/arena_allocator.cc
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+#include <algorithm>
+#include <numeric>
+
 #include "arena_allocator.h"
 #include "base/logging.h"
 #include "base/mutex.h"
@@ -28,7 +31,8 @@ static constexpr bool kUseMemSet = true && kUseMemMap;
 static constexpr size_t kValgrindRedZoneBytes = 8;
 constexpr size_t Arena::kDefaultSize;
 
-static const char* alloc_names[ArenaAllocator::kNumAllocKinds] = {
+template <bool kCount>
+const char* ArenaAllocatorStatsImpl<kCount>::kAllocNames[kNumArenaAllocKinds] = {
   "Misc       ",
   "BasicBlock ",
   "LIR        ",
@@ -42,8 +46,69 @@ static const char* alloc_names[ArenaAllocator::kNumAllocKinds] = {
   "RegAlloc   ",
   "Data       ",
   "Preds      ",
+  "STL        ",
 };
 
+template <bool kCount>
+ArenaAllocatorStatsImpl<kCount>::ArenaAllocatorStatsImpl()
+    : num_allocations_(0u) {
+  std::fill_n(alloc_stats_, arraysize(alloc_stats_), 0u);
+}
+
+template <bool kCount>
+void ArenaAllocatorStatsImpl<kCount>::Copy(const ArenaAllocatorStatsImpl& other) {
+  num_allocations_ = other.num_allocations_;
+  std::copy(other.alloc_stats_, other.alloc_stats_ + arraysize(alloc_stats_), alloc_stats_);
+}
+
+template <bool kCount>
+void ArenaAllocatorStatsImpl<kCount>::RecordAlloc(size_t bytes, ArenaAllocKind kind) {
+  alloc_stats_[kind] += bytes;
+  ++num_allocations_;
+}
+
+template <bool kCount>
+size_t ArenaAllocatorStatsImpl<kCount>::NumAllocations() const {
+  return num_allocations_;
+}
+
+template <bool kCount>
+size_t ArenaAllocatorStatsImpl<kCount>::BytesAllocated() const {
+  const size_t init = 0u;  // Initial value of the correct type.
+  return std::accumulate(alloc_stats_, alloc_stats_ + arraysize(alloc_stats_), init);
+}
+
+template <bool kCount>
+void ArenaAllocatorStatsImpl<kCount>::Dump(std::ostream& os, const Arena* first,
+                                           ssize_t lost_bytes_adjustment) const {
+  size_t malloc_bytes = 0u;
+  size_t lost_bytes = 0u;
+  size_t num_arenas = 0u;
+  for (const Arena* arena = first; arena != nullptr; arena = arena->next_) {
+    malloc_bytes += arena->Size();
+    lost_bytes += arena->RemainingSpace();
+    ++num_arenas;
+  }
+  // The lost_bytes_adjustment is used to make up for the fact that the current arena
+  // may not have the bytes_allocated_ updated correctly.
+  lost_bytes += lost_bytes_adjustment;
+  const size_t bytes_allocated = BytesAllocated();
+  os << " MEM: used: " << bytes_allocated << ", allocated: " << malloc_bytes
+     << ", lost: " << lost_bytes << "\n";
+  size_t num_allocations = NumAllocations();
+  if (num_allocations != 0) {
+    os << "Number of arenas allocated: " << num_arenas << ", Number of allocations: "
+       << num_allocations << ", avg size: " << bytes_allocated / num_allocations << "\n";
+  }
+  os << "===== Allocation by kind\n";
+  for (int i = 0; i < kNumArenaAllocKinds; i++) {
+      os << kAllocNames[i] << std::setw(10) << alloc_stats_[i] << "\n";
+  }
+}
+
+// Explicitly instantiate the used implementation.
+template class ArenaAllocatorStatsImpl<kArenaAllocatorCountAllocations>;
+
 Arena::Arena(size_t size)
     : bytes_allocated_(0),
       map_(nullptr),
@@ -110,24 +175,26 @@ Arena* ArenaPool::AllocArena(size_t size) {
   return ret;
 }
 
-void ArenaPool::FreeArena(Arena* arena) {
-  Thread* self = Thread::Current();
+void ArenaPool::FreeArenaChain(Arena* first) {
   if (UNLIKELY(RUNNING_ON_VALGRIND > 0)) {
-    VALGRIND_MAKE_MEM_UNDEFINED(arena->memory_, arena->bytes_allocated_);
+    for (Arena* arena = first; arena != nullptr; arena = arena->next_) {
+      VALGRIND_MAKE_MEM_UNDEFINED(arena->memory_, arena->bytes_allocated_);
+    }
   }
-  {
+  if (first != nullptr) {
+    Arena* last = first;
+    while (last->next_ != nullptr) {
+      last = last->next_;
+    }
+    Thread* self = Thread::Current();
     MutexLock lock(self, lock_);
-    arena->next_ = free_arenas_;
-    free_arenas_ = arena;
+    last->next_ = free_arenas_;
+    free_arenas_ = first;
   }
 }
 
 size_t ArenaAllocator::BytesAllocated() const {
-  size_t total = 0;
-  for (int i = 0; i < kNumAllocKinds; i++) {
-    total += alloc_stats_[i];
-  }
-  return total;
+  return ArenaAllocatorStats::BytesAllocated();
 }
 
 ArenaAllocator::ArenaAllocator(ArenaPool* pool)
@@ -136,9 +203,7 @@ ArenaAllocator::ArenaAllocator(ArenaPool* pool)
     end_(nullptr),
     ptr_(nullptr),
     arena_head_(nullptr),
-    num_allocations_(0),
     running_on_valgrind_(RUNNING_ON_VALGRIND > 0) {
-  memset(&alloc_stats_[0], 0, sizeof(alloc_stats_));
 }
 
 void ArenaAllocator::UpdateBytesAllocated() {
@@ -158,10 +223,7 @@ void* ArenaAllocator::AllocValgrind(size_t bytes, ArenaAllocKind kind) {
       return nullptr;
     }
   }
-  if (kCountAllocations) {
-    alloc_stats_[kind] += rounded_bytes;
-    ++num_allocations_;
-  }
+  ArenaAllocatorStats::RecordAlloc(rounded_bytes, kind);
   uint8_t* ret = ptr_;
   ptr_ += rounded_bytes;
   // Check that the memory is already zeroed out.
@@ -175,11 +237,7 @@ void* ArenaAllocator::AllocValgrind(size_t bytes, ArenaAllocKind kind) {
 ArenaAllocator::~ArenaAllocator() {
   // Reclaim all the arenas by giving them back to the thread pool.
   UpdateBytesAllocated();
-  while (arena_head_ != nullptr) {
-    Arena* arena = arena_head_;
-    arena_head_ = arena_head_->next_;
-    pool_->FreeArena(arena);
-  }
+  pool_->FreeArenaChain(arena_head_);
 }
 
 void ArenaAllocator::ObtainNewArenaForAllocation(size_t allocation_size) {
@@ -192,30 +250,24 @@ void ArenaAllocator::ObtainNewArenaForAllocation(size_t allocation_size) {
   end_ = new_arena->End();
 }
 
+MemStats::MemStats(const char* name, const ArenaAllocatorStats* stats, const Arena* first_arena,
+                   ssize_t lost_bytes_adjustment)
+    : name_(name),
+      stats_(stats),
+      first_arena_(first_arena),
+      lost_bytes_adjustment_(lost_bytes_adjustment) {
+}
+
+void MemStats::Dump(std::ostream& os) const {
+  os << name_ << " stats:\n";
+  stats_->Dump(os, first_arena_, lost_bytes_adjustment_);
+}
+
 // Dump memory usage stats.
-void ArenaAllocator::DumpMemStats(std::ostream& os) const {
-  size_t malloc_bytes = 0;
-  // Start out with how many lost bytes we have in the arena we are currently allocating into.
-  size_t lost_bytes(end_ - ptr_);
-  size_t num_arenas = 0;
-  for (Arena* arena = arena_head_; arena != nullptr; arena = arena->next_) {
-    malloc_bytes += arena->Size();
-    if (arena != arena_head_) {
-      lost_bytes += arena->RemainingSpace();
-    }
-    ++num_arenas;
-  }
-  const size_t bytes_allocated = BytesAllocated();
-  os << " MEM: used: " << bytes_allocated << ", allocated: " << malloc_bytes
-     << ", lost: " << lost_bytes << "\n";
-  if (num_allocations_ != 0) {
-    os << "Number of arenas allocated: " << num_arenas << ", Number of allocations: "
-       << num_allocations_ << ", avg size: " << bytes_allocated / num_allocations_ << "\n";
-  }
-  os << "===== Allocation by kind\n";
-  for (int i = 0; i < kNumAllocKinds; i++) {
-      os << alloc_names[i] << std::setw(10) << alloc_stats_[i] << "\n";
-  }
+MemStats ArenaAllocator::GetMemStats() const {
+  ssize_t lost_bytes_adjustment =
+      (arena_head_ == nullptr) ? 0 : (end_ - ptr_) - arena_head_->RemainingSpace();
+  return MemStats("ArenaAllocator", this, arena_head_, lost_bytes_adjustment);
 }
 
 }  // namespace art
diff --git a/compiler/utils/arena_allocator.h b/compiler/utils/arena_allocator.h
index 56cedfefd5..18a5bce77d 100644
--- a/compiler/utils/arena_allocator.h
+++ b/compiler/utils/arena_allocator.h
@@ -20,6 +20,7 @@
 #include <stdint.h>
 #include <stddef.h>
 
+#include "base/macros.h"
 #include "base/mutex.h"
 #include "mem_map.h"
 
@@ -28,6 +29,72 @@ namespace art {
 class Arena;
 class ArenaPool;
 class ArenaAllocator;
+class ArenaStack;
+class ScopedArenaAllocator;
+class MemStats;
+
+static constexpr bool kArenaAllocatorCountAllocations = false;
+
+// Type of allocation for memory tuning.
+enum ArenaAllocKind {
+  kArenaAllocMisc,
+  kArenaAllocBB,
+  kArenaAllocLIR,
+  kArenaAllocMIR,
+  kArenaAllocDFInfo,
+  kArenaAllocGrowableArray,
+  kArenaAllocGrowableBitMap,
+  kArenaAllocDalvikToSSAMap,
+  kArenaAllocDebugInfo,
+  kArenaAllocSuccessor,
+  kArenaAllocRegAlloc,
+  kArenaAllocData,
+  kArenaAllocPredecessors,
+  kArenaAllocSTL,
+  kNumArenaAllocKinds
+};
+
+template <bool kCount>
+class ArenaAllocatorStatsImpl;
+
+template <>
+class ArenaAllocatorStatsImpl<false> {
+ public:
+  ArenaAllocatorStatsImpl() = default;
+  ArenaAllocatorStatsImpl(const ArenaAllocatorStatsImpl& other) = default;
+  ArenaAllocatorStatsImpl& operator = (const ArenaAllocatorStatsImpl& other) = delete;
+
+  void Copy(const ArenaAllocatorStatsImpl& other) { UNUSED(other); }
+  void RecordAlloc(size_t bytes, ArenaAllocKind kind) { UNUSED(bytes); UNUSED(kind); }
+  size_t NumAllocations() const { return 0u; }
+  size_t BytesAllocated() const { return 0u; }
+  void Dump(std::ostream& os, const Arena* first, ssize_t lost_bytes_adjustment) const {
+    UNUSED(os); UNUSED(first); UNUSED(lost_bytes_adjustment);
+  }
+};
+
+template <bool kCount>
+class ArenaAllocatorStatsImpl {
+ public:
+  ArenaAllocatorStatsImpl();
+  ArenaAllocatorStatsImpl(const ArenaAllocatorStatsImpl& other) = default;
+  ArenaAllocatorStatsImpl& operator = (const ArenaAllocatorStatsImpl& other) = delete;
+
+  void Copy(const ArenaAllocatorStatsImpl& other);
+  void RecordAlloc(size_t bytes, ArenaAllocKind kind);
+  size_t NumAllocations() const;
+  size_t BytesAllocated() const;
+  void Dump(std::ostream& os, const Arena* first, ssize_t lost_bytes_adjustment) const;
+
+ private:
+  size_t num_allocations_;
+  // TODO: Use std::array<size_t, kNumArenaAllocKinds> from C++11 when we upgrade the STL.
+  size_t alloc_stats_[kNumArenaAllocKinds];  // Bytes used by various allocation kinds.
+
+  static const char* kAllocNames[kNumArenaAllocKinds];
+};
+
+typedef ArenaAllocatorStatsImpl<kArenaAllocatorCountAllocations> ArenaAllocatorStats;
 
 class Arena {
  public:
@@ -59,6 +126,9 @@ class Arena {
   Arena* next_;
   friend class ArenaPool;
   friend class ArenaAllocator;
+  friend class ArenaStack;
+  friend class ScopedArenaAllocator;
+  template <bool kCount> friend class ArenaAllocatorStatsImpl;
   DISALLOW_COPY_AND_ASSIGN(Arena);
 };
 
@@ -67,7 +137,7 @@ class ArenaPool {
   ArenaPool();
   ~ArenaPool();
   Arena* AllocArena(size_t size);
-  void FreeArena(Arena* arena);
+  void FreeArenaChain(Arena* first);
 
  private:
   Mutex lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
@@ -75,28 +145,8 @@ class ArenaPool {
   DISALLOW_COPY_AND_ASSIGN(ArenaPool);
 };
 
-class ArenaAllocator {
+class ArenaAllocator : private ArenaAllocatorStats {
  public:
-  // Type of allocation for memory tuning.
-  enum ArenaAllocKind {
-    kAllocMisc,
-    kAllocBB,
-    kAllocLIR,
-    kAllocMIR,
-    kAllocDFInfo,
-    kAllocGrowableArray,
-    kAllocGrowableBitMap,
-    kAllocDalvikToSSAMap,
-    kAllocDebugInfo,
-    kAllocSuccessor,
-    kAllocRegAlloc,
-    kAllocData,
-    kAllocPredecessors,
-    kNumAllocKinds
-  };
-
-  static constexpr bool kCountAllocations = false;
-
   explicit ArenaAllocator(ArenaPool* pool);
   ~ArenaAllocator();
 
@@ -113,10 +163,7 @@ class ArenaAllocator {
         return nullptr;
       }
     }
-    if (kCountAllocations) {
-      alloc_stats_[kind] += bytes;
-      ++num_allocations_;
-    }
+    ArenaAllocatorStats::RecordAlloc(bytes, kind);
     uint8_t* ret = ptr_;
     ptr_ += bytes;
     return ret;
@@ -125,7 +172,7 @@ class ArenaAllocator {
   void* AllocValgrind(size_t bytes, ArenaAllocKind kind);
   void ObtainNewArenaForAllocation(size_t allocation_size);
   size_t BytesAllocated() const;
-  void DumpMemStats(std::ostream& os) const;
+  MemStats GetMemStats() const;
 
  private:
   void UpdateBytesAllocated();
@@ -135,21 +182,22 @@ class ArenaAllocator {
   uint8_t* end_;
   uint8_t* ptr_;
   Arena* arena_head_;
-  size_t num_allocations_;
-  size_t alloc_stats_[kNumAllocKinds];  // Bytes used by various allocation kinds.
   bool running_on_valgrind_;
 
   DISALLOW_COPY_AND_ASSIGN(ArenaAllocator);
 };  // ArenaAllocator
 
-struct MemStats {
-   public:
-     void Dump(std::ostream& os) const {
-       arena_.DumpMemStats(os);
-     }
-     explicit MemStats(const ArenaAllocator &arena) : arena_(arena) {}
-  private:
-    const ArenaAllocator &arena_;
+class MemStats {
+ public:
+  MemStats(const char* name, const ArenaAllocatorStats* stats, const Arena* first_arena,
+           ssize_t lost_bytes_adjustment = 0);
+  void Dump(std::ostream& os) const;
+
+ private:
+  const char* const name_;
+  const ArenaAllocatorStats* const stats_;
+  const Arena* const first_arena_;
+  const ssize_t lost_bytes_adjustment_;
 };  // MemStats
 
 }  // namespace art
diff --git a/compiler/utils/arena_bit_vector.cc b/compiler/utils/arena_bit_vector.cc
index 220ff14baa..eff9778612 100644
--- a/compiler/utils/arena_bit_vector.cc
+++ b/compiler/utils/arena_bit_vector.cc
@@ -25,13 +25,13 @@ class ArenaBitVectorAllocator : public Allocator {
   ~ArenaBitVectorAllocator() {}
 
   virtual void* Alloc(size_t size) {
-    return arena_->Alloc(size, ArenaAllocator::kAllocGrowableBitMap);
+    return arena_->Alloc(size, kArenaAllocGrowableBitMap);
   }
 
   virtual void Free(void*) {}  // Nop.
 
   static void* operator new(size_t size, ArenaAllocator* arena) {
-    return arena->Alloc(sizeof(ArenaBitVectorAllocator), ArenaAllocator::kAllocGrowableBitMap);
+    return arena->Alloc(sizeof(ArenaBitVectorAllocator), kArenaAllocGrowableBitMap);
   }
   static void operator delete(void* p) {}  // Nop.
 
diff --git a/compiler/utils/arena_bit_vector.h b/compiler/utils/arena_bit_vector.h
index 6c1461727a..1a3d6a3e34 100644
--- a/compiler/utils/arena_bit_vector.h
+++ b/compiler/utils/arena_bit_vector.h
@@ -55,7 +55,7 @@ class ArenaBitVector : public BitVector {
     ~ArenaBitVector() {}
 
   static void* operator new(size_t size, ArenaAllocator* arena) {
-     return arena->Alloc(sizeof(ArenaBitVector), ArenaAllocator::kAllocGrowableBitMap);
+     return arena->Alloc(sizeof(ArenaBitVector), kArenaAllocGrowableBitMap);
   }
   static void operator delete(void* p) {}  // Nop.
 
diff --git a/compiler/utils/arm/assembler_arm.cc b/compiler/utils/arm/assembler_arm.cc
index 828dffafa1..dbd078a81c 100644
--- a/compiler/utils/arm/assembler_arm.cc
+++ b/compiler/utils/arm/assembler_arm.cc
@@ -1550,6 +1550,9 @@ void ArmAssembler::LoadRef(ManagedRegister mdest, ManagedRegister base,
   CHECK(dst.IsCoreRegister() && dst.IsCoreRegister()) << dst;
   LoadFromOffset(kLoadWord, dst.AsCoreRegister(),
                  base.AsArm().AsCoreRegister(), offs.Int32Value());
+  if (kPoisonHeapReferences) {
+    rsb(dst.AsCoreRegister(), dst.AsCoreRegister(), ShifterOperand(0));
+  }
 }
 
 void ArmAssembler::LoadRef(ManagedRegister mdest, FrameOffset  src) {
diff --git a/compiler/utils/arm64/managed_register_arm64.cc b/compiler/utils/arm64/managed_register_arm64.cc
index cc0b509033..de5cb8cd8d 100644
--- a/compiler/utils/arm64/managed_register_arm64.cc
+++ b/compiler/utils/arm64/managed_register_arm64.cc
@@ -27,10 +27,10 @@ namespace arm64 {
 //  * [W0, W15]
 //  * [D0, D31]
 //  * [S0, S31]
-static const int kNumberOfAvailableCoreRegisters = (X15 - X0) + 1;
-static const int kNumberOfAvailableWRegisters = (W15 - W0) + 1;
-static const int kNumberOfAvailableDRegisters = kNumberOfDRegisters;
-static const int kNumberOfAvailableSRegisters = kNumberOfSRegisters;
+// static const int kNumberOfAvailableCoreRegisters = (X15 - X0) + 1;
+// static const int kNumberOfAvailableWRegisters = (W15 - W0) + 1;
+// static const int kNumberOfAvailableDRegisters = kNumberOfDRegisters;
+// static const int kNumberOfAvailableSRegisters = kNumberOfSRegisters;
 
 // Returns true if this managed-register overlaps the other managed-register.
 // GP Register Bank:
diff --git a/compiler/utils/debug_stack.h b/compiler/utils/debug_stack.h
new file mode 100644
index 0000000000..2e02b438b9
--- /dev/null
+++ b/compiler/utils/debug_stack.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_UTILS_DEBUG_STACK_H_
+#define ART_COMPILER_UTILS_DEBUG_STACK_H_
+
+#include "base/logging.h"
+#include "base/macros.h"
+#include "globals.h"
+
+namespace art {
+
+// Helper classes for reference counting to enforce construction/destruction order and
+// usage of the top element of a stack in debug mode with no overhead in release mode.
+
+// Reference counter. No references allowed in destructor or in explicitly called CheckNoRefs().
+template <bool kIsDebug>
+class DebugStackRefCounterImpl;
+// Reference. Allows an explicit check that it's the top reference.
+template <bool kIsDebug>
+class DebugStackReferenceImpl;
+// Indirect top reference. Checks that the reference is the top reference when used.
+template <bool kIsDebug>
+class DebugStackIndirectTopRefImpl;
+
+typedef DebugStackRefCounterImpl<kIsDebugBuild> DebugStackRefCounter;
+typedef DebugStackReferenceImpl<kIsDebugBuild> DebugStackReference;
+typedef DebugStackIndirectTopRefImpl<kIsDebugBuild> DebugStackIndirectTopRef;
+
+// Non-debug mode specializations. This should be optimized away.
+
+template <>
+class DebugStackRefCounterImpl<false> {
+ public:
+  size_t IncrementRefCount() { return 0u; }
+  void DecrementRefCount() { }
+  size_t GetRefCount() const { return 0u; }
+  void CheckNoRefs() const { }
+};
+
+template <>
+class DebugStackReferenceImpl<false> {
+ public:
+  explicit DebugStackReferenceImpl(DebugStackRefCounterImpl<false>* counter) { UNUSED(counter); }
+  DebugStackReferenceImpl(const DebugStackReferenceImpl& other) = default;
+  DebugStackReferenceImpl& operator=(const DebugStackReferenceImpl& other) = default;
+  void CheckTop() { }
+};
+
+template <>
+class DebugStackIndirectTopRefImpl<false> {
+ public:
+  explicit DebugStackIndirectTopRefImpl(DebugStackReferenceImpl<false>* ref) { UNUSED(ref); }
+  DebugStackIndirectTopRefImpl(const DebugStackIndirectTopRefImpl& other) = default;
+  DebugStackIndirectTopRefImpl& operator=(const DebugStackIndirectTopRefImpl& other) = default;
+  void CheckTop() { }
+};
+
+// Debug mode versions.
+
+template <bool kIsDebug>
+class DebugStackRefCounterImpl {
+ public:
+  DebugStackRefCounterImpl() : ref_count_(0u) { }
+  ~DebugStackRefCounterImpl() { CheckNoRefs(); }
+  size_t IncrementRefCount() { return ++ref_count_; }
+  void DecrementRefCount() { --ref_count_; }
+  size_t GetRefCount() const { return ref_count_; }
+  void CheckNoRefs() const { CHECK_EQ(ref_count_, 0u); }
+
+ private:
+  size_t ref_count_;
+};
+
+template <bool kIsDebug>
+class DebugStackReferenceImpl {
+ public:
+  explicit DebugStackReferenceImpl(DebugStackRefCounterImpl<kIsDebug>* counter)
+    : counter_(counter), ref_count_(counter->IncrementRefCount()) {
+  }
+  DebugStackReferenceImpl(const DebugStackReferenceImpl& other)
+    : counter_(other.counter_), ref_count_(counter_->IncrementRefCount()) {
+  }
+  DebugStackReferenceImpl& operator=(const DebugStackReferenceImpl& other) {
+    CHECK(counter_ == other.counter_);
+    return *this;
+  }
+  ~DebugStackReferenceImpl() { counter_->DecrementRefCount(); }
+  void CheckTop() { CHECK_EQ(counter_->GetRefCount(), ref_count_); }
+
+ private:
+  DebugStackRefCounterImpl<true>* counter_;
+  size_t ref_count_;
+};
+
+template <bool kIsDebug>
+class DebugStackIndirectTopRefImpl {
+ public:
+  explicit DebugStackIndirectTopRefImpl(DebugStackReferenceImpl<kIsDebug>* ref)
+      : ref_(ref) {
+    CheckTop();
+  }
+  DebugStackIndirectTopRefImpl(const DebugStackIndirectTopRefImpl& other)
+      : ref_(other.ref_) {
+    CheckTop();
+  }
+  DebugStackIndirectTopRefImpl& operator=(const DebugStackIndirectTopRefImpl& other) {
+    CHECK(ref_ == other->ref_);
+    CheckTop();
+    return *this;
+  }
+  ~DebugStackIndirectTopRefImpl() {
+    CheckTop();
+  }
+  void CheckTop() {
+    ref_->CheckTop();
+  }
+
+ private:
+  DebugStackReferenceImpl<kIsDebug>* ref_;
+};
+
+}  // namespace art
+
+#endif  // ART_COMPILER_UTILS_DEBUG_STACK_H_
diff --git a/compiler/utils/growable_array.h b/compiler/utils/growable_array.h
index 82b6a607e7..a7d1f0e5a5 100644
--- a/compiler/utils/growable_array.h
+++ b/compiler/utils/growable_array.h
@@ -75,7 +75,7 @@ class GrowableArray {
         num_used_(0),
         kind_(kind) {
       elem_list_ = static_cast<T*>(arena_->Alloc(sizeof(T) * init_length,
-                                                 ArenaAllocator::kAllocGrowableArray));
+                                                 kArenaAllocGrowableArray));
     };
 
 
@@ -89,7 +89,7 @@ class GrowableArray {
          target_length = new_length;
       }
       T* new_array = static_cast<T*>(arena_->Alloc(sizeof(T) * target_length,
-                                                   ArenaAllocator::kAllocGrowableArray));
+                                                   kArenaAllocGrowableArray));
       memcpy(new_array, elem_list_, sizeof(T) * num_allocated_);
       num_allocated_ = target_length;
       elem_list_ = new_array;
@@ -181,7 +181,7 @@ class GrowableArray {
     T* GetRawStorage() const { return elem_list_; }
 
     static void* operator new(size_t size, ArenaAllocator* arena) {
-      return arena->Alloc(sizeof(GrowableArray<T>), ArenaAllocator::kAllocGrowableArray);
+      return arena->Alloc(sizeof(GrowableArray<T>), kArenaAllocGrowableArray);
     };
     static void operator delete(void* p) {}  // Nop.
 
diff --git a/compiler/utils/mips/assembler_mips.cc b/compiler/utils/mips/assembler_mips.cc
index fdd2bab4da..ce21b84867 100644
--- a/compiler/utils/mips/assembler_mips.cc
+++ b/compiler/utils/mips/assembler_mips.cc
@@ -684,6 +684,9 @@ void MipsAssembler::LoadRef(ManagedRegister mdest, ManagedRegister base,
   CHECK(dest.IsCoreRegister() && dest.IsCoreRegister());
   LoadFromOffset(kLoadWord, dest.AsCoreRegister(),
                  base.AsMips().AsCoreRegister(), offs.Int32Value());
+  if (kPoisonHeapReferences) {
+    Subu(dest.AsCoreRegister(), ZERO, dest.AsCoreRegister());
+  }
 }
 
 void MipsAssembler::LoadRawPtr(ManagedRegister mdest, ManagedRegister base,
diff --git a/compiler/utils/scoped_arena_allocator.cc b/compiler/utils/scoped_arena_allocator.cc
new file mode 100644
index 0000000000..ee3b07ebe9
--- /dev/null
+++ b/compiler/utils/scoped_arena_allocator.cc
@@ -0,0 +1,126 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "scoped_arena_allocator.h"
+
+#include "utils/arena_allocator.h"
+#include <memcheck/memcheck.h>
+
+namespace art {
+
+static constexpr size_t kValgrindRedZoneBytes = 8;
+
+ArenaStack::ArenaStack(ArenaPool* arena_pool)
+  : DebugStackRefCounter(),
+    stats_and_pool_(arena_pool),
+    bottom_arena_(nullptr),
+    top_arena_(nullptr),
+    top_ptr_(nullptr),
+    top_end_(nullptr),
+    running_on_valgrind_(RUNNING_ON_VALGRIND > 0) {
+}
+
+ArenaStack::~ArenaStack() {
+  stats_and_pool_.pool->FreeArenaChain(bottom_arena_);
+}
+
+MemStats ArenaStack::GetPeakStats() const {
+  DebugStackRefCounter::CheckNoRefs();
+  return MemStats("ArenaStack peak", static_cast<const TaggedStats<Peak>*>(&stats_and_pool_),
+                  bottom_arena_);
+}
+
+uint8_t* ArenaStack::AllocateFromNextArena(size_t rounded_bytes) {
+  UpdateBytesAllocated();
+  size_t allocation_size = std::max(Arena::kDefaultSize, rounded_bytes);
+  if (UNLIKELY(top_arena_ == nullptr)) {
+    top_arena_ = bottom_arena_ = stats_and_pool_.pool->AllocArena(allocation_size);
+    top_arena_->next_ = nullptr;
+  } else if (top_arena_->next_ != nullptr && top_arena_->next_->Size() >= allocation_size) {
+    top_arena_ = top_arena_->next_;
+  } else {
+    Arena* tail = top_arena_->next_;
+    top_arena_->next_ = stats_and_pool_.pool->AllocArena(allocation_size);
+    top_arena_ = top_arena_->next_;
+    top_arena_->next_ = tail;
+  }
+  top_end_ = top_arena_->End();
+  // top_ptr_ shall be updated by ScopedArenaAllocator.
+  return top_arena_->Begin();
+}
+
+void ArenaStack::UpdatePeakStatsAndRestore(const ArenaAllocatorStats& restore_stats) {
+  if (PeakStats()->BytesAllocated() < CurrentStats()->BytesAllocated()) {
+    PeakStats()->Copy(*CurrentStats());
+  }
+  CurrentStats()->Copy(restore_stats);
+}
+
+void ArenaStack::UpdateBytesAllocated() {
+  if (top_arena_ != nullptr) {
+    // Update how many bytes we have allocated into the arena so that the arena pool knows how
+    // much memory to zero out. Though ScopedArenaAllocator doesn't guarantee the memory is
+    // zero-initialized, the Arena may be reused by ArenaAllocator which does guarantee this.
+    size_t allocated = static_cast<size_t>(top_ptr_ - top_arena_->Begin());
+    if (top_arena_->bytes_allocated_ < allocated) {
+      top_arena_->bytes_allocated_ = allocated;
+    }
+  }
+}
+
+void* ArenaStack::AllocValgrind(size_t bytes, ArenaAllocKind kind) {
+  size_t rounded_bytes = (bytes + kValgrindRedZoneBytes + 3) & ~3;
+  uint8_t* ptr = top_ptr_;
+  if (UNLIKELY(static_cast<size_t>(top_end_ - ptr) < rounded_bytes)) {
+    ptr = AllocateFromNextArena(rounded_bytes);
+  }
+  CurrentStats()->RecordAlloc(bytes, kind);
+  top_ptr_ = ptr + rounded_bytes;
+  VALGRIND_MAKE_MEM_NOACCESS(ptr + bytes, rounded_bytes - bytes);
+  return ptr;
+}
+
+ScopedArenaAllocator::ScopedArenaAllocator(ArenaStack* arena_stack)
+  : DebugStackReference(arena_stack),
+    DebugStackRefCounter(),
+    ArenaAllocatorStats(*arena_stack->CurrentStats()),
+    arena_stack_(arena_stack),
+    mark_arena_(arena_stack->top_arena_),
+    mark_ptr_(arena_stack->top_ptr_),
+    mark_end_(arena_stack->top_end_) {
+}
+
+ScopedArenaAllocator::~ScopedArenaAllocator() {
+  Reset();
+}
+
+void ScopedArenaAllocator::Reset() {
+  DebugStackReference::CheckTop();
+  DebugStackRefCounter::CheckNoRefs();
+  arena_stack_->UpdatePeakStatsAndRestore(*this);
+  arena_stack_->UpdateBytesAllocated();
+  if (LIKELY(mark_arena_ != nullptr)) {
+    arena_stack_->top_arena_ = mark_arena_;
+    arena_stack_->top_ptr_ = mark_ptr_;
+    arena_stack_->top_end_ = mark_end_;
+  } else if (arena_stack_->bottom_arena_ != nullptr) {
+    mark_arena_ = arena_stack_->top_arena_ = arena_stack_->bottom_arena_;
+    mark_ptr_ = arena_stack_->top_ptr_ = mark_arena_->Begin();
+    mark_end_ = arena_stack_->top_end_ = mark_arena_->End();
+  }
+}
+
+}  // namespace art
diff --git a/compiler/utils/scoped_arena_allocator.h b/compiler/utils/scoped_arena_allocator.h
new file mode 100644
index 0000000000..24a8afea6e
--- /dev/null
+++ b/compiler/utils/scoped_arena_allocator.h
@@ -0,0 +1,244 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_UTILS_SCOPED_ARENA_ALLOCATOR_H_
+#define ART_COMPILER_UTILS_SCOPED_ARENA_ALLOCATOR_H_
+
+#include "base/logging.h"
+#include "base/macros.h"
+#include "utils/arena_allocator.h"
+#include "utils/debug_stack.h"
+#include "globals.h"
+
+namespace art {
+
+class ArenaStack;
+class ScopedArenaAllocator;
+
+template <typename T>
+class ScopedArenaAllocatorAdapter;
+
+// Holds a list of Arenas for use by ScopedArenaAllocator stack.
+class ArenaStack : private DebugStackRefCounter {
+ public:
+  explicit ArenaStack(ArenaPool* arena_pool);
+  ~ArenaStack();
+
+  size_t PeakBytesAllocated() {
+    return PeakStats()->BytesAllocated();
+  }
+
+  MemStats GetPeakStats() const;
+
+ private:
+  struct Peak;
+  struct Current;
+  template <typename Tag> struct TaggedStats : ArenaAllocatorStats { };
+  struct StatsAndPool : TaggedStats<Peak>, TaggedStats<Current> {
+    explicit StatsAndPool(ArenaPool* arena_pool) : pool(arena_pool) { }
+    ArenaPool* const pool;
+  };
+
+  ArenaAllocatorStats* PeakStats() {
+    return static_cast<TaggedStats<Peak>*>(&stats_and_pool_);
+  }
+
+  ArenaAllocatorStats* CurrentStats() {
+    return static_cast<TaggedStats<Current>*>(&stats_and_pool_);
+  }
+
+  // Private - access via ScopedArenaAllocator or ScopedArenaAllocatorAdapter.
+  void* Alloc(size_t bytes, ArenaAllocKind kind) ALWAYS_INLINE {
+    if (UNLIKELY(running_on_valgrind_)) {
+      return AllocValgrind(bytes, kind);
+    }
+    size_t rounded_bytes = (bytes + 3) & ~3;
+    uint8_t* ptr = top_ptr_;
+    if (UNLIKELY(static_cast<size_t>(top_end_ - ptr) < rounded_bytes)) {
+      ptr = AllocateFromNextArena(rounded_bytes);
+    }
+    CurrentStats()->RecordAlloc(bytes, kind);
+    top_ptr_ = ptr + rounded_bytes;
+    return ptr;
+  }
+
+  uint8_t* AllocateFromNextArena(size_t rounded_bytes);
+  void UpdatePeakStatsAndRestore(const ArenaAllocatorStats& restore_stats);
+  void UpdateBytesAllocated();
+  void* AllocValgrind(size_t bytes, ArenaAllocKind kind);
+
+  StatsAndPool stats_and_pool_;
+  Arena* bottom_arena_;
+  Arena* top_arena_;
+  uint8_t* top_ptr_;
+  uint8_t* top_end_;
+
+  const bool running_on_valgrind_;
+
+  friend class ScopedArenaAllocator;
+  template <typename T>
+  friend class ScopedArenaAllocatorAdapter;
+
+  DISALLOW_COPY_AND_ASSIGN(ArenaStack);
+};
+
+class ScopedArenaAllocator
+    : private DebugStackReference, private DebugStackRefCounter, private ArenaAllocatorStats {
+ public:
+  // Create a ScopedArenaAllocator directly on the ArenaStack when the scope of
+  // the allocator is not exactly a C++ block scope. For example, an optimization
+  // pass can create the scoped allocator in Start() and destroy it in End().
+  static ScopedArenaAllocator* Create(ArenaStack* arena_stack) {
+    void* addr = arena_stack->Alloc(sizeof(ScopedArenaAllocator), kArenaAllocMisc);
+    ScopedArenaAllocator* allocator = new(addr) ScopedArenaAllocator(arena_stack);
+    allocator->mark_ptr_ = reinterpret_cast<uint8_t*>(addr);
+    return allocator;
+  }
+
+  explicit ScopedArenaAllocator(ArenaStack* arena_stack);
+  ~ScopedArenaAllocator();
+
+  void Reset();
+
+  void* Alloc(size_t bytes, ArenaAllocKind kind) ALWAYS_INLINE {
+    DebugStackReference::CheckTop();
+    return arena_stack_->Alloc(bytes, kind);
+  }
+
+  // ScopedArenaAllocatorAdapter is incomplete here, we need to define this later.
+  ScopedArenaAllocatorAdapter<void> Adapter();
+
+  // Allow a delete-expression to destroy but not deallocate allocators created by Create().
+  static void operator delete(void* ptr) { UNUSED(ptr); }
+
+ private:
+  ArenaStack* const arena_stack_;
+  Arena* mark_arena_;
+  uint8_t* mark_ptr_;
+  uint8_t* mark_end_;
+
+  template <typename T>
+  friend class ScopedArenaAllocatorAdapter;
+
+  DISALLOW_COPY_AND_ASSIGN(ScopedArenaAllocator);
+};
+
+template <>
+class ScopedArenaAllocatorAdapter<void>
+    : private DebugStackReference, private DebugStackIndirectTopRef {
+ public:
+  typedef void value_type;
+  typedef void* pointer;
+  typedef const void* const_pointer;
+
+  template <typename U>
+  struct rebind {
+    typedef ScopedArenaAllocatorAdapter<U> other;
+  };
+
+  explicit ScopedArenaAllocatorAdapter(ScopedArenaAllocator* arena_allocator)
+      : DebugStackReference(arena_allocator),
+        DebugStackIndirectTopRef(arena_allocator),
+        arena_stack_(arena_allocator->arena_stack_) {
+  }
+  template <typename U>
+  ScopedArenaAllocatorAdapter(const ScopedArenaAllocatorAdapter<U>& other)
+      : DebugStackReference(other),
+        DebugStackIndirectTopRef(other),
+        arena_stack_(other.arena_stack_) {
+  }
+  ScopedArenaAllocatorAdapter(const ScopedArenaAllocatorAdapter& other) = default;
+  ScopedArenaAllocatorAdapter& operator=(const ScopedArenaAllocatorAdapter& other) = default;
+  ~ScopedArenaAllocatorAdapter() = default;
+
+ private:
+  ArenaStack* arena_stack_;
+
+  template <typename U>
+  friend class ScopedArenaAllocatorAdapter;
+};
+
+// Adapter for use of ScopedArenaAllocator in STL containers.
+template <typename T>
+class ScopedArenaAllocatorAdapter : private DebugStackReference, private DebugStackIndirectTopRef {
+ public:
+  typedef T value_type;
+  typedef T* pointer;
+  typedef T& reference;
+  typedef const T* const_pointer;
+  typedef const T& const_reference;
+  typedef size_t size_type;
+  typedef ptrdiff_t difference_type;
+
+  template <typename U>
+  struct rebind {
+    typedef ScopedArenaAllocatorAdapter<U> other;
+  };
+
+  explicit ScopedArenaAllocatorAdapter(ScopedArenaAllocator* arena_allocator)
+      : DebugStackReference(arena_allocator),
+        DebugStackIndirectTopRef(arena_allocator),
+        arena_stack_(arena_allocator->arena_stack_) {
+  }
+  template <typename U>
+  ScopedArenaAllocatorAdapter(const ScopedArenaAllocatorAdapter<U>& other)
+      : DebugStackReference(other),
+        DebugStackIndirectTopRef(other),
+        arena_stack_(other.arena_stack_) {
+  }
+  ScopedArenaAllocatorAdapter(const ScopedArenaAllocatorAdapter& other) = default;
+  ScopedArenaAllocatorAdapter& operator=(const ScopedArenaAllocatorAdapter& other) = default;
+  ~ScopedArenaAllocatorAdapter() = default;
+
+  size_type max_size() const {
+    return static_cast<size_type>(-1) / sizeof(T);
+  }
+
+  pointer address(reference x) const { return &x; }
+  const_pointer address(const_reference x) const { return &x; }
+
+  pointer allocate(size_type n, ScopedArenaAllocatorAdapter<void>::pointer hint = nullptr) {
+    DCHECK_LE(n, max_size());
+    DebugStackIndirectTopRef::CheckTop();
+    return reinterpret_cast<T*>(arena_stack_->Alloc(n * sizeof(T), kArenaAllocSTL));
+  }
+  void deallocate(pointer p, size_type n) {
+    DebugStackIndirectTopRef::CheckTop();
+  }
+
+  void construct(pointer p, const_reference val) {
+    DebugStackIndirectTopRef::CheckTop();
+    new (static_cast<void*>(p)) value_type(val);
+  }
+  void destroy(pointer p) {
+    DebugStackIndirectTopRef::CheckTop();
+    p->~value_type();
+  }
+
+ private:
+  ArenaStack* arena_stack_;
+
+  template <typename U>
+  friend class ScopedArenaAllocatorAdapter;
+};
+
+inline ScopedArenaAllocatorAdapter<void> ScopedArenaAllocator::Adapter() {
+  return ScopedArenaAllocatorAdapter<void>(this);
+}
+
+}  // namespace art
+
+#endif  // ART_COMPILER_UTILS_SCOPED_ARENA_ALLOCATOR_H_
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index 26300e02e0..db8956d43b 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -1560,6 +1560,9 @@ void X86Assembler::LoadRef(ManagedRegister mdest, ManagedRegister base,
   X86ManagedRegister dest = mdest.AsX86();
   CHECK(dest.IsCpuRegister() && dest.IsCpuRegister());
   movl(dest.AsCpuRegister(), Address(base.AsX86().AsCpuRegister(), offs));
+  if (kPoisonHeapReferences) {
+    negl(dest.AsCpuRegister());
+  }
 }
 
 void X86Assembler::LoadRawPtr(ManagedRegister mdest, ManagedRegister base,
diff --git a/dex2oat/dex2oat.cc b/dex2oat/dex2oat.cc
index 67a9e060d9..7c81ffb16e 100644
--- a/dex2oat/dex2oat.cc
+++ b/dex2oat/dex2oat.cc
@@ -54,7 +54,6 @@
 #include "runtime.h"
 #include "ScopedLocalRef.h"
 #include "scoped_thread_state_change.h"
-#include "sirt_ref.h"
 #include "vector_output_stream.h"
 #include "well_known_classes.h"
 #include "zip_archive.h"
diff --git a/runtime/Android.mk b/runtime/Android.mk
index bb1bc990ee..18e2d3e630 100644
--- a/runtime/Android.mk
+++ b/runtime/Android.mk
@@ -84,7 +84,6 @@ LIBART_COMMON_SRC_FILES := \
 	jdwp/object_registry.cc \
 	jni_internal.cc \
 	jobject_comparator.cc \
-	locks.cc \
 	mem_map.cc \
 	memory_region.cc \
 	mirror/art_field.cc \
@@ -289,7 +288,6 @@ LIBART_ENUM_OPERATOR_OUT_HEADER_FILES := \
 	invoke_type.h \
 	jdwp/jdwp.h \
 	jdwp/jdwp_constants.h \
-	locks.h \
 	lock_word.h \
 	mirror/class.h \
 	oat.h \
@@ -425,14 +423,8 @@ $$(ENUM_OPERATOR_OUT_GEN): $$(GENERATED_SRC_DIR)/%_operator_out.cc : $(LOCAL_PAT
   endif
 endef
 
-ifeq ($(ART_BUILD_TARGET_NDEBUG),true)
-  $(eval $(call build-libart,target,ndebug,$(ART_TARGET_CLANG)))
-endif
-ifeq ($(ART_BUILD_TARGET_DEBUG),true)
-  $(eval $(call build-libart,target,debug,$(ART_TARGET_CLANG)))
-endif
-
-# We always build dex2oat and dependencies, even if the host build is otherwise disabled, since they are used to cross compile for the target.
+# We always build dex2oat and dependencies, even if the host build is otherwise disabled, since
+# they are used to cross compile for the target.
 ifeq ($(WITH_HOST_DALVIK),true)
   ifeq ($(ART_BUILD_NDEBUG),true)
     $(eval $(call build-libart,host,ndebug,$(ART_HOST_CLANG)))
@@ -441,3 +433,10 @@ ifeq ($(WITH_HOST_DALVIK),true)
     $(eval $(call build-libart,host,debug,$(ART_HOST_CLANG)))
   endif
 endif
+
+ifeq ($(ART_BUILD_TARGET_NDEBUG),true)
+  $(eval $(call build-libart,target,ndebug,$(ART_TARGET_CLANG)))
+endif
+ifeq ($(ART_BUILD_TARGET_DEBUG),true)
+  $(eval $(call build-libart,target,debug,$(ART_TARGET_CLANG)))
+endif
diff --git a/runtime/arch/arm/context_arm.h b/runtime/arch/arm/context_arm.h
index 4a0d08292c..2ccce8dcaf 100644
--- a/runtime/arch/arm/context_arm.h
+++ b/runtime/arch/arm/context_arm.h
@@ -17,7 +17,6 @@
 #ifndef ART_RUNTIME_ARCH_ARM_CONTEXT_ARM_H_
 #define ART_RUNTIME_ARCH_ARM_CONTEXT_ARM_H_
 
-#include "locks.h"
 #include "arch/context.h"
 #include "base/logging.h"
 #include "registers_arm.h"
diff --git a/runtime/arch/context.h b/runtime/arch/context.h
index 83bbb11fd5..f7b7835466 100644
--- a/runtime/arch/context.h
+++ b/runtime/arch/context.h
@@ -20,7 +20,7 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include "locks.h"
+#include "base/mutex.h"
 
 namespace art {
 
diff --git a/runtime/arch/x86_64/asm_support_x86_64.S b/runtime/arch/x86_64/asm_support_x86_64.S
index b59c0cbe50..14975dadd4 100644
--- a/runtime/arch/x86_64/asm_support_x86_64.S
+++ b/runtime/arch/x86_64/asm_support_x86_64.S
@@ -19,38 +19,25 @@
 
 #include "asm_support_x86_64.h"
 
-#if defined(__APPLE__)
-    // Mac OS' as(1) doesn't let you name macro parameters.
+#if defined(__clang__)
+    // Clang's as(1) doesn't let you name macro parameters.
     #define MACRO0(macro_name) .macro macro_name
     #define MACRO1(macro_name, macro_arg1) .macro macro_name
     #define MACRO2(macro_name, macro_arg1, macro_args2) .macro macro_name
     #define MACRO3(macro_name, macro_arg1, macro_args2, macro_args3) .macro macro_name
     #define END_MACRO .endmacro
 
-    // Mac OS' as(1) uses $0, $1, and so on for macro arguments, and function names
-    // are mangled with an extra underscore prefix. The use of $x for arguments
-    // mean that literals need to be represented with $$x in macros.
-    #define SYMBOL(name) _ ## name
-    #define PLT_SYMBOL(name) _ ## name
+    // Clang's as(1) uses $0, $1, and so on for macro arguments.
     #define VAR(name,index) SYMBOL($index)
     #define PLT_VAR(name, index) SYMBOL($index)
     #define REG_VAR(name,index) %$index
     #define CALL_MACRO(name,index) $index
+    #define FUNCTION_TYPE(name,index) .type $index, @function
+    #define SIZE(name,index) .size $index, .-$index
+
+    //  The use of $x for arguments mean that literals need to be represented with $$x in macros.
     #define LITERAL(value) $value
     #define MACRO_LITERAL(value) $$value
-
-    // Mac OS' doesn't like cfi_* directives
-    #define CFI_STARTPROC
-    #define CFI_ENDPROC
-    #define CFI_ADJUST_CFA_OFFSET(size)
-    #define CFI_DEF_CFA(reg,size)
-    #define CFI_DEF_CFA_REGISTER(reg)
-    #define CFI_RESTORE(reg)
-    #define CFI_REL_OFFSET(reg,size)
-
-    // Mac OS' doesn't support certain directives
-    #define FUNCTION_TYPE(name)
-    #define SIZE(name)
 #else
     // Regular gas(1) lets you name macro parameters.
     #define MACRO0(macro_name) .macro macro_name
@@ -65,16 +52,19 @@
     // no special meaning to $, so literals are still just $x. The use of altmacro means % is a
     // special character meaning care needs to be taken when passing registers as macro arguments.
     .altmacro
-    #define SYMBOL(name) name
-    #define PLT_SYMBOL(name) name@PLT
     #define VAR(name,index) name&
     #define PLT_VAR(name, index) name&@PLT
     #define REG_VAR(name,index) %name
     #define CALL_MACRO(name,index) name&
+    #define FUNCTION_TYPE(name,index) .type name&, @function
+    #define SIZE(name,index) .size name, .-name
+
     #define LITERAL(value) $value
     #define MACRO_LITERAL(value) $value
+#endif
 
-    // CFI support
+    // CFI support.
+#if !defined(__APPLE__)
     #define CFI_STARTPROC .cfi_startproc
     #define CFI_ENDPROC .cfi_endproc
     #define CFI_ADJUST_CFA_OFFSET(size) .cfi_adjust_cfa_offset size
@@ -82,9 +72,25 @@
     #define CFI_DEF_CFA_REGISTER(reg) .cfi_def_cfa_register reg
     #define CFI_RESTORE(reg) .cfi_restore reg
     #define CFI_REL_OFFSET(reg,size) .cfi_rel_offset reg,size
+#else
+    // Mac OS' doesn't like cfi_* directives.
+    #define CFI_STARTPROC
+    #define CFI_ENDPROC
+    #define CFI_ADJUST_CFA_OFFSET(size)
+    #define CFI_DEF_CFA(reg,size)
+    #define CFI_DEF_CFA_REGISTER(reg)
+    #define CFI_RESTORE(reg)
+    #define CFI_REL_OFFSET(reg,size)
+#endif
 
-    #define FUNCTION_TYPE(name) .type name&, @function
-    #define SIZE(name) .size name, .-name
+    // Symbols.
+#if !defined(__APPLE__)
+    #define SYMBOL(name) name
+    #define PLT_SYMBOL(name) name ## @PLT
+#else
+    // Mac OS' symbols have an _ prefix.
+    #define SYMBOL(name) _ ## name
+    #define PLT_SYMBOL(name) _ ## name
 #endif
 
     /* Cache alignment for function entry */
@@ -93,7 +99,7 @@ MACRO0(ALIGN_FUNCTION_ENTRY)
 END_MACRO
 
 MACRO1(DEFINE_FUNCTION, c_name)
-    FUNCTION_TYPE(\c_name)
+    FUNCTION_TYPE(\c_name, 0)
     .globl VAR(c_name, 0)
     ALIGN_FUNCTION_ENTRY
 VAR(c_name, 0):
@@ -102,7 +108,7 @@ END_MACRO
 
 MACRO1(END_FUNCTION, c_name)
     CFI_ENDPROC
-    SIZE(\c_name)
+    SIZE(\c_name, 0)
 END_MACRO
 
 MACRO1(PUSH, reg)
@@ -118,7 +124,7 @@ MACRO1(POP, reg)
 END_MACRO
 
 MACRO1(UNIMPLEMENTED,name)
-    FUNCTION_TYPE(\name)
+    FUNCTION_TYPE(\name, 0)
     .globl VAR(name, 0)
     ALIGN_FUNCTION_ENTRY
 VAR(name, 0):
@@ -126,21 +132,7 @@ VAR(name, 0):
     int3
     int3
     CFI_ENDPROC
-    SIZE(\name)
-END_MACRO
-
-MACRO0(SETUP_GOT_NOSAVE)
-    call __x86.get_pc_thunk.bx
-    addl $_GLOBAL_OFFSET_TABLE_, %ebx
-END_MACRO
-
-MACRO0(SETUP_GOT)
-    PUSH  ebx
-    SETUP_GOT_NOSAVE
-END_MACRO
-
-MACRO0(UNDO_SETUP_GOT)
-    POP  ebx
+    SIZE(\name, 0)
 END_MACRO
 
 #endif  // ART_RUNTIME_ARCH_X86_64_ASM_SUPPORT_X86_64_S_
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 863fa318cd..a78a1e5676 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -33,7 +33,7 @@ MACRO0(SETUP_SAVE_ALL_CALLEE_SAVE_FRAME)
     PUSH r12  // Callee save.
     PUSH rbp  // Callee save.
     PUSH rbx  // Callee save.
-    subq LITERAL(8), %rsp  // Space for Method* (also aligns the frame).
+    subq MACRO_LITERAL(8), %rsp  // Space for Method* (also aligns the frame).
     CFI_ADJUST_CFA_OFFSET(8)
     // R10 := ArtMethod* for ref and args callee save frame method.
     movq RUNTIME_SAVE_ALL_CALLEE_SAVE_FRAME_OFFSET(%r10), %r10
@@ -76,7 +76,7 @@ MACRO0(SETUP_REF_AND_ARGS_CALLEE_SAVE_FRAME)
     PUSH rdx  // Quick arg 2.
     PUSH rcx  // Quick arg 3.
     // Create space for FPR args and create 2 slots, 1 of padding and 1 for the ArtMethod*.
-    subq LITERAL(80), %rsp
+    subq MACRO_LITERAL(80), %rsp
     CFI_ADJUST_CFA_OFFSET(80)
     // R10 := ArtMethod* for ref and args callee save frame method.
     movq RUNTIME_REF_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET(%r10), %r10
@@ -103,7 +103,7 @@ MACRO0(RESTORE_REF_AND_ARGS_CALLEE_SAVE_FRAME)
     movq 56(%rsp), %xmm5
     movq 64(%rsp), %xmm6
     movq 72(%rsp), %xmm7
-    addq LITERAL(80), %rsp
+    addq MACRO_LITERAL(80), %rsp
     CFI_ADJUST_CFA_OFFSET(-80)
     // Restore callee and GPR args, mixed together to agree with core spills bitmap.
     POP rcx
@@ -226,26 +226,26 @@ INVOKE_TRAMPOLINE art_quick_invoke_virtual_trampoline_with_access_check, artInvo
 MACRO2(LOOP_OVER_SHORTY_LOADING_XMMS, xmm_reg, finished)
 1: // LOOP
     movb (%r10), %al              // al := *shorty
-    addq LITERAL(1), %r10         // shorty++
-    cmpb LITERAL(0), %al          // if (al == '\0') goto xmm_setup_finished
+    addq MACRO_LITERAL(1), %r10   // shorty++
+    cmpb MACRO_LITERAL(0), %al    // if (al == '\0') goto xmm_setup_finished
     je VAR(finished, 1)
-    cmpb LITERAL(68), %al         // if (al == 'D') goto FOUND_DOUBLE
+    cmpb MACRO_LITERAL(68), %al   // if (al == 'D') goto FOUND_DOUBLE
     je 2f
-    cmpb LITERAL(70), %al         // if (al == 'F') goto FOUND_FLOAT
+    cmpb MACRO_LITERAL(70), %al   // if (al == 'F') goto FOUND_FLOAT
     je 3f
-    addq LITERAL(4), %r11         // arg_array++
+    addq MACRO_LITERAL(4), %r11   // arg_array++
     //  Handle extra space in arg array taken by a long.
-    cmpb LITERAL(74), %al         // if (al != 'J') goto LOOP
+    cmpb MACRO_LITERAL(74), %al   // if (al != 'J') goto LOOP
     jne 1b
-    addq LITERAL(4), %r11         // arg_array++
+    addq MACRO_LITERAL(4), %r11   // arg_array++
     jmp 1b                        // goto LOOP
 2:  // FOUND_DOUBLE
     movsd (%r11), REG_VAR(xmm_reg, 0)
-    addq LITERAL(8), %r11         // arg_array+=2
+    addq MACRO_LITERAL(8), %r11   // arg_array+=2
     jmp 4f
 3:  // FOUND_FLOAT
     movss (%r11), REG_VAR(xmm_reg, 0)
-    addq LITERAL(4), %r11         // arg_array++
+    addq MACRO_LITERAL(4), %r11   // arg_array++
 4:
 END_MACRO
 
@@ -257,27 +257,27 @@ END_MACRO
 MACRO3(LOOP_OVER_SHORTY_LOADING_GPRS, gpr_reg64, gpr_reg32, finished)
 1: // LOOP
     movb (%r10), %al              // al := *shorty
-    addq LITERAL(1), %r10         // shorty++
-    cmpb LITERAL(0), %al          // if (al == '\0') goto gpr_setup_finished
+    addq MACRO_LITERAL(1), %r10   // shorty++
+    cmpb MACRO_LITERAL(0), %al    // if (al == '\0') goto gpr_setup_finished
     je  VAR(finished, 2)
-    cmpb LITERAL(74), %al         // if (al == 'J') goto FOUND_LONG
+    cmpb MACRO_LITERAL(74), %al   // if (al == 'J') goto FOUND_LONG
     je 2f
-    cmpb LITERAL(70), %al         // if (al == 'F') goto SKIP_FLOAT
+    cmpb MACRO_LITERAL(70), %al   // if (al == 'F') goto SKIP_FLOAT
     je 3f
-    cmpb LITERAL(68), %al         // if (al == 'D') goto SKIP_DOUBLE
+    cmpb MACRO_LITERAL(68), %al   // if (al == 'D') goto SKIP_DOUBLE
     je 4f
     movl (%r11), REG_VAR(gpr_reg32, 1)
-    addq LITERAL(4), %r11         // arg_array++
+    addq MACRO_LITERAL(4), %r11   // arg_array++
     jmp 5f
 2:  // FOUND_LONG
     movq (%r11), REG_VAR(gpr_reg64, 0)
-    addq LITERAL(8), %r11         // arg_array+=2
+    addq MACRO_LITERAL(8), %r11   // arg_array+=2
     jmp 5f
 3:  // SKIP_FLOAT
-    addq LITERAL(4), %r11         // arg_array++
+    addq MACRO_LITERAL(4), %r11   // arg_array++
     jmp 1b
 4:  // SKIP_DOUBLE
-    addq LITERAL(8), %r11         // arg_array+=2
+    addq MACRO_LITERAL(8), %r11   // arg_array+=2
     jmp 1b
 5:
 END_MACRO
@@ -766,7 +766,11 @@ DEFINE_FUNCTION art_quick_generic_jni_trampoline
     // 16-byte aligned: 4336
     // Note: 14x8 = 7*16, so the stack stays aligned for the native call...
     //       Also means: the padding is somewhere in the middle
-    subq LITERAL(4336), %rsp
+    //
+    //
+    // New test: use 5K and release
+    // 5k = 5120
+    subq LITERAL(5120), %rsp
     // prepare for artQuickGenericJniTrampoline call
     // (Thread*,  SP)
     //    rdi    rsi      <= C calling convention
@@ -774,9 +778,13 @@ DEFINE_FUNCTION art_quick_generic_jni_trampoline
     movq %gs:THREAD_SELF_OFFSET, %rdi
     movq %rbp, %rsi
     call PLT_SYMBOL(artQuickGenericJniTrampoline)  // (Thread*, sp)
-    test %rax, %rax                 // check whether code pointer is NULL, also indicates exception
-    jz 1f
-    // pop from the register-passing alloca
+    test %rax, %rax                 // check whether error (negative value)
+    js 1f
+    // release part of the alloca
+    addq %rax, %rsp
+    // get the code pointer
+    popq %rax
+    // pop from the register-passing alloca region
     // what's the right layout?
     popq %rdi
     popq %rsi
diff --git a/runtime/barrier.h b/runtime/barrier.h
index e335c327be..0c7fd87a79 100644
--- a/runtime/barrier.h
+++ b/runtime/barrier.h
@@ -18,7 +18,6 @@
 #define ART_RUNTIME_BARRIER_H_
 
 #include "base/mutex.h"
-#include "locks.h"
 #include "UniquePtr.h"
 
 namespace art {
diff --git a/runtime/base/logging.h b/runtime/base/logging.h
index 075d571197..0fcec1f277 100644
--- a/runtime/base/logging.h
+++ b/runtime/base/logging.h
@@ -192,7 +192,7 @@ class LogMessage {
     : data_(new LogMessageData(file, line, severity, error)) {
   }
 
-  ~LogMessage() LOCKS_EXCLUDED(Locks::logging_lock_);
+  ~LogMessage();  // TODO: enable LOCKS_EXCLUDED(Locks::logging_lock_).
 
   std::ostream& stream() {
     return data_->buffer;
@@ -235,32 +235,6 @@ std::ostream& operator<<(std::ostream& os, const Dumpable<T>& rhs) {
   return os;
 }
 
-template<typename T>
-class MutatorLockedDumpable {
- public:
-  explicit MutatorLockedDumpable(T& value)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) : value_(value) {
-  }
-
-  void Dump(std::ostream& os) const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    value_.Dump(os);
-  }
-
- private:
-  T& value_;
-
-  DISALLOW_COPY_AND_ASSIGN(MutatorLockedDumpable);
-};
-
-template<typename T>
-std::ostream& operator<<(std::ostream& os, const MutatorLockedDumpable<T>& rhs)
-// TODO: should be SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) however annotalysis
-//       currently fails for this.
-    NO_THREAD_SAFETY_ANALYSIS {
-  rhs.Dump(os);
-  return os;
-}
-
 // Helps you use operator<< in a const char*-like context such as our various 'F' methods with
 // format strings.
 template<typename T>
diff --git a/runtime/base/macros.h b/runtime/base/macros.h
index 6cc9396bf2..b193ff18d4 100644
--- a/runtime/base/macros.h
+++ b/runtime/base/macros.h
@@ -178,48 +178,40 @@ char (&ArraySizeHelper(T (&array)[N]))[N];
 
 template<typename T> void UNUSED(const T&) {}
 
-#if defined(__SUPPORT_TS_ANNOTATION__)
-
-#define ACQUIRED_AFTER(...) __attribute__ ((acquired_after(__VA_ARGS__)))
-#define ACQUIRED_BEFORE(...) __attribute__ ((acquired_before(__VA_ARGS__)))
-#define EXCLUSIVE_LOCK_FUNCTION(...) __attribute__ ((exclusive_lock(__VA_ARGS__)))
-#define EXCLUSIVE_LOCKS_REQUIRED(...) __attribute__ ((exclusive_locks_required(__VA_ARGS__)))
-#define EXCLUSIVE_TRYLOCK_FUNCTION(...) __attribute__ ((exclusive_trylock(__VA_ARGS__)))
-#define GUARDED_BY(x) __attribute__ ((guarded_by(x)))
-#define GUARDED_VAR __attribute__ ((guarded))
-#define LOCKABLE __attribute__ ((lockable))
-#define LOCK_RETURNED(x) __attribute__ ((lock_returned(x)))
-#define LOCKS_EXCLUDED(...) __attribute__ ((locks_excluded(__VA_ARGS__)))
-#define NO_THREAD_SAFETY_ANALYSIS __attribute__ ((no_thread_safety_analysis))
-#define PT_GUARDED_BY(x) __attribute__ ((point_to_guarded_by(x)))
-#define PT_GUARDED_VAR __attribute__ ((point_to_guarded))
-#define SCOPED_LOCKABLE __attribute__ ((scoped_lockable))
-#define SHARED_LOCK_FUNCTION(...) __attribute__ ((shared_lock(__VA_ARGS__)))
-#define SHARED_LOCKS_REQUIRED(...) __attribute__ ((shared_locks_required(__VA_ARGS__)))
-#define SHARED_TRYLOCK_FUNCTION(...) __attribute__ ((shared_trylock(__VA_ARGS__)))
-#define UNLOCK_FUNCTION(...) __attribute__ ((unlock(__VA_ARGS__)))
-
+// Annotalysis thread-safety analysis support.
+#if defined(__SUPPORT_TS_ANNOTATION__) || defined(__clang__)
+#define THREAD_ANNOTATION_ATTRIBUTE__(x)   __attribute__((x))
 #else
+#define THREAD_ANNOTATION_ATTRIBUTE__(x)   // no-op
+#endif
 
-#define ACQUIRED_AFTER(...)
-#define ACQUIRED_BEFORE(...)
-#define EXCLUSIVE_LOCK_FUNCTION(...)
-#define EXCLUSIVE_LOCKS_REQUIRED(...)
-#define EXCLUSIVE_TRYLOCK_FUNCTION(...)
-#define GUARDED_BY(x)
-#define GUARDED_VAR
-#define LOCKABLE
-#define LOCK_RETURNED(x)
-#define LOCKS_EXCLUDED(...)
-#define NO_THREAD_SAFETY_ANALYSIS
+#define ACQUIRED_AFTER(...) THREAD_ANNOTATION_ATTRIBUTE__(acquired_after(__VA_ARGS__))
+#define ACQUIRED_BEFORE(...) THREAD_ANNOTATION_ATTRIBUTE__(acquired_before(__VA_ARGS__))
+#define EXCLUSIVE_LOCKS_REQUIRED(...) THREAD_ANNOTATION_ATTRIBUTE__(exclusive_locks_required(__VA_ARGS__))
+#define GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(guarded_by(x))
+#define GUARDED_VAR THREAD_ANNOTATION_ATTRIBUTE__(guarded)
+#define LOCKABLE THREAD_ANNOTATION_ATTRIBUTE__(lockable)
+#define LOCK_RETURNED(x) THREAD_ANNOTATION_ATTRIBUTE__(lock_returned(x))
+#define LOCKS_EXCLUDED(...) THREAD_ANNOTATION_ATTRIBUTE__(locks_excluded(__VA_ARGS__))
+#define NO_THREAD_SAFETY_ANALYSIS THREAD_ANNOTATION_ATTRIBUTE__(no_thread_safety_analysis)
 #define PT_GUARDED_BY(x)
-#define PT_GUARDED_VAR
-#define SCOPED_LOCKABLE
-#define SHARED_LOCK_FUNCTION(...)
-#define SHARED_LOCKS_REQUIRED(...)
-#define SHARED_TRYLOCK_FUNCTION(...)
-#define UNLOCK_FUNCTION(...)
-
-#endif  // defined(__SUPPORT_TS_ANNOTATION__)
+// THREAD_ANNOTATION_ATTRIBUTE__(point_to_guarded_by(x))
+#define PT_GUARDED_VAR THREAD_ANNOTATION_ATTRIBUTE__(point_to_guarded)
+#define SCOPED_LOCKABLE THREAD_ANNOTATION_ATTRIBUTE__(scoped_lockable)
+#define SHARED_LOCKS_REQUIRED(...) THREAD_ANNOTATION_ATTRIBUTE__(shared_locks_required(__VA_ARGS__))
+
+#if defined(__clang__)
+#define EXCLUSIVE_LOCK_FUNCTION(...) THREAD_ANNOTATION_ATTRIBUTE__(exclusive_lock_function(__VA_ARGS__))
+#define EXCLUSIVE_TRYLOCK_FUNCTION(...) THREAD_ANNOTATION_ATTRIBUTE__(exclusive_trylock_function(__VA_ARGS__))
+#define SHARED_LOCK_FUNCTION(...) THREAD_ANNOTATION_ATTRIBUTE__(shared_lock_function(__VA_ARGS__))
+#define SHARED_TRYLOCK_FUNCTION(...) THREAD_ANNOTATION_ATTRIBUTE__(shared_trylock_function(__VA_ARGS__))
+#define UNLOCK_FUNCTION(...) THREAD_ANNOTATION_ATTRIBUTE__(unlock_function(__VA_ARGS__))
+#else
+#define EXCLUSIVE_LOCK_FUNCTION(...) THREAD_ANNOTATION_ATTRIBUTE__(exclusive_lock(__VA_ARGS__))
+#define EXCLUSIVE_TRYLOCK_FUNCTION(...) THREAD_ANNOTATION_ATTRIBUTE__(exclusive_trylock(__VA_ARGS__))
+#define SHARED_LOCK_FUNCTION(...) THREAD_ANNOTATION_ATTRIBUTE__(shared_lock(__VA_ARGS__))
+#define SHARED_TRYLOCK_FUNCTION(...) THREAD_ANNOTATION_ATTRIBUTE__(shared_trylock(__VA_ARGS__))
+#define UNLOCK_FUNCTION(...) THREAD_ANNOTATION_ATTRIBUTE__(unlock(__VA_ARGS__))
+#endif
 
 #endif  // ART_RUNTIME_BASE_MACROS_H_
diff --git a/runtime/base/mutex.cc b/runtime/base/mutex.cc
index ff72d16908..fdf5763c3f 100644
--- a/runtime/base/mutex.cc
+++ b/runtime/base/mutex.cc
@@ -29,6 +29,30 @@
 
 namespace art {
 
+Mutex* Locks::abort_lock_ = nullptr;
+Mutex* Locks::breakpoint_lock_ = nullptr;
+Mutex* Locks::deoptimization_lock_ = nullptr;
+ReaderWriterMutex* Locks::classlinker_classes_lock_ = nullptr;
+ReaderWriterMutex* Locks::heap_bitmap_lock_ = nullptr;
+Mutex* Locks::logging_lock_ = nullptr;
+ReaderWriterMutex* Locks::mutator_lock_ = nullptr;
+Mutex* Locks::runtime_shutdown_lock_ = nullptr;
+Mutex* Locks::thread_list_lock_ = nullptr;
+Mutex* Locks::thread_suspend_count_lock_ = nullptr;
+Mutex* Locks::trace_lock_ = nullptr;
+Mutex* Locks::profiler_lock_ = nullptr;
+Mutex* Locks::unexpected_signal_lock_ = nullptr;
+Mutex* Locks::intern_table_lock_ = nullptr;
+
+struct AllMutexData {
+  // A guard for all_mutexes_ that's not a mutex (Mutexes must CAS to acquire and busy wait).
+  Atomic<const BaseMutex*> all_mutexes_guard;
+  // All created mutexes guarded by all_mutexes_guard_.
+  std::set<BaseMutex*>* all_mutexes;
+  AllMutexData() : all_mutexes(NULL) {}
+};
+static struct AllMutexData gAllMutexData[kAllMutexDataSize];
+
 #if ART_USE_FUTEXES
 static bool ComputeRelativeTimeSpec(timespec* result_ts, const timespec& lhs, const timespec& rhs) {
   const int32_t one_sec = 1000 * 1000 * 1000;  // one second in nanoseconds.
@@ -45,15 +69,6 @@ static bool ComputeRelativeTimeSpec(timespec* result_ts, const timespec& lhs, co
 }
 #endif
 
-struct AllMutexData {
-  // A guard for all_mutexes_ that's not a mutex (Mutexes must CAS to acquire and busy wait).
-  Atomic<const BaseMutex*> all_mutexes_guard;
-  // All created mutexes guarded by all_mutexes_guard_.
-  std::set<BaseMutex*>* all_mutexes;
-  AllMutexData() : all_mutexes(NULL) {}
-};
-static struct AllMutexData gAllMutexData[kAllMutexDataSize];
-
 class ScopedAllMutexesLock {
  public:
   explicit ScopedAllMutexesLock(const BaseMutex* mutex) : mutex_(mutex) {
@@ -792,4 +807,53 @@ void ConditionVariable::TimedWait(Thread* self, int64_t ms, int32_t ns) {
   guard_.recursion_count_ = old_recursion_count;
 }
 
+void Locks::Init() {
+  if (logging_lock_ != nullptr) {
+    // Already initialized.
+    DCHECK(abort_lock_ != nullptr);
+    DCHECK(breakpoint_lock_ != nullptr);
+    DCHECK(deoptimization_lock_ != nullptr);
+    DCHECK(classlinker_classes_lock_ != nullptr);
+    DCHECK(heap_bitmap_lock_ != nullptr);
+    DCHECK(logging_lock_ != nullptr);
+    DCHECK(mutator_lock_ != nullptr);
+    DCHECK(thread_list_lock_ != nullptr);
+    DCHECK(thread_suspend_count_lock_ != nullptr);
+    DCHECK(trace_lock_ != nullptr);
+    DCHECK(profiler_lock_ != nullptr);
+    DCHECK(unexpected_signal_lock_ != nullptr);
+    DCHECK(intern_table_lock_ != nullptr);
+  } else {
+    logging_lock_ = new Mutex("logging lock", kLoggingLock, true);
+    abort_lock_ = new Mutex("abort lock", kAbortLock, true);
+
+    DCHECK(breakpoint_lock_ == nullptr);
+    breakpoint_lock_ = new Mutex("breakpoint lock", kBreakpointLock);
+    DCHECK(deoptimization_lock_ == nullptr);
+    deoptimization_lock_ = new Mutex("deoptimization lock", kDeoptimizationLock);
+    DCHECK(classlinker_classes_lock_ == nullptr);
+    classlinker_classes_lock_ = new ReaderWriterMutex("ClassLinker classes lock",
+                                                      kClassLinkerClassesLock);
+    DCHECK(heap_bitmap_lock_ == nullptr);
+    heap_bitmap_lock_ = new ReaderWriterMutex("heap bitmap lock", kHeapBitmapLock);
+    DCHECK(mutator_lock_ == nullptr);
+    mutator_lock_ = new ReaderWriterMutex("mutator lock", kMutatorLock);
+    DCHECK(runtime_shutdown_lock_ == nullptr);
+    runtime_shutdown_lock_ = new Mutex("runtime shutdown lock", kRuntimeShutdownLock);
+    DCHECK(thread_list_lock_ == nullptr);
+    thread_list_lock_ = new Mutex("thread list lock", kThreadListLock);
+    DCHECK(thread_suspend_count_lock_ == nullptr);
+    thread_suspend_count_lock_ = new Mutex("thread suspend count lock", kThreadSuspendCountLock);
+    DCHECK(trace_lock_ == nullptr);
+    trace_lock_ = new Mutex("trace lock", kTraceLock);
+    DCHECK(profiler_lock_ == nullptr);
+    profiler_lock_ = new Mutex("profiler lock", kProfilerLock);
+    DCHECK(unexpected_signal_lock_ == nullptr);
+    unexpected_signal_lock_ = new Mutex("unexpected signal lock", kUnexpectedSignalLock, true);
+    DCHECK(intern_table_lock_ == nullptr);
+    intern_table_lock_ = new Mutex("InternTable lock", kInternTableLock);
+  }
+}
+
+
 }  // namespace art
diff --git a/runtime/base/mutex.h b/runtime/base/mutex.h
index 63ed6cbe2f..55ec1c3839 100644
--- a/runtime/base/mutex.h
+++ b/runtime/base/mutex.h
@@ -27,7 +27,6 @@
 #include "base/logging.h"
 #include "base/macros.h"
 #include "globals.h"
-#include "locks.h"
 
 #if defined(__APPLE__)
 #define ART_USE_FUTEXES 0
@@ -44,9 +43,56 @@
 
 namespace art {
 
+class LOCKABLE ReaderWriterMutex;
 class ScopedContentionRecorder;
 class Thread;
 
+// LockLevel is used to impose a lock hierarchy [1] where acquisition of a Mutex at a higher or
+// equal level to a lock a thread holds is invalid. The lock hierarchy achieves a cycle free
+// partial ordering and thereby cause deadlock situations to fail checks.
+//
+// [1] http://www.drdobbs.com/parallel/use-lock-hierarchies-to-avoid-deadlock/204801163
+enum LockLevel {
+  kLoggingLock = 0,
+  kUnexpectedSignalLock,
+  kThreadSuspendCountLock,
+  kAbortLock,
+  kJdwpSocketLock,
+  kRosAllocGlobalLock,
+  kRosAllocBracketLock,
+  kRosAllocBulkFreeLock,
+  kAllocSpaceLock,
+  kDexFileMethodInlinerLock,
+  kDexFileToMethodInlinerMapLock,
+  kMarkSweepMarkStackLock,
+  kTransactionLogLock,
+  kInternTableLock,
+  kMonitorPoolLock,
+  kDefaultMutexLevel,
+  kMarkSweepLargeObjectLock,
+  kPinTableLock,
+  kLoadLibraryLock,
+  kJdwpObjectRegistryLock,
+  kClassLinkerClassesLock,
+  kBreakpointLock,
+  kMonitorLock,
+  kThreadListLock,
+  kBreakpointInvokeLock,
+  kDeoptimizationLock,
+  kTraceLock,
+  kProfilerLock,
+  kJdwpEventListLock,
+  kJdwpAttachLock,
+  kJdwpStartLock,
+  kRuntimeShutdownLock,
+  kHeapBitmapLock,
+  kMutatorLock,
+  kZygoteCreationLock,
+
+  kLockLevelCount  // Must come last.
+};
+std::ostream& operator<<(std::ostream& os, const LockLevel& rhs);
+
 const bool kDebugLocking = kIsDebugBuild;
 
 // Record Log contention information, dumpable via SIGQUIT.
@@ -413,6 +459,117 @@ class SCOPED_LOCKABLE WriterMutexLock {
 // "WriterMutexLock mu(lock)".
 #define WriterMutexLock(x) COMPILE_ASSERT(0, writer_mutex_lock_declaration_missing_variable_name)
 
+// Global mutexes corresponding to the levels above.
+class Locks {
+ public:
+  static void Init();
+
+  // The mutator_lock_ is used to allow mutators to execute in a shared (reader) mode or to block
+  // mutators by having an exclusive (writer) owner. In normal execution each mutator thread holds
+  // a share on the mutator_lock_. The garbage collector may also execute with shared access but
+  // at times requires exclusive access to the heap (not to be confused with the heap meta-data
+  // guarded by the heap_lock_ below). When the garbage collector requires exclusive access it asks
+  // the mutators to suspend themselves which also involves usage of the thread_suspend_count_lock_
+  // to cover weaknesses in using ReaderWriterMutexes with ConditionVariables. We use a condition
+  // variable to wait upon in the suspension logic as releasing and then re-acquiring a share on
+  // the mutator lock doesn't necessarily allow the exclusive user (e.g the garbage collector)
+  // chance to acquire the lock.
+  //
+  // Thread suspension:
+  // Shared users                                  | Exclusive user
+  // (holding mutator lock and in kRunnable state) |   .. running ..
+  //   .. running ..                               | Request thread suspension by:
+  //   .. running ..                               |   - acquiring thread_suspend_count_lock_
+  //   .. running ..                               |   - incrementing Thread::suspend_count_ on
+  //   .. running ..                               |     all mutator threads
+  //   .. running ..                               |   - releasing thread_suspend_count_lock_
+  //   .. running ..                               | Block trying to acquire exclusive mutator lock
+  // Poll Thread::suspend_count_ and enter full    |   .. blocked ..
+  // suspend code.                                 |   .. blocked ..
+  // Change state to kSuspended                    |   .. blocked ..
+  // x: Release share on mutator_lock_             | Carry out exclusive access
+  // Acquire thread_suspend_count_lock_            |   .. exclusive ..
+  // while Thread::suspend_count_ > 0              |   .. exclusive ..
+  //   - wait on Thread::resume_cond_              |   .. exclusive ..
+  //     (releases thread_suspend_count_lock_)     |   .. exclusive ..
+  //   .. waiting ..                               | Release mutator_lock_
+  //   .. waiting ..                               | Request thread resumption by:
+  //   .. waiting ..                               |   - acquiring thread_suspend_count_lock_
+  //   .. waiting ..                               |   - decrementing Thread::suspend_count_ on
+  //   .. waiting ..                               |     all mutator threads
+  //   .. waiting ..                               |   - notifying on Thread::resume_cond_
+  //    - re-acquire thread_suspend_count_lock_    |   - releasing thread_suspend_count_lock_
+  // Release thread_suspend_count_lock_            |  .. running ..
+  // Acquire share on mutator_lock_                |  .. running ..
+  //  - This could block but the thread still      |  .. running ..
+  //    has a state of kSuspended and so this      |  .. running ..
+  //    isn't an issue.                            |  .. running ..
+  // Acquire thread_suspend_count_lock_            |  .. running ..
+  //  - we poll here as we're transitioning into   |  .. running ..
+  //    kRunnable and an individual thread suspend |  .. running ..
+  //    request (e.g for debugging) won't try      |  .. running ..
+  //    to acquire the mutator lock (which would   |  .. running ..
+  //    block as we hold the mutator lock). This   |  .. running ..
+  //    poll ensures that if the suspender thought |  .. running ..
+  //    we were suspended by incrementing our      |  .. running ..
+  //    Thread::suspend_count_ and then reading    |  .. running ..
+  //    our state we go back to waiting on         |  .. running ..
+  //    Thread::resume_cond_.                      |  .. running ..
+  // can_go_runnable = Thread::suspend_count_ == 0 |  .. running ..
+  // Release thread_suspend_count_lock_            |  .. running ..
+  // if can_go_runnable                            |  .. running ..
+  //   Change state to kRunnable                   |  .. running ..
+  // else                                          |  .. running ..
+  //   Goto x                                      |  .. running ..
+  //  .. running ..                                |  .. running ..
+  static ReaderWriterMutex* mutator_lock_;
+
+  // Allow reader-writer mutual exclusion on the mark and live bitmaps of the heap.
+  static ReaderWriterMutex* heap_bitmap_lock_ ACQUIRED_AFTER(mutator_lock_);
+
+  // Guards shutdown of the runtime.
+  static Mutex* runtime_shutdown_lock_ ACQUIRED_AFTER(heap_bitmap_lock_);
+
+  // The thread_list_lock_ guards ThreadList::list_. It is also commonly held to stop threads
+  // attaching and detaching.
+  static Mutex* thread_list_lock_ ACQUIRED_AFTER(runtime_shutdown_lock_);
+
+  // Guards breakpoints.
+  static Mutex* breakpoint_lock_ ACQUIRED_AFTER(thread_list_lock_);
+
+  // Guards deoptimization requests.
+  static Mutex* deoptimization_lock_ ACQUIRED_AFTER(breakpoint_lock_);
+
+  // Guards trace requests.
+  static Mutex* trace_lock_ ACQUIRED_AFTER(deoptimization_lock_);
+
+  // Guards profile objects.
+  static Mutex* profiler_lock_ ACQUIRED_AFTER(trace_lock_);
+
+  // Guards lists of classes within the class linker.
+  static ReaderWriterMutex* classlinker_classes_lock_ ACQUIRED_AFTER(profiler_lock_);
+
+  // When declaring any Mutex add DEFAULT_MUTEX_ACQUIRED_AFTER to use annotalysis to check the code
+  // doesn't try to hold a higher level Mutex.
+  #define DEFAULT_MUTEX_ACQUIRED_AFTER ACQUIRED_AFTER(Locks::classlinker_classes_lock_)
+
+  // Guards intern table.
+  static Mutex* intern_table_lock_ ACQUIRED_AFTER(classlinker_classes_lock_);
+
+  // Have an exclusive aborting thread.
+  static Mutex* abort_lock_ ACQUIRED_AFTER(classlinker_classes_lock_);
+
+  // Allow mutual exclusion when manipulating Thread::suspend_count_.
+  // TODO: Does the trade-off of a per-thread lock make sense?
+  static Mutex* thread_suspend_count_lock_ ACQUIRED_AFTER(abort_lock_);
+
+  // One unexpected signal at a time lock.
+  static Mutex* unexpected_signal_lock_ ACQUIRED_AFTER(thread_suspend_count_lock_);
+
+  // Have an exclusive logging thread.
+  static Mutex* logging_lock_ ACQUIRED_AFTER(unexpected_signal_lock_);
+};
+
 }  // namespace art
 
 #endif  // ART_RUNTIME_BASE_MUTEX_H_
diff --git a/runtime/class_linker-inl.h b/runtime/class_linker-inl.h
index 754d1dd8c2..6c53563a3b 100644
--- a/runtime/class_linker-inl.h
+++ b/runtime/class_linker-inl.h
@@ -24,7 +24,7 @@
 #include "mirror/iftable.h"
 #include "mirror/object_array.h"
 #include "object_utils.h"
-#include "sirt_ref.h"
+#include "sirt_ref-inl.h"
 
 namespace art {
 
diff --git a/runtime/class_linker.h b/runtime/class_linker.h
index aad7cfc875..701e62e57a 100644
--- a/runtime/class_linker.h
+++ b/runtime/class_linker.h
@@ -260,7 +260,7 @@ class ClassLinker {
   bool GenerateOatFile(const char* dex_filename,
                        int oat_fd,
                        const char* oat_cache_filename,
-                       std::string* error_msg);
+                       std::string* error_msg)
       LOCKS_EXCLUDED(Locks::mutator_lock_);
 
   const OatFile* FindOatFileFromOatLocation(const std::string& location,
@@ -519,7 +519,7 @@ class ClassLinker {
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   const OatFile* FindOpenedOatFileFromDexLocation(const char* dex_location,
                                                   const uint32_t* const dex_location_checksum)
-      LOCKS_EXCLUDED(dex_lock);
+      LOCKS_EXCLUDED(dex_lock_);
   const OatFile* FindOpenedOatFileFromOatLocation(const std::string& oat_location)
       LOCKS_EXCLUDED(dex_lock_);
   const DexFile* FindDexFileInOatLocation(const char* dex_location,
diff --git a/runtime/compiler_callbacks.h b/runtime/compiler_callbacks.h
index 7233d8ee7f..b07043f5d7 100644
--- a/runtime/compiler_callbacks.h
+++ b/runtime/compiler_callbacks.h
@@ -17,8 +17,8 @@
 #ifndef ART_RUNTIME_COMPILER_CALLBACKS_H_
 #define ART_RUNTIME_COMPILER_CALLBACKS_H_
 
+#include "base/mutex.h"
 #include "class_reference.h"
-#include "locks.h"
 
 namespace art {
 
diff --git a/runtime/debugger.cc b/runtime/debugger.cc
index 3b4e9c7fd6..7e2dfd2766 100644
--- a/runtime/debugger.cc
+++ b/runtime/debugger.cc
@@ -184,14 +184,14 @@ static Dbg::HpsgWhat gDdmHpsgWhat;
 static Dbg::HpsgWhen gDdmNhsgWhen = Dbg::HPSG_WHEN_NEVER;
 static Dbg::HpsgWhat gDdmNhsgWhat;
 
-static ObjectRegistry* gRegistry = NULL;
+static ObjectRegistry* gRegistry = nullptr;
 
 // Recent allocation tracking.
-static Mutex gAllocTrackerLock DEFAULT_MUTEX_ACQUIRED_AFTER("AllocTracker lock");
-AllocRecord* Dbg::recent_allocation_records_ PT_GUARDED_BY(gAllocTrackerLock) = NULL;  // TODO: CircularBuffer<AllocRecord>
-static size_t gAllocRecordMax GUARDED_BY(gAllocTrackerLock) = 0;
-static size_t gAllocRecordHead GUARDED_BY(gAllocTrackerLock) = 0;
-static size_t gAllocRecordCount GUARDED_BY(gAllocTrackerLock) = 0;
+Mutex* Dbg::alloc_tracker_lock_ = nullptr;
+AllocRecord* Dbg::recent_allocation_records_ = nullptr;  // TODO: CircularBuffer<AllocRecord>
+size_t Dbg::alloc_record_max_ = 0;
+size_t Dbg::alloc_record_head_ = 0;
+size_t Dbg::alloc_record_count_ = 0;
 
 // Deoptimization support.
 struct MethodInstrumentationRequest {
@@ -468,9 +468,10 @@ void Dbg::StartJdwp() {
     return;
   }
 
-  CHECK(gRegistry == NULL);
+  CHECK(gRegistry == nullptr);
   gRegistry = new ObjectRegistry;
 
+  alloc_tracker_lock_ = new Mutex("AllocTracker lock");
   // Init JDWP if the debugger is enabled. This may connect out to a
   // debugger, passively listen for a debugger, or block waiting for a
   // debugger.
@@ -496,9 +497,11 @@ void Dbg::StopJdwp() {
   // Prevent the JDWP thread from processing JDWP incoming packets after we close the connection.
   Disposed();
   delete gJdwpState;
-  gJdwpState = NULL;
+  gJdwpState = nullptr;
   delete gRegistry;
-  gRegistry = NULL;
+  gRegistry = nullptr;
+  delete alloc_tracker_lock_;
+  alloc_tracker_lock_ = nullptr;
 }
 
 void Dbg::GcDidFinish() {
@@ -3695,15 +3698,15 @@ static size_t GetAllocTrackerMax() {
 }
 
 void Dbg::SetAllocTrackingEnabled(bool enabled) {
-  MutexLock mu(Thread::Current(), gAllocTrackerLock);
+  MutexLock mu(Thread::Current(), *alloc_tracker_lock_);
   if (enabled) {
     if (recent_allocation_records_ == NULL) {
-      gAllocRecordMax = GetAllocTrackerMax();
-      LOG(INFO) << "Enabling alloc tracker (" << gAllocRecordMax << " entries of "
+      alloc_record_max_ = GetAllocTrackerMax();
+      LOG(INFO) << "Enabling alloc tracker (" << alloc_record_max_ << " entries of "
                 << kMaxAllocRecordStackDepth << " frames, taking "
-                << PrettySize(sizeof(AllocRecord) * gAllocRecordMax) << ")";
-      gAllocRecordHead = gAllocRecordCount = 0;
-      recent_allocation_records_ = new AllocRecord[gAllocRecordMax];
+                << PrettySize(sizeof(AllocRecord) * alloc_record_max_) << ")";
+      alloc_record_head_ = alloc_record_count_ = 0;
+      recent_allocation_records_ = new AllocRecord[alloc_record_max_];
       CHECK(recent_allocation_records_ != NULL);
     }
     Runtime::Current()->GetInstrumentation()->InstrumentQuickAllocEntryPoints();
@@ -3750,18 +3753,18 @@ void Dbg::RecordAllocation(mirror::Class* type, size_t byte_count) {
   Thread* self = Thread::Current();
   CHECK(self != NULL);
 
-  MutexLock mu(self, gAllocTrackerLock);
+  MutexLock mu(self, *alloc_tracker_lock_);
   if (recent_allocation_records_ == NULL) {
     return;
   }
 
   // Advance and clip.
-  if (++gAllocRecordHead == gAllocRecordMax) {
-    gAllocRecordHead = 0;
+  if (++alloc_record_head_ == alloc_record_max_) {
+    alloc_record_head_ = 0;
   }
 
   // Fill in the basics.
-  AllocRecord* record = &recent_allocation_records_[gAllocRecordHead];
+  AllocRecord* record = &recent_allocation_records_[alloc_record_head_];
   record->type = type;
   record->byte_count = byte_count;
   record->thin_lock_id = self->GetThreadId();
@@ -3770,8 +3773,8 @@ void Dbg::RecordAllocation(mirror::Class* type, size_t byte_count) {
   AllocRecordStackVisitor visitor(self, record);
   visitor.WalkStack();
 
-  if (gAllocRecordCount < gAllocRecordMax) {
-    ++gAllocRecordCount;
+  if (alloc_record_count_ < alloc_record_max_) {
+    ++alloc_record_count_;
   }
 }
 
@@ -3783,13 +3786,14 @@ void Dbg::RecordAllocation(mirror::Class* type, size_t byte_count) {
 //
 // We need to handle underflow in our circular buffer, so we add
 // gAllocRecordMax and then mask it back down.
-static inline int HeadIndex() EXCLUSIVE_LOCKS_REQUIRED(gAllocTrackerLock) {
-  return (gAllocRecordHead+1 + gAllocRecordMax - gAllocRecordCount) & (gAllocRecordMax-1);
+size_t Dbg::HeadIndex() {
+  return (Dbg::alloc_record_head_ + 1 + Dbg::alloc_record_max_ - Dbg::alloc_record_count_) &
+      (Dbg::alloc_record_max_ - 1);
 }
 
 void Dbg::DumpRecentAllocations() {
   ScopedObjectAccess soa(Thread::Current());
-  MutexLock mu(soa.Self(), gAllocTrackerLock);
+  MutexLock mu(soa.Self(), *alloc_tracker_lock_);
   if (recent_allocation_records_ == NULL) {
     LOG(INFO) << "Not recording tracked allocations";
     return;
@@ -3798,9 +3802,9 @@ void Dbg::DumpRecentAllocations() {
   // "i" is the head of the list.  We want to start at the end of the
   // list and move forward to the tail.
   size_t i = HeadIndex();
-  size_t count = gAllocRecordCount;
+  size_t count = alloc_record_count_;
 
-  LOG(INFO) << "Tracked allocations, (head=" << gAllocRecordHead << " count=" << count << ")";
+  LOG(INFO) << "Tracked allocations, (head=" << alloc_record_head_ << " count=" << count << ")";
   while (count--) {
     AllocRecord* record = &recent_allocation_records_[i];
 
@@ -3820,22 +3824,20 @@ void Dbg::DumpRecentAllocations() {
       usleep(40000);
     }
 
-    i = (i + 1) & (gAllocRecordMax-1);
+    i = (i + 1) & (alloc_record_max_ - 1);
   }
 }
 
 void Dbg::UpdateObjectPointers(IsMarkedCallback* visitor, void* arg) {
-  {
-    MutexLock mu(Thread::Current(), gAllocTrackerLock);
-    if (recent_allocation_records_ != nullptr) {
-      size_t i = HeadIndex();
-      size_t count = gAllocRecordCount;
-      while (count--) {
-        AllocRecord* record = &recent_allocation_records_[i];
-        DCHECK(record != nullptr);
-        record->UpdateObjectPointers(visitor, arg);
-        i = (i + 1) & (gAllocRecordMax - 1);
-      }
+  if (recent_allocation_records_ != nullptr) {
+    MutexLock mu(Thread::Current(), *alloc_tracker_lock_);
+    size_t i = HeadIndex();
+    size_t count = alloc_record_count_;
+    while (count--) {
+      AllocRecord* record = &recent_allocation_records_[i];
+      DCHECK(record != nullptr);
+      record->UpdateObjectPointers(visitor, arg);
+      i = (i + 1) & (alloc_record_max_ - 1);
     }
   }
   if (gRegistry != nullptr) {
@@ -3941,7 +3943,7 @@ jbyteArray Dbg::GetRecentAllocations() {
   Thread* self = Thread::Current();
   std::vector<uint8_t> bytes;
   {
-    MutexLock mu(self, gAllocTrackerLock);
+    MutexLock mu(self, *alloc_tracker_lock_);
     //
     // Part 1: generate string tables.
     //
@@ -3949,7 +3951,7 @@ jbyteArray Dbg::GetRecentAllocations() {
     StringTable method_names;
     StringTable filenames;
 
-    int count = gAllocRecordCount;
+    int count = alloc_record_count_;
     int idx = HeadIndex();
     while (count--) {
       AllocRecord* record = &recent_allocation_records_[idx];
@@ -3967,10 +3969,10 @@ jbyteArray Dbg::GetRecentAllocations() {
         }
       }
 
-      idx = (idx + 1) & (gAllocRecordMax-1);
+      idx = (idx + 1) & (alloc_record_max_ - 1);
     }
 
-    LOG(INFO) << "allocation records: " << gAllocRecordCount;
+    LOG(INFO) << "allocation records: " << alloc_record_count_;
 
     //
     // Part 2: Generate the output and store it in the buffer.
@@ -3991,14 +3993,14 @@ jbyteArray Dbg::GetRecentAllocations() {
     // (2b) number of class name strings
     // (2b) number of method name strings
     // (2b) number of source file name strings
-    JDWP::Append2BE(bytes, gAllocRecordCount);
+    JDWP::Append2BE(bytes, alloc_record_count_);
     size_t string_table_offset = bytes.size();
     JDWP::Append4BE(bytes, 0);  // We'll patch this later...
     JDWP::Append2BE(bytes, class_names.Size());
     JDWP::Append2BE(bytes, method_names.Size());
     JDWP::Append2BE(bytes, filenames.Size());
 
-    count = gAllocRecordCount;
+    count = alloc_record_count_;
     idx = HeadIndex();
     while (count--) {
       // For each entry:
@@ -4032,7 +4034,7 @@ jbyteArray Dbg::GetRecentAllocations() {
         JDWP::Append2BE(bytes, record->stack[stack_frame].LineNumber());
       }
 
-      idx = (idx + 1) & (gAllocRecordMax-1);
+      idx = (idx + 1) & (alloc_record_max_ - 1);
     }
 
     // (xb) class name strings
diff --git a/runtime/debugger.h b/runtime/debugger.h
index 5d269ee457..6c44bdea8f 100644
--- a/runtime/debugger.h
+++ b/runtime/debugger.h
@@ -391,7 +391,7 @@ class Dbg {
       LOCKS_EXCLUDED(Locks::deoptimization_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   static void DisableFullDeoptimization()
-      EXCLUSIVE_LOCKS_REQUIRED(event_list_lock_)
+      LOCKS_EXCLUDED(Locks::deoptimization_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Manage deoptimization after updating JDWP events list. This must be done while all mutator
@@ -448,8 +448,11 @@ class Dbg {
   static void RecordAllocation(mirror::Class* type, size_t byte_count)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   static void SetAllocTrackingEnabled(bool enabled);
-  static inline bool IsAllocTrackingEnabled() { return recent_allocation_records_ != NULL; }
+  static bool IsAllocTrackingEnabled() {
+    return recent_allocation_records_ != nullptr;
+  }
   static jbyteArray GetRecentAllocations() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  static size_t HeadIndex() EXCLUSIVE_LOCKS_REQUIRED(alloc_tracker_lock_);
   static void DumpRecentAllocations();
 
   // Updates the stored direct object pointers (called from SweepSystemWeaks).
@@ -488,7 +491,14 @@ class Dbg {
   static void PostThreadStartOrStop(Thread*, uint32_t)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  static AllocRecord* recent_allocation_records_;
+  static Mutex* alloc_tracker_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
+
+  static AllocRecord* recent_allocation_records_ PT_GUARDED_BY(alloc_tracker_lock_);
+  static size_t alloc_record_max_ GUARDED_BY(alloc_tracker_lock_);
+  static size_t alloc_record_head_ GUARDED_BY(alloc_tracker_lock_);
+  static size_t alloc_record_count_ GUARDED_BY(alloc_tracker_lock_);
+
+  DISALLOW_COPY_AND_ASSIGN(Dbg);
 };
 
 #define CHUNK_TYPE(_name) \
diff --git a/runtime/entrypoints/entrypoint_utils.h b/runtime/entrypoints/entrypoint_utils.h
index a8fb6c14a1..8b48b3647f 100644
--- a/runtime/entrypoints/entrypoint_utils.h
+++ b/runtime/entrypoints/entrypoint_utils.h
@@ -29,9 +29,8 @@
 #include "mirror/class-inl.h"
 #include "mirror/object-inl.h"
 #include "mirror/throwable.h"
-#include "locks.h"
 #include "object_utils.h"
-#include "sirt_ref.h"
+#include "sirt_ref-inl.h"
 #include "thread.h"
 
 namespace art {
@@ -642,8 +641,7 @@ static inline mirror::String* ResolveStringFromCode(mirror::ArtMethod* referrer,
 }
 
 static inline void UnlockJniSynchronizedMethod(jobject locked, Thread* self)
-    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
-    UNLOCK_FUNCTION(monitor_lock_) {
+    NO_THREAD_SAFETY_ANALYSIS /* SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) */ {
   // Save any pending exception over monitor exit call.
   mirror::Throwable* saved_exception = NULL;
   ThrowLocation saved_throw_location;
diff --git a/runtime/entrypoints/portable/portable_jni_entrypoints.cc b/runtime/entrypoints/portable/portable_jni_entrypoints.cc
index de1e32ef17..17ad4d047c 100644
--- a/runtime/entrypoints/portable/portable_jni_entrypoints.cc
+++ b/runtime/entrypoints/portable/portable_jni_entrypoints.cc
@@ -23,7 +23,7 @@ namespace art {
 
 // Called on entry to JNI, transition out of Runnable and release share of mutator_lock_.
 extern "C" uint32_t art_portable_jni_method_start(Thread* self)
-    UNLOCK_FUNCTION(GlobalSynchronizatio::mutator_lock_) {
+    UNLOCK_FUNCTION(Locks::mutator_lock_) {
   JNIEnvExt* env = self->GetJniEnv();
   uint32_t saved_local_ref_cookie = env->local_ref_cookie;
   env->local_ref_cookie = env->locals.GetSegmentState();
@@ -32,7 +32,7 @@ extern "C" uint32_t art_portable_jni_method_start(Thread* self)
 }
 
 extern "C" uint32_t art_portable_jni_method_start_synchronized(jobject to_lock, Thread* self)
-    UNLOCK_FUNCTION(Locks::mutator_lock_) {
+    UNLOCK_FUNCTION(Locks::mutator_lock_) NO_THREAD_SAFETY_ANALYSIS {
   self->DecodeJObject(to_lock)->MonitorEnter(self);
   return art_portable_jni_method_start(self);
 }
diff --git a/runtime/entrypoints/portable/portable_lock_entrypoints.cc b/runtime/entrypoints/portable/portable_lock_entrypoints.cc
index 44d3da9897..358ac233dc 100644
--- a/runtime/entrypoints/portable/portable_lock_entrypoints.cc
+++ b/runtime/entrypoints/portable/portable_lock_entrypoints.cc
@@ -20,8 +20,9 @@
 namespace art {
 
 extern "C" void art_portable_lock_object_from_code(mirror::Object* obj, Thread* thread)
-    EXCLUSIVE_LOCK_FUNCTION(monitor_lock_) {
-  DCHECK(obj != NULL);        // Assumed to have been checked before entry.
+    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+    NO_THREAD_SAFETY_ANALYSIS /* EXCLUSIVE_LOCK_FUNCTION(Monitor::monitor_lock_) */ {
+  DCHECK(obj != nullptr);        // Assumed to have been checked before entry.
   obj->MonitorEnter(thread);  // May block.
   DCHECK(thread->HoldsLock(obj));
   // Only possible exception is NPE and is handled before entry.
@@ -29,8 +30,9 @@ extern "C" void art_portable_lock_object_from_code(mirror::Object* obj, Thread*
 }
 
 extern "C" void art_portable_unlock_object_from_code(mirror::Object* obj, Thread* thread)
-    UNLOCK_FUNCTION(monitor_lock_) {
-  DCHECK(obj != NULL);  // Assumed to have been checked before entry.
+    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+    NO_THREAD_SAFETY_ANALYSIS /* UNLOCK_FUNCTION(Monitor::monitor_lock_) */ {
+  DCHECK(obj != nullptr);  // Assumed to have been checked before entry.
   // MonitorExit may throw exception.
   obj->MonitorExit(thread);
 }
diff --git a/runtime/entrypoints/quick/callee_save_frame.h b/runtime/entrypoints/quick/callee_save_frame.h
index 8f7004920d..3fd4adc0a7 100644
--- a/runtime/entrypoints/quick/callee_save_frame.h
+++ b/runtime/entrypoints/quick/callee_save_frame.h
@@ -26,8 +26,8 @@ class ArtMethod;
 }  // namespace mirror
 
 // Place a special frame at the TOS that will save the callee saves for the given type.
-static void FinishCalleeSaveFrameSetup(Thread* self, mirror::ArtMethod** sp,
-                                       Runtime::CalleeSaveType type)
+static inline void FinishCalleeSaveFrameSetup(Thread* self, mirror::ArtMethod** sp,
+                                              Runtime::CalleeSaveType type)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   // Be aware the store below may well stomp on an incoming argument.
   Locks::mutator_lock_->AssertSharedHeld(self);
diff --git a/runtime/entrypoints/quick/quick_jni_entrypoints.cc b/runtime/entrypoints/quick/quick_jni_entrypoints.cc
index 737fa3e735..116957d54c 100644
--- a/runtime/entrypoints/quick/quick_jni_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_jni_entrypoints.cc
@@ -24,6 +24,7 @@
 #include "object_utils.h"
 #include "scoped_thread_state_change.h"
 #include "thread.h"
+#include "verify_object-inl.h"
 
 namespace art {
 
diff --git a/runtime/entrypoints/quick/quick_lock_entrypoints.cc b/runtime/entrypoints/quick/quick_lock_entrypoints.cc
index 5bc7f4cdec..817d053c9b 100644
--- a/runtime/entrypoints/quick/quick_lock_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_lock_entrypoints.cc
@@ -21,7 +21,8 @@
 namespace art {
 
 extern "C" int artLockObjectFromCode(mirror::Object* obj, Thread* self, mirror::ArtMethod** sp)
-    EXCLUSIVE_LOCK_FUNCTION(monitor_lock_) {
+    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+    NO_THREAD_SAFETY_ANALYSIS /* EXCLUSIVE_LOCK_FUNCTION(Monitor::monitor_lock_) */ {
   FinishCalleeSaveFrameSetup(self, sp, Runtime::kRefsOnly);
   if (UNLIKELY(obj == NULL)) {
     ThrowLocation throw_location(self->GetCurrentLocationForThrow());
@@ -42,7 +43,8 @@ extern "C" int artLockObjectFromCode(mirror::Object* obj, Thread* self, mirror::
 }
 
 extern "C" int artUnlockObjectFromCode(mirror::Object* obj, Thread* self, mirror::ArtMethod** sp)
-    UNLOCK_FUNCTION(monitor_lock_) {
+    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+    NO_THREAD_SAFETY_ANALYSIS /* UNLOCK_FUNCTION(Monitor::monitor_lock_) */ {
   FinishCalleeSaveFrameSetup(self, sp, Runtime::kRefsOnly);
   if (UNLIKELY(obj == NULL)) {
     ThrowLocation throw_location(self->GetCurrentLocationForThrow());
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index bf8b8bab64..1bbaa6a7fd 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -820,84 +820,492 @@ extern "C" const void* artQuickResolutionTrampoline(mirror::ArtMethod* called,
   return code;
 }
 
-// Visits arguments on the stack placing them into a region lower down the stack for the benefit
-// of transitioning into native code.
-class BuildGenericJniFrameVisitor FINAL : public QuickArgumentVisitor {
+
+
+/*
+ * This class uses a couple of observations to unite the different calling conventions through
+ * a few constants.
+ *
+ * 1) Number of registers used for passing is normally even, so counting down has no penalty for
+ *    possible alignment.
+ * 2) Known 64b architectures store 8B units on the stack, both for integral and floating point
+ *    types, so using uintptr_t is OK. Also means that we can use kRegistersNeededX to denote
+ *    when we have to split things
+ * 3) The only soft-float, Arm, is 32b, so no widening needs to be taken into account for floats
+ *    and we can use Int handling directly.
+ * 4) Only 64b architectures widen, and their stack is aligned 8B anyways, so no padding code
+ *    necessary when widening. Also, widening of Ints will take place implicitly, and the
+ *    extension should be compatible with Aarch64, which mandates copying the available bits
+ *    into LSB and leaving the rest unspecified.
+ * 5) Aligning longs and doubles is necessary on arm only, and it's the same in registers and on
+ *    the stack.
+ * 6) There is only little endian.
+ *
+ *
+ * Actual work is supposed to be done in a delegate of the template type. The interface is as
+ * follows:
+ *
+ * void PushGpr(uintptr_t):   Add a value for the next GPR
+ *
+ * void PushFpr4(float):      Add a value for the next FPR of size 32b. Is only called if we need
+ *                            padding, that is, think the architecture is 32b and aligns 64b.
+ *
+ * void PushFpr8(uint64_t):   Push a double. We _will_ call this on 32b, it's the callee's job to
+ *                            split this if necessary. The current state will have aligned, if
+ *                            necessary.
+ *
+ * void PushStack(uintptr_t): Push a value to the stack.
+ *
+ * uintptr_t PushSirt(mirror::Object* ref): Add a reference to the Sirt. Is guaranteed != nullptr.
+ *                                          Must return the jobject, that is, the reference to the
+ *                                          entry in the Sirt.
+ *
+ */
+template <class T> class BuildGenericJniFrameStateMachine {
+ public:
 #if defined(__arm__)
   // TODO: These are all dummy values!
-  static constexpr bool kNativeSoftFloatAbi = false;  // This is a hard float ABI.
-  static constexpr size_t kNumNativeGprArgs = 3;  // 3 arguments passed in GPRs.
+  static constexpr bool kNativeSoftFloatAbi = true;
+  static constexpr size_t kNumNativeGprArgs = 4;  // 4 arguments passed in GPRs, r0-r3
   static constexpr size_t kNumNativeFprArgs = 0;  // 0 arguments passed in FPRs.
 
-  static constexpr size_t kGprStackOffset = 4336;
-  static constexpr size_t kFprStackOffset = 4336 - 6*8;
-  static constexpr size_t kCallStackStackOffset = 4336 - 112;
-
   static constexpr size_t kRegistersNeededForLong = 2;
   static constexpr size_t kRegistersNeededForDouble = 2;
+  static constexpr bool kMultiRegistersAligned = true;
+  static constexpr bool kMultiRegistersWidened = false;
+  static constexpr bool kAlignLongOnStack = true;
+  static constexpr bool kAlignDoubleOnStack = true;
 #elif defined(__mips__)
   // TODO: These are all dummy values!
   static constexpr bool kNativeSoftFloatAbi = true;  // This is a hard float ABI.
   static constexpr size_t kNumNativeGprArgs = 0;  // 6 arguments passed in GPRs.
   static constexpr size_t kNumNativeFprArgs = 0;  // 8 arguments passed in FPRs.
 
-  // update these
-  static constexpr size_t kGprStackOffset = 4336;
-  static constexpr size_t kFprStackOffset = 4336 - 6*8;
-  static constexpr size_t kCallStackStackOffset = 4336 - 112;
-
   static constexpr size_t kRegistersNeededForLong = 2;
   static constexpr size_t kRegistersNeededForDouble = 2;
+  static constexpr bool kMultiRegistersAligned = true;
+  static constexpr bool kMultiRegistersWidened = true;
+  static constexpr bool kAlignLongOnStack = false;
+  static constexpr bool kAlignDoubleOnStack = false;
 #elif defined(__i386__)
   // TODO: Check these!
-  static constexpr bool kNativeSoftFloatAbi = true;  // This is a soft float ABI.
+  static constexpr bool kNativeSoftFloatAbi = false;  // Not using int registers for fp
   static constexpr size_t kNumNativeGprArgs = 0;  // 6 arguments passed in GPRs.
   static constexpr size_t kNumNativeFprArgs = 0;  // 8 arguments passed in FPRs.
 
-  // update these
-  static constexpr size_t kGprStackOffset = 4336;
-  static constexpr size_t kFprStackOffset = 4336 - 6*8;
-  static constexpr size_t kCallStackStackOffset = 4336 - 112;
-
   static constexpr size_t kRegistersNeededForLong = 2;
   static constexpr size_t kRegistersNeededForDouble = 2;
+  static constexpr bool kMultiRegistersAligned = false;       // x86 not using regs, anyways
+  static constexpr bool kMultiRegistersWidened = false;
+  static constexpr bool kAlignLongOnStack = false;
+  static constexpr bool kAlignDoubleOnStack = false;
 #elif defined(__x86_64__)
   static constexpr bool kNativeSoftFloatAbi = false;  // This is a hard float ABI.
   static constexpr size_t kNumNativeGprArgs = 6;  // 6 arguments passed in GPRs.
   static constexpr size_t kNumNativeFprArgs = 8;  // 8 arguments passed in FPRs.
 
-  static constexpr size_t kGprStackOffset = 4336;
-  static constexpr size_t kFprStackOffset = 4336 - 6*8;
-  static constexpr size_t kCallStackStackOffset = 4336 - 112;
-
   static constexpr size_t kRegistersNeededForLong = 1;
   static constexpr size_t kRegistersNeededForDouble = 1;
+  static constexpr bool kMultiRegistersAligned = false;
+  static constexpr bool kMultiRegistersWidened = true;
+  static constexpr bool kAlignLongOnStack = false;
+  static constexpr bool kAlignDoubleOnStack = false;
 #else
 #error "Unsupported architecture"
 #endif
 
+ public:
+  explicit BuildGenericJniFrameStateMachine(T* delegate) : gpr_index_(kNumNativeGprArgs),
+                                                           fpr_index_(kNumNativeFprArgs),
+                                                           stack_entries_(0),
+                                                           delegate_(delegate) {
+    // For register alignment, we want to assume that counters (gpr_index_, fpr_index_) are even iff
+    // the next register is even; counting down is just to make the compiler happy...
+    CHECK_EQ(kNumNativeGprArgs % 2, 0U);
+    CHECK_EQ(kNumNativeFprArgs % 2, 0U);
+  }
+
+  virtual ~BuildGenericJniFrameStateMachine() {}
+
+  bool HavePointerGpr() {
+    return gpr_index_ > 0;
+  }
+
+  void AdvancePointer(void* val) {
+    if (HavePointerGpr()) {
+      gpr_index_--;
+      PushGpr(reinterpret_cast<uintptr_t>(val));
+    } else {
+      stack_entries_++;         // TODO: have a field for pointer length as multiple of 32b
+      PushStack(reinterpret_cast<uintptr_t>(val));
+      gpr_index_ = 0;
+    }
+  }
+
+
+  bool HaveSirtGpr() {
+    return gpr_index_ > 0;
+  }
+
+  void AdvanceSirt(mirror::Object* ptr) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    uintptr_t sirtRef;
+    if (ptr != nullptr) {
+      sirtRef = PushSirt(ptr);
+    } else {
+      sirtRef = reinterpret_cast<uintptr_t>(nullptr);
+    }
+    if (HaveSirtGpr()) {
+      gpr_index_--;
+      PushGpr(sirtRef);
+    } else {
+      stack_entries_++;
+      PushStack(sirtRef);
+      gpr_index_ = 0;
+    }
+  }
+
+
+  bool HaveIntGpr() {
+    return gpr_index_ > 0;
+  }
+
+  void AdvanceInt(uint32_t val) {
+    if (HaveIntGpr()) {
+      gpr_index_--;
+      PushGpr(val);
+    } else {
+      stack_entries_++;
+      PushStack(val);
+      gpr_index_ = 0;
+    }
+  }
+
+
+  bool HaveLongGpr() {
+    return gpr_index_ >= kRegistersNeededForLong + (LongGprNeedsPadding() ? 1 : 0);
+  }
+
+  bool LongGprNeedsPadding() {
+    return kRegistersNeededForLong > 1 &&     // only pad when using multiple registers
+        kAlignLongOnStack &&                  // and when it needs alignment
+        (gpr_index_ & 1) == 1;                // counter is odd, see constructor
+  }
+
+  bool LongStackNeedsPadding() {
+    return kRegistersNeededForLong > 1 &&     // only pad when using multiple registers
+        kAlignLongOnStack &&                  // and when it needs 8B alignment
+        (stack_entries_ & 1) == 1;            // counter is odd
+  }
+
+  void AdvanceLong(uint64_t val) {
+    if (HaveLongGpr()) {
+      if (LongGprNeedsPadding()) {
+        PushGpr(0);
+        gpr_index_--;
+      }
+      if (kRegistersNeededForLong == 1) {
+        PushGpr(static_cast<uintptr_t>(val));
+      } else {
+        PushGpr(static_cast<uintptr_t>(val & 0xFFFFFFFF));
+        PushGpr(static_cast<uintptr_t>((val >> 32) & 0xFFFFFFFF));
+      }
+      gpr_index_ -= kRegistersNeededForLong;
+    } else {
+      if (LongStackNeedsPadding()) {
+        PushStack(0);
+        stack_entries_++;
+      }
+      if (kRegistersNeededForLong == 1) {
+        PushStack(static_cast<uintptr_t>(val));
+        stack_entries_++;
+      } else {
+        PushStack(static_cast<uintptr_t>(val & 0xFFFFFFFF));
+        PushStack(static_cast<uintptr_t>((val >> 32) & 0xFFFFFFFF));
+        stack_entries_ += 2;
+      }
+      gpr_index_ = 0;
+    }
+  }
+
+
+  bool HaveFloatFpr() {
+    return fpr_index_ > 0;
+  }
+
+  // TODO: please review this bit representation retrieving.
+  template <typename U, typename V> V convert(U in) {
+    CHECK_LE(sizeof(U), sizeof(V));
+    union { U u; V v; } tmp;
+    tmp.u = in;
+    return tmp.v;
+  }
+
+  void AdvanceFloat(float val) {
+    if (kNativeSoftFloatAbi) {
+      AdvanceInt(convert<float, uint32_t>(val));
+    } else {
+      if (HaveFloatFpr()) {
+        fpr_index_--;
+        if (kRegistersNeededForDouble == 1) {
+          if (kMultiRegistersWidened) {
+            PushFpr8(convert<double, uint64_t>(val));
+          } else {
+            // No widening, just use the bits.
+            PushFpr8(convert<float, uint64_t>(val));
+          }
+        } else {
+          PushFpr4(val);
+        }
+      } else {
+        stack_entries_++;
+        if (kRegistersNeededForDouble == 1 && kMultiRegistersWidened) {
+          // Need to widen before storing: Note the "double" in the template instantiation.
+          PushStack(convert<double, uintptr_t>(val));
+        } else {
+          PushStack(convert<float, uintptr_t>(val));
+        }
+        fpr_index_ = 0;
+      }
+    }
+  }
+
+
+  bool HaveDoubleFpr() {
+    return fpr_index_ >= kRegistersNeededForDouble + (DoubleFprNeedsPadding() ? 1 : 0);
+  }
+
+  bool DoubleFprNeedsPadding() {
+    return kRegistersNeededForDouble > 1 &&     // only pad when using multiple registers
+        kAlignDoubleOnStack &&                  // and when it needs alignment
+        (fpr_index_ & 1) == 1;                  // counter is odd, see constructor
+  }
+
+  bool DoubleStackNeedsPadding() {
+    return kRegistersNeededForDouble > 1 &&     // only pad when using multiple registers
+        kAlignDoubleOnStack &&                  // and when it needs 8B alignment
+        (stack_entries_ & 1) == 1;              // counter is odd
+  }
+
+  void AdvanceDouble(uint64_t val) {
+    if (kNativeSoftFloatAbi) {
+      AdvanceLong(val);
+    } else {
+      if (HaveDoubleFpr()) {
+        if (DoubleFprNeedsPadding()) {
+          PushFpr4(0);
+          fpr_index_--;
+        }
+        PushFpr8(val);
+        fpr_index_ -= kRegistersNeededForDouble;
+      } else {
+        if (DoubleStackNeedsPadding()) {
+          PushStack(0);
+          stack_entries_++;
+        }
+        if (kRegistersNeededForDouble == 1) {
+          PushStack(static_cast<uintptr_t>(val));
+          stack_entries_++;
+        } else {
+          PushStack(static_cast<uintptr_t>(val & 0xFFFFFFFF));
+          PushStack(static_cast<uintptr_t>((val >> 32) & 0xFFFFFFFF));
+          stack_entries_ += 2;
+        }
+        fpr_index_ = 0;
+      }
+    }
+  }
+
+  uint32_t getStackEntries() {
+    return stack_entries_;
+  }
+
+  uint32_t getNumberOfUsedGprs() {
+    return kNumNativeGprArgs - gpr_index_;
+  }
+
+  uint32_t getNumberOfUsedFprs() {
+    return kNumNativeFprArgs - fpr_index_;
+  }
+
+ private:
+  void PushGpr(uintptr_t val) {
+    delegate_->PushGpr(val);
+  }
+  void PushFpr4(float val) {
+    delegate_->PushFpr4(val);
+  }
+  void PushFpr8(uint64_t val) {
+    delegate_->PushFpr8(val);
+  }
+  void PushStack(uintptr_t val) {
+    delegate_->PushStack(val);
+  }
+  uintptr_t PushSirt(mirror::Object* ref) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    return delegate_->PushSirt(ref);
+  }
+
+  uint32_t gpr_index_;      // Number of free GPRs
+  uint32_t fpr_index_;      // Number of free FPRs
+  uint32_t stack_entries_;  // Stack entries are in multiples of 32b, as floats are usually not
+                            // extended
+  T* delegate_;             // What Push implementation gets called
+};
 
+class ComputeGenericJniFrameSize FINAL {
+ public:
+  ComputeGenericJniFrameSize() : num_sirt_references_(0), num_stack_entries_(0) {}
+
+  // (negative) offset from SP to top of Sirt.
+  uint32_t GetSirtOffset() {
+    return 8;
+  }
+
+  uint32_t GetFirstSirtEntryOffset() {
+    return GetSirtOffset() + sizeof(StackReference<mirror::Object>);
+  }
+
+  uint32_t GetNumSirtReferences() {
+    return num_sirt_references_;
+  }
+
+  uint32_t GetStackSize() {
+    return num_stack_entries_ * sizeof(uintptr_t);
+  }
+
+  void ComputeLayout(bool is_static, const char* shorty, uint32_t shorty_len, void* sp,
+                     StackReference<mirror::Object>** start_sirt, StackIndirectReferenceTable** table,
+                     uint32_t* sirt_entries, uintptr_t** start_stack, uintptr_t** start_gpr,
+                     uint32_t** start_fpr, void** code_return, size_t* overall_size)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    ComputeAll(is_static, shorty, shorty_len);
+
+    uint8_t* sp8 = reinterpret_cast<uint8_t*>(sp);
+    *start_sirt = reinterpret_cast<StackReference<mirror::Object>*>(sp8-GetFirstSirtEntryOffset());
+
+    // Add padding entries if necessary for alignment.
+    if (sizeof(uintptr_t) < sizeof(uint64_t)) {
+      uint32_t size = sizeof(uintptr_t) * num_sirt_references_;
+      uint32_t rem = size % 8;
+      if (rem != 0) {
+        DCHECK_EQ(rem, 4U);
+        num_sirt_references_++;
+      }
+    }
+    *sirt_entries = num_sirt_references_;
+    size_t sirt_size = StackIndirectReferenceTable::SizeOf(num_sirt_references_);
+    sp8 -= GetSirtOffset() + sirt_size;
+    *table = reinterpret_cast<StackIndirectReferenceTable*>(sp8);
+
+    sp8 -= GetStackSize();
+    // Now align the call stack under the Sirt. This aligns by 16.
+    uintptr_t mask = ~0x0F;
+    sp8 = reinterpret_cast<uint8_t*>(reinterpret_cast<uintptr_t>(sp8) & mask);
+    *start_stack = reinterpret_cast<uintptr_t*>(sp8);
+
+    // put fprs and gprs below
+    // Assumption is OK right now, as we have soft-float arm
+    size_t fregs = BuildGenericJniFrameStateMachine<ComputeGenericJniFrameSize>::kNumNativeFprArgs;
+    sp8 -= fregs * sizeof(uintptr_t);
+    *start_fpr = reinterpret_cast<uint32_t*>(sp8);
+    size_t iregs = BuildGenericJniFrameStateMachine<ComputeGenericJniFrameSize>::kNumNativeGprArgs;
+    sp8 -= iregs * sizeof(uintptr_t);
+    *start_gpr = reinterpret_cast<uintptr_t*>(sp8);
+
+    // reserve space for the code pointer
+    sp8 -= sizeof(void*);
+    *code_return = reinterpret_cast<void*>(sp8);
+
+    *overall_size = reinterpret_cast<uint8_t*>(sp) - sp8;
+  }
+
+  void ComputeSirtOffset() { }  // nothing to do, static right now
+
+  void ComputeAll(bool is_static, const char* shorty, uint32_t shorty_len)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    BuildGenericJniFrameStateMachine<ComputeGenericJniFrameSize> sm(this);
+
+    // JNIEnv
+    sm.AdvancePointer(nullptr);
+
+    // Class object or this as first argument
+    sm.AdvanceSirt(reinterpret_cast<mirror::Object*>(0x12345678));
+
+    for (uint32_t i = 1; i < shorty_len; ++i) {
+      Primitive::Type cur_type_ = Primitive::GetType(shorty[i]);
+      switch (cur_type_) {
+        case Primitive::kPrimNot:
+          sm.AdvanceSirt(reinterpret_cast<mirror::Object*>(0x12345678));
+          break;
+
+        case Primitive::kPrimBoolean:
+        case Primitive::kPrimByte:
+        case Primitive::kPrimChar:
+        case Primitive::kPrimShort:
+        case Primitive::kPrimInt:
+          sm.AdvanceInt(0);
+          break;
+        case Primitive::kPrimFloat:
+          sm.AdvanceFloat(0);
+          break;
+        case Primitive::kPrimDouble:
+          sm.AdvanceDouble(0);
+          break;
+        case Primitive::kPrimLong:
+          sm.AdvanceLong(0);
+          break;
+        default:
+          LOG(FATAL) << "Unexpected type: " << cur_type_ << " in " << shorty;
+      }
+    }
+
+    num_stack_entries_ = sm.getStackEntries();
+  }
+
+  void PushGpr(uintptr_t /* val */) {
+    // not optimizing registers, yet
+  }
+
+  void PushFpr4(float /* val */) {
+    // not optimizing registers, yet
+  }
+
+  void PushFpr8(uint64_t /* val */) {
+    // not optimizing registers, yet
+  }
+
+  void PushStack(uintptr_t /* val */) {
+    // counting is already done in the superclass
+  }
+
+  uintptr_t PushSirt(mirror::Object* /* ptr */) {
+    num_sirt_references_++;
+    return reinterpret_cast<uintptr_t>(nullptr);
+  }
+
+ private:
+  uint32_t num_sirt_references_;
+  uint32_t num_stack_entries_;
+};
+
+// Visits arguments on the stack placing them into a region lower down the stack for the benefit
+// of transitioning into native code.
+class BuildGenericJniFrameVisitor FINAL : public QuickArgumentVisitor {
  public:
   BuildGenericJniFrameVisitor(mirror::ArtMethod** sp, bool is_static, const char* shorty,
                               uint32_t shorty_len, Thread* self) :
-      QuickArgumentVisitor(sp, is_static, shorty, shorty_len) {
-    // size of cookie plus padding
-    uint8_t* sp8 = reinterpret_cast<uint8_t*>(sp);
-    top_of_sirt_ =  sp8 - 8;
-    cur_sirt_entry_ = reinterpret_cast<StackReference<mirror::Object>*>(top_of_sirt_) - 1;
+      QuickArgumentVisitor(sp, is_static, shorty, shorty_len), sm_(this) {
+    ComputeGenericJniFrameSize fsc;
+    fsc.ComputeLayout(is_static, shorty, shorty_len, sp, &cur_sirt_entry_, &sirt_,
+                      &sirt_expected_refs_, &cur_stack_arg_, &cur_gpr_reg_, &cur_fpr_reg_,
+                      &code_return_, &alloca_used_size_);
     sirt_number_of_references_ = 0;
-    gpr_index_ = kNumNativeGprArgs;
-    fpr_index_ = kNumNativeFprArgs;
-
-    cur_gpr_reg_ = reinterpret_cast<uintptr_t*>(sp8 - kGprStackOffset);
-    cur_fpr_reg_ = reinterpret_cast<uint32_t*>(sp8 - kFprStackOffset);
-    cur_stack_arg_ = reinterpret_cast<uintptr_t*>(sp8 - kCallStackStackOffset);
+    top_of_sirt_ = cur_sirt_entry_;
 
     // jni environment is always first argument
-    PushPointer(self->GetJniEnv());
+    sm_.AdvancePointer(self->GetJniEnv());
 
     if (is_static) {
-      PushArgumentInSirt((*sp)->GetDeclaringClass());
+      sm_.AdvanceSirt((*sp)->GetDeclaringClass());
     }
   }
 
@@ -911,7 +1319,7 @@ class BuildGenericJniFrameVisitor FINAL : public QuickArgumentVisitor {
         } else {
           long_arg = *reinterpret_cast<jlong*>(GetParamAddress());
         }
-        PushLongArgument(long_arg);
+        sm_.AdvanceLong(long_arg);
         break;
       }
       case Primitive::kPrimDouble: {
@@ -922,24 +1330,24 @@ class BuildGenericJniFrameVisitor FINAL : public QuickArgumentVisitor {
         } else {
           double_arg = *reinterpret_cast<uint64_t*>(GetParamAddress());
         }
-        PushDoubleArgument(double_arg);
+        sm_.AdvanceDouble(double_arg);
         break;
       }
       case Primitive::kPrimNot: {
         StackReference<mirror::Object>* stack_ref =
             reinterpret_cast<StackReference<mirror::Object>*>(GetParamAddress());
-        PushArgumentInSirt(stack_ref->AsMirrorPtr());
+        sm_.AdvanceSirt(stack_ref->AsMirrorPtr());
         break;
       }
       case Primitive::kPrimFloat:
-        PushFloatArgument(*reinterpret_cast<int32_t*>(GetParamAddress()));
+        sm_.AdvanceFloat(*reinterpret_cast<float*>(GetParamAddress()));
         break;
       case Primitive::kPrimBoolean:  // Fall-through.
       case Primitive::kPrimByte:     // Fall-through.
       case Primitive::kPrimChar:     // Fall-through.
       case Primitive::kPrimShort:    // Fall-through.
       case Primitive::kPrimInt:      // Fall-through.
-        PushIntArgument(*reinterpret_cast<jint*>(GetParamAddress()));
+        sm_.AdvanceInt(*reinterpret_cast<jint*>(GetParamAddress()));
         break;
       case Primitive::kPrimVoid:
         LOG(FATAL) << "UNREACHABLE";
@@ -948,149 +1356,87 @@ class BuildGenericJniFrameVisitor FINAL : public QuickArgumentVisitor {
   }
 
   void FinalizeSirt(Thread* self) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    if (!IsAligned<8>(StackIndirectReferenceTable::SizeOf(sirt_number_of_references_))) {
-      sirt_number_of_references_++;
+    // Initialize padding entries.
+    while (sirt_number_of_references_ < sirt_expected_refs_) {
       *cur_sirt_entry_ = StackReference<mirror::Object>();
       cur_sirt_entry_--;
+      sirt_number_of_references_++;
     }
-    CHECK(IsAligned<8>(StackIndirectReferenceTable::SizeOf(sirt_number_of_references_)));
-    StackIndirectReferenceTable* sirt = reinterpret_cast<StackIndirectReferenceTable*>(
-        top_of_sirt_ - StackIndirectReferenceTable::SizeOf(sirt_number_of_references_));
+    sirt_->SetNumberOfReferences(sirt_expected_refs_);
 
-    sirt->SetNumberOfReferences(sirt_number_of_references_);
-    self->PushSirt(sirt);
+    // Install Sirt.
+    self->PushSirt(sirt_);
   }
 
   jobject GetFirstSirtEntry() {
-    return reinterpret_cast<jobject>(reinterpret_cast<StackReference<mirror::Object>*>(top_of_sirt_) - 1);
+    return reinterpret_cast<jobject>(top_of_sirt_);
   }
 
- private:
-  void PushArgumentInSirt(mirror::Object* obj) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    // Do something to push into the SIRT.
-    uintptr_t sirt_or_null;
-    if (obj != nullptr) {
-      sirt_number_of_references_++;
-      *cur_sirt_entry_ = StackReference<mirror::Object>::FromMirrorPtr(obj);
-      sirt_or_null = reinterpret_cast<uintptr_t>(cur_sirt_entry_);
-      cur_sirt_entry_--;
-    } else {
-      sirt_or_null = reinterpret_cast<uintptr_t>(nullptr);
-    }
-    // Push the GPR or stack arg.
-    if (gpr_index_ > 0) {
-      *cur_gpr_reg_ = sirt_or_null;
-      cur_gpr_reg_++;
-      gpr_index_--;
-    } else {
-      *cur_stack_arg_ = sirt_or_null;
-      cur_stack_arg_++;
-    }
+  void PushGpr(uintptr_t val) {
+    *cur_gpr_reg_ = val;
+    cur_gpr_reg_++;
   }
 
-  void PushPointer(void* val) {
-    if (gpr_index_ > 0) {
-      *cur_gpr_reg_ = reinterpret_cast<uintptr_t>(val);
-      cur_gpr_reg_++;
-      gpr_index_--;
-    } else {
-      *cur_stack_arg_ = reinterpret_cast<uintptr_t>(val);
-      cur_stack_arg_++;
-    }
+  void PushFpr4(float val) {
+    *cur_fpr_reg_ = val;
+    cur_fpr_reg_++;
   }
 
-  void PushIntArgument(jint val) {
-    if (gpr_index_ > 0) {
-      *cur_gpr_reg_ = val;
-      cur_gpr_reg_++;
-      gpr_index_--;
-    } else {
-      *cur_stack_arg_ = val;
-      cur_stack_arg_++;
-    }
+  void PushFpr8(uint64_t val) {
+    uint64_t* tmp = reinterpret_cast<uint64_t*>(cur_fpr_reg_);
+    *tmp = val;
+    cur_fpr_reg_ += 2;
   }
 
-  void PushLongArgument(jlong val) {
-    // This is an ugly hack for the following problem:
-    //  Assume odd number of 32b registers. Then having exactly kRegsNeeded left needs to spill!
-    if (gpr_index_ >= kRegistersNeededForLong + (kNumNativeGprArgs % kRegistersNeededForLong)) {
-      if (kRegistersNeededForLong > 1 && ((kNumNativeGprArgs - gpr_index_) & 1) == 1) {
-        // Pad.
-        gpr_index_--;
-        cur_gpr_reg_++;
-      }
-      uint64_t* tmp = reinterpret_cast<uint64_t*>(cur_gpr_reg_);
-      *tmp = val;
-      cur_gpr_reg_ += kRegistersNeededForLong;
-      gpr_index_ -= kRegistersNeededForLong;
-    } else {
-      uint64_t* tmp = reinterpret_cast<uint64_t*>(cur_stack_arg_);
-      *tmp = val;
-      cur_stack_arg_ += kRegistersNeededForLong;
-
-      gpr_index_ = 0;                   // can't use GPRs anymore
-    }
+  void PushStack(uintptr_t val) {
+    *cur_stack_arg_ = val;
+    cur_stack_arg_++;
   }
 
-  void PushFloatArgument(int32_t val) {
-    if (kNativeSoftFloatAbi) {
-      PushIntArgument(val);
-    } else {
-      if (fpr_index_ > 0) {
-        *cur_fpr_reg_ = val;
-        cur_fpr_reg_++;
-        if (kRegistersNeededForDouble == 1) {
-          // will pop 64 bits from the stack
-          // TODO: extend/clear bits???
-          cur_fpr_reg_++;
-        }
-        fpr_index_--;
-      } else {
-        // TODO: Check ABI for floats.
-        *cur_stack_arg_ = val;
-        cur_stack_arg_++;
-      }
-    }
+  uintptr_t PushSirt(mirror::Object* ref) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    *cur_sirt_entry_ = StackReference<mirror::Object>::FromMirrorPtr(ref);
+    uintptr_t tmp = reinterpret_cast<uintptr_t>(cur_sirt_entry_);
+    cur_sirt_entry_--;
+    sirt_number_of_references_++;
+    return tmp;
   }
 
-  void PushDoubleArgument(uint64_t val) {
-    // See PushLongArgument for explanation
-    if (fpr_index_ >= kRegistersNeededForDouble + (kNumNativeFprArgs % kRegistersNeededForDouble)) {
-      if (kRegistersNeededForDouble > 1 && ((kNumNativeFprArgs - fpr_index_) & 1) == 1) {
-        // Pad.
-        fpr_index_--;
-        cur_fpr_reg_++;
-      }
-      uint64_t* tmp = reinterpret_cast<uint64_t*>(cur_fpr_reg_);
-      *tmp = val;
-      // TODO: the whole thing doesn't make sense if we take uint32_t*...
-      cur_fpr_reg_ += 2;        // kRegistersNeededForDouble;
-      fpr_index_ -= kRegistersNeededForDouble;
-    } else {
-      if (!IsAligned<8>(cur_stack_arg_)) {
-        cur_stack_arg_++;  // Pad.
-      }
-      uint64_t* tmp = reinterpret_cast<uint64_t*>(cur_stack_arg_);
-      *tmp = val;
-      cur_stack_arg_ += kRegistersNeededForDouble;
+  // Size of the part of the alloca that we actually need.
+  size_t GetAllocaUsedSize() {
+    return alloca_used_size_;
+  }
 
-      fpr_index_ = 0;                   // can't use FPRs anymore
-    }
+  void* GetCodeReturn() {
+    return code_return_;
   }
 
+ private:
   uint32_t sirt_number_of_references_;
   StackReference<mirror::Object>* cur_sirt_entry_;
-  uint32_t gpr_index_;           // should be uint, but gives error because on some archs no regs
+  StackIndirectReferenceTable* sirt_;
+  uint32_t sirt_expected_refs_;
   uintptr_t* cur_gpr_reg_;
-  uint32_t fpr_index_;           //                      ----- # -----
   uint32_t* cur_fpr_reg_;
   uintptr_t* cur_stack_arg_;
-  uint8_t* top_of_sirt_;
+  StackReference<mirror::Object>* top_of_sirt_;
+  void* code_return_;
+  size_t alloca_used_size_;
+
+  BuildGenericJniFrameStateMachine<BuildGenericJniFrameVisitor> sm_;
 
   DISALLOW_COPY_AND_ASSIGN(BuildGenericJniFrameVisitor);
 };
 
-extern "C" const void* artQuickGenericJniTrampoline(Thread* self, mirror::ArtMethod** sp)
+/*
+ * Initializes an alloca region assumed to be directly below sp for a native call:
+ * Create a Sirt and call stack and fill a mini stack with values to be pushed to registers.
+ * The final element on the stack is a pointer to the native code.
+ *
+ * The return of this function denotes:
+ * 1) How many bytes of the alloca can be released, if the value is non-negative.
+ * 2) An error, if the value is negative.
+ */
+extern "C" ssize_t artQuickGenericJniTrampoline(Thread* self, mirror::ArtMethod** sp)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   uint32_t* sp32 = reinterpret_cast<uint32_t*>(sp);
   mirror::ArtMethod* called = *sp;
@@ -1098,6 +1444,7 @@ extern "C" const void* artQuickGenericJniTrampoline(Thread* self, mirror::ArtMet
 
   // run the visitor
   MethodHelper mh(called);
+
   BuildGenericJniFrameVisitor visitor(sp, called->IsStatic(), mh.GetShorty(), mh.GetShortyLength(),
                                       self);
   visitor.VisitArguments();
@@ -1110,10 +1457,10 @@ extern "C" const void* artQuickGenericJniTrampoline(Thread* self, mirror::ArtMet
   uint32_t cookie;
   if (called->IsSynchronized()) {
     cookie = JniMethodStartSynchronized(visitor.GetFirstSirtEntry(), self);
-    // TODO: error checking.
     if (self->IsExceptionPending()) {
       self->PopSirt();
-      return nullptr;
+      // A negative value denotes an error.
+      return -1;
     }
   } else {
     cookie = JniMethodStart(self);
@@ -1127,7 +1474,12 @@ extern "C" const void* artQuickGenericJniTrampoline(Thread* self, mirror::ArtMet
     LOG(FATAL) << "Finding native code not implemented yet.";
   }
 
-  return nativeCode;
+  uintptr_t* code_pointer = reinterpret_cast<uintptr_t*>(visitor.GetCodeReturn());
+  size_t window_size = visitor.GetAllocaUsedSize();
+  *code_pointer = reinterpret_cast<uintptr_t>(nativeCode);
+
+  // 5K reserved, window_size used.
+  return 5*1024 - window_size;
 }
 
 /*
@@ -1141,27 +1493,30 @@ extern "C" uint64_t artQuickGenericJniEndTrampoline(Thread* self, mirror::ArtMet
   mirror::ArtMethod* called = *sp;
   uint32_t cookie = *(sp32-1);
 
-  // TODO: synchronized.
   MethodHelper mh(called);
   char return_shorty_char = mh.GetShorty()[0];
 
   if (return_shorty_char == 'L') {
     // the only special ending call
     if (called->IsSynchronized()) {
-      BuildGenericJniFrameVisitor visitor(sp, called->IsStatic(), mh.GetShorty(),
-                                          mh.GetShortyLength(), self);
-      return reinterpret_cast<uint64_t>(JniMethodEndWithReferenceSynchronized(result.l, cookie,
-                                                                              visitor.GetFirstSirtEntry(),
+      ComputeGenericJniFrameSize fsc;
+      fsc.ComputeSirtOffset();
+      uint32_t offset = fsc.GetFirstSirtEntryOffset();
+      jobject tmp = reinterpret_cast<jobject>(reinterpret_cast<uint8_t*>(sp)-offset);
+
+      return reinterpret_cast<uint64_t>(JniMethodEndWithReferenceSynchronized(result.l, cookie, tmp,
                                                                               self));
     } else {
       return reinterpret_cast<uint64_t>(JniMethodEndWithReference(result.l, cookie, self));
     }
   } else {
     if (called->IsSynchronized()) {
-      // run the visitor
-      BuildGenericJniFrameVisitor visitor(sp, called->IsStatic(), mh.GetShorty(),
-                                          mh.GetShortyLength(), self);
-      JniMethodEndSynchronized(cookie, visitor.GetFirstSirtEntry(), self);
+      ComputeGenericJniFrameSize fsc;
+      fsc.ComputeSirtOffset();
+      uint32_t offset = fsc.GetFirstSirtEntryOffset();
+      jobject tmp = reinterpret_cast<jobject>(reinterpret_cast<uint8_t*>(sp)-offset);
+
+      JniMethodEndSynchronized(cookie, tmp, self);
     } else {
       JniMethodEnd(cookie, self);
     }
diff --git a/runtime/gc/accounting/card_table.h b/runtime/gc/accounting/card_table.h
index bb4d1d7326..8b7bfd35ef 100644
--- a/runtime/gc/accounting/card_table.h
+++ b/runtime/gc/accounting/card_table.h
@@ -17,8 +17,8 @@
 #ifndef ART_RUNTIME_GC_ACCOUNTING_CARD_TABLE_H_
 #define ART_RUNTIME_GC_ACCOUNTING_CARD_TABLE_H_
 
+#include "base/mutex.h"
 #include "globals.h"
-#include "locks.h"
 #include "mem_map.h"
 #include "UniquePtr.h"
 
diff --git a/runtime/gc/accounting/heap_bitmap.h b/runtime/gc/accounting/heap_bitmap.h
index dde1425abf..7cfeb63e25 100644
--- a/runtime/gc/accounting/heap_bitmap.h
+++ b/runtime/gc/accounting/heap_bitmap.h
@@ -19,7 +19,6 @@
 
 #include "base/logging.h"
 #include "gc_allocator.h"
-#include "locks.h"
 #include "object_callbacks.h"
 #include "space_bitmap.h"
 
diff --git a/runtime/gc/accounting/mod_union_table.cc b/runtime/gc/accounting/mod_union_table.cc
index 06127c11b9..887192183d 100644
--- a/runtime/gc/accounting/mod_union_table.cc
+++ b/runtime/gc/accounting/mod_union_table.cc
@@ -175,7 +175,6 @@ class CheckReferenceVisitor {
   }
 
   // Extra parameters are required since we use this same visitor signature for checking objects.
-  // TODO: Fixme when anotatalysis works with visitors.
   void operator()(Object* obj, Object* ref,
                   const MemberOffset& /* offset */, bool /* is_static */) const
       SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_) {
diff --git a/runtime/gc/accounting/space_bitmap.h b/runtime/gc/accounting/space_bitmap.h
index 3c4b674fcd..5fd2bce8af 100644
--- a/runtime/gc/accounting/space_bitmap.h
+++ b/runtime/gc/accounting/space_bitmap.h
@@ -17,9 +17,9 @@
 #ifndef ART_RUNTIME_GC_ACCOUNTING_SPACE_BITMAP_H_
 #define ART_RUNTIME_GC_ACCOUNTING_SPACE_BITMAP_H_
 
+#include "base/mutex.h"
 #include "gc_allocator.h"
 #include "globals.h"
-#include "locks.h"
 #include "mem_map.h"
 #include "object_callbacks.h"
 #include "UniquePtr.h"
@@ -248,8 +248,7 @@ class ObjectSet {
     contained_ = space_set.contained_;
   }
 
-  void Walk(ObjectCallback* callback, void* arg)
-      SHARED_LOCKS_REQUIRED(GlobalSynchronization::heap_bitmap_lock_);
+  void Walk(ObjectCallback* callback, void* arg) SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
   template <typename Visitor>
   void Visit(const Visitor& visitor) NO_THREAD_SAFETY_ANALYSIS {
diff --git a/runtime/gc/collector/garbage_collector.h b/runtime/gc/collector/garbage_collector.h
index 088f1d4581..8d401b8812 100644
--- a/runtime/gc/collector/garbage_collector.h
+++ b/runtime/gc/collector/garbage_collector.h
@@ -18,10 +18,10 @@
 #define ART_RUNTIME_GC_COLLECTOR_GARBAGE_COLLECTOR_H_
 
 #include "base/histogram.h"
+#include "base/mutex.h"
 #include "base/timing_logger.h"
 #include "gc/gc_cause.h"
 #include "gc_type.h"
-#include "locks.h"
 #include <stdint.h>
 #include <vector>
 
diff --git a/runtime/gc/collector/mark_sweep.cc b/runtime/gc/collector/mark_sweep.cc
index 4aff68a569..71424bd886 100644
--- a/runtime/gc/collector/mark_sweep.cc
+++ b/runtime/gc/collector/mark_sweep.cc
@@ -1347,9 +1347,6 @@ void MarkSweep::FinishPhase() {
   timings_.NewSplit("PostGcVerification");
   heap->PostGcVerification(this);
 
-  timings_.NewSplit("RequestHeapTrim");
-  heap->RequestHeapTrim();
-
   // Update the cumulative statistics
   total_freed_objects_ += GetFreedObjects() + GetFreedLargeObjects();
   total_freed_bytes_ += GetFreedBytes() + GetFreedLargeObjectBytes();
diff --git a/runtime/gc/collector/mark_sweep.h b/runtime/gc/collector/mark_sweep.h
index 5c0a233375..8d40c34f28 100644
--- a/runtime/gc/collector/mark_sweep.h
+++ b/runtime/gc/collector/mark_sweep.h
@@ -114,7 +114,7 @@ class MarkSweep : public GarbageCollector {
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  bool IsImmuneSpace(const space::ContinuousSpace* space) const;
+  bool IsImmuneSpace(const space::ContinuousSpace* space) const
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Bind the live bits to the mark bits of bitmaps for spaces that are never collected, ie
@@ -152,6 +152,7 @@ class MarkSweep : public GarbageCollector {
 
   // Sweep only pointers within an array. WARNING: Trashes objects.
   void SweepArray(accounting::ObjectStack* allocation_stack_, bool swap_bitmaps)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
   // Blackens an object.
diff --git a/runtime/gc/collector/partial_mark_sweep.h b/runtime/gc/collector/partial_mark_sweep.h
index 44ae9e9296..ac0d068194 100644
--- a/runtime/gc/collector/partial_mark_sweep.h
+++ b/runtime/gc/collector/partial_mark_sweep.h
@@ -17,7 +17,6 @@
 #ifndef ART_RUNTIME_GC_COLLECTOR_PARTIAL_MARK_SWEEP_H_
 #define ART_RUNTIME_GC_COLLECTOR_PARTIAL_MARK_SWEEP_H_
 
-#include "locks.h"
 #include "mark_sweep.h"
 
 namespace art {
diff --git a/runtime/gc/collector/semi_space.cc b/runtime/gc/collector/semi_space.cc
index a577f909de..2da360f3a0 100644
--- a/runtime/gc/collector/semi_space.cc
+++ b/runtime/gc/collector/semi_space.cc
@@ -678,13 +678,14 @@ void SemiSpace::DelayReferenceReferent(mirror::Class* klass, Object* obj) {
   heap_->DelayReferenceReferent(klass, obj, MarkedForwardingAddressCallback, this);
 }
 
-// Visit all of the references of an object and update.
-void SemiSpace::ScanObject(Object* obj) {
-  DCHECK(obj != NULL);
-  DCHECK(!from_space_->HasAddress(obj)) << "Scanning object " << obj << " in from space";
-  MarkSweep::VisitObjectReferences(obj, [this](Object* obj, Object* ref, const MemberOffset& offset,
-     bool /* is_static */) ALWAYS_INLINE_LAMBDA NO_THREAD_SAFETY_ANALYSIS {
-    mirror::Object* new_address = MarkObject(ref);
+class SemiSpaceMarkObjectVisitor {
+ public:
+  explicit SemiSpaceMarkObjectVisitor(SemiSpace* semi_space) : semi_space_(semi_space) {
+  }
+
+  void operator()(Object* obj, Object* ref, const MemberOffset& offset, bool /* is_static */)
+      const ALWAYS_INLINE NO_THREAD_SAFETY_ANALYSIS /* EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_) */ {
+    mirror::Object* new_address = semi_space_->MarkObject(ref);
     if (new_address != ref) {
       DCHECK(new_address != nullptr);
       // Don't need to mark the card since we updating the object address and not changing the
@@ -694,7 +695,17 @@ void SemiSpace::ScanObject(Object* obj) {
       // disable check as we could run inside a transaction.
       obj->SetFieldObjectWithoutWriteBarrier<false, false, kVerifyNone>(offset, new_address, false);
     }
-  }, kMovingClasses);
+  }
+ private:
+  SemiSpace* const semi_space_;
+};
+
+// Visit all of the references of an object and update.
+void SemiSpace::ScanObject(Object* obj) {
+  DCHECK(obj != NULL);
+  DCHECK(!from_space_->HasAddress(obj)) << "Scanning object " << obj << " in from space";
+  SemiSpaceMarkObjectVisitor visitor(this);
+  MarkSweep::VisitObjectReferences(obj, visitor, kMovingClasses);
   mirror::Class* klass = obj->GetClass<kVerifyNone>();
   if (UNLIKELY(klass->IsReferenceClass<kVerifyNone>())) {
     DelayReferenceReferent(klass, obj);
diff --git a/runtime/gc/collector/sticky_mark_sweep.h b/runtime/gc/collector/sticky_mark_sweep.h
index 98f2b59243..934b1bd368 100644
--- a/runtime/gc/collector/sticky_mark_sweep.h
+++ b/runtime/gc/collector/sticky_mark_sweep.h
@@ -18,7 +18,6 @@
 #define ART_RUNTIME_GC_COLLECTOR_STICKY_MARK_SWEEP_H_
 
 #include "base/macros.h"
-#include "locks.h"
 #include "partial_mark_sweep.h"
 
 namespace art {
@@ -43,7 +42,9 @@ class StickyMarkSweep FINAL : public PartialMarkSweep {
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
-  void Sweep(bool swap_bitmaps) OVERRIDE EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
+  void Sweep(bool swap_bitmaps) OVERRIDE
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
   // Don't need to do anything special here since we scan all the cards which may have references
   // to the newly allocated objects.
diff --git a/runtime/gc/heap-inl.h b/runtime/gc/heap-inl.h
index 89ded0b27f..b80e72e6cf 100644
--- a/runtime/gc/heap-inl.h
+++ b/runtime/gc/heap-inl.h
@@ -24,8 +24,8 @@
 #include "gc/space/dlmalloc_space-inl.h"
 #include "gc/space/large_object_space.h"
 #include "gc/space/rosalloc_space-inl.h"
-#include "object_utils.h"
 #include "runtime.h"
+#include "sirt_ref-inl.h"
 #include "thread.h"
 #include "thread-inl.h"
 #include "verify_object-inl.h"
@@ -37,7 +37,9 @@ template <bool kInstrumented, bool kCheckLargeObject, typename PreFenceVisitor>
 inline mirror::Object* Heap::AllocObjectWithAllocator(Thread* self, mirror::Class* klass,
                                                       size_t byte_count, AllocatorType allocator,
                                                       const PreFenceVisitor& pre_fence_visitor) {
-  DebugCheckPreconditionsForAllocObject(klass, byte_count);
+  if (kIsDebugBuild) {
+    CheckPreconditionsForAllocObject(klass, byte_count);
+  }
   // Since allocation can cause a GC which will need to SuspendAll, make sure all allocations are
   // done in the runnable state where suspension is expected.
   DCHECK_EQ(self->GetState(), kRunnable);
@@ -107,7 +109,7 @@ inline mirror::Object* Heap::AllocObjectWithAllocator(Thread* self, mirror::Clas
   // optimized out. And for the other allocators, AllocatorMayHaveConcurrentGC is a constant since
   // the allocator_type should be constant propagated.
   if (AllocatorMayHaveConcurrentGC(allocator) && concurrent_gc_) {
-    CheckConcurrentGC(self, new_num_bytes_allocated, obj);
+    CheckConcurrentGC(self, new_num_bytes_allocated, &obj);
   }
   VerifyObject(obj);
   self->VerifyStack();
@@ -226,13 +228,6 @@ inline mirror::Object* Heap::TryToAllocate(Thread* self, AllocatorType allocator
   return ret;
 }
 
-inline void Heap::DebugCheckPreconditionsForAllocObject(mirror::Class* c, size_t byte_count) {
-  DCHECK(c == NULL || (c->IsClassClass() && byte_count >= sizeof(mirror::Class)) ||
-         (c->IsVariableSize() || c->GetObjectSize() == byte_count) ||
-         strlen(ClassHelper(c).GetDescriptor()) == 0);
-  DCHECK_GE(byte_count, sizeof(mirror::Object));
-}
-
 inline Heap::AllocationTimer::AllocationTimer(Heap* heap, mirror::Object** allocated_obj_ptr)
     : heap_(heap), allocated_obj_ptr_(allocated_obj_ptr) {
   if (kMeasureAllocationTime) {
@@ -280,11 +275,13 @@ inline bool Heap::IsOutOfMemoryOnAllocation(AllocatorType allocator_type, size_t
 }
 
 inline void Heap::CheckConcurrentGC(Thread* self, size_t new_num_bytes_allocated,
-                                    mirror::Object* obj) {
+                                    mirror::Object** obj) {
   if (UNLIKELY(new_num_bytes_allocated >= concurrent_start_bytes_)) {
     // The SirtRef is necessary since the calls in RequestConcurrentGC are a safepoint.
-    SirtRef<mirror::Object> ref(self, obj);
+    SirtRef<mirror::Object> ref(self, *obj);
     RequestConcurrentGC(self);
+    // Restore obj in case it moved.
+    *obj = ref.get();
   }
 }
 
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 87ee21bb86..87b4e60c82 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -90,6 +90,11 @@ Heap::Heap(size_t initial_size, size_t growth_limit, size_t min_free, size_t max
       collector_type_(kCollectorTypeNone),
       post_zygote_collector_type_(post_zygote_collector_type),
       background_collector_type_(background_collector_type),
+      desired_collector_type_(collector_type_),
+      heap_trim_request_lock_(nullptr),
+      heap_trim_target_time_(0),
+      heap_transition_target_time_(0),
+      heap_trim_request_pending_(false),
       parallel_gc_threads_(parallel_gc_threads),
       conc_gc_threads_(conc_gc_threads),
       low_memory_mode_(low_memory_mode),
@@ -127,7 +132,6 @@ Heap::Heap(size_t initial_size, size_t growth_limit, size_t min_free, size_t max
       verify_mod_union_table_(false),
       verify_pre_gc_rosalloc_(verify_pre_gc_rosalloc),
       verify_post_gc_rosalloc_(verify_post_gc_rosalloc),
-      last_trim_time_ms_(0),
       allocation_rate_(0),
       /* For GC a lot mode, we limit the allocations stacks to be kGcAlotInterval allocations. This
        * causes a lot of GC since we do a GC for alloc whenever the stack is full. When heap
@@ -160,16 +164,17 @@ Heap::Heap(size_t initial_size, size_t growth_limit, size_t min_free, size_t max
   // If we aren't the zygote, switch to the default non zygote allocator. This may update the
   // entrypoints.
   if (!Runtime::Current()->IsZygote()) {
-    ChangeCollector(post_zygote_collector_type_);
+    desired_collector_type_ = post_zygote_collector_type_;
     large_object_threshold_ = kDefaultLargeObjectThreshold;
   } else {
     if (kMovingCollector) {
       // We are the zygote, use bump pointer allocation + semi space collector.
-      ChangeCollector(kCollectorTypeSS);
+      desired_collector_type_ = kCollectorTypeSS;
     } else {
-      ChangeCollector(post_zygote_collector_type_);
+      desired_collector_type_ = post_zygote_collector_type_;
     }
   }
+  ChangeCollector(desired_collector_type_);
 
   live_bitmap_.reset(new accounting::HeapBitmap(this));
   mark_bitmap_.reset(new accounting::HeapBitmap(this));
@@ -274,7 +279,7 @@ Heap::Heap(size_t initial_size, size_t growth_limit, size_t min_free, size_t max
   gc_complete_lock_ = new Mutex("GC complete lock");
   gc_complete_cond_.reset(new ConditionVariable("GC complete condition variable",
                                                 *gc_complete_lock_));
-  last_gc_time_ns_ = NanoTime();
+  heap_trim_request_lock_ = new Mutex("Heap trim request lock");
   last_gc_size_ = GetBytesAllocated();
 
   if (ignore_max_footprint_) {
@@ -318,6 +323,16 @@ void Heap::ChangeAllocator(AllocatorType allocator) {
   }
 }
 
+void Heap::DisableCompaction() {
+  if (IsCompactingGC(post_zygote_collector_type_)) {
+    post_zygote_collector_type_ = kCollectorTypeCMS;
+  }
+  if (IsCompactingGC(background_collector_type_)) {
+    background_collector_type_ = post_zygote_collector_type_;
+  }
+  TransitionCollector(post_zygote_collector_type_);
+}
+
 std::string Heap::SafeGetClassDescriptor(mirror::Class* klass) {
   if (!IsValidContinuousSpaceObjectAddress(klass)) {
     return StringPrintf("<non heap address klass %p>", klass);
@@ -442,12 +457,12 @@ void Heap::UpdateProcessState(ProcessState process_state) {
   if (process_state_ != process_state) {
     process_state_ = process_state;
     if (process_state_ == kProcessStateJankPerceptible) {
-      TransitionCollector(post_zygote_collector_type_);
+      // Transition back to foreground right away to prevent jank.
+      RequestHeapTransition(post_zygote_collector_type_, 0);
     } else {
-      TransitionCollector(background_collector_type_);
+      // Don't delay for debug builds since we may want to stress test the GC.
+      RequestHeapTransition(background_collector_type_, kIsDebugBuild ? 0 : kHeapTransitionWait);
     }
-  } else {
-    CollectGarbageInternal(collector::kGcTypeFull, kGcCauseBackground, false);
   }
 }
 
@@ -844,9 +859,40 @@ void Heap::ThrowOutOfMemoryError(Thread* self, size_t byte_count, bool large_obj
   self->ThrowOutOfMemoryError(oss.str().c_str());
 }
 
+void Heap::DoPendingTransitionOrTrim() {
+  Thread* self = Thread::Current();
+  CollectorType desired_collector_type;
+  // Wait until we reach the desired transition time.
+  while (true) {
+    uint64_t wait_time;
+    {
+      MutexLock mu(self, *heap_trim_request_lock_);
+      desired_collector_type = desired_collector_type_;
+      uint64_t current_time = NanoTime();
+      if (current_time >= heap_transition_target_time_) {
+        break;
+      }
+      wait_time = heap_transition_target_time_ - current_time;
+    }
+    ScopedThreadStateChange tsc(self, kSleeping);
+    usleep(wait_time / 1000);  // Usleep takes microseconds.
+  }
+  // Transition the heap if the desired collector type is nto the same as the current collector type.
+  TransitionCollector(desired_collector_type);
+  // Do a heap trim if it is needed.
+  Trim();
+}
+
 void Heap::Trim() {
   Thread* self = Thread::Current();
   {
+    MutexLock mu(self, *heap_trim_request_lock_);
+    if (!heap_trim_request_pending_ || NanoTime() < heap_trim_target_time_) {
+      return;
+    }
+    heap_trim_request_pending_ = false;
+  }
+  {
     // Need to do this before acquiring the locks since we don't want to get suspended while
     // holding any locks.
     ScopedThreadStateChange tsc(self, kWaitingForGcToComplete);
@@ -1731,6 +1777,7 @@ collector::GcType Heap::CollectGarbageInternal(collector::GcType gc_type, GcCaus
   collector->Run(gc_cause, clear_soft_references);
   total_objects_freed_ever_ += collector->GetFreedObjects();
   total_bytes_freed_ever_ += collector->GetFreedBytes();
+  RequestHeapTrim(Heap::kHeapTrimWait);
   // Enqueue cleared references.
   EnqueueClearedReferences();
   // Grow the heap so that we know when to perform the next GC.
@@ -2493,7 +2540,20 @@ void Heap::ConcurrentGC(Thread* self) {
   }
 }
 
-void Heap::RequestHeapTrim() {
+void Heap::RequestHeapTransition(CollectorType desired_collector_type, uint64_t delta_time) {
+  Thread* self = Thread::Current();
+  {
+    MutexLock mu(self, *heap_trim_request_lock_);
+    if (desired_collector_type_ == desired_collector_type) {
+      return;
+    }
+    heap_transition_target_time_ = std::max(heap_transition_target_time_, NanoTime() + delta_time);
+    desired_collector_type_ = desired_collector_type;
+  }
+  SignalHeapTrimDaemon(self);
+}
+
+void Heap::RequestHeapTrim(uint64_t delta_time) {
   // GC completed and now we must decide whether to request a heap trim (advising pages back to the
   // kernel) or not. Issuing a request will also cause trimming of the libc heap. As a trim scans
   // a space it will hold its lock and can become a cause of jank.
@@ -2506,11 +2566,6 @@ void Heap::RequestHeapTrim() {
   // to utilization (which is probably inversely proportional to how much benefit we can expect).
   // We could try mincore(2) but that's only a measure of how many pages we haven't given away,
   // not how much use we're making of those pages.
-  uint64_t ms_time = MilliTime();
-  // Don't bother trimming the alloc space if a heap trim occurred in the last two seconds.
-  if (ms_time - last_trim_time_ms_ < 2 * 1000) {
-    return;
-  }
 
   Thread* self = Thread::Current();
   Runtime* runtime = Runtime::Current();
@@ -2521,19 +2576,27 @@ void Heap::RequestHeapTrim() {
     return;
   }
 
-  last_trim_time_ms_ = ms_time;
-
-  // Trim only if we do not currently care about pause times.
+  // Request a heap trim only if we do not currently care about pause times.
   if (!CareAboutPauseTimes()) {
-    JNIEnv* env = self->GetJniEnv();
-    DCHECK(WellKnownClasses::java_lang_Daemons != NULL);
-    DCHECK(WellKnownClasses::java_lang_Daemons_requestHeapTrim != NULL);
-    env->CallStaticVoidMethod(WellKnownClasses::java_lang_Daemons,
-                              WellKnownClasses::java_lang_Daemons_requestHeapTrim);
-    CHECK(!env->ExceptionCheck());
+    {
+      MutexLock mu(self, *heap_trim_request_lock_);
+      heap_trim_target_time_ = std::max(heap_trim_target_time_, NanoTime() + delta_time);
+      heap_trim_request_pending_ = true;
+    }
+    // Notify the daemon thread which will actually do the heap trim.
+    SignalHeapTrimDaemon(self);
   }
 }
 
+void Heap::SignalHeapTrimDaemon(Thread* self) {
+  JNIEnv* env = self->GetJniEnv();
+  DCHECK(WellKnownClasses::java_lang_Daemons != nullptr);
+  DCHECK(WellKnownClasses::java_lang_Daemons_requestHeapTrim != nullptr);
+  env->CallStaticVoidMethod(WellKnownClasses::java_lang_Daemons,
+                            WellKnownClasses::java_lang_Daemons_requestHeapTrim);
+  CHECK(!env->ExceptionCheck());
+}
+
 void Heap::RevokeThreadLocalBuffers(Thread* thread) {
   if (rosalloc_space_ != nullptr) {
     rosalloc_space_->RevokeThreadLocalBuffers(thread);
@@ -2645,5 +2708,12 @@ void Heap::AddModUnionTable(accounting::ModUnionTable* mod_union_table) {
   mod_union_tables_.Put(mod_union_table->GetSpace(), mod_union_table);
 }
 
+void Heap::CheckPreconditionsForAllocObject(mirror::Class* c, size_t byte_count) {
+  CHECK(c == NULL || (c->IsClassClass() && byte_count >= sizeof(mirror::Class)) ||
+        (c->IsVariableSize() || c->GetObjectSize() == byte_count) ||
+        strlen(ClassHelper(c).GetDescriptor()) == 0);
+  CHECK_GE(byte_count, sizeof(mirror::Object));
+}
+
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index 88adf811c5..3a8739a020 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -31,7 +31,6 @@
 #include "globals.h"
 #include "gtest/gtest.h"
 #include "jni.h"
-#include "locks.h"
 #include "object_callbacks.h"
 #include "offsets.h"
 #include "reference_queue.h"
@@ -135,6 +134,10 @@ class Heap {
   // Used so that we don't overflow the allocation time atomic integer.
   static constexpr size_t kTimeAdjust = 1024;
 
+  // How long we wait after a GC to perform a heap trim (nanoseconds).
+  static constexpr uint64_t kHeapTrimWait = MsToNs(5000);
+  static constexpr uint64_t kHeapTransitionWait = MsToNs(5000);
+
   // Create a heap with the requested sizes. The possible empty
   // image_file_names names specify Spaces to load based on
   // ImageWriter output.
@@ -189,7 +192,7 @@ class Heap {
 
   void SwapSemiSpaces() EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  void DebugCheckPreconditionsForAllocObject(mirror::Class* c, size_t byte_count)
+  void CheckPreconditionsForAllocObject(mirror::Class* c, size_t byte_count)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void ThrowOutOfMemoryError(size_t byte_count, bool large_object_allocation);
 
@@ -437,8 +440,12 @@ class Heap {
 
   void DumpForSigQuit(std::ostream& os);
 
+
+  // Do a pending heap transition or trim.
+  void DoPendingTransitionOrTrim() LOCKS_EXCLUDED(heap_trim_request_lock_);
+
   // Trim the managed and native heaps by releasing unused memory back to the OS.
-  void Trim();
+  void Trim() LOCKS_EXCLUDED(heap_trim_request_lock_);
 
   void RevokeThreadLocalBuffers(Thread* thread);
   void RevokeAllThreadLocalBuffers();
@@ -487,6 +494,9 @@ class Heap {
   // Assumes there is only one image space.
   space::ImageSpace* GetImageSpace() const;
 
+  // Permenantly disable compaction.
+  void DisableCompaction();
+
   space::DlMallocSpace* GetDlMallocSpace() const {
     return dlmalloc_space_;
   }
@@ -572,7 +582,8 @@ class Heap {
   bool ShouldAllocLargeObject(mirror::Class* c, size_t byte_count) const
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   ALWAYS_INLINE void CheckConcurrentGC(Thread* self, size_t new_num_bytes_allocated,
-                                       mirror::Object* obj);
+                                       mirror::Object** obj)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // We don't force this to be inlined since it is a slow path.
   template <bool kInstrumented, typename PreFenceVisitor>
@@ -636,7 +647,9 @@ class Heap {
   collector::GcType WaitForGcToCompleteLocked(Thread* self)
       EXCLUSIVE_LOCKS_REQUIRED(gc_complete_lock_);
 
-  void RequestHeapTrim() LOCKS_EXCLUDED(Locks::runtime_shutdown_lock_);
+  void RequestHeapTransition(CollectorType desired_collector_type, uint64_t delta_time)
+      LOCKS_EXCLUDED(heap_trim_request_lock_);
+  void RequestHeapTrim(uint64_t delta_time) LOCKS_EXCLUDED(Locks::runtime_shutdown_lock_);
   void RequestConcurrentGC(Thread* self) LOCKS_EXCLUDED(Locks::runtime_shutdown_lock_);
   bool IsGCRequestPending() const;
 
@@ -670,7 +683,7 @@ class Heap {
   void RemoveSpace(space::Space* space) LOCKS_EXCLUDED(Locks::heap_bitmap_lock_);
 
   static void VerificationCallback(mirror::Object* obj, void* arg)
-      SHARED_LOCKS_REQUIRED(GlobalSychronization::heap_bitmap_lock_);
+      SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
   // Swap the allocation stack with the live stack.
   void SwapStacks(Thread* self);
@@ -678,6 +691,10 @@ class Heap {
   // Clear cards and update the mod union table.
   void ProcessCards(TimingLogger& timings);
 
+  // Signal the heap trim daemon that there is something to do, either a heap transition or heap
+  // trim.
+  void SignalHeapTrimDaemon(Thread* self);
+
   // Push an object onto the allocation stack.
   void PushOnAllocationStack(Thread* self, mirror::Object* obj);
 
@@ -730,6 +747,17 @@ class Heap {
   CollectorType post_zygote_collector_type_;
   // Which collector we will use when the app is notified of a transition to background.
   CollectorType background_collector_type_;
+  // Desired collector type, heap trimming daemon transitions the heap if it is != collector_type_.
+  CollectorType desired_collector_type_;
+
+  // Lock which guards heap trim requests.
+  Mutex* heap_trim_request_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
+  // When we want to perform the next heap trim (nano seconds).
+  uint64_t heap_trim_target_time_ GUARDED_BY(heap_trim_request_lock_);
+  // When we want to perform the next heap transition (nano seconds).
+  uint64_t heap_transition_target_time_ GUARDED_BY(heap_trim_request_lock_);
+  // If we have a heap trim request pending.
+  bool heap_trim_request_pending_ GUARDED_BY(heap_trim_request_lock_);
 
   // How many GC threads we may use for paused parts of garbage collection.
   const size_t parallel_gc_threads_;
@@ -851,9 +879,6 @@ class Heap {
   // Parallel GC data structures.
   UniquePtr<ThreadPool> thread_pool_;
 
-  // The last time a heap trim occurred.
-  uint64_t last_trim_time_ms_;
-
   // The nanosecond time at which the last GC ended.
   uint64_t last_gc_time_ns_;
 
diff --git a/runtime/gc/reference_queue.h b/runtime/gc/reference_queue.h
index e12a95f332..99314ba0ef 100644
--- a/runtime/gc/reference_queue.h
+++ b/runtime/gc/reference_queue.h
@@ -26,7 +26,6 @@
 #include "globals.h"
 #include "gtest/gtest.h"
 #include "jni.h"
-#include "locks.h"
 #include "object_callbacks.h"
 #include "offsets.h"
 #include "thread_pool.h"
diff --git a/runtime/gc/space/bump_pointer_space.h b/runtime/gc/space/bump_pointer_space.h
index 2c9d35fa55..031fccdfcd 100644
--- a/runtime/gc/space/bump_pointer_space.h
+++ b/runtime/gc/space/bump_pointer_space.h
@@ -146,9 +146,6 @@ class BumpPointerSpace FINAL : public ContinuousMemMapAllocSpace {
   byte* AllocBlock(size_t bytes) EXCLUSIVE_LOCKS_REQUIRED(block_lock_);
   void RevokeThreadLocalBuffersLocked(Thread* thread) EXCLUSIVE_LOCKS_REQUIRED(block_lock_);
 
-  mirror::Object* AllocWithoutGrowthLocked(size_t num_bytes, size_t* bytes_allocated)
-      EXCLUSIVE_LOCKS_REQUIRED(lock_);
-
   // The main block is an unbounded block where objects go when there are no other blocks. This
   // enables us to maintain tightly packed objects when you are not using thread local buffers for
   // allocation. The main block starts at the space Begin().
diff --git a/runtime/gc/space/malloc_space.cc b/runtime/gc/space/malloc_space.cc
index 61d1071124..dac043efbb 100644
--- a/runtime/gc/space/malloc_space.cc
+++ b/runtime/gc/space/malloc_space.cc
@@ -24,6 +24,7 @@
 #include "mirror/class-inl.h"
 #include "mirror/object-inl.h"
 #include "runtime.h"
+#include "sirt_ref-inl.h"
 #include "thread.h"
 #include "thread_list.h"
 #include "utils.h"
diff --git a/runtime/gc/space/space_test.h b/runtime/gc/space/space_test.h
index 413fc1dcf9..ea0d290b1d 100644
--- a/runtime/gc/space/space_test.h
+++ b/runtime/gc/space/space_test.h
@@ -75,7 +75,7 @@ class SpaceTest : public CommonRuntimeTest {
   void SizeFootPrintGrowthLimitAndTrimDriver(size_t object_size, CreateSpaceFn create_space);
 };
 
-static size_t test_rand(size_t* seed) {
+static inline size_t test_rand(size_t* seed) {
   *seed = *seed * 1103515245 + 12345;
   return *seed;
 }
diff --git a/runtime/globals.h b/runtime/globals.h
index 83e302892a..5bc4b9146d 100644
--- a/runtime/globals.h
+++ b/runtime/globals.h
@@ -99,6 +99,9 @@ static constexpr bool kUseBrooksPointer = true;
 static constexpr bool kUseBrooksPointer = false;
 #endif
 
+// If true, references within the heap are poisoned (negated).
+static constexpr bool kPoisonHeapReferences = false;
+
 }  // namespace art
 
 #endif  // ART_RUNTIME_GLOBALS_H_
diff --git a/runtime/indirect_reference_table.cc b/runtime/indirect_reference_table.cc
index 54c7b6e45d..ed3fb5fa0c 100644
--- a/runtime/indirect_reference_table.cc
+++ b/runtime/indirect_reference_table.cc
@@ -21,11 +21,38 @@
 #include "scoped_thread_state_change.h"
 #include "thread.h"
 #include "utils.h"
+#include "verify_object-inl.h"
 
 #include <cstdlib>
 
 namespace art {
 
+template<typename T>
+class MutatorLockedDumpable {
+ public:
+  explicit MutatorLockedDumpable(T& value)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) : value_(value) {
+  }
+
+  void Dump(std::ostream& os) const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    value_.Dump(os);
+  }
+
+ private:
+  T& value_;
+
+  DISALLOW_COPY_AND_ASSIGN(MutatorLockedDumpable);
+};
+
+template<typename T>
+std::ostream& operator<<(std::ostream& os, const MutatorLockedDumpable<T>& rhs)
+// TODO: should be SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) however annotalysis
+//       currently fails for this.
+    NO_THREAD_SAFETY_ANALYSIS {
+  rhs.Dump(os);
+  return os;
+}
+
 static void AbortMaybe() {
   // If -Xcheck:jni is on, it'll give a more detailed error before aborting.
   if (!Runtime::Current()->GetJavaVM()->check_jni) {
@@ -81,8 +108,7 @@ IndirectRef IndirectReferenceTable::Add(uint32_t cookie, mirror::Object* obj) {
   size_t topIndex = segment_state_.parts.topIndex;
 
   CHECK(obj != NULL);
-  // TODO: stronger sanity check on the object (such as in heap)
-  DCHECK_ALIGNED(reinterpret_cast<uintptr_t>(obj), 8);
+  VerifyObject(obj);
   DCHECK(table_ != NULL);
   DCHECK_LE(alloc_entries_, max_entries_);
   DCHECK_GE(segment_state_.parts.numHoles, prevState.parts.numHoles);
@@ -329,4 +355,13 @@ void IndirectReferenceTable::Dump(std::ostream& os) const {
   ReferenceTable::Dump(os, entries);
 }
 
+mirror::Object* IndirectReferenceTable::Get(IndirectRef iref) const {
+  if (!GetChecked(iref)) {
+    return kInvalidIndirectRefObject;
+  }
+  mirror::Object* obj = table_[ExtractIndex(iref)];;
+  VerifyObject(obj);
+  return obj;
+}
+
 }  // namespace art
diff --git a/runtime/indirect_reference_table.h b/runtime/indirect_reference_table.h
index 9d2fa35103..a2de726de4 100644
--- a/runtime/indirect_reference_table.h
+++ b/runtime/indirect_reference_table.h
@@ -23,6 +23,7 @@
 #include <string>
 
 #include "base/logging.h"
+#include "base/mutex.h"
 #include "object_callbacks.h"
 #include "offsets.h"
 
@@ -266,12 +267,7 @@ class IndirectReferenceTable {
    *
    * Returns kInvalidIndirectRefObject if iref is invalid.
    */
-  mirror::Object* Get(IndirectRef iref) const {
-    if (!GetChecked(iref)) {
-      return kInvalidIndirectRefObject;
-    }
-    return table_[ExtractIndex(iref)];
-  }
+  mirror::Object* Get(IndirectRef iref) const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // TODO: remove when we remove work_around_app_jni_bugs support.
   bool ContainsDirectPointer(mirror::Object* direct_pointer) const;
diff --git a/runtime/instrumentation.h b/runtime/instrumentation.h
index e04d7b22e7..e9356e06da 100644
--- a/runtime/instrumentation.h
+++ b/runtime/instrumentation.h
@@ -19,7 +19,7 @@
 
 #include "atomic.h"
 #include "base/macros.h"
-#include "locks.h"
+#include "base/mutex.h"
 
 #include <stdint.h>
 #include <set>
diff --git a/runtime/intern_table.h b/runtime/intern_table.h
index fd921f3daf..7dd06c6f7a 100644
--- a/runtime/intern_table.h
+++ b/runtime/intern_table.h
@@ -17,12 +17,11 @@
 #ifndef ART_RUNTIME_INTERN_TABLE_H_
 #define ART_RUNTIME_INTERN_TABLE_H_
 
+#include <map>
+
 #include "base/mutex.h"
-#include "locks.h"
 #include "object_callbacks.h"
 
-#include <map>
-
 namespace art {
 
 enum VisitRootFlags : uint8_t;
diff --git a/runtime/interpreter/interpreter.h b/runtime/interpreter/interpreter.h
index efe11fc140..0750eb5c49 100644
--- a/runtime/interpreter/interpreter.h
+++ b/runtime/interpreter/interpreter.h
@@ -17,8 +17,8 @@
 #ifndef ART_RUNTIME_INTERPRETER_INTERPRETER_H_
 #define ART_RUNTIME_INTERPRETER_INTERPRETER_H_
 
+#include "base/mutex.h"
 #include "dex_file.h"
-#include "locks.h"
 
 namespace art {
 namespace mirror {
diff --git a/runtime/jdwp/jdwp.h b/runtime/jdwp/jdwp.h
index fdbdfeb3b1..fec0e31806 100644
--- a/runtime/jdwp/jdwp.h
+++ b/runtime/jdwp/jdwp.h
@@ -31,11 +31,13 @@
 struct iovec;
 
 namespace art {
-  union JValue;
+
+union JValue;
+class Thread;
+
 namespace mirror {
   class ArtMethod;
 }  // namespace mirror
-class Thread;
 
 namespace JDWP {
 
@@ -156,7 +158,7 @@ struct JdwpState {
   // ObjectId GetWaitForEventThread();
   void SetWaitForEventThread(ObjectId threadId)
       LOCKS_EXCLUDED(event_thread_lock_, process_request_lock_);
-  void ClearWaitForEventThread() LOCKS_EXCLUDED(event_thread_lock);
+  void ClearWaitForEventThread() LOCKS_EXCLUDED(event_thread_lock_);
 
   /*
    * These notify the debug code that something interesting has happened.  This
@@ -334,6 +336,7 @@ struct JdwpState {
 
   // Linked list of events requested by the debugger (breakpoints, class prep, etc).
   Mutex event_list_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
+
   JdwpEvent* event_list_ GUARDED_BY(event_list_lock_);
   size_t event_list_size_ GUARDED_BY(event_list_lock_);  // Number of elements in event_list_.
   size_t full_deoptimization_requests_ GUARDED_BY(event_list_lock_);  // Number of events requiring
diff --git a/runtime/jni_internal.cc b/runtime/jni_internal.cc
index 1bcb8dd3bc..f8865ea9ef 100644
--- a/runtime/jni_internal.cc
+++ b/runtime/jni_internal.cc
@@ -2466,8 +2466,7 @@ class JNI {
     return JNI_OK;
   }
 
-  static jint MonitorEnter(JNIEnv* env, jobject java_object)
-      EXCLUSIVE_LOCK_FUNCTION(monitor_lock_) {
+  static jint MonitorEnter(JNIEnv* env, jobject java_object) NO_THREAD_SAFETY_ANALYSIS {
     CHECK_NON_NULL_ARGUMENT(MonitorEnter, java_object);
     ScopedObjectAccess soa(env);
     mirror::Object* o = soa.Decode<mirror::Object*>(java_object);
@@ -2479,8 +2478,7 @@ class JNI {
     return JNI_OK;
   }
 
-  static jint MonitorExit(JNIEnv* env, jobject java_object)
-      UNLOCK_FUNCTION(monitor_lock_) {
+  static jint MonitorExit(JNIEnv* env, jobject java_object) NO_THREAD_SAFETY_ANALYSIS {
     CHECK_NON_NULL_ARGUMENT(MonitorExit, java_object);
     ScopedObjectAccess soa(env);
     mirror::Object* o = soa.Decode<mirror::Object*>(java_object);
@@ -2539,11 +2537,13 @@ class JNI {
     IndirectRef ref = reinterpret_cast<IndirectRef>(java_object);
     IndirectRefKind kind = GetIndirectRefKind(ref);
     switch (kind) {
-    case kLocal:
+    case kLocal: {
+      ScopedObjectAccess soa(env);
       if (static_cast<JNIEnvExt*>(env)->locals.Get(ref) != kInvalidIndirectRefObject) {
         return JNILocalRefType;
       }
       return JNIInvalidRefType;
+    }
     case kGlobal:
       return JNIGlobalRefType;
     case kWeakGlobal:
@@ -3194,7 +3194,11 @@ mirror::Object* JavaVMExt::DecodeWeakGlobal(Thread* self, IndirectRef ref) {
   while (UNLIKELY(!allow_new_weak_globals_)) {
     weak_globals_add_condition_.WaitHoldingLocks(self);
   }
-  return const_cast<mirror::Object*>(weak_globals_.Get(ref));
+  mirror::Object* obj = weak_globals_.Get(ref);
+  if (obj != kClearedJniWeakGlobal) {
+    VerifyObject(obj);
+  }
+  return obj;
 }
 
 void JavaVMExt::DumpReferenceTables(std::ostream& os) {
diff --git a/runtime/jni_internal.h b/runtime/jni_internal.h
index 606d5d1311..7b49d33625 100644
--- a/runtime/jni_internal.h
+++ b/runtime/jni_internal.h
@@ -25,7 +25,6 @@
 #include "object_callbacks.h"
 #include "reference_table.h"
 #include "runtime.h"
-#include "sirt_ref.h"
 
 #include <iosfwd>
 #include <string>
@@ -48,6 +47,7 @@ union JValue;
 class Libraries;
 class ParsedOptions;
 class ScopedObjectAccess;
+template<class T> class SirtRef;
 class Thread;
 
 void JniAbortF(const char* jni_function_name, const char* fmt, ...)
@@ -101,7 +101,8 @@ class JavaVMExt : public JavaVM {
   void DeleteWeakGlobalRef(Thread* self, jweak obj)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void SweepJniWeakGlobals(IsMarkedCallback* callback, void* arg);
-  mirror::Object* DecodeWeakGlobal(Thread* self, IndirectRef ref);
+  mirror::Object* DecodeWeakGlobal(Thread* self, IndirectRef ref)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   Runtime* runtime;
 
diff --git a/runtime/locks.cc b/runtime/locks.cc
deleted file mode 100644
index 246e339ce9..0000000000
--- a/runtime/locks.cc
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (C) 2012 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "locks.h"
-
-#include "base/mutex.h"
-
-namespace art {
-
-Mutex* Locks::abort_lock_ = NULL;
-Mutex* Locks::breakpoint_lock_ = NULL;
-Mutex* Locks::deoptimization_lock_ = NULL;
-ReaderWriterMutex* Locks::classlinker_classes_lock_ = NULL;
-ReaderWriterMutex* Locks::heap_bitmap_lock_ = NULL;
-Mutex* Locks::logging_lock_ = NULL;
-ReaderWriterMutex* Locks::mutator_lock_ = NULL;
-Mutex* Locks::runtime_shutdown_lock_ = NULL;
-Mutex* Locks::thread_list_lock_ = NULL;
-Mutex* Locks::thread_suspend_count_lock_ = NULL;
-Mutex* Locks::trace_lock_ = NULL;
-Mutex* Locks::profiler_lock_ = NULL;
-Mutex* Locks::unexpected_signal_lock_ = NULL;
-Mutex* Locks::intern_table_lock_ = NULL;
-
-void Locks::Init() {
-  if (logging_lock_ != NULL) {
-    // Already initialized.
-    DCHECK(abort_lock_ != NULL);
-    DCHECK(breakpoint_lock_ != NULL);
-    DCHECK(deoptimization_lock_ != NULL);
-    DCHECK(classlinker_classes_lock_ != NULL);
-    DCHECK(heap_bitmap_lock_ != NULL);
-    DCHECK(logging_lock_ != NULL);
-    DCHECK(mutator_lock_ != NULL);
-    DCHECK(thread_list_lock_ != NULL);
-    DCHECK(thread_suspend_count_lock_ != NULL);
-    DCHECK(trace_lock_ != NULL);
-    DCHECK(profiler_lock_ != NULL);
-    DCHECK(unexpected_signal_lock_ != NULL);
-    DCHECK(intern_table_lock_ != NULL);
-  } else {
-    logging_lock_ = new Mutex("logging lock", kLoggingLock, true);
-    abort_lock_ = new Mutex("abort lock", kAbortLock, true);
-
-    DCHECK(breakpoint_lock_ == NULL);
-    breakpoint_lock_ = new Mutex("breakpoint lock", kBreakpointLock);
-    DCHECK(deoptimization_lock_ == NULL);
-    deoptimization_lock_ = new Mutex("deoptimization lock", kDeoptimizationLock);
-    DCHECK(classlinker_classes_lock_ == NULL);
-    classlinker_classes_lock_ = new ReaderWriterMutex("ClassLinker classes lock",
-                                                      kClassLinkerClassesLock);
-    DCHECK(heap_bitmap_lock_ == NULL);
-    heap_bitmap_lock_ = new ReaderWriterMutex("heap bitmap lock", kHeapBitmapLock);
-    DCHECK(mutator_lock_ == NULL);
-    mutator_lock_ = new ReaderWriterMutex("mutator lock", kMutatorLock);
-    DCHECK(runtime_shutdown_lock_ == NULL);
-    runtime_shutdown_lock_ = new Mutex("runtime shutdown lock", kRuntimeShutdownLock);
-    DCHECK(thread_list_lock_ == NULL);
-    thread_list_lock_ = new Mutex("thread list lock", kThreadListLock);
-    DCHECK(thread_suspend_count_lock_ == NULL);
-    thread_suspend_count_lock_ = new Mutex("thread suspend count lock", kThreadSuspendCountLock);
-    DCHECK(trace_lock_ == NULL);
-    trace_lock_ = new Mutex("trace lock", kTraceLock);
-    DCHECK(profiler_lock_ == NULL);
-    profiler_lock_ = new Mutex("profiler lock", kProfilerLock);
-    DCHECK(unexpected_signal_lock_ == NULL);
-    unexpected_signal_lock_ = new Mutex("unexpected signal lock", kUnexpectedSignalLock, true);
-    DCHECK(intern_table_lock_ == NULL);
-    intern_table_lock_ = new Mutex("InternTable lock", kInternTableLock);
-  }
-}
-
-}  // namespace art
diff --git a/runtime/locks.h b/runtime/locks.h
deleted file mode 100644
index 4343ab40ed..0000000000
--- a/runtime/locks.h
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * Copyright (C) 2012 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ART_RUNTIME_LOCKS_H_
-#define ART_RUNTIME_LOCKS_H_
-
-#include <ostream>
-
-#include "base/macros.h"
-
-namespace art {
-
-class LOCKABLE Mutex;
-class LOCKABLE ReaderWriterMutex;
-
-// LockLevel is used to impose a lock hierarchy [1] where acquisition of a Mutex at a higher or
-// equal level to a lock a thread holds is invalid. The lock hierarchy achieves a cycle free
-// partial ordering and thereby cause deadlock situations to fail checks.
-//
-// [1] http://www.drdobbs.com/parallel/use-lock-hierarchies-to-avoid-deadlock/204801163
-enum LockLevel {
-  kLoggingLock = 0,
-  kUnexpectedSignalLock,
-  kThreadSuspendCountLock,
-  kAbortLock,
-  kJdwpSocketLock,
-  kRosAllocGlobalLock,
-  kRosAllocBracketLock,
-  kRosAllocBulkFreeLock,
-  kAllocSpaceLock,
-  kDexFileMethodInlinerLock,
-  kDexFileToMethodInlinerMapLock,
-  kMarkSweepMarkStackLock,
-  kTransactionLogLock,
-  kInternTableLock,
-  kMonitorPoolLock,
-  kDefaultMutexLevel,
-  kMarkSweepLargeObjectLock,
-  kPinTableLock,
-  kLoadLibraryLock,
-  kJdwpObjectRegistryLock,
-  kClassLinkerClassesLock,
-  kBreakpointLock,
-  kMonitorLock,
-  kThreadListLock,
-  kBreakpointInvokeLock,
-  kDeoptimizationLock,
-  kTraceLock,
-  kProfilerLock,
-  kJdwpEventListLock,
-  kJdwpAttachLock,
-  kJdwpStartLock,
-  kRuntimeShutdownLock,
-  kHeapBitmapLock,
-  kMutatorLock,
-  kZygoteCreationLock,
-
-  kLockLevelCount  // Must come last.
-};
-std::ostream& operator<<(std::ostream& os, const LockLevel& rhs);
-
-// Global mutexes corresponding to the levels above.
-class Locks {
- public:
-  static void Init();
-
-  // The mutator_lock_ is used to allow mutators to execute in a shared (reader) mode or to block
-  // mutators by having an exclusive (writer) owner. In normal execution each mutator thread holds
-  // a share on the mutator_lock_. The garbage collector may also execute with shared access but
-  // at times requires exclusive access to the heap (not to be confused with the heap meta-data
-  // guarded by the heap_lock_ below). When the garbage collector requires exclusive access it asks
-  // the mutators to suspend themselves which also involves usage of the thread_suspend_count_lock_
-  // to cover weaknesses in using ReaderWriterMutexes with ConditionVariables. We use a condition
-  // variable to wait upon in the suspension logic as releasing and then re-acquiring a share on
-  // the mutator lock doesn't necessarily allow the exclusive user (e.g the garbage collector)
-  // chance to acquire the lock.
-  //
-  // Thread suspension:
-  // Shared users                                  | Exclusive user
-  // (holding mutator lock and in kRunnable state) |   .. running ..
-  //   .. running ..                               | Request thread suspension by:
-  //   .. running ..                               |   - acquiring thread_suspend_count_lock_
-  //   .. running ..                               |   - incrementing Thread::suspend_count_ on
-  //   .. running ..                               |     all mutator threads
-  //   .. running ..                               |   - releasing thread_suspend_count_lock_
-  //   .. running ..                               | Block trying to acquire exclusive mutator lock
-  // Poll Thread::suspend_count_ and enter full    |   .. blocked ..
-  // suspend code.                                 |   .. blocked ..
-  // Change state to kSuspended                    |   .. blocked ..
-  // x: Release share on mutator_lock_             | Carry out exclusive access
-  // Acquire thread_suspend_count_lock_            |   .. exclusive ..
-  // while Thread::suspend_count_ > 0              |   .. exclusive ..
-  //   - wait on Thread::resume_cond_              |   .. exclusive ..
-  //     (releases thread_suspend_count_lock_)     |   .. exclusive ..
-  //   .. waiting ..                               | Release mutator_lock_
-  //   .. waiting ..                               | Request thread resumption by:
-  //   .. waiting ..                               |   - acquiring thread_suspend_count_lock_
-  //   .. waiting ..                               |   - decrementing Thread::suspend_count_ on
-  //   .. waiting ..                               |     all mutator threads
-  //   .. waiting ..                               |   - notifying on Thread::resume_cond_
-  //    - re-acquire thread_suspend_count_lock_    |   - releasing thread_suspend_count_lock_
-  // Release thread_suspend_count_lock_            |  .. running ..
-  // Acquire share on mutator_lock_                |  .. running ..
-  //  - This could block but the thread still      |  .. running ..
-  //    has a state of kSuspended and so this      |  .. running ..
-  //    isn't an issue.                            |  .. running ..
-  // Acquire thread_suspend_count_lock_            |  .. running ..
-  //  - we poll here as we're transitioning into   |  .. running ..
-  //    kRunnable and an individual thread suspend |  .. running ..
-  //    request (e.g for debugging) won't try      |  .. running ..
-  //    to acquire the mutator lock (which would   |  .. running ..
-  //    block as we hold the mutator lock). This   |  .. running ..
-  //    poll ensures that if the suspender thought |  .. running ..
-  //    we were suspended by incrementing our      |  .. running ..
-  //    Thread::suspend_count_ and then reading    |  .. running ..
-  //    our state we go back to waiting on         |  .. running ..
-  //    Thread::resume_cond_.                      |  .. running ..
-  // can_go_runnable = Thread::suspend_count_ == 0 |  .. running ..
-  // Release thread_suspend_count_lock_            |  .. running ..
-  // if can_go_runnable                            |  .. running ..
-  //   Change state to kRunnable                   |  .. running ..
-  // else                                          |  .. running ..
-  //   Goto x                                      |  .. running ..
-  //  .. running ..                                |  .. running ..
-  static ReaderWriterMutex* mutator_lock_;
-
-  // Allow reader-writer mutual exclusion on the mark and live bitmaps of the heap.
-  static ReaderWriterMutex* heap_bitmap_lock_ ACQUIRED_AFTER(mutator_lock_);
-
-  // Guards shutdown of the runtime.
-  static Mutex* runtime_shutdown_lock_ ACQUIRED_AFTER(heap_bitmap_lock_);
-
-  // The thread_list_lock_ guards ThreadList::list_. It is also commonly held to stop threads
-  // attaching and detaching.
-  static Mutex* thread_list_lock_ ACQUIRED_AFTER(runtime_shutdown_lock_);
-
-  // Guards breakpoints.
-  static Mutex* breakpoint_lock_ ACQUIRED_AFTER(thread_list_lock_);
-
-  // Guards deoptimization requests.
-  static Mutex* deoptimization_lock_ ACQUIRED_AFTER(breakpoint_lock_);
-
-  // Guards trace requests.
-  static Mutex* trace_lock_ ACQUIRED_AFTER(deoptimization_lock_);
-
-  // Guards profile objects.
-  static Mutex* profiler_lock_ ACQUIRED_AFTER(trace_lock_);
-
-  // Guards lists of classes within the class linker.
-  static ReaderWriterMutex* classlinker_classes_lock_ ACQUIRED_AFTER(profiler_lock_);
-
-  // When declaring any Mutex add DEFAULT_MUTEX_ACQUIRED_AFTER to use annotalysis to check the code
-  // doesn't try to hold a higher level Mutex.
-  #define DEFAULT_MUTEX_ACQUIRED_AFTER ACQUIRED_AFTER(classlinker_classes_lock_)
-
-  // Guards intern table.
-  static Mutex* intern_table_lock_ ACQUIRED_AFTER(classlinker_classes_lock_);
-
-  // Have an exclusive aborting thread.
-  static Mutex* abort_lock_ ACQUIRED_AFTER(classlinker_classes_lock_);
-
-  // Allow mutual exclusion when manipulating Thread::suspend_count_.
-  // TODO: Does the trade-off of a per-thread lock make sense?
-  static Mutex* thread_suspend_count_lock_ ACQUIRED_AFTER(abort_lock_);
-
-  // One unexpected signal at a time lock.
-  static Mutex* unexpected_signal_lock_ ACQUIRED_AFTER(thread_suspend_count_lock_);
-
-  // Have an exclusive logging thread.
-  static Mutex* logging_lock_ ACQUIRED_AFTER(unexpected_signal_lock_);
-};
-
-}  // namespace art
-
-#endif  // ART_RUNTIME_LOCKS_H_
diff --git a/runtime/mirror/art_method.h b/runtime/mirror/art_method.h
index a18e171960..a61698d709 100644
--- a/runtime/mirror/art_method.h
+++ b/runtime/mirror/art_method.h
@@ -20,7 +20,6 @@
 #include "class.h"
 #include "dex_file.h"
 #include "invoke_type.h"
-#include "locks.h"
 #include "modifiers.h"
 #include "object.h"
 #include "object_callbacks.h"
diff --git a/runtime/mirror/class.h b/runtime/mirror/class.h
index f9a5ea2b91..76ab94c65e 100644
--- a/runtime/mirror/class.h
+++ b/runtime/mirror/class.h
@@ -17,7 +17,6 @@
 #ifndef ART_RUNTIME_MIRROR_CLASS_H_
 #define ART_RUNTIME_MIRROR_CLASS_H_
 
-#include "gc/heap.h"
 #include "invoke_type.h"
 #include "modifiers.h"
 #include "object.h"
diff --git a/runtime/mirror/object.h b/runtime/mirror/object.h
index ded4e0ae7a..4e2c624516 100644
--- a/runtime/mirror/object.h
+++ b/runtime/mirror/object.h
@@ -21,6 +21,7 @@
 #include "base/logging.h"
 #include "base/macros.h"
 #include "cutils/atomic-inline.h"
+#include "monitor.h"
 #include "object_reference.h"
 #include "offsets.h"
 #include "runtime.h"
@@ -30,7 +31,6 @@ namespace art {
 
 class ImageWriter;
 class LockWord;
-class Monitor;
 struct ObjectOffsets;
 class Thread;
 template <typename T> class SirtRef;
@@ -64,7 +64,7 @@ class Throwable;
 static constexpr bool kCheckFieldAssignments = false;
 
 // C++ mirror of java.lang.Object
-class MANAGED Object {
+class MANAGED LOCKABLE Object {
  public:
   static MemberOffset ClassOffset() {
     return OFFSET_OF_OBJECT_MEMBER(Object, klass_);
@@ -104,9 +104,9 @@ class MANAGED Object {
   uint32_t GetLockOwnerThreadId();
 
   mirror::Object* MonitorEnter(Thread* self) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
-      EXCLUSIVE_LOCK_FUNCTION(monitor_lock_);
+      EXCLUSIVE_LOCK_FUNCTION();
   bool MonitorExit(Thread* self) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
-      UNLOCK_FUNCTION(monitor_lock_);
+      UNLOCK_FUNCTION();
   void Notify(Thread* self) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void NotifyAll(Thread* self) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void Wait(Thread* self) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
diff --git a/runtime/mirror/object_reference.h b/runtime/mirror/object_reference.h
index b30890f99a..72f281df06 100644
--- a/runtime/mirror/object_reference.h
+++ b/runtime/mirror/object_reference.h
@@ -17,7 +17,8 @@
 #ifndef ART_RUNTIME_MIRROR_OBJECT_REFERENCE_H_
 #define ART_RUNTIME_MIRROR_OBJECT_REFERENCE_H_
 
-#include "locks.h"
+#include "base/mutex.h"
+#include "globals.h"
 
 namespace art {
 namespace mirror {
@@ -74,7 +75,7 @@ class MANAGED ObjectReference {
 
 // References between objects within the managed heap.
 template<class MirrorType>
-class MANAGED HeapReference : public ObjectReference<false, MirrorType> {
+class MANAGED HeapReference : public ObjectReference<kPoisonHeapReferences, MirrorType> {
  public:
   static HeapReference<MirrorType> FromMirrorPtr(MirrorType* mirror_ptr)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
@@ -82,7 +83,7 @@ class MANAGED HeapReference : public ObjectReference<false, MirrorType> {
   }
  private:
   HeapReference<MirrorType>(MirrorType* mirror_ptr) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
-      : ObjectReference<false, MirrorType>(mirror_ptr) {}
+      : ObjectReference<kPoisonHeapReferences, MirrorType>(mirror_ptr) {}
 };
 
 }  // namespace mirror
diff --git a/runtime/mirror/stack_trace_element.cc b/runtime/mirror/stack_trace_element.cc
index 02a396acde..5217e5eda3 100644
--- a/runtime/mirror/stack_trace_element.cc
+++ b/runtime/mirror/stack_trace_element.cc
@@ -20,6 +20,7 @@
 #include "class-inl.h"
 #include "gc/accounting/card_table-inl.h"
 #include "object-inl.h"
+#include "sirt_ref-inl.h"
 #include "string.h"
 
 namespace art {
diff --git a/runtime/mirror/stack_trace_element.h b/runtime/mirror/stack_trace_element.h
index 779ec4b780..9e023c7dba 100644
--- a/runtime/mirror/stack_trace_element.h
+++ b/runtime/mirror/stack_trace_element.h
@@ -18,10 +18,10 @@
 #define ART_RUNTIME_MIRROR_STACK_TRACE_ELEMENT_H_
 
 #include "object.h"
-#include "sirt_ref.h"
 
 namespace art {
 
+template<class T> class SirtRef;
 struct StackTraceElementOffsets;
 
 namespace mirror {
diff --git a/runtime/monitor.cc b/runtime/monitor.cc
index 64794feb48..332aef0b7e 100644
--- a/runtime/monitor.cc
+++ b/runtime/monitor.cc
@@ -650,9 +650,22 @@ void Monitor::InflateThinLocked(Thread* self, SirtRef<mirror::Object>& obj, Lock
   }
 }
 
+// Fool annotalysis into thinking that the lock on obj is acquired.
+static mirror::Object* FakeLock(mirror::Object* obj)
+    EXCLUSIVE_LOCK_FUNCTION(obj) NO_THREAD_SAFETY_ANALYSIS {
+  return obj;
+}
+
+// Fool annotalysis into thinking that the lock on obj is release.
+static mirror::Object* FakeUnlock(mirror::Object* obj)
+    UNLOCK_FUNCTION(obj) NO_THREAD_SAFETY_ANALYSIS {
+  return obj;
+}
+
 mirror::Object* Monitor::MonitorEnter(Thread* self, mirror::Object* obj) {
   DCHECK(self != NULL);
   DCHECK(obj != NULL);
+  obj = FakeLock(obj);
   uint32_t thread_id = self->GetThreadId();
   size_t contention_count = 0;
   SirtRef<mirror::Object> sirt_obj(self, obj);
@@ -698,24 +711,22 @@ mirror::Object* Monitor::MonitorEnter(Thread* self, mirror::Object* obj) {
         mon->Lock(self);
         return sirt_obj.get();  // Success!
       }
-      case LockWord::kHashCode: {
+      case LockWord::kHashCode:
         // Inflate with the existing hashcode.
         Inflate(self, nullptr, sirt_obj.get(), lock_word.GetHashCode());
-        break;
-      }
+        continue;  // Start from the beginning.
       default: {
         LOG(FATAL) << "Invalid monitor state " << lock_word.GetState();
         return sirt_obj.get();
       }
     }
   }
-  return sirt_obj.get();
 }
 
 bool Monitor::MonitorExit(Thread* self, mirror::Object* obj) {
   DCHECK(self != NULL);
   DCHECK(obj != NULL);
-
+  obj = FakeUnlock(obj);
   LockWord lock_word = obj->GetLockWord();
   SirtRef<mirror::Object> sirt_obj(self, obj);
   switch (lock_word.GetState()) {
diff --git a/runtime/monitor.h b/runtime/monitor.h
index d0a3a2ed2b..55504b5943 100644
--- a/runtime/monitor.h
+++ b/runtime/monitor.h
@@ -27,16 +27,18 @@
 #include "atomic.h"
 #include "base/mutex.h"
 #include "object_callbacks.h"
-#include "sirt_ref.h"
 #include "thread_state.h"
 
 namespace art {
 
+template<class T> class SirtRef;
+
 namespace mirror {
   class ArtMethod;
   class Object;
 }  // namespace mirror
 class LockWord;
+template<class T> class SirtRef;
 class Thread;
 class StackVisitor;
 
@@ -58,11 +60,11 @@ class Monitor {
       NO_THREAD_SAFETY_ANALYSIS;  // TODO: Reading lock owner without holding lock is racy.
 
   static mirror::Object* MonitorEnter(Thread* thread, mirror::Object* obj)
-      EXCLUSIVE_LOCK_FUNCTION(monitor_lock_)
+      EXCLUSIVE_LOCK_FUNCTION(obj)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   static bool MonitorExit(Thread* thread, mirror::Object* obj)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
-      UNLOCK_FUNCTION(monitor_lock_);
+      UNLOCK_FUNCTION(obj);
 
   static void Notify(Thread* self, mirror::Object* obj)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
@@ -178,6 +180,7 @@ class Monitor {
   static uint32_t lock_profiling_threshold_;
 
   Mutex monitor_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
+
   ConditionVariable monitor_contenders_ GUARDED_BY(monitor_lock_);
 
   // Number of people waiting on the condition.
diff --git a/runtime/monitor_pool.cc b/runtime/monitor_pool.cc
index 19e569d204..eb7525a6ad 100644
--- a/runtime/monitor_pool.cc
+++ b/runtime/monitor_pool.cc
@@ -18,6 +18,7 @@
 
 #include "base/logging.h"
 #include "base/mutex-inl.h"
+#include "thread-inl.h"
 #include "monitor.h"
 
 namespace art {
diff --git a/runtime/monitor_pool.h b/runtime/monitor_pool.h
index 32f3f4ebe3..82d0feef4d 100644
--- a/runtime/monitor_pool.h
+++ b/runtime/monitor_pool.h
@@ -17,11 +17,14 @@
 #ifndef ART_RUNTIME_MONITOR_POOL_H_
 #define ART_RUNTIME_MONITOR_POOL_H_
 
-#include "monitor.h"
+#ifdef __LP64__
+#include <bitset>
+#include <stdint.h>
 
+#include "monitor.h"
+#include "runtime.h"
 #include "safe_map.h"
-
-#include <stdint.h>
+#endif
 
 namespace art {
 
diff --git a/runtime/native/dalvik_system_VMRuntime.cc b/runtime/native/dalvik_system_VMRuntime.cc
index f48e8ad07a..4aa1d1011d 100644
--- a/runtime/native/dalvik_system_VMRuntime.cc
+++ b/runtime/native/dalvik_system_VMRuntime.cc
@@ -177,6 +177,8 @@ static void VMRuntime_setTargetSdkVersionNative(JNIEnv* env, jobject, jint targe
           << targetSdkVersion << "...";
 
       vm->work_around_app_jni_bugs = true;
+      LOG(WARNING) << "Permenantly disabling heap compaction due to jni workarounds";
+      Runtime::Current()->GetHeap()->DisableCompaction();
     }
   }
 }
@@ -204,12 +206,11 @@ static void VMRuntime_updateProcessState(JNIEnv* env, jobject, jint process_stat
 }
 
 static void VMRuntime_trimHeap(JNIEnv*, jobject) {
-  Runtime::Current()->GetHeap()->Trim();
+  Runtime::Current()->GetHeap()->DoPendingTransitionOrTrim();
 }
 
 static void VMRuntime_concurrentGC(JNIEnv* env, jobject) {
-  Thread* self = ThreadForEnv(env);
-  Runtime::Current()->GetHeap()->ConcurrentGC(self);
+  Runtime::Current()->GetHeap()->ConcurrentGC(ThreadForEnv(env));
 }
 
 typedef std::map<std::string, mirror::String*> StringTable;
diff --git a/runtime/native/java_lang_Runtime.cc b/runtime/native/java_lang_Runtime.cc
index 0629f4d71c..f6149fff44 100644
--- a/runtime/native/java_lang_Runtime.cc
+++ b/runtime/native/java_lang_Runtime.cc
@@ -24,6 +24,7 @@
 #include "runtime.h"
 #include "scoped_thread_state_change.h"
 #include "ScopedUtfChars.h"
+#include "sirt_ref-inl.h"
 
 namespace art {
 
diff --git a/runtime/native/scoped_fast_native_object_access.h b/runtime/native/scoped_fast_native_object_access.h
index b5ee748425..645d78cce8 100644
--- a/runtime/native/scoped_fast_native_object_access.h
+++ b/runtime/native/scoped_fast_native_object_access.h
@@ -80,8 +80,6 @@ class ScopedFastNativeObjectAccess {
       return NULL;
     }
 
-    VerifyObject(obj);
-
     DCHECK_NE((reinterpret_cast<uintptr_t>(obj) & 0xffff0000), 0xebad0000);
 
     IndirectReferenceTable& locals = Env()->locals;
diff --git a/runtime/nth_caller_visitor.h b/runtime/nth_caller_visitor.h
index 794878a08e..374a80ea28 100644
--- a/runtime/nth_caller_visitor.h
+++ b/runtime/nth_caller_visitor.h
@@ -17,8 +17,8 @@
 #ifndef ART_RUNTIME_NTH_CALLER_VISITOR_H_
 #define ART_RUNTIME_NTH_CALLER_VISITOR_H_
 
+#include "base/mutex.h"
 #include "mirror/art_method.h"
-#include "locks.h"
 #include "stack.h"
 
 namespace art {
diff --git a/runtime/object_utils.h b/runtime/object_utils.h
index 4eac29164e..dd2bd4fafe 100644
--- a/runtime/object_utils.h
+++ b/runtime/object_utils.h
@@ -28,7 +28,7 @@
 #include "mirror/string.h"
 
 #include "runtime.h"
-#include "sirt_ref.h"
+#include "sirt_ref-inl.h"
 
 #include <string>
 
diff --git a/runtime/parsed_options.cc b/runtime/parsed_options.cc
index 04f1a05a3d..37db4624be 100644
--- a/runtime/parsed_options.cc
+++ b/runtime/parsed_options.cc
@@ -147,7 +147,13 @@ bool ParsedOptions::Parse(const Runtime::Options& options, bool ignore_unrecogni
 
   compiler_callbacks_ = nullptr;
   is_zygote_ = false;
-  interpreter_only_ = false;
+  if (kPoisonHeapReferences) {
+    // kPoisonHeapReferences currently works only with the interpreter only.
+    // TODO: make it work with the compiler.
+    interpreter_only_ = true;
+  } else {
+    interpreter_only_ = false;
+  }
   is_explicit_gc_disabled_ = false;
 
   long_pause_log_threshold_ = gc::Heap::kDefaultLongPauseLogThreshold;
diff --git a/runtime/profiler.h b/runtime/profiler.h
index e3af47cf50..6ea6c84f08 100644
--- a/runtime/profiler.h
+++ b/runtime/profiler.h
@@ -22,15 +22,14 @@
 #include <string>
 #include <vector>
 
+#include "barrier.h"
 #include "base/macros.h"
+#include "base/mutex.h"
 #include "globals.h"
 #include "instrumentation.h"
 #include "os.h"
 #include "safe_map.h"
-#include "base/mutex.h"
-#include "locks.h"
 #include "UniquePtr.h"
-#include "barrier.h"
 
 namespace art {
 
diff --git a/runtime/reference_table.cc b/runtime/reference_table.cc
index f43a15b83d..a3119bbd12 100644
--- a/runtime/reference_table.cc
+++ b/runtime/reference_table.cc
@@ -40,6 +40,7 @@ ReferenceTable::~ReferenceTable() {
 
 void ReferenceTable::Add(mirror::Object* obj) {
   DCHECK(obj != NULL);
+  VerifyObject(obj);
   if (entries_.size() >= max_size_) {
     LOG(FATAL) << "ReferenceTable '" << name_ << "' "
                << "overflowed (" << max_size_ << " entries)";
diff --git a/runtime/reference_table.h b/runtime/reference_table.h
index c9f5bc5c57..45309c9d99 100644
--- a/runtime/reference_table.h
+++ b/runtime/reference_table.h
@@ -22,8 +22,8 @@
 #include <string>
 #include <vector>
 
+#include "base/mutex.h"
 #include "object_callbacks.h"
-#include "locks.h"
 
 namespace art {
 namespace mirror {
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index de06fb8ee0..fdbf2456a4 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -94,7 +94,7 @@ Runtime::Runtime()
       default_imt_(nullptr),
       fault_message_lock_("Fault message lock"),
       fault_message_(""),
-      method_verifiers_lock_("Method verifiers lock"),
+      method_verifier_lock_("Method verifiers lock"),
       threads_being_born_(0),
       shutdown_cond_(new ConditionVariable("Runtime shutdown", *Locks::runtime_shutdown_lock_)),
       shutting_down_(false),
@@ -851,7 +851,7 @@ void Runtime::VisitNonThreadRoots(RootCallback* callback, void* arg) {
     }
   }
   {
-    MutexLock mu(Thread::Current(), method_verifiers_lock_);
+    MutexLock mu(Thread::Current(), method_verifier_lock_);
     for (verifier::MethodVerifier* verifier : method_verifiers_) {
       verifier->VisitRoots(callback, arg);
     }
@@ -1043,13 +1043,13 @@ void Runtime::SetCompileTimeClassPath(jobject class_loader,
 
 void Runtime::AddMethodVerifier(verifier::MethodVerifier* verifier) {
   DCHECK(verifier != nullptr);
-  MutexLock mu(Thread::Current(), method_verifiers_lock_);
+  MutexLock mu(Thread::Current(), method_verifier_lock_);
   method_verifiers_.insert(verifier);
 }
 
 void Runtime::RemoveMethodVerifier(verifier::MethodVerifier* verifier) {
   DCHECK(verifier != nullptr);
-  MutexLock mu(Thread::Current(), method_verifiers_lock_);
+  MutexLock mu(Thread::Current(), method_verifier_lock_);
   auto it = method_verifiers_.find(verifier);
   CHECK(it != method_verifiers_.end());
   method_verifiers_.erase(it);
diff --git a/runtime/runtime.h b/runtime/runtime.h
index 87307ae223..65d296a3dc 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -33,7 +33,6 @@
 #include "instruction_set.h"
 #include "instrumentation.h"
 #include "jobject_comparator.h"
-#include "locks.h"
 #include "object_callbacks.h"
 #include "runtime_stats.h"
 #include "safe_map.h"
@@ -471,7 +470,7 @@ class Runtime {
   std::string fault_message_ GUARDED_BY(fault_message_lock_);
 
   // Method verifier set, used so that we can update their GC roots.
-  Mutex method_verifiers_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
+  Mutex method_verifier_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
   std::set<verifier::MethodVerifier*> method_verifiers_;
 
   // A non-zero value indicates that a thread has been created but not yet initialized. Guarded by
diff --git a/runtime/safe_map.h b/runtime/safe_map.h
index 89da927cc2..393bf92ba2 100644
--- a/runtime/safe_map.h
+++ b/runtime/safe_map.h
@@ -33,10 +33,17 @@ class SafeMap {
   typedef SafeMap<K, V, Comparator, Allocator> Self;
 
  public:
-  typedef typename ::std::map<K, V, Comparator>::iterator iterator;
-  typedef typename ::std::map<K, V, Comparator>::const_iterator const_iterator;
-  typedef typename ::std::map<K, V, Comparator>::size_type size_type;
-  typedef typename ::std::map<K, V, Comparator>::value_type value_type;
+  typedef typename ::std::map<K, V, Comparator, Allocator>::key_compare key_compare;
+  typedef typename ::std::map<K, V, Comparator, Allocator>::allocator_type allocator_type;
+  typedef typename ::std::map<K, V, Comparator, Allocator>::iterator iterator;
+  typedef typename ::std::map<K, V, Comparator, Allocator>::const_iterator const_iterator;
+  typedef typename ::std::map<K, V, Comparator, Allocator>::size_type size_type;
+  typedef typename ::std::map<K, V, Comparator, Allocator>::value_type value_type;
+
+  SafeMap() = default;
+  explicit SafeMap(const key_compare& cmp, const allocator_type& allocator = allocator_type())
+    : map_(cmp, allocator) {
+  }
 
   Self& operator=(const Self& rhs) {
     map_ = rhs.map_;
diff --git a/runtime/scoped_thread_state_change.h b/runtime/scoped_thread_state_change.h
index f0f5ed263d..d9e7986efe 100644
--- a/runtime/scoped_thread_state_change.h
+++ b/runtime/scoped_thread_state_change.h
@@ -169,8 +169,6 @@ class ScopedObjectAccessUnchecked : public ScopedThreadStateChange {
       return NULL;
     }
 
-    VerifyObject(obj);
-
     DCHECK_NE((reinterpret_cast<uintptr_t>(obj) & 0xffff0000), 0xebad0000);
 
     IndirectReferenceTable& locals = Env()->locals;
diff --git a/runtime/sirt_ref-inl.h b/runtime/sirt_ref-inl.h
new file mode 100644
index 0000000000..7f2d847fa8
--- /dev/null
+++ b/runtime/sirt_ref-inl.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_SIRT_REF_INL_H_
+#define ART_RUNTIME_SIRT_REF_INL_H_
+
+#include "sirt_ref.h"
+
+#include "verify_object-inl.h"
+
+namespace art {
+
+template<class T> inline SirtRef<T>::SirtRef(Thread* self, T* object) : self_(self), sirt_(object) {
+  VerifyObject(object);
+  self_->PushSirt(&sirt_);
+}
+
+template<class T> inline SirtRef<T>::~SirtRef() {
+  StackIndirectReferenceTable* top_sirt = self_->PopSirt();
+  DCHECK_EQ(top_sirt, &sirt_);
+}
+
+template<class T> inline T* SirtRef<T>::reset(T* object) {
+  VerifyObject(object);
+  T* old_ref = get();
+  sirt_.SetReference(0, object);
+  return old_ref;
+}
+
+}  // namespace art
+
+#endif  // ART_RUNTIME_SIRT_REF_INL_H_
diff --git a/runtime/sirt_ref.h b/runtime/sirt_ref.h
index b22e816e2d..2226e17f56 100644
--- a/runtime/sirt_ref.h
+++ b/runtime/sirt_ref.h
@@ -20,6 +20,7 @@
 #include "base/casts.h"
 #include "base/logging.h"
 #include "base/macros.h"
+#include "stack_indirect_reference_table.h"
 #include "thread.h"
 
 namespace art {
@@ -27,13 +28,8 @@ namespace art {
 template<class T>
 class SirtRef {
  public:
-  SirtRef(Thread* self, T* object) : self_(self), sirt_(object) {
-    self_->PushSirt(&sirt_);
-  }
-  ~SirtRef() {
-    StackIndirectReferenceTable* top_sirt = self_->PopSirt();
-    DCHECK_EQ(top_sirt, &sirt_);
-  }
+  SirtRef(Thread* self, T* object);
+  ~SirtRef();
 
   T& operator*() const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     return *get();
@@ -46,11 +42,7 @@ class SirtRef {
   }
 
   // Returns the old reference.
-  T* reset(T* object = nullptr) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    T* old_ref = get();
-    sirt_.SetReference(0, object);
-    return old_ref;
-  }
+  T* reset(T* object = nullptr) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
  private:
   Thread* const self_;
diff --git a/runtime/thread-inl.h b/runtime/thread-inl.h
index f7e88cc75b..66077f904e 100644
--- a/runtime/thread-inl.h
+++ b/runtime/thread-inl.h
@@ -146,9 +146,10 @@ inline ThreadState Thread::TransitionFromSuspendedToRunnable() {
     if (UNLIKELY(!done)) {
       // Failed to transition to Runnable. Release shared mutator_lock_ access and try again.
       Locks::mutator_lock_->SharedUnlock(this);
+    } else {
+      return static_cast<ThreadState>(old_state);
     }
-  } while (UNLIKELY(!done));
-  return static_cast<ThreadState>(old_state);
+  } while (true);
 }
 
 inline void Thread::VerifyStack() {
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 0ad01906e5..6d8ede5aa5 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -864,7 +864,8 @@ void Thread::DumpStack(std::ostream& os) const {
     // If we're currently in native code, dump that stack before dumping the managed stack.
     if (dump_for_abort || ShouldShowNativeStack(this)) {
       DumpKernelStack(os, GetTid(), "  kernel: ", false);
-      DumpNativeStack(os, GetTid(), "  native: ", false);
+      SirtRef<mirror::ArtMethod> method_ref(Thread::Current(), GetCurrentMethod(nullptr));
+      DumpNativeStack(os, GetTid(), "  native: ", false, method_ref.get());
     }
     UniquePtr<Context> context(Context::Create());
     StackDumpVisitor dumper(os, const_cast<Thread*>(this), context.get(), !throwing_OutOfMemoryError_);
@@ -1196,16 +1197,18 @@ mirror::Object* Thread::DecodeJObject(jobject obj) const {
   // The "kinds" below are sorted by the frequency we expect to encounter them.
   if (kind == kLocal) {
     IndirectReferenceTable& locals = jni_env_->locals;
-    result = const_cast<mirror::Object*>(locals.Get(ref));
+    result = locals.Get(ref);
   } else if (kind == kSirtOrInvalid) {
     // TODO: make stack indirect reference table lookup more efficient.
     // Check if this is a local reference in the SIRT.
     if (LIKELY(SirtContains(obj))) {
       // Read from SIRT.
       result = reinterpret_cast<StackReference<mirror::Object>*>(obj)->AsMirrorPtr();
+      VerifyObject(result);
     } else if (Runtime::Current()->GetJavaVM()->work_around_app_jni_bugs) {
       // Assume an invalid local reference is actually a direct pointer.
       result = reinterpret_cast<mirror::Object*>(obj);
+      VerifyObject(result);
     } else {
       result = kInvalidIndirectRefObject;
     }
@@ -1225,10 +1228,6 @@ mirror::Object* Thread::DecodeJObject(jobject obj) const {
 
   if (UNLIKELY(result == nullptr)) {
     JniAbortF(nullptr, "use of deleted %s %p", ToStr<IndirectRefKind>(kind).c_str(), obj);
-  } else {
-    if (result != kInvalidIndirectRefObject) {
-      VerifyObject(result);
-    }
   }
   return result;
 }
diff --git a/runtime/thread.h b/runtime/thread.h
index c7ab735465..2ebc107942 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -24,13 +24,13 @@
 #include <string>
 
 #include "base/macros.h"
+#include "base/mutex.h"
 #include "entrypoints/interpreter/interpreter_entrypoints.h"
 #include "entrypoints/jni/jni_entrypoints.h"
 #include "entrypoints/portable/portable_entrypoints.h"
 #include "entrypoints/quick/quick_entrypoints.h"
 #include "globals.h"
 #include "jvalue.h"
-#include "locks.h"
 #include "object_callbacks.h"
 #include "offsets.h"
 #include "runtime_stats.h"
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index d311945180..bddebbd5e7 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -151,7 +151,8 @@ void ThreadList::AssertThreadsAreSuspended(Thread* self, Thread* ignore1, Thread
 
 #if HAVE_TIMED_RWLOCK
 // Attempt to rectify locks so that we dump thread list with required locks before exiting.
-static void UnsafeLogFatalForThreadSuspendAllTimeout(Thread* self) NO_THREAD_SAFETY_ANALYSIS {
+static void UnsafeLogFatalForThreadSuspendAllTimeout(Thread* self) NO_THREAD_SAFETY_ANALYSIS __attribute__((noreturn));
+static void UnsafeLogFatalForThreadSuspendAllTimeout(Thread* self) {
   Runtime* runtime = Runtime::Current();
   std::ostringstream ss;
   ss << "Thread suspend timeout\n";
@@ -159,6 +160,7 @@ static void UnsafeLogFatalForThreadSuspendAllTimeout(Thread* self) NO_THREAD_SAF
   ss << "\n";
   runtime->GetThreadList()->DumpLocked(ss);
   LOG(FATAL) << ss.str();
+  exit(0);
 }
 #endif
 
@@ -193,10 +195,10 @@ static void ThreadSuspendSleep(Thread* self, useconds_t* delay_us, useconds_t* t
 
 size_t ThreadList::RunCheckpoint(Closure* checkpoint_function) {
   Thread* self = Thread::Current();
-  if (kIsDebugBuild) {
-    Locks::mutator_lock_->AssertNotExclusiveHeld(self);
-    Locks::thread_list_lock_->AssertNotHeld(self);
-    Locks::thread_suspend_count_lock_->AssertNotHeld(self);
+  Locks::mutator_lock_->AssertNotExclusiveHeld(self);
+  Locks::thread_list_lock_->AssertNotHeld(self);
+  Locks::thread_suspend_count_lock_->AssertNotHeld(self);
+  if (kDebugLocking) {
     CHECK_NE(self->GetState(), kRunnable);
   }
 
@@ -273,41 +275,41 @@ void ThreadList::SuspendAll() {
 
   VLOG(threads) << *self << " SuspendAll starting...";
 
-  if (kIsDebugBuild) {
-    Locks::mutator_lock_->AssertNotHeld(self);
-    Locks::thread_list_lock_->AssertNotHeld(self);
-    Locks::thread_suspend_count_lock_->AssertNotHeld(self);
+  Locks::mutator_lock_->AssertNotHeld(self);
+  Locks::thread_list_lock_->AssertNotHeld(self);
+  Locks::thread_suspend_count_lock_->AssertNotHeld(self);
+  if (kDebugLocking) {
     CHECK_NE(self->GetState(), kRunnable);
   }
   {
     MutexLock mu(self, *Locks::thread_list_lock_);
-    {
-      MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
-      // Update global suspend all state for attaching threads.
-      ++suspend_all_count_;
-      // Increment everybody's suspend count (except our own).
-      for (const auto& thread : list_) {
-        if (thread == self) {
-          continue;
-        }
-        VLOG(threads) << "requesting thread suspend: " << *thread;
-        thread->ModifySuspendCount(self, +1, false);
+    MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
+    // Update global suspend all state for attaching threads.
+    ++suspend_all_count_;
+    // Increment everybody's suspend count (except our own).
+    for (const auto& thread : list_) {
+      if (thread == self) {
+        continue;
       }
+      VLOG(threads) << "requesting thread suspend: " << *thread;
+      thread->ModifySuspendCount(self, +1, false);
     }
   }
 
   // Block on the mutator lock until all Runnable threads release their share of access.
 #if HAVE_TIMED_RWLOCK
   // Timeout if we wait more than 30 seconds.
-  if (UNLIKELY(!Locks::mutator_lock_->ExclusiveLockWithTimeout(self, 30 * 1000, 0))) {
+  if (!Locks::mutator_lock_->ExclusiveLockWithTimeout(self, 30 * 1000, 0)) {
     UnsafeLogFatalForThreadSuspendAllTimeout(self);
   }
 #else
   Locks::mutator_lock_->ExclusiveLock(self);
 #endif
 
-  // Debug check that all threads are suspended.
-  AssertThreadsAreSuspended(self, self);
+  if (kDebugLocking) {
+    // Debug check that all threads are suspended.
+    AssertThreadsAreSuspended(self, self);
+  }
 
   VLOG(threads) << *self << " SuspendAll complete";
 }
@@ -317,8 +319,10 @@ void ThreadList::ResumeAll() {
 
   VLOG(threads) << *self << " ResumeAll starting";
 
-  // Debug check that all threads are suspended.
-  AssertThreadsAreSuspended(self, self);
+  if (kDebugLocking) {
+    // Debug check that all threads are suspended.
+    AssertThreadsAreSuspended(self, self);
+  }
 
   Locks::mutator_lock_->ExclusiveUnlock(self);
   {
diff --git a/runtime/thread_list.h b/runtime/thread_list.h
index e98aed9c5d..1a76705f76 100644
--- a/runtime/thread_list.h
+++ b/runtime/thread_list.h
@@ -86,7 +86,7 @@ class ThreadList {
 
   // Run a checkpoint on threads, running threads are not suspended but run the checkpoint inside
   // of the suspend check. Returns how many checkpoints we should expect to run.
-  size_t RunCheckpoint(Closure* checkpoint_function);
+  size_t RunCheckpoint(Closure* checkpoint_function)
       LOCKS_EXCLUDED(Locks::thread_list_lock_,
                      Locks::thread_suspend_count_lock_);
 
diff --git a/runtime/thread_pool.h b/runtime/thread_pool.h
index e8f9afe62d..b8735a3136 100644
--- a/runtime/thread_pool.h
+++ b/runtime/thread_pool.h
@@ -23,7 +23,6 @@
 #include "barrier.h"
 #include "base/mutex.h"
 #include "closure.h"
-#include "locks.h"
 #include "mem_map.h"
 
 namespace art {
diff --git a/runtime/throw_location.h b/runtime/throw_location.h
index f30aa4ea1e..c171b0783a 100644
--- a/runtime/throw_location.h
+++ b/runtime/throw_location.h
@@ -19,6 +19,7 @@
 
 #include "object_callbacks.h"
 #include "base/macros.h"
+#include "base/mutex.h"
 
 #include <stdint.h>
 #include <string>
diff --git a/runtime/transaction.h b/runtime/transaction.h
index 68f95402c6..cf696de32b 100644
--- a/runtime/transaction.h
+++ b/runtime/transaction.h
@@ -19,10 +19,9 @@
 
 #include "base/macros.h"
 #include "base/mutex.h"
-#include "locks.h"
+#include "object_callbacks.h"
 #include "offsets.h"
 #include "primitive.h"
-#include "object_callbacks.h"
 #include "safe_map.h"
 
 #include <list>
diff --git a/runtime/utf.h b/runtime/utf.h
index 5b2289ef19..29f84997e6 100644
--- a/runtime/utf.h
+++ b/runtime/utf.h
@@ -18,6 +18,7 @@
 #define ART_RUNTIME_UTF_H_
 
 #include "base/macros.h"
+#include "base/mutex.h"
 
 #include <stddef.h>
 #include <stdint.h>
diff --git a/runtime/utils.cc b/runtime/utils.cc
index 237d217f0b..d2d23e8de6 100644
--- a/runtime/utils.cc
+++ b/runtime/utils.cc
@@ -38,6 +38,7 @@
 #include "mirror/string.h"
 #include "object_utils.h"
 #include "os.h"
+#include "scoped_thread_state_change.h"
 #include "utf-inl.h"
 
 #if !defined(HAVE_POSIX_CLOCKS)
@@ -1052,7 +1053,12 @@ static std::string CleanMapName(const backtrace_map_t* map) {
   return map->name.substr(last_slash + 1);
 }
 
-void DumpNativeStack(std::ostream& os, pid_t tid, const char* prefix, bool include_count) {
+void DumpNativeStack(std::ostream& os, pid_t tid, const char* prefix, bool include_count,
+    mirror::ArtMethod* current_method) {
+  // We may be called from contexts where current_method is not null, so we must assert this.
+  if (current_method != nullptr) {
+    Locks::mutator_lock_->AssertSharedHeld(Thread::Current());
+  }
   UniquePtr<Backtrace> backtrace(Backtrace::Create(BACKTRACE_CURRENT_PROCESS, tid));
   if (!backtrace->Unwind(0)) {
     os << prefix << "(backtrace::Unwind failed for thread " << tid << ")\n";
@@ -1073,7 +1079,11 @@ void DumpNativeStack(std::ostream& os, pid_t tid, const char* prefix, bool inclu
     if (!it->func_name.empty()) {
       os << it->func_name;
     } else {
-      os << "???";
+      if (current_method != nullptr && current_method->IsWithinQuickCode(it->pc)) {
+        os << JniLongName(current_method) << "+" << (it->pc - current_method->GetQuickOatCodeOffset());
+      } else {
+        os << "???";
+      }
     }
     if (it->func_offset != 0) {
       os << "+" << it->func_offset;
diff --git a/runtime/utils.h b/runtime/utils.h
index bcbeb0ea63..dbc3ab7634 100644
--- a/runtime/utils.h
+++ b/runtime/utils.h
@@ -373,7 +373,9 @@ std::string GetSchedulerGroupName(pid_t tid);
 void SetThreadName(const char* thread_name);
 
 // Dumps the native stack for thread 'tid' to 'os'.
-void DumpNativeStack(std::ostream& os, pid_t tid, const char* prefix = "", bool include_count = true);
+void DumpNativeStack(std::ostream& os, pid_t tid, const char* prefix = "",
+    bool include_count = true, mirror::ArtMethod* current_method = nullptr)
+    NO_THREAD_SAFETY_ANALYSIS;
 
 // Dumps the kernel stack for thread 'tid' to 'os'. Note that this is only available on linux-x86.
 void DumpKernelStack(std::ostream& os, pid_t tid, const char* prefix = "", bool include_count = true);
diff --git a/runtime/verifier/method_verifier-inl.h b/runtime/verifier/method_verifier-inl.h
index 74c3e33531..c5543940e8 100644
--- a/runtime/verifier/method_verifier-inl.h
+++ b/runtime/verifier/method_verifier-inl.h
@@ -21,6 +21,7 @@
 #include "method_verifier.h"
 #include "mirror/class_loader.h"
 #include "mirror/dex_cache.h"
+#include "sirt_ref-inl.h"
 
 namespace art {
 namespace verifier {
diff --git a/runtime/verifier/method_verifier.cc b/runtime/verifier/method_verifier.cc
index 555714f8f5..c4c3082918 100644
--- a/runtime/verifier/method_verifier.cc
+++ b/runtime/verifier/method_verifier.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "method_verifier.h"
+#include "method_verifier-inl.h"
 
 #include <iostream>
 
@@ -40,6 +40,7 @@
 #include "register_line-inl.h"
 #include "runtime.h"
 #include "scoped_thread_state_change.h"
+#include "sirt_ref-inl.h"
 #include "verifier/dex_gc_map.h"
 
 namespace art {
diff --git a/runtime/verifier/method_verifier.h b/runtime/verifier/method_verifier.h
index 031cfec3b1..5f13191bbe 100644
--- a/runtime/verifier/method_verifier.h
+++ b/runtime/verifier/method_verifier.h
@@ -33,12 +33,12 @@
 #include "reg_type_cache-inl.h"
 #include "register_line.h"
 #include "safe_map.h"
-#include "sirt_ref.h"
 #include "UniquePtr.h"
 
 namespace art {
 
 struct ReferenceMap2Visitor;
+template<class T> class SirtRef;
 
 namespace verifier {
 
diff --git a/runtime/verify_object.h b/runtime/verify_object.h
index b39df4a374..6640e0dd4a 100644
--- a/runtime/verify_object.h
+++ b/runtime/verify_object.h
@@ -17,10 +17,10 @@
 #ifndef ART_RUNTIME_VERIFY_OBJECT_H_
 #define ART_RUNTIME_VERIFY_OBJECT_H_
 
-#include "locks.h"
-
 #include <stdint.h>
 
+#include "base/macros.h"
+
 namespace art {
 
 namespace mirror {
@@ -52,10 +52,10 @@ static constexpr VerifyObjectFlags kDefaultVerifyFlags = kVerifyNone;
 static constexpr VerifyObjectMode kVerifyObjectSupport =
     kDefaultVerifyFlags != 0 ? kVerifyObjectModeFast : kVerifyObjectModeDisabled;
 
-ALWAYS_INLINE inline void VerifyObject(mirror::Object* obj) NO_THREAD_SAFETY_ANALYSIS;
+void VerifyObject(mirror::Object* obj) ALWAYS_INLINE NO_THREAD_SAFETY_ANALYSIS;
 
 // Check that c.getClass() == c.getClass().getClass().
-ALWAYS_INLINE inline bool VerifyClassClass(mirror::Class* c) NO_THREAD_SAFETY_ANALYSIS;
+bool VerifyClassClass(mirror::Class* c) ALWAYS_INLINE NO_THREAD_SAFETY_ANALYSIS;
 
 }  // namespace art