17 files changed, 924 insertions, 594 deletions
diff --git a/compiler/compiled_method.cc b/compiler/compiled_method.cc
index 0f69dbab94..e41371855d 100644
--- a/compiler/compiled_method.cc
+++ b/compiler/compiled_method.cc
@@ -159,10 +159,4 @@ CompiledMethod::~CompiledMethod() {
   storage->ReleaseMethodInfo(method_info_);
 }
 
-void CompiledMethod::ReleaseVMapTable() {
-  CompiledMethodStorage* storage = GetCompilerDriver()->GetCompiledMethodStorage();
-  storage->ReleaseVMapTable(vmap_table_);
-  vmap_table_ = nullptr;
-}
-
 }  // namespace art
diff --git a/compiler/compiled_method.h b/compiler/compiled_method.h
index 4e8f3efe5a..acdce260e5 100644
--- a/compiler/compiled_method.h
+++ b/compiler/compiled_method.h
@@ -168,10 +168,6 @@ class CompiledMethod FINAL : public CompiledCode {
 
   ArrayRef<const linker::LinkerPatch> GetPatches() const;
 
-  // The compiler sometimes unquickens shared code items. In that case, we need to clear the vmap
-  // table to avoid writing the quicken info to the vdex file.
-  void ReleaseVMapTable();
-
  private:
   static constexpr size_t kIsIntrinsicLsb = kNumberOfCompiledCodePackedBits;
   static constexpr size_t kIsIntrinsicSize = 1u;
@@ -190,7 +186,7 @@ class CompiledMethod FINAL : public CompiledCode {
   // For quick code, method specific information that is not very dedupe friendly (method indices).
   const LengthPrefixedArray<uint8_t>* const method_info_;
   // For quick code, holds code infos which contain stack maps, inline information, and etc.
-  const LengthPrefixedArray<uint8_t>* vmap_table_;
+  const LengthPrefixedArray<uint8_t>* const vmap_table_;
   // For quick code, a FDE entry for the debug_frame section.
   const LengthPrefixedArray<uint8_t>* const cfi_info_;
   // For quick code, linker patches needed by the method.
diff --git a/compiler/debug/elf_symtab_writer.h b/compiler/debug/elf_symtab_writer.h
index 9c9e8b35b8..4b19547d28 100644
--- a/compiler/debug/elf_symtab_writer.h
+++ b/compiler/debug/elf_symtab_writer.h
@@ -101,7 +101,7 @@ static void WriteDebugSymbols(linker::ElfBuilder<ElfTypes>* builder,
       }
     }
   }
-  // Add symbols for interpreted methods (with address range of the method's bytecode).
+  // Add symbols for dex files.
   if (!debug_info.dex_files.empty() && builder->GetDex()->Exists()) {
     auto dex = builder->GetDex();
     for (auto it : debug_info.dex_files) {
@@ -109,33 +109,6 @@ static void WriteDebugSymbols(linker::ElfBuilder<ElfTypes>* builder,
       const DexFile* dex_file = it.second;
       typename ElfTypes::Word dex_name = strtab->Write(kDexFileSymbolName);
       symtab->Add(dex_name, dex, dex_address, dex_file->Size(), STB_GLOBAL, STT_FUNC);
-      if (mini_debug_info) {
-        continue;  // Don't add interpreter method names to mini-debug-info for now.
-      }
-      for (uint32_t i = 0; i < dex_file->NumClassDefs(); ++i) {
-        const DexFile::ClassDef& class_def = dex_file->GetClassDef(i);
-        const uint8_t* class_data = dex_file->GetClassData(class_def);
-        if (class_data == nullptr) {
-          continue;
-        }
-        for (ClassDataItemIterator item(*dex_file, class_data); item.HasNext(); item.Next()) {
-          if (!item.IsAtMethod()) {
-            continue;
-          }
-          const DexFile::CodeItem* code_item = item.GetMethodCodeItem();
-          if (code_item == nullptr) {
-            continue;
-          }
-          CodeItemInstructionAccessor code(*dex_file, code_item);
-          DCHECK(code.HasCodeItem());
-          std::string name = dex_file->PrettyMethod(item.GetMemberIndex(), !mini_debug_info);
-          size_t name_offset = strtab->Write(name);
-          uint64_t offset = reinterpret_cast<const uint8_t*>(code.Insns()) - dex_file->Begin();
-          uint64_t address = dex_address + offset;
-          size_t size = code.InsnsSizeInCodeUnits() * sizeof(uint16_t);
-          symtab->Add(name_offset, dex, address, size, STB_GLOBAL, STT_FUNC);
-        }
-      }
     }
   }
   strtab->End();
diff --git a/compiler/dex/dex_to_dex_compiler.cc b/compiler/dex/dex_to_dex_compiler.cc
index 28c7fe2c34..9f0aaa4e10 100644
--- a/compiler/dex/dex_to_dex_compiler.cc
+++ b/compiler/dex/dex_to_dex_compiler.cc
@@ -45,6 +45,85 @@ const bool kEnableQuickening = true;
 // Control check-cast elision.
 const bool kEnableCheckCastEllision = true;
 
+// Holds the state for compiling a single method.
+struct DexToDexCompiler::CompilationState {
+  struct QuickenedInfo {
+    QuickenedInfo(uint32_t pc, uint16_t index) : dex_pc(pc), dex_member_index(index) {}
+
+    uint32_t dex_pc;
+    uint16_t dex_member_index;
+  };
+
+  CompilationState(DexToDexCompiler* compiler,
+                   const DexCompilationUnit& unit,
+                   const CompilationLevel compilation_level,
+                   const std::vector<uint8_t>* quicken_data);
+
+  const std::vector<QuickenedInfo>& GetQuickenedInfo() const {
+    return quickened_info_;
+  }
+
+  // Returns the quickening info, or an empty array if it was not quickened.
+  // If already_quickened is true, then don't change anything but still return what the quicken
+  // data would have been.
+  std::vector<uint8_t> Compile();
+
+  const DexFile& GetDexFile() const;
+
+  // Compiles a RETURN-VOID into a RETURN-VOID-BARRIER within a constructor where
+  // a barrier is required.
+  void CompileReturnVoid(Instruction* inst, uint32_t dex_pc);
+
+  // Compiles a CHECK-CAST into 2 NOP instructions if it is known to be safe. In
+  // this case, returns the second NOP instruction pointer. Otherwise, returns
+  // the given "inst".
+  Instruction* CompileCheckCast(Instruction* inst, uint32_t dex_pc);
+
+  // Compiles a field access into a quick field access.
+  // The field index is replaced by an offset within an Object where we can read
+  // from / write to this field. Therefore, this does not involve any resolution
+  // at runtime.
+  // Since the field index is encoded with 16 bits, we can replace it only if the
+  // field offset can be encoded with 16 bits too.
+  void CompileInstanceFieldAccess(Instruction* inst, uint32_t dex_pc,
+                                  Instruction::Code new_opcode, bool is_put);
+
+  // Compiles a virtual method invocation into a quick virtual method invocation.
+  // The method index is replaced by the vtable index where the corresponding
+  // executable can be found. Therefore, this does not involve any resolution
+  // at runtime.
+  // Since the method index is encoded with 16 bits, we can replace it only if the
+  // vtable index can be encoded with 16 bits too.
+  void CompileInvokeVirtual(Instruction* inst, uint32_t dex_pc,
+                            Instruction::Code new_opcode, bool is_range);
+
+  // Return the next index.
+  uint16_t NextIndex();
+
+  // Returns the dequickened index if an instruction is quickened, otherwise return index.
+  uint16_t GetIndexForInstruction(const Instruction* inst, uint32_t index);
+
+  DexToDexCompiler* const compiler_;
+  CompilerDriver& driver_;
+  const DexCompilationUnit& unit_;
+  const CompilationLevel compilation_level_;
+
+  // Filled by the compiler when quickening, in order to encode that information
+  // in the .oat file. The runtime will use that information to get to the original
+  // opcodes.
+  std::vector<QuickenedInfo> quickened_info_;
+
+  // True if we optimized a return void to a return void no barrier.
+  bool optimized_return_void_ = false;
+
+  // If the code item was already quickened previously.
+  const bool already_quickened_;
+  const QuickenInfoTable existing_quicken_info_;
+  uint32_t quicken_index_ = 0u;
+
+  DISALLOW_COPY_AND_ASSIGN(CompilationState);
+};
+
 DexToDexCompiler::DexToDexCompiler(CompilerDriver* driver)
     : driver_(driver),
       lock_("Quicken lock", kDexToDexCompilerLock) {
@@ -55,16 +134,13 @@ void DexToDexCompiler::ClearState() {
   MutexLock lock(Thread::Current(), lock_);
   active_dex_file_ = nullptr;
   active_bit_vector_ = nullptr;
-  seen_code_items_.clear();
   should_quicken_.clear();
-  shared_code_items_.clear();
-  blacklisted_code_items_.clear();
   shared_code_item_quicken_info_.clear();
 }
 
-size_t DexToDexCompiler::NumUniqueCodeItems(Thread* self) const {
+size_t DexToDexCompiler::NumCodeItemsToQuicken(Thread* self) const {
   MutexLock lock(self, lock_);
-  return seen_code_items_.size();
+  return num_code_items_;
 }
 
 BitVector* DexToDexCompiler::GetOrAddBitVectorForDex(const DexFile* dex_file) {
@@ -80,17 +156,13 @@ BitVector* DexToDexCompiler::GetOrAddBitVectorForDex(const DexFile* dex_file) {
 }
 
 void DexToDexCompiler::MarkForCompilation(Thread* self,
-                                          const MethodReference& method_ref,
-                                          const DexFile::CodeItem* code_item) {
+                                          const MethodReference& method_ref) {
   MutexLock lock(self, lock_);
   BitVector* const bitmap = GetOrAddBitVectorForDex(method_ref.dex_file);
   DCHECK(bitmap != nullptr);
   DCHECK(!bitmap->IsBitSet(method_ref.index));
   bitmap->SetBit(method_ref.index);
-  // Detect the shared code items.
-  if (!seen_code_items_.insert(code_item).second) {
-    shared_code_items_.insert(code_item);
-  }
+  ++num_code_items_;
 }
 
 DexToDexCompiler::CompilationState::CompilationState(DexToDexCompiler* compiler,
@@ -316,6 +388,7 @@ void DexToDexCompiler::CompilationState::CompileReturnVoid(Instruction* inst, ui
                  << " at dex pc " << StringPrintf("0x%x", dex_pc) << " in method "
                  << GetDexFile().PrettyMethod(unit_.GetDexMethodIndex(), true);
   inst->SetOpcode(Instruction::RETURN_VOID_NO_BARRIER);
+  optimized_return_void_ = true;
 }
 
 Instruction* DexToDexCompiler::CompilationState::CompileCheckCast(Instruction* inst,
@@ -464,51 +537,48 @@ CompiledMethod* DexToDexCompiler::CompileMethod(
   // If the code item is shared with multiple different method ids, make sure that we quicken only
   // once and verify that all the dequicken maps match.
   if (UNLIKELY(shared_code_items_.find(code_item) != shared_code_items_.end())) {
-    // For shared code items, use a lock to prevent races.
-    MutexLock mu(soa.Self(), lock_);
-    // Blacklisted means there was a quickening conflict previously, bail early.
-    if (blacklisted_code_items_.find(code_item) != blacklisted_code_items_.end()) {
+    // Avoid quickening the shared code items for now because the existing conflict detection logic
+    // does not currently handle cases where the code item is quickened in one place but
+    // compiled in another.
+    static constexpr bool kAvoidQuickeningSharedCodeItems = true;
+    if (kAvoidQuickeningSharedCodeItems) {
       return nullptr;
     }
+    // For shared code items, use a lock to prevent races.
+    MutexLock mu(soa.Self(), lock_);
     auto existing = shared_code_item_quicken_info_.find(code_item);
-    const bool already_quickened = existing != shared_code_item_quicken_info_.end();
+    QuickenState* existing_data = nullptr;
+    std::vector<uint8_t>* existing_quicken_data = nullptr;
+    if (existing != shared_code_item_quicken_info_.end()) {
+      existing_data = &existing->second;
+      if (existing_data->conflict_) {
+        return nullptr;
+      }
+      existing_quicken_data = &existing_data->quicken_data_;
+    }
+    bool optimized_return_void;
     {
-      CompilationState state(this,
-                             unit,
-                             compilation_level,
-                             already_quickened ? &existing->second.quicken_data_ : nullptr);
+      CompilationState state(this, unit, compilation_level, existing_quicken_data);
       quicken_data = state.Compile();
+      optimized_return_void = state.optimized_return_void_;
     }
 
     // Already quickened, check that the data matches what was previously seen.
     MethodReference method_ref(&dex_file, method_idx);
-    if (already_quickened) {
-      QuickenState* const existing_data = &existing->second;
-      if (existing_data->quicken_data_ != quicken_data) {
-        VLOG(compiler) << "Quicken data mismatch, dequickening method "
+    if (existing_data != nullptr) {
+      if (*existing_quicken_data != quicken_data ||
+          existing_data->optimized_return_void_ != optimized_return_void) {
+        VLOG(compiler) << "Quicken data mismatch, for method "
                        << dex_file.PrettyMethod(method_idx);
-        // Unquicken using the existing quicken data.
-        optimizer::ArtDecompileDEX(dex_file,
-                                   *code_item,
-                                   ArrayRef<const uint8_t>(existing_data->quicken_data_),
-                                   /* decompile_return_instruction*/ false);
-        // Go clear the vmaps for all the methods that were already quickened to avoid writing them
-        // out during oat writing.
-        for (const MethodReference& ref : existing_data->methods_) {
-          CompiledMethod* method = driver_->GetCompiledMethod(ref);
-          DCHECK(method != nullptr);
-          method->ReleaseVMapTable();
-        }
-        // Blacklist the method to never attempt to quicken it in the future.
-        blacklisted_code_items_.insert(code_item);
-        shared_code_item_quicken_info_.erase(existing);
-        return nullptr;
+        // Mark the method as a conflict to never attempt to quicken it in the future.
+        existing_data->conflict_ = true;
       }
       existing_data->methods_.push_back(method_ref);
     } else {
       QuickenState new_state;
       new_state.methods_.push_back(method_ref);
       new_state.quicken_data_ = quicken_data;
+      new_state.optimized_return_void_ = optimized_return_void;
       bool inserted = shared_code_item_quicken_info_.emplace(code_item, new_state).second;
       CHECK(inserted) << "Failed to insert " << dex_file.PrettyMethod(method_idx);
     }
@@ -556,9 +626,65 @@ CompiledMethod* DexToDexCompiler::CompileMethod(
       ArrayRef<const uint8_t>(quicken_data),       // vmap_table
       ArrayRef<const uint8_t>(),                   // cfi data
       ArrayRef<const linker::LinkerPatch>());
+  DCHECK(ret != nullptr);
   return ret;
 }
 
+void DexToDexCompiler::SetDexFiles(const std::vector<const DexFile*>& dex_files) {
+  // Record what code items are already seen to detect when multiple methods have the same code
+  // item.
+  std::unordered_set<const DexFile::CodeItem*> seen_code_items;
+  for (const DexFile* dex_file : dex_files) {
+    for (size_t i = 0; i < dex_file->NumClassDefs(); ++i) {
+      const DexFile::ClassDef& class_def = dex_file->GetClassDef(i);
+      const uint8_t* class_data = dex_file->GetClassData(class_def);
+      if (class_data == nullptr) {
+        continue;
+      }
+      ClassDataItemIterator it(*dex_file, class_data);
+      it.SkipAllFields();
+      for (; it.HasNextMethod(); it.Next()) {
+        const DexFile::CodeItem* code_item = it.GetMethodCodeItem();
+        // Detect the shared code items.
+        if (!seen_code_items.insert(code_item).second) {
+          shared_code_items_.insert(code_item);
+        }
+      }
+    }
+  }
+  VLOG(compiler) << "Shared code items " << shared_code_items_.size();
+}
+
+void DexToDexCompiler::UnquickenConflictingMethods() {
+  MutexLock mu(Thread::Current(), lock_);
+  size_t unquicken_count = 0;
+  for (const auto& pair : shared_code_item_quicken_info_) {
+    const DexFile::CodeItem* code_item = pair.first;
+    const QuickenState& state = pair.second;
+    CHECK_GE(state.methods_.size(), 1u);
+    if (state.conflict_) {
+      // Unquicken using the existing quicken data.
+      // TODO: Do we really need to pass a dex file in?
+      optimizer::ArtDecompileDEX(*state.methods_[0].dex_file,
+                                 *code_item,
+                                 ArrayRef<const uint8_t>(state.quicken_data_),
+                                 /* decompile_return_instruction*/ true);
+      ++unquicken_count;
+      // Go clear the vmaps for all the methods that were already quickened to avoid writing them
+      // out during oat writing.
+      for (const MethodReference& ref : state.methods_) {
+        CompiledMethod* method = driver_->RemoveCompiledMethod(ref);
+        if (method != nullptr) {
+          // There is up to one compiled method for each method ref. Releasing it leaves the
+          // deduped data intact, this means its safe to do even when other threads might be
+          // compiling.
+          CompiledMethod::ReleaseSwapAllocatedCompiledMethod(driver_, method);
+        }
+      }
+    }
+  }
+}
+
 }  // namespace optimizer
 
 }  // namespace art
diff --git a/compiler/dex/dex_to_dex_compiler.h b/compiler/dex/dex_to_dex_compiler.h
index 2105a9ded4..7df09f140c 100644
--- a/compiler/dex/dex_to_dex_compiler.h
+++ b/compiler/dex/dex_to_dex_compiler.h
@@ -59,99 +59,35 @@ class DexToDexCompiler {
                                 const CompilationLevel compilation_level) WARN_UNUSED;
 
   void MarkForCompilation(Thread* self,
-                          const MethodReference& method_ref,
-                          const DexFile::CodeItem* code_item);
+                          const MethodReference& method_ref);
 
   void ClearState();
 
+  // Unquicken all methods that have conflicting quicken info. This is not done during the
+  // quickening process to avoid race conditions.
+  void UnquickenConflictingMethods();
+
   CompilerDriver* GetDriver() {
     return driver_;
   }
 
   bool ShouldCompileMethod(const MethodReference& ref);
 
-  size_t NumUniqueCodeItems(Thread* self) const;
+  // Return the number of code items to quicken.
+  size_t NumCodeItemsToQuicken(Thread* self) const;
+
+  void SetDexFiles(const std::vector<const DexFile*>& dex_files);
 
  private:
   // Holds the state for compiling a single method.
-  struct CompilationState {
-    struct QuickenedInfo {
-      QuickenedInfo(uint32_t pc, uint16_t index) : dex_pc(pc), dex_member_index(index) {}
-
-      uint32_t dex_pc;
-      uint16_t dex_member_index;
-    };
-
-    CompilationState(DexToDexCompiler* compiler,
-                     const DexCompilationUnit& unit,
-                     const CompilationLevel compilation_level,
-                     const std::vector<uint8_t>* quicken_data);
-
-    const std::vector<QuickenedInfo>& GetQuickenedInfo() const {
-      return quickened_info_;
-    }
-
-    // Returns the quickening info, or an empty array if it was not quickened.
-    // If already_quickened is true, then don't change anything but still return what the quicken
-    // data would have been.
-    std::vector<uint8_t> Compile();
-
-    const DexFile& GetDexFile() const;
-
-    // Compiles a RETURN-VOID into a RETURN-VOID-BARRIER within a constructor where
-    // a barrier is required.
-    void CompileReturnVoid(Instruction* inst, uint32_t dex_pc);
-
-    // Compiles a CHECK-CAST into 2 NOP instructions if it is known to be safe. In
-    // this case, returns the second NOP instruction pointer. Otherwise, returns
-    // the given "inst".
-    Instruction* CompileCheckCast(Instruction* inst, uint32_t dex_pc);
-
-    // Compiles a field access into a quick field access.
-    // The field index is replaced by an offset within an Object where we can read
-    // from / write to this field. Therefore, this does not involve any resolution
-    // at runtime.
-    // Since the field index is encoded with 16 bits, we can replace it only if the
-    // field offset can be encoded with 16 bits too.
-    void CompileInstanceFieldAccess(Instruction* inst, uint32_t dex_pc,
-                                    Instruction::Code new_opcode, bool is_put);
-
-    // Compiles a virtual method invocation into a quick virtual method invocation.
-    // The method index is replaced by the vtable index where the corresponding
-    // executable can be found. Therefore, this does not involve any resolution
-    // at runtime.
-    // Since the method index is encoded with 16 bits, we can replace it only if the
-    // vtable index can be encoded with 16 bits too.
-    void CompileInvokeVirtual(Instruction* inst, uint32_t dex_pc,
-                              Instruction::Code new_opcode, bool is_range);
-
-    // Return the next index.
-    uint16_t NextIndex();
-
-    // Returns the dequickened index if an instruction is quickened, otherwise return index.
-    uint16_t GetIndexForInstruction(const Instruction* inst, uint32_t index);
-
-    DexToDexCompiler* const compiler_;
-    CompilerDriver& driver_;
-    const DexCompilationUnit& unit_;
-    const CompilationLevel compilation_level_;
-
-    // Filled by the compiler when quickening, in order to encode that information
-    // in the .oat file. The runtime will use that information to get to the original
-    // opcodes.
-    std::vector<QuickenedInfo> quickened_info_;
-
-    // If the code item was already quickened previously.
-    const bool already_quickened_;
-    const QuickenInfoTable existing_quicken_info_;
-    uint32_t quicken_index_ = 0u;
-
-    DISALLOW_COPY_AND_ASSIGN(CompilationState);
-  };
+  struct CompilationState;
 
+  // Quicken state for a code item, may be referenced by multiple methods.
   struct QuickenState {
     std::vector<MethodReference> methods_;
     std::vector<uint8_t> quicken_data_;
+    bool optimized_return_void_ = false;
+    bool conflict_ = false;
   };
 
   BitVector* GetOrAddBitVectorForDex(const DexFile* dex_file) REQUIRES(lock_);
@@ -166,15 +102,14 @@ class DexToDexCompiler {
   mutable Mutex lock_;
   // Record what method references are going to get quickened.
   std::unordered_map<const DexFile*, BitVector> should_quicken_;
-  // Record what code items are already seen to detect when multiple methods have the same code
-  // item.
-  std::unordered_set<const DexFile::CodeItem*> seen_code_items_ GUARDED_BY(lock_);
   // Guarded by lock_ during writing, accessed without a lock during quickening.
   // This is safe because no thread is adding to the shared code items during the quickening phase.
   std::unordered_set<const DexFile::CodeItem*> shared_code_items_;
-  std::unordered_set<const DexFile::CodeItem*> blacklisted_code_items_ GUARDED_BY(lock_);
+  // Blacklisted code items are unquickened in UnquickenConflictingMethods.
   std::unordered_map<const DexFile::CodeItem*, QuickenState> shared_code_item_quicken_info_
       GUARDED_BY(lock_);
+  // Number of added code items.
+  size_t num_code_items_ GUARDED_BY(lock_) = 0u;
 };
 
 std::ostream& operator<<(std::ostream& os, const DexToDexCompiler::CompilationLevel& rhs);
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index c617f54f55..fb428b8d9a 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -289,7 +289,6 @@ CompilerDriver::CompilerDriver(
       compiled_method_storage_(swap_fd),
       profile_compilation_info_(profile_compilation_info),
       max_arena_alloc_(0),
-      compiling_dex_to_dex_(false),
       dex_to_dex_compiler_(this) {
   DCHECK(compiler_options_ != nullptr);
 
@@ -450,99 +449,39 @@ static bool InstructionSetHasGenericJniStub(InstructionSet isa) {
   }
 }
 
-static void CompileMethod(Thread* self,
-                          CompilerDriver* driver,
-                          const DexFile::CodeItem* code_item,
-                          uint32_t access_flags,
-                          InvokeType invoke_type,
-                          uint16_t class_def_idx,
-                          uint32_t method_idx,
-                          Handle<mirror::ClassLoader> class_loader,
-                          const DexFile& dex_file,
-                          optimizer::DexToDexCompiler::CompilationLevel dex_to_dex_compilation_level,
-                          bool compilation_enabled,
-                          Handle<mirror::DexCache> dex_cache) {
+template <typename CompileFn>
+static void CompileMethodHarness(
+    Thread* self,
+    CompilerDriver* driver,
+    const DexFile::CodeItem* code_item,
+    uint32_t access_flags,
+    InvokeType invoke_type,
+    uint16_t class_def_idx,
+    uint32_t method_idx,
+    Handle<mirror::ClassLoader> class_loader,
+    const DexFile& dex_file,
+    optimizer::DexToDexCompiler::CompilationLevel dex_to_dex_compilation_level,
+    bool compilation_enabled,
+    Handle<mirror::DexCache> dex_cache,
+    CompileFn compile_fn) {
   DCHECK(driver != nullptr);
-  CompiledMethod* compiled_method = nullptr;
+  CompiledMethod* compiled_method;
   uint64_t start_ns = kTimeCompileMethod ? NanoTime() : 0;
   MethodReference method_ref(&dex_file, method_idx);
 
-  if (driver->GetCompilingDexToDex()) {
-    optimizer::DexToDexCompiler* const compiler = &driver->GetDexToDexCompiler();
-    // This is the second pass when we dex-to-dex compile previously marked methods.
-    // TODO: Refactor the compilation to avoid having to distinguish the two passes
-    // here. That should be done on a higher level. http://b/29089975
-    if (compiler->ShouldCompileMethod(method_ref)) {
-      VerificationResults* results = driver->GetVerificationResults();
-      DCHECK(results != nullptr);
-      const VerifiedMethod* verified_method = results->GetVerifiedMethod(method_ref);
-      // Do not optimize if a VerifiedMethod is missing. SafeCast elision,
-      // for example, relies on it.
-      compiled_method = compiler->CompileMethod(
-          code_item,
-          access_flags,
-          invoke_type,
-          class_def_idx,
-          method_idx,
-          class_loader,
-          dex_file,
-          (verified_method != nullptr)
-              ? dex_to_dex_compilation_level
-              : optimizer::DexToDexCompiler::CompilationLevel::kDontDexToDexCompile);
-    }
-  } else if ((access_flags & kAccNative) != 0) {
-    // Are we extracting only and have support for generic JNI down calls?
-    if (!driver->GetCompilerOptions().IsJniCompilationEnabled() &&
-        InstructionSetHasGenericJniStub(driver->GetInstructionSet())) {
-      // Leaving this empty will trigger the generic JNI version
-    } else {
-      // Query any JNI optimization annotations such as @FastNative or @CriticalNative.
-      access_flags |= annotations::GetNativeMethodAnnotationAccessFlags(
-          dex_file, dex_file.GetClassDef(class_def_idx), method_idx);
+  compiled_method = compile_fn(self,
+                               driver,
+                               code_item,
+                               access_flags,
+                               invoke_type,
+                               class_def_idx,
+                               method_idx,
+                               class_loader,
+                               dex_file,
+                               dex_to_dex_compilation_level,
+                               compilation_enabled,
+                               dex_cache);
 
-      compiled_method = driver->GetCompiler()->JniCompile(
-          access_flags, method_idx, dex_file, dex_cache);
-      CHECK(compiled_method != nullptr);
-    }
-  } else if ((access_flags & kAccAbstract) != 0) {
-    // Abstract methods don't have code.
-  } else {
-    VerificationResults* results = driver->GetVerificationResults();
-    DCHECK(results != nullptr);
-    const VerifiedMethod* verified_method = results->GetVerifiedMethod(method_ref);
-    bool compile = compilation_enabled &&
-        // Basic checks, e.g., not <clinit>.
-        results->IsCandidateForCompilation(method_ref, access_flags) &&
-        // Did not fail to create VerifiedMethod metadcata.
-        verified_method != nullptr &&
-        // Do not have failures that should punt to the interpreter.
-        !verified_method->HasRuntimeThrow() &&
-        (verified_method->GetEncounteredVerificationFailures() &
-            (verifier::VERIFY_ERROR_FORCE_INTERPRETER | verifier::VERIFY_ERROR_LOCKING)) == 0 &&
-        // Is eligable for compilation by methods-to-compile filter.
-        driver->IsMethodToCompile(method_ref) &&
-        driver->ShouldCompileBasedOnProfile(method_ref);
-
-    if (compile) {
-      // NOTE: if compiler declines to compile this method, it will return null.
-      compiled_method = driver->GetCompiler()->Compile(code_item,
-                                                       access_flags,
-                                                       invoke_type,
-                                                       class_def_idx,
-                                                       method_idx,
-                                                       class_loader,
-                                                       dex_file,
-                                                       dex_cache);
-    }
-    if (compiled_method == nullptr &&
-        dex_to_dex_compilation_level !=
-            optimizer::DexToDexCompiler::CompilationLevel::kDontDexToDexCompile) {
-      DCHECK(!Runtime::Current()->UseJitCompilation());
-      DCHECK(!driver->GetCompilingDexToDex());
-      // TODO: add a command-line option to disable DEX-to-DEX compilation ?
-      driver->GetDexToDexCompiler().MarkForCompilation(self, method_ref, code_item);
-    }
-  }
   if (kTimeCompileMethod) {
     uint64_t duration_ns = NanoTime() - start_ns;
     if (duration_ns > MsToNs(driver->GetCompiler()->GetMaximumCompilationTimeBeforeWarning())) {
@@ -573,6 +512,170 @@ static void CompileMethod(Thread* self,
   }
 }
 
+static void CompileMethodDex2Dex(
+    Thread* self,
+    CompilerDriver* driver,
+    const DexFile::CodeItem* code_item,
+    uint32_t access_flags,
+    InvokeType invoke_type,
+    uint16_t class_def_idx,
+    uint32_t method_idx,
+    Handle<mirror::ClassLoader> class_loader,
+    const DexFile& dex_file,
+    optimizer::DexToDexCompiler::CompilationLevel dex_to_dex_compilation_level,
+    bool compilation_enabled,
+    Handle<mirror::DexCache> dex_cache) {
+  auto dex_2_dex_fn = [](Thread* self ATTRIBUTE_UNUSED,
+      CompilerDriver* driver,
+      const DexFile::CodeItem* code_item,
+      uint32_t access_flags,
+      InvokeType invoke_type,
+      uint16_t class_def_idx,
+      uint32_t method_idx,
+      Handle<mirror::ClassLoader> class_loader,
+      const DexFile& dex_file,
+      optimizer::DexToDexCompiler::CompilationLevel dex_to_dex_compilation_level,
+      bool compilation_enabled ATTRIBUTE_UNUSED,
+      Handle<mirror::DexCache> dex_cache ATTRIBUTE_UNUSED) -> CompiledMethod* {
+    DCHECK(driver != nullptr);
+    MethodReference method_ref(&dex_file, method_idx);
+
+    optimizer::DexToDexCompiler* const compiler = &driver->GetDexToDexCompiler();
+
+    if (compiler->ShouldCompileMethod(method_ref)) {
+      VerificationResults* results = driver->GetVerificationResults();
+      DCHECK(results != nullptr);
+      const VerifiedMethod* verified_method = results->GetVerifiedMethod(method_ref);
+      // Do not optimize if a VerifiedMethod is missing. SafeCast elision,
+      // for example, relies on it.
+      return compiler->CompileMethod(
+          code_item,
+          access_flags,
+          invoke_type,
+          class_def_idx,
+          method_idx,
+          class_loader,
+          dex_file,
+          (verified_method != nullptr)
+          ? dex_to_dex_compilation_level
+              : optimizer::DexToDexCompiler::CompilationLevel::kDontDexToDexCompile);
+    }
+    return nullptr;
+  };
+  CompileMethodHarness(self,
+                       driver,
+                       code_item,
+                       access_flags,
+                       invoke_type,
+                       class_def_idx,
+                       method_idx,
+                       class_loader,
+                       dex_file,
+                       dex_to_dex_compilation_level,
+                       compilation_enabled,
+                       dex_cache,
+                       dex_2_dex_fn);
+}
+
+static void CompileMethodQuick(
+    Thread* self,
+    CompilerDriver* driver,
+    const DexFile::CodeItem* code_item,
+    uint32_t access_flags,
+    InvokeType invoke_type,
+    uint16_t class_def_idx,
+    uint32_t method_idx,
+    Handle<mirror::ClassLoader> class_loader,
+    const DexFile& dex_file,
+    optimizer::DexToDexCompiler::CompilationLevel dex_to_dex_compilation_level,
+    bool compilation_enabled,
+    Handle<mirror::DexCache> dex_cache) {
+  auto quick_fn = [](
+      Thread* self,
+      CompilerDriver* driver,
+      const DexFile::CodeItem* code_item,
+      uint32_t access_flags,
+      InvokeType invoke_type,
+      uint16_t class_def_idx,
+      uint32_t method_idx,
+      Handle<mirror::ClassLoader> class_loader,
+      const DexFile& dex_file,
+      optimizer::DexToDexCompiler::CompilationLevel dex_to_dex_compilation_level,
+      bool compilation_enabled,
+      Handle<mirror::DexCache> dex_cache) {
+    DCHECK(driver != nullptr);
+    CompiledMethod* compiled_method = nullptr;
+    MethodReference method_ref(&dex_file, method_idx);
+
+    if ((access_flags & kAccNative) != 0) {
+      // Are we extracting only and have support for generic JNI down calls?
+      if (!driver->GetCompilerOptions().IsJniCompilationEnabled() &&
+          InstructionSetHasGenericJniStub(driver->GetInstructionSet())) {
+        // Leaving this empty will trigger the generic JNI version
+      } else {
+        // Query any JNI optimization annotations such as @FastNative or @CriticalNative.
+        access_flags |= annotations::GetNativeMethodAnnotationAccessFlags(
+            dex_file, dex_file.GetClassDef(class_def_idx), method_idx);
+
+        compiled_method = driver->GetCompiler()->JniCompile(
+            access_flags, method_idx, dex_file, dex_cache);
+        CHECK(compiled_method != nullptr);
+      }
+    } else if ((access_flags & kAccAbstract) != 0) {
+      // Abstract methods don't have code.
+    } else {
+      VerificationResults* results = driver->GetVerificationResults();
+      DCHECK(results != nullptr);
+      const VerifiedMethod* verified_method = results->GetVerifiedMethod(method_ref);
+      bool compile = compilation_enabled &&
+          // Basic checks, e.g., not <clinit>.
+          results->IsCandidateForCompilation(method_ref, access_flags) &&
+          // Did not fail to create VerifiedMethod metadata.
+          verified_method != nullptr &&
+          // Do not have failures that should punt to the interpreter.
+          !verified_method->HasRuntimeThrow() &&
+          (verified_method->GetEncounteredVerificationFailures() &
+              (verifier::VERIFY_ERROR_FORCE_INTERPRETER | verifier::VERIFY_ERROR_LOCKING)) == 0 &&
+              // Is eligable for compilation by methods-to-compile filter.
+              driver->IsMethodToCompile(method_ref) &&
+              driver->ShouldCompileBasedOnProfile(method_ref);
+
+      if (compile) {
+        // NOTE: if compiler declines to compile this method, it will return null.
+        compiled_method = driver->GetCompiler()->Compile(code_item,
+                                                         access_flags,
+                                                         invoke_type,
+                                                         class_def_idx,
+                                                         method_idx,
+                                                         class_loader,
+                                                         dex_file,
+                                                         dex_cache);
+      }
+      if (compiled_method == nullptr &&
+          dex_to_dex_compilation_level !=
+              optimizer::DexToDexCompiler::CompilationLevel::kDontDexToDexCompile) {
+        DCHECK(!Runtime::Current()->UseJitCompilation());
+        // TODO: add a command-line option to disable DEX-to-DEX compilation ?
+        driver->GetDexToDexCompiler().MarkForCompilation(self, method_ref);
+      }
+    }
+    return compiled_method;
+  };
+  CompileMethodHarness(self,
+                       driver,
+                       code_item,
+                       access_flags,
+                       invoke_type,
+                       class_def_idx,
+                       method_idx,
+                       class_loader,
+                       dex_file,
+                       dex_to_dex_compilation_level,
+                       compilation_enabled,
+                       dex_cache,
+                       quick_fn);
+}
+
 void CompilerDriver::CompileOne(Thread* self, ArtMethod* method, TimingLogger* timings) {
   DCHECK(!Runtime::Current()->IsStarted());
   jobject jclass_loader;
@@ -614,37 +717,34 @@ void CompilerDriver::CompileOne(Thread* self, ArtMethod* method, TimingLogger* t
                                   *dex_file,
                                   dex_file->GetClassDef(class_def_idx));
 
-  DCHECK(!compiling_dex_to_dex_);
-  CompileMethod(self,
-                this,
-                code_item,
-                access_flags,
-                invoke_type,
-                class_def_idx,
-                method_idx,
-                class_loader,
-                *dex_file,
-                dex_to_dex_compilation_level,
-                true,
-                dex_cache);
-
-  const size_t num_methods = dex_to_dex_compiler_.NumUniqueCodeItems(self);
+  CompileMethodQuick(self,
+                     this,
+                     code_item,
+                     access_flags,
+                     invoke_type,
+                     class_def_idx,
+                     method_idx,
+                     class_loader,
+                     *dex_file,
+                     dex_to_dex_compilation_level,
+                     true,
+                     dex_cache);
+
+  const size_t num_methods = dex_to_dex_compiler_.NumCodeItemsToQuicken(self);
   if (num_methods != 0) {
     DCHECK_EQ(num_methods, 1u);
-    compiling_dex_to_dex_ = true;
-    CompileMethod(self,
-                  this,
-                  code_item,
-                  access_flags,
-                  invoke_type,
-                  class_def_idx,
-                  method_idx,
-                  class_loader,
-                  *dex_file,
-                  dex_to_dex_compilation_level,
-                  true,
-                  dex_cache);
-    compiling_dex_to_dex_ = false;
+    CompileMethodDex2Dex(self,
+                         this,
+                         code_item,
+                         access_flags,
+                         invoke_type,
+                         class_def_idx,
+                         method_idx,
+                         class_loader,
+                         *dex_file,
+                         dex_to_dex_compilation_level,
+                         true,
+                         dex_cache);
     dex_to_dex_compiler_.ClearState();
   }
 
@@ -1444,13 +1544,19 @@ class ParallelCompilationManager {
 
   void ForAll(size_t begin, size_t end, CompilationVisitor* visitor, size_t work_units)
       REQUIRES(!*Locks::mutator_lock_) {
+    ForAllLambda(begin, end, [visitor](size_t index) { visitor->Visit(index); }, work_units);
+  }
+
+  template <typename Fn>
+  void ForAllLambda(size_t begin, size_t end, Fn fn, size_t work_units)
+      REQUIRES(!*Locks::mutator_lock_) {
     Thread* self = Thread::Current();
     self->AssertNoPendingException();
     CHECK_GT(work_units, 0U);
 
     index_.StoreRelaxed(begin);
     for (size_t i = 0; i < work_units; ++i) {
-      thread_pool_->AddTask(self, new ForAllClosure(this, end, visitor));
+      thread_pool_->AddTask(self, new ForAllClosureLambda<Fn>(this, end, fn));
     }
     thread_pool_->StartWorkers(self);
 
@@ -1470,32 +1576,33 @@ class ParallelCompilationManager {
   }
 
  private:
-  class ForAllClosure : public Task {
+  template <typename Fn>
+  class ForAllClosureLambda : public Task {
    public:
-    ForAllClosure(ParallelCompilationManager* manager, size_t end, CompilationVisitor* visitor)
+    ForAllClosureLambda(ParallelCompilationManager* manager, size_t end, Fn fn)
         : manager_(manager),
           end_(end),
-          visitor_(visitor) {}
+          fn_(fn) {}
 
-    virtual void Run(Thread* self) {
+    void Run(Thread* self) OVERRIDE {
       while (true) {
         const size_t index = manager_->NextIndex();
         if (UNLIKELY(index >= end_)) {
           break;
         }
-        visitor_->Visit(index);
+        fn_(index);
         self->AssertNoPendingException();
       }
     }
 
-    virtual void Finalize() {
+    void Finalize() OVERRIDE {
       delete this;
     }
 
    private:
     ParallelCompilationManager* const manager_;
     const size_t end_;
-    CompilationVisitor* const visitor_;
+    Fn fn_;
   };
 
   AtomicInteger index_;
@@ -2574,64 +2681,33 @@ void CompilerDriver::InitializeClasses(jobject class_loader,
   }
 }
 
-void CompilerDriver::Compile(jobject class_loader,
-                             const std::vector<const DexFile*>& dex_files,
-                             TimingLogger* timings) {
-  if (kDebugProfileGuidedCompilation) {
-    LOG(INFO) << "[ProfileGuidedCompilation] " <<
-        ((profile_compilation_info_ == nullptr)
-            ? "null"
-            : profile_compilation_info_->DumpInfo(&dex_files));
-  }
-
-  dex_to_dex_compiler_.ClearState();
-  compiling_dex_to_dex_ = false;
-  for (const DexFile* dex_file : dex_files) {
-    CHECK(dex_file != nullptr);
-    CompileDexFile(class_loader,
-                   *dex_file,
-                   dex_files,
-                   parallel_thread_pool_.get(),
-                   parallel_thread_count_,
-                   timings);
-    const ArenaPool* const arena_pool = Runtime::Current()->GetArenaPool();
-    const size_t arena_alloc = arena_pool->GetBytesAllocated();
-    max_arena_alloc_ = std::max(arena_alloc, max_arena_alloc_);
-    Runtime::Current()->ReclaimArenaPoolMemory();
-  }
-
-  if (dex_to_dex_compiler_.NumUniqueCodeItems(Thread::Current()) > 0u) {
-    compiling_dex_to_dex_ = true;
-    // TODO: Not visit all of the dex files, its probably rare that only one would have quickened
-    // methods though.
-    for (const DexFile* dex_file : dex_files) {
-      CompileDexFile(class_loader,
-                     *dex_file,
-                     dex_files,
-                     parallel_thread_pool_.get(),
-                     parallel_thread_count_,
-                     timings);
-    }
-    dex_to_dex_compiler_.ClearState();
-    compiling_dex_to_dex_ = false;
-  }
-
-  VLOG(compiler) << "Compile: " << GetMemoryUsageString(false);
-}
-
-class CompileClassVisitor : public CompilationVisitor {
- public:
-  explicit CompileClassVisitor(const ParallelCompilationManager* manager) : manager_(manager) {}
+template <typename CompileFn>
+static void CompileDexFile(CompilerDriver* driver,
+                           jobject class_loader,
+                           const DexFile& dex_file,
+                           const std::vector<const DexFile*>& dex_files,
+                           ThreadPool* thread_pool,
+                           size_t thread_count,
+                           TimingLogger* timings,
+                           const char* timing_name,
+                           CompileFn compile_fn) {
+  TimingLogger::ScopedTiming t(timing_name, timings);
+  ParallelCompilationManager context(Runtime::Current()->GetClassLinker(),
+                                     class_loader,
+                                     driver,
+                                     &dex_file,
+                                     dex_files,
+                                     thread_pool);
 
-  virtual void Visit(size_t class_def_index) REQUIRES(!Locks::mutator_lock_) OVERRIDE {
+  auto compile = [&context, &compile_fn](size_t class_def_index) {
     ScopedTrace trace(__FUNCTION__);
-    const DexFile& dex_file = *manager_->GetDexFile();
+    const DexFile& dex_file = *context.GetDexFile();
     const DexFile::ClassDef& class_def = dex_file.GetClassDef(class_def_index);
-    ClassLinker* class_linker = manager_->GetClassLinker();
-    jobject jclass_loader = manager_->GetClassLoader();
+    ClassLinker* class_linker = context.GetClassLinker();
+    jobject jclass_loader = context.GetClassLoader();
     ClassReference ref(&dex_file, class_def_index);
     // Skip compiling classes with generic verifier failures since they will still fail at runtime
-    if (manager_->GetCompiler()->verification_results_->IsClassRejected(ref)) {
+    if (context.GetCompiler()->GetVerificationResults()->IsClassRejected(ref)) {
       return;
     }
     // Use a scoped object access to perform to the quick SkipClass check.
@@ -2662,7 +2738,7 @@ class CompileClassVisitor : public CompilationVisitor {
     // Go to native so that we don't block GC during compilation.
     ScopedThreadSuspension sts(soa.Self(), kNative);
 
-    CompilerDriver* const driver = manager_->GetCompiler();
+    CompilerDriver* const driver = context.GetCompiler();
 
     // Can we run DEX-to-DEX compiler on this class ?
     optimizer::DexToDexCompiler::CompilationLevel dex_to_dex_compilation_level =
@@ -2685,38 +2761,71 @@ class CompileClassVisitor : public CompilationVisitor {
         continue;
       }
       previous_method_idx = method_idx;
-      CompileMethod(soa.Self(),
-                    driver,
-                    it.GetMethodCodeItem(),
-                    it.GetMethodAccessFlags(),
-                    it.GetMethodInvokeType(class_def),
-                    class_def_index,
-                    method_idx,
-                    class_loader,
-                    dex_file,
-                    dex_to_dex_compilation_level,
-                    compilation_enabled,
-                    dex_cache);
+      compile_fn(soa.Self(),
+                 driver,
+                 it.GetMethodCodeItem(),
+                 it.GetMethodAccessFlags(),
+                 it.GetMethodInvokeType(class_def),
+                 class_def_index,
+                 method_idx,
+                 class_loader,
+                 dex_file,
+                 dex_to_dex_compilation_level,
+                 compilation_enabled,
+                 dex_cache);
       it.Next();
     }
     DCHECK(!it.HasNext());
+  };
+  context.ForAllLambda(0, dex_file.NumClassDefs(), compile, thread_count);
+}
+
+void CompilerDriver::Compile(jobject class_loader,
+                             const std::vector<const DexFile*>& dex_files,
+                             TimingLogger* timings) {
+  if (kDebugProfileGuidedCompilation) {
+    LOG(INFO) << "[ProfileGuidedCompilation] " <<
+        ((profile_compilation_info_ == nullptr)
+            ? "null"
+            : profile_compilation_info_->DumpInfo(&dex_files));
   }
 
- private:
-  const ParallelCompilationManager* const manager_;
-};
+  dex_to_dex_compiler_.ClearState();
+  for (const DexFile* dex_file : dex_files) {
+    CHECK(dex_file != nullptr);
+    CompileDexFile(this,
+                   class_loader,
+                   *dex_file,
+                   dex_files,
+                   parallel_thread_pool_.get(),
+                   parallel_thread_count_,
+                   timings,
+                   "Compile Dex File Quick",
+                   CompileMethodQuick);
+    const ArenaPool* const arena_pool = Runtime::Current()->GetArenaPool();
+    const size_t arena_alloc = arena_pool->GetBytesAllocated();
+    max_arena_alloc_ = std::max(arena_alloc, max_arena_alloc_);
+    Runtime::Current()->ReclaimArenaPoolMemory();
+  }
 
-void CompilerDriver::CompileDexFile(jobject class_loader,
-                                    const DexFile& dex_file,
-                                    const std::vector<const DexFile*>& dex_files,
-                                    ThreadPool* thread_pool,
-                                    size_t thread_count,
-                                    TimingLogger* timings) {
-  TimingLogger::ScopedTiming t("Compile Dex File", timings);
-  ParallelCompilationManager context(Runtime::Current()->GetClassLinker(), class_loader, this,
-                                     &dex_file, dex_files, thread_pool);
-  CompileClassVisitor visitor(&context);
-  context.ForAll(0, dex_file.NumClassDefs(), &visitor, thread_count);
+  if (dex_to_dex_compiler_.NumCodeItemsToQuicken(Thread::Current()) > 0u) {
+    // TODO: Not visit all of the dex files, its probably rare that only one would have quickened
+    // methods though.
+    for (const DexFile* dex_file : dex_files) {
+      CompileDexFile(this,
+                     class_loader,
+                     *dex_file,
+                     dex_files,
+                     parallel_thread_pool_.get(),
+                     parallel_thread_count_,
+                     timings,
+                     "Compile Dex File Dex2Dex",
+                     CompileMethodDex2Dex);
+    }
+    dex_to_dex_compiler_.ClearState();
+  }
+
+  VLOG(compiler) << "Compile: " << GetMemoryUsageString(false);
 }
 
 void CompilerDriver::AddCompiledMethod(const MethodReference& method_ref,
@@ -2731,6 +2840,12 @@ void CompilerDriver::AddCompiledMethod(const MethodReference& method_ref,
   DCHECK(GetCompiledMethod(method_ref) != nullptr) << method_ref.PrettyMethod();
 }
 
+CompiledMethod* CompilerDriver::RemoveCompiledMethod(const MethodReference& method_ref) {
+  CompiledMethod* ret = nullptr;
+  CHECK(compiled_methods_.Remove(method_ref, &ret));
+  return ret;
+}
+
 bool CompilerDriver::GetCompiledClass(const ClassReference& ref, ClassStatus* status) const {
   DCHECK(status != nullptr);
   // The table doesn't know if something wasn't inserted. For this case it will return
@@ -2904,6 +3019,7 @@ void CompilerDriver::FreeThreadPools() {
 void CompilerDriver::SetDexFilesForOatFile(const std::vector<const DexFile*>& dex_files) {
   dex_files_for_oat_file_ = dex_files;
   compiled_classes_.AddDexFiles(dex_files);
+  dex_to_dex_compiler_.SetDexFiles(dex_files);
 }
 
 void CompilerDriver::SetClasspathDexFiles(const std::vector<const DexFile*>& dex_files) {
diff --git a/compiler/driver/compiler_driver.h b/compiler/driver/compiler_driver.h
index b51e0debbb..2b524a347d 100644
--- a/compiler/driver/compiler_driver.h
+++ b/compiler/driver/compiler_driver.h
@@ -165,6 +165,7 @@ class CompilerDriver {
   void AddCompiledMethod(const MethodReference& method_ref,
                          CompiledMethod* const compiled_method,
                          size_t non_relative_linker_patch_count);
+  CompiledMethod* RemoveCompiledMethod(const MethodReference& method_ref);
 
   void SetRequiresConstructorBarrier(Thread* self,
                                      const DexFile* dex_file,
@@ -374,10 +375,6 @@ class CompilerDriver {
         || android::base::EndsWith(boot_image_filename, "core-optimizing.art");
   }
 
-  bool GetCompilingDexToDex() const {
-    return compiling_dex_to_dex_;
-  }
-
   optimizer::DexToDexCompiler& GetDexToDexCompiler() {
     return dex_to_dex_compiler_;
   }
@@ -449,13 +446,6 @@ class CompilerDriver {
   void Compile(jobject class_loader,
                const std::vector<const DexFile*>& dex_files,
                TimingLogger* timings);
-  void CompileDexFile(jobject class_loader,
-                      const DexFile& dex_file,
-                      const std::vector<const DexFile*>& dex_files,
-                      ThreadPool* thread_pool,
-                      size_t thread_count,
-                      TimingLogger* timings)
-      REQUIRES(!Locks::mutator_lock_);
 
   bool MayInlineInternal(const DexFile* inlined_from, const DexFile* inlined_into) const;
 
@@ -541,7 +531,6 @@ class CompilerDriver {
   size_t max_arena_alloc_;
 
   // Compiler for dex to dex (quickening).
-  bool compiling_dex_to_dex_;
   optimizer::DexToDexCompiler dex_to_dex_compiler_;
 
   friend class CompileClassVisitor;
diff --git a/compiler/jit/jit_compiler.cc b/compiler/jit/jit_compiler.cc
index 2c62095458..17b94d3bdf 100644
--- a/compiler/jit/jit_compiler.cc
+++ b/compiler/jit/jit_compiler.cc
@@ -76,7 +76,7 @@ extern "C" void jit_types_loaded(void* handle, mirror::Class** types, size_t cou
     const ArrayRef<mirror::Class*> types_array(types, count);
     std::vector<uint8_t> elf_file = debug::WriteDebugElfFileForClasses(
         kRuntimeISA, jit_compiler->GetCompilerDriver()->GetInstructionSetFeatures(), types_array);
-    MutexLock mu(Thread::Current(), g_jit_debug_mutex);
+    MutexLock mu(Thread::Current(), *Locks::native_debug_interface_lock_);
     CreateJITCodeEntry(std::move(elf_file));
   }
 }
diff --git a/compiler/linker/elf_builder.h b/compiler/linker/elf_builder.h
index 1c875189c5..3145497091 100644
--- a/compiler/linker/elf_builder.h
+++ b/compiler/linker/elf_builder.h
@@ -18,7 +18,6 @@
 #define ART_COMPILER_LINKER_ELF_BUILDER_H_
 
 #include <vector>
-#include <unordered_map>
 
 #include "arch/instruction_set.h"
 #include "arch/mips/instruction_set_features_mips.h"
@@ -310,24 +309,27 @@ class ElfBuilder FINAL {
                   /* info */ 0,
                   align,
                   /* entsize */ 0),
-          current_offset_(0) {
+          current_offset_(0),
+          last_offset_(0) {
     }
 
     Elf_Word Write(const std::string& name) {
       if (current_offset_ == 0) {
         DCHECK(name.empty());
+      } else if (name == last_name_) {
+        return last_offset_;  // Very simple string de-duplication.
       }
-      auto res = written_names_.emplace(name, current_offset_);
-      if (res.second) {  // Inserted.
-        this->WriteFully(name.c_str(), name.length() + 1);
-        current_offset_ += name.length() + 1;
-      }
-      return res.first->second;  // Offset.
+      last_name_ = name;
+      last_offset_ = current_offset_;
+      this->WriteFully(name.c_str(), name.length() + 1);
+      current_offset_ += name.length() + 1;
+      return last_offset_;
     }
 
    private:
     Elf_Word current_offset_;
-    std::unordered_map<std::string, Elf_Word> written_names_;  // Dedup strings.
+    std::string last_name_;
+    Elf_Word last_offset_;
   };
 
   // Writer of .dynsym and .symtab sections.
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index 704a0d3b87..6d49b32dbc 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -2498,8 +2498,23 @@ void CodeGeneratorARMVIXL::GenerateFrameEntry() {
   }
 
   if (!skip_overflow_check) {
+    // Using r4 instead of IP saves 2 bytes.
     UseScratchRegisterScope temps(GetVIXLAssembler());
-    vixl32::Register temp = temps.Acquire();
+    vixl32::Register temp;
+    // TODO: Remove this check when R4 is made a callee-save register
+    // in ART compiled code (b/72801708). Currently we need to make
+    // sure r4 is not blocked, e.g. in special purpose
+    // TestCodeGeneratorARMVIXL; also asserting that r4 is available
+    // here.
+    if (!blocked_core_registers_[R4]) {
+      for (vixl32::Register reg : kParameterCoreRegistersVIXL) {
+        DCHECK(!reg.Is(r4));
+      }
+      DCHECK(!kCoreCalleeSaves.Includes(r4));
+      temp = r4;
+    } else {
+      temp = temps.Acquire();
+    }
     __ Sub(temp, sp, Operand::From(GetStackOverflowReservedBytes(InstructionSet::kArm)));
     // The load must immediately precede RecordPcInfo.
     ExactAssemblyScope aas(GetVIXLAssembler(),
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index 36c921986b..855da2b18f 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -395,7 +395,7 @@ class TypeCheckSlowPathMIPS : public SlowPathCodeMIPS {
     CodeGeneratorMIPS* mips_codegen = down_cast<CodeGeneratorMIPS*>(codegen);
 
     __ Bind(GetEntryLabel());
-    if (!is_fatal_) {
+    if (!is_fatal_ || instruction_->CanThrowIntoCatchBlock()) {
       SaveLiveRegisters(codegen, locations);
     }
 
@@ -3283,26 +3283,8 @@ static size_t NumberOfCheckCastTemps(TypeCheckKind type_check_kind) {
 }
 
 void LocationsBuilderMIPS::VisitCheckCast(HCheckCast* instruction) {
-  LocationSummary::CallKind call_kind = LocationSummary::kNoCall;
-  bool throws_into_catch = instruction->CanThrowIntoCatchBlock();
-
   TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
-  switch (type_check_kind) {
-    case TypeCheckKind::kExactCheck:
-    case TypeCheckKind::kAbstractClassCheck:
-    case TypeCheckKind::kClassHierarchyCheck:
-    case TypeCheckKind::kArrayObjectCheck:
-      call_kind = (throws_into_catch || kEmitCompilerReadBarrier)
-          ? LocationSummary::kCallOnSlowPath
-          : LocationSummary::kNoCall;  // In fact, call on a fatal (non-returning) slow path.
-      break;
-    case TypeCheckKind::kArrayCheck:
-    case TypeCheckKind::kUnresolvedCheck:
-    case TypeCheckKind::kInterfaceCheck:
-      call_kind = LocationSummary::kCallOnSlowPath;
-      break;
-  }
-
+  LocationSummary::CallKind call_kind = CodeGenerator::GetCheckCastCallKind(instruction);
   LocationSummary* locations =
       new (GetGraph()->GetAllocator()) LocationSummary(instruction, call_kind);
   locations->SetInAt(0, Location::RequiresRegister());
@@ -3331,18 +3313,7 @@ void InstructionCodeGeneratorMIPS::VisitCheckCast(HCheckCast* instruction) {
       mirror::Array::DataOffset(kHeapReferenceSize).Uint32Value();
   MipsLabel done;
 
-  // Always false for read barriers since we may need to go to the entrypoint for non-fatal cases
-  // from false negatives. The false negatives may come from avoiding read barriers below. Avoiding
-  // read barriers is done for performance and code size reasons.
-  bool is_type_check_slow_path_fatal = false;
-  if (!kEmitCompilerReadBarrier) {
-    is_type_check_slow_path_fatal =
-        (type_check_kind == TypeCheckKind::kExactCheck ||
-         type_check_kind == TypeCheckKind::kAbstractClassCheck ||
-         type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
-         type_check_kind == TypeCheckKind::kArrayObjectCheck) &&
-        !instruction->CanThrowIntoCatchBlock();
-  }
+  bool is_type_check_slow_path_fatal = CodeGenerator::IsTypeCheckSlowPathFatal(instruction);
   SlowPathCodeMIPS* slow_path =
       new (codegen_->GetScopedAllocator()) TypeCheckSlowPathMIPS(
           instruction, is_type_check_slow_path_fatal);
@@ -7213,11 +7184,12 @@ void LocationsBuilderMIPS::VisitInstanceOf(HInstanceOf* instruction) {
     case TypeCheckKind::kExactCheck:
     case TypeCheckKind::kAbstractClassCheck:
     case TypeCheckKind::kClassHierarchyCheck:
-    case TypeCheckKind::kArrayObjectCheck:
-      call_kind =
-          kEmitCompilerReadBarrier ? LocationSummary::kCallOnSlowPath : LocationSummary::kNoCall;
-      baker_read_barrier_slow_path = kUseBakerReadBarrier;
+    case TypeCheckKind::kArrayObjectCheck: {
+      bool needs_read_barrier = CodeGenerator::InstanceOfNeedsReadBarrier(instruction);
+      call_kind = needs_read_barrier ? LocationSummary::kCallOnSlowPath : LocationSummary::kNoCall;
+      baker_read_barrier_slow_path = kUseBakerReadBarrier && needs_read_barrier;
       break;
+    }
     case TypeCheckKind::kArrayCheck:
     case TypeCheckKind::kUnresolvedCheck:
     case TypeCheckKind::kInterfaceCheck:
@@ -7265,13 +7237,15 @@ void InstructionCodeGeneratorMIPS::VisitInstanceOf(HInstanceOf* instruction) {
 
   switch (type_check_kind) {
     case TypeCheckKind::kExactCheck: {
+      ReadBarrierOption read_barrier_option =
+          CodeGenerator::ReadBarrierOptionForInstanceOf(instruction);
       // /* HeapReference<Class> */ out = obj->klass_
       GenerateReferenceLoadTwoRegisters(instruction,
                                         out_loc,
                                         obj_loc,
                                         class_offset,
                                         maybe_temp_loc,
-                                        kCompilerReadBarrierOption);
+                                        read_barrier_option);
       // Classes must be equal for the instanceof to succeed.
       __ Xor(out, out, cls);
       __ Sltiu(out, out, 1);
@@ -7279,13 +7253,15 @@ void InstructionCodeGeneratorMIPS::VisitInstanceOf(HInstanceOf* instruction) {
     }
 
     case TypeCheckKind::kAbstractClassCheck: {
+      ReadBarrierOption read_barrier_option =
+          CodeGenerator::ReadBarrierOptionForInstanceOf(instruction);
       // /* HeapReference<Class> */ out = obj->klass_
       GenerateReferenceLoadTwoRegisters(instruction,
                                         out_loc,
                                         obj_loc,
                                         class_offset,
                                         maybe_temp_loc,
-                                        kCompilerReadBarrierOption);
+                                        read_barrier_option);
       // If the class is abstract, we eagerly fetch the super class of the
       // object to avoid doing a comparison we know will fail.
       MipsLabel loop;
@@ -7295,7 +7271,7 @@ void InstructionCodeGeneratorMIPS::VisitInstanceOf(HInstanceOf* instruction) {
                                        out_loc,
                                        super_offset,
                                        maybe_temp_loc,
-                                       kCompilerReadBarrierOption);
+                                       read_barrier_option);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ Beqz(out, &done);
       __ Bne(out, cls, &loop);
@@ -7304,13 +7280,15 @@ void InstructionCodeGeneratorMIPS::VisitInstanceOf(HInstanceOf* instruction) {
     }
 
     case TypeCheckKind::kClassHierarchyCheck: {
+      ReadBarrierOption read_barrier_option =
+          CodeGenerator::ReadBarrierOptionForInstanceOf(instruction);
       // /* HeapReference<Class> */ out = obj->klass_
       GenerateReferenceLoadTwoRegisters(instruction,
                                         out_loc,
                                         obj_loc,
                                         class_offset,
                                         maybe_temp_loc,
-                                        kCompilerReadBarrierOption);
+                                        read_barrier_option);
       // Walk over the class hierarchy to find a match.
       MipsLabel loop, success;
       __ Bind(&loop);
@@ -7320,7 +7298,7 @@ void InstructionCodeGeneratorMIPS::VisitInstanceOf(HInstanceOf* instruction) {
                                        out_loc,
                                        super_offset,
                                        maybe_temp_loc,
-                                       kCompilerReadBarrierOption);
+                                       read_barrier_option);
       __ Bnez(out, &loop);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ B(&done);
@@ -7330,13 +7308,15 @@ void InstructionCodeGeneratorMIPS::VisitInstanceOf(HInstanceOf* instruction) {
     }
 
     case TypeCheckKind::kArrayObjectCheck: {
+      ReadBarrierOption read_barrier_option =
+          CodeGenerator::ReadBarrierOptionForInstanceOf(instruction);
       // /* HeapReference<Class> */ out = obj->klass_
       GenerateReferenceLoadTwoRegisters(instruction,
                                         out_loc,
                                         obj_loc,
                                         class_offset,
                                         maybe_temp_loc,
-                                        kCompilerReadBarrierOption);
+                                        read_barrier_option);
       // Do an exact check.
       MipsLabel success;
       __ Beq(out, cls, &success);
@@ -7346,7 +7326,7 @@ void InstructionCodeGeneratorMIPS::VisitInstanceOf(HInstanceOf* instruction) {
                                        out_loc,
                                        component_offset,
                                        maybe_temp_loc,
-                                       kCompilerReadBarrierOption);
+                                       read_barrier_option);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ Beqz(out, &done);
       __ LoadFromOffset(kLoadUnsignedHalfword, out, out, primitive_offset);
diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc
index 6657582e2a..8a06061c6a 100644
--- a/compiler/optimizing/code_generator_mips64.cc
+++ b/compiler/optimizing/code_generator_mips64.cc
@@ -352,7 +352,7 @@ class TypeCheckSlowPathMIPS64 : public SlowPathCodeMIPS64 {
     CodeGeneratorMIPS64* mips64_codegen = down_cast<CodeGeneratorMIPS64*>(codegen);
 
     __ Bind(GetEntryLabel());
-    if (!is_fatal_) {
+    if (!is_fatal_ || instruction_->CanThrowIntoCatchBlock()) {
       SaveLiveRegisters(codegen, locations);
     }
 
@@ -2836,26 +2836,8 @@ static size_t NumberOfCheckCastTemps(TypeCheckKind type_check_kind) {
 }
 
 void LocationsBuilderMIPS64::VisitCheckCast(HCheckCast* instruction) {
-  LocationSummary::CallKind call_kind = LocationSummary::kNoCall;
-  bool throws_into_catch = instruction->CanThrowIntoCatchBlock();
-
   TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
-  switch (type_check_kind) {
-    case TypeCheckKind::kExactCheck:
-    case TypeCheckKind::kAbstractClassCheck:
-    case TypeCheckKind::kClassHierarchyCheck:
-    case TypeCheckKind::kArrayObjectCheck:
-      call_kind = (throws_into_catch || kEmitCompilerReadBarrier)
-          ? LocationSummary::kCallOnSlowPath
-          : LocationSummary::kNoCall;  // In fact, call on a fatal (non-returning) slow path.
-      break;
-    case TypeCheckKind::kArrayCheck:
-    case TypeCheckKind::kUnresolvedCheck:
-    case TypeCheckKind::kInterfaceCheck:
-      call_kind = LocationSummary::kCallOnSlowPath;
-      break;
-  }
-
+  LocationSummary::CallKind call_kind = CodeGenerator::GetCheckCastCallKind(instruction);
   LocationSummary* locations =
       new (GetGraph()->GetAllocator()) LocationSummary(instruction, call_kind);
   locations->SetInAt(0, Location::RequiresRegister());
@@ -2884,18 +2866,7 @@ void InstructionCodeGeneratorMIPS64::VisitCheckCast(HCheckCast* instruction) {
       mirror::Array::DataOffset(kHeapReferenceSize).Uint32Value();
   Mips64Label done;
 
-  // Always false for read barriers since we may need to go to the entrypoint for non-fatal cases
-  // from false negatives. The false negatives may come from avoiding read barriers below. Avoiding
-  // read barriers is done for performance and code size reasons.
-  bool is_type_check_slow_path_fatal = false;
-  if (!kEmitCompilerReadBarrier) {
-    is_type_check_slow_path_fatal =
-        (type_check_kind == TypeCheckKind::kExactCheck ||
-         type_check_kind == TypeCheckKind::kAbstractClassCheck ||
-         type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
-         type_check_kind == TypeCheckKind::kArrayObjectCheck) &&
-        !instruction->CanThrowIntoCatchBlock();
-  }
+  bool is_type_check_slow_path_fatal = CodeGenerator::IsTypeCheckSlowPathFatal(instruction);
   SlowPathCodeMIPS64* slow_path =
       new (codegen_->GetScopedAllocator()) TypeCheckSlowPathMIPS64(
           instruction, is_type_check_slow_path_fatal);
@@ -5528,11 +5499,12 @@ void LocationsBuilderMIPS64::VisitInstanceOf(HInstanceOf* instruction) {
     case TypeCheckKind::kExactCheck:
     case TypeCheckKind::kAbstractClassCheck:
     case TypeCheckKind::kClassHierarchyCheck:
-    case TypeCheckKind::kArrayObjectCheck:
-      call_kind =
-          kEmitCompilerReadBarrier ? LocationSummary::kCallOnSlowPath : LocationSummary::kNoCall;
-      baker_read_barrier_slow_path = kUseBakerReadBarrier;
+    case TypeCheckKind::kArrayObjectCheck: {
+      bool needs_read_barrier = CodeGenerator::InstanceOfNeedsReadBarrier(instruction);
+      call_kind = needs_read_barrier ? LocationSummary::kCallOnSlowPath : LocationSummary::kNoCall;
+      baker_read_barrier_slow_path = kUseBakerReadBarrier && needs_read_barrier;
       break;
+    }
     case TypeCheckKind::kArrayCheck:
     case TypeCheckKind::kUnresolvedCheck:
     case TypeCheckKind::kInterfaceCheck:
@@ -5580,13 +5552,15 @@ void InstructionCodeGeneratorMIPS64::VisitInstanceOf(HInstanceOf* instruction) {
 
   switch (type_check_kind) {
     case TypeCheckKind::kExactCheck: {
+      ReadBarrierOption read_barrier_option =
+          CodeGenerator::ReadBarrierOptionForInstanceOf(instruction);
       // /* HeapReference<Class> */ out = obj->klass_
       GenerateReferenceLoadTwoRegisters(instruction,
                                         out_loc,
                                         obj_loc,
                                         class_offset,
                                         maybe_temp_loc,
-                                        kCompilerReadBarrierOption);
+                                        read_barrier_option);
       // Classes must be equal for the instanceof to succeed.
       __ Xor(out, out, cls);
       __ Sltiu(out, out, 1);
@@ -5594,13 +5568,15 @@ void InstructionCodeGeneratorMIPS64::VisitInstanceOf(HInstanceOf* instruction) {
     }
 
     case TypeCheckKind::kAbstractClassCheck: {
+      ReadBarrierOption read_barrier_option =
+          CodeGenerator::ReadBarrierOptionForInstanceOf(instruction);
       // /* HeapReference<Class> */ out = obj->klass_
       GenerateReferenceLoadTwoRegisters(instruction,
                                         out_loc,
                                         obj_loc,
                                         class_offset,
                                         maybe_temp_loc,
-                                        kCompilerReadBarrierOption);
+                                        read_barrier_option);
       // If the class is abstract, we eagerly fetch the super class of the
       // object to avoid doing a comparison we know will fail.
       Mips64Label loop;
@@ -5610,7 +5586,7 @@ void InstructionCodeGeneratorMIPS64::VisitInstanceOf(HInstanceOf* instruction) {
                                        out_loc,
                                        super_offset,
                                        maybe_temp_loc,
-                                       kCompilerReadBarrierOption);
+                                       read_barrier_option);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ Beqzc(out, &done);
       __ Bnec(out, cls, &loop);
@@ -5619,13 +5595,15 @@ void InstructionCodeGeneratorMIPS64::VisitInstanceOf(HInstanceOf* instruction) {
     }
 
     case TypeCheckKind::kClassHierarchyCheck: {
+      ReadBarrierOption read_barrier_option =
+          CodeGenerator::ReadBarrierOptionForInstanceOf(instruction);
       // /* HeapReference<Class> */ out = obj->klass_
       GenerateReferenceLoadTwoRegisters(instruction,
                                         out_loc,
                                         obj_loc,
                                         class_offset,
                                         maybe_temp_loc,
-                                        kCompilerReadBarrierOption);
+                                        read_barrier_option);
       // Walk over the class hierarchy to find a match.
       Mips64Label loop, success;
       __ Bind(&loop);
@@ -5635,7 +5613,7 @@ void InstructionCodeGeneratorMIPS64::VisitInstanceOf(HInstanceOf* instruction) {
                                        out_loc,
                                        super_offset,
                                        maybe_temp_loc,
-                                       kCompilerReadBarrierOption);
+                                       read_barrier_option);
       __ Bnezc(out, &loop);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ Bc(&done);
@@ -5645,13 +5623,15 @@ void InstructionCodeGeneratorMIPS64::VisitInstanceOf(HInstanceOf* instruction) {
     }
 
     case TypeCheckKind::kArrayObjectCheck: {
+      ReadBarrierOption read_barrier_option =
+          CodeGenerator::ReadBarrierOptionForInstanceOf(instruction);
       // /* HeapReference<Class> */ out = obj->klass_
       GenerateReferenceLoadTwoRegisters(instruction,
                                         out_loc,
                                         obj_loc,
                                         class_offset,
                                         maybe_temp_loc,
-                                        kCompilerReadBarrierOption);
+                                        read_barrier_option);
       // Do an exact check.
       Mips64Label success;
       __ Beqc(out, cls, &success);
@@ -5661,7 +5641,7 @@ void InstructionCodeGeneratorMIPS64::VisitInstanceOf(HInstanceOf* instruction) {
                                        out_loc,
                                        component_offset,
                                        maybe_temp_loc,
-                                       kCompilerReadBarrierOption);
+                                       read_barrier_option);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ Beqzc(out, &done);
       __ LoadFromOffset(kLoadUnsignedHalfword, out, out, primitive_offset);
diff --git a/compiler/optimizing/dead_code_elimination.cc b/compiler/optimizing/dead_code_elimination.cc
index cca1055ac8..9fa0f72e80 100644
--- a/compiler/optimizing/dead_code_elimination.cc
+++ b/compiler/optimizing/dead_code_elimination.cc
@@ -146,6 +146,65 @@ static HConstant* Evaluate(HCondition* condition, HInstruction* left, HInstructi
   }
 }
 
+static bool RemoveNonNullControlDependences(HBasicBlock* block, HBasicBlock* throws) {
+  // Test for an if as last statement.
+  if (!block->EndsWithIf()) {
+    return false;
+  }
+  HIf* ifs = block->GetLastInstruction()->AsIf();
+  // Find either:
+  //   if obj == null
+  //     throws
+  //   else
+  //     not_throws
+  // or:
+  //   if obj != null
+  //     not_throws
+  //   else
+  //     throws
+  HInstruction* cond = ifs->InputAt(0);
+  HBasicBlock* not_throws = nullptr;
+  if (throws == ifs->IfTrueSuccessor() && cond->IsEqual()) {
+    not_throws = ifs->IfFalseSuccessor();
+  } else if (throws == ifs->IfFalseSuccessor() && cond->IsNotEqual()) {
+    not_throws = ifs->IfTrueSuccessor();
+  } else {
+    return false;
+  }
+  DCHECK(cond->IsEqual() || cond->IsNotEqual());
+  HInstruction* obj = cond->InputAt(1);
+  if (obj->IsNullConstant()) {
+    obj = cond->InputAt(0);
+  } else if (!cond->InputAt(0)->IsNullConstant()) {
+    return false;
+  }
+  // Scan all uses of obj and find null check under control dependence.
+  HBoundType* bound = nullptr;
+  const HUseList<HInstruction*>& uses = obj->GetUses();
+  for (auto it = uses.begin(), end = uses.end(); it != end;) {
+    HInstruction* user = it->GetUser();
+    ++it;  // increment before possibly replacing
+    if (user->IsNullCheck()) {
+      HBasicBlock* user_block = user->GetBlock();
+      if (user_block != block &&
+          user_block != throws &&
+          block->Dominates(user_block)) {
+        if (bound == nullptr) {
+          ReferenceTypeInfo ti = obj->GetReferenceTypeInfo();
+          bound = new (obj->GetBlock()->GetGraph()->GetAllocator()) HBoundType(obj);
+          bound->SetUpperBound(ti, /*can_be_null*/ false);
+          bound->SetReferenceTypeInfo(ti);
+          bound->SetCanBeNull(false);
+          not_throws->InsertInstructionBefore(bound, not_throws->GetFirstInstruction());
+        }
+        user->ReplaceWith(bound);
+        user_block->RemoveInstruction(user);
+      }
+    }
+  }
+  return bound != nullptr;
+}
+
 // Simplify the pattern:
 //
 //           B1
@@ -203,6 +262,11 @@ bool HDeadCodeElimination::SimplifyAlwaysThrows() {
         block->ReplaceSuccessor(succ, exit);
         rerun_dominance_and_loop_analysis = true;
         MaybeRecordStat(stats_, MethodCompilationStat::kSimplifyThrowingInvoke);
+        // Perform a quick follow up optimization on object != null control dependences
+        // that is much cheaper to perform now than in a later phase.
+        if (RemoveNonNullControlDependences(pred, block)) {
+          MaybeRecordStat(stats_, MethodCompilationStat::kRemovedNullCheck);
+        }
       }
     }
   }
diff --git a/compiler/optimizing/load_store_elimination.cc b/compiler/optimizing/load_store_elimination.cc
index 88326d321b..8b4eae1780 100644
--- a/compiler/optimizing/load_store_elimination.cc
+++ b/compiler/optimizing/load_store_elimination.cc
@@ -25,6 +25,51 @@
 
 #include <iostream>
 
+/**
+ * The general algorithm of load-store elimination (LSE).
+ * Load-store analysis in the previous pass collects a list of heap locations
+ * and does alias analysis of those heap locations.
+ * LSE keeps track of a list of heap values corresponding to the heap
+ * locations. It visits basic blocks in reverse post order and for
+ * each basic block, visits instructions sequentially, and processes
+ * instructions as follows:
+ * - If the instruction is a load, and the heap location for that load has a
+ *   valid heap value, the load can be eliminated. In order to maintain the
+ *   validity of all heap locations during the optimization phase, the real
+ *   elimination is delayed till the end of LSE.
+ * - If the instruction is a store, it updates the heap value for the heap
+ *   location of the store with the store instruction. The real heap value
+ *   can be fetched from the store instruction. Heap values are invalidated
+ *   for heap locations that may alias with the store instruction's heap
+ *   location. The store instruction can be eliminated unless the value stored
+ *   is later needed e.g. by a load from the same/aliased heap location or
+ *   the heap location persists at method return/deoptimization.
+ *   The store instruction is also needed if it's not used to track the heap
+ *   value anymore, e.g. when it fails to merge with the heap values from other
+ *   predecessors.
+ * - A store that stores the same value as the heap value is eliminated.
+ * - The list of heap values are merged at basic block entry from the basic
+ *   block's predecessors. The algorithm is single-pass, so loop side-effects is
+ *   used as best effort to decide if a heap location is stored inside the loop.
+ * - A special type of objects called singletons are instantiated in the method
+ *   and have a single name, i.e. no aliases. Singletons have exclusive heap
+ *   locations since they have no aliases. Singletons are helpful in narrowing
+ *   down the life span of a heap location such that they do not always
+ *   need to participate in merging heap values. Allocation of a singleton
+ *   can be eliminated if that singleton is not used and does not persist
+ *   at method return/deoptimization.
+ * - For newly instantiated instances, their heap values are initialized to
+ *   language defined default values.
+ * - Some instructions such as invokes are treated as loading and invalidating
+ *   all the heap values, depending on the instruction's side effects.
+ * - Finalizable objects are considered as persisting at method
+ *   return/deoptimization.
+ * - Currently this LSE algorithm doesn't handle SIMD graph, e.g. with VecLoad
+ *   and VecStore instructions.
+ * - Currently this LSE algorithm doesn't handle graph with try-catch, due to
+ *   the special block merging structure.
+ */
+
 namespace art {
 
 // An unknown heap value. Loads with such a value in the heap location cannot be eliminated.
@@ -59,8 +104,7 @@ class LSEVisitor : public HGraphDelegateVisitor {
         removed_loads_(allocator_.Adapter(kArenaAllocLSE)),
         substitute_instructions_for_loads_(allocator_.Adapter(kArenaAllocLSE)),
         possibly_removed_stores_(allocator_.Adapter(kArenaAllocLSE)),
-        singleton_new_instances_(allocator_.Adapter(kArenaAllocLSE)),
-        singleton_new_arrays_(allocator_.Adapter(kArenaAllocLSE)) {
+        singleton_new_instances_(allocator_.Adapter(kArenaAllocLSE)) {
   }
 
   void VisitBasicBlock(HBasicBlock* block) OVERRIDE {
@@ -88,19 +132,26 @@ class LSEVisitor : public HGraphDelegateVisitor {
     return type_conversion;
   }
 
-  // Find an instruction's substitute if it should be removed.
+  // Find an instruction's substitute if it's a removed load.
   // Return the same instruction if it should not be removed.
   HInstruction* FindSubstitute(HInstruction* instruction) {
+    if (!IsLoad(instruction)) {
+      return instruction;
+    }
     size_t size = removed_loads_.size();
     for (size_t i = 0; i < size; i++) {
       if (removed_loads_[i] == instruction) {
-        return substitute_instructions_for_loads_[i];
+        HInstruction* substitute = substitute_instructions_for_loads_[i];
+        // The substitute list is a flat hierarchy.
+        DCHECK_EQ(FindSubstitute(substitute), substitute);
+        return substitute;
       }
     }
     return instruction;
   }
 
   void AddRemovedLoad(HInstruction* load, HInstruction* heap_value) {
+    DCHECK(IsLoad(load));
     DCHECK_EQ(FindSubstitute(heap_value), heap_value) <<
         "Unexpected heap_value that has a substitute " << heap_value->DebugName();
     removed_loads_.push_back(load);
@@ -207,28 +258,59 @@ class LSEVisitor : public HGraphDelegateVisitor {
         new_instance->GetBlock()->RemoveInstruction(new_instance);
       }
     }
-    for (HInstruction* new_array : singleton_new_arrays_) {
-      size_t removed = HConstructorFence::RemoveConstructorFences(new_array);
-      MaybeRecordStat(stats_,
-                      MethodCompilationStat::kConstructorFenceRemovedLSE,
-                      removed);
+  }
 
-      if (!new_array->HasNonEnvironmentUses()) {
-        new_array->RemoveEnvironmentUsers();
-        new_array->GetBlock()->RemoveInstruction(new_array);
-      }
+ private:
+  static bool IsLoad(HInstruction* instruction) {
+    if (instruction == kUnknownHeapValue || instruction == kDefaultHeapValue) {
+      return false;
     }
+    // Unresolved load is not treated as a load.
+    return instruction->IsInstanceFieldGet() ||
+        instruction->IsStaticFieldGet() ||
+        instruction->IsArrayGet();
   }
 
- private:
-  // If heap_values[index] is an instance field store, need to keep the store.
-  // This is necessary if a heap value is killed due to merging, or loop side
-  // effects (which is essentially merging also), since a load later from the
-  // location won't be eliminated.
+  static bool IsStore(HInstruction* instruction) {
+    if (instruction == kUnknownHeapValue || instruction == kDefaultHeapValue) {
+      return false;
+    }
+    // Unresolved store is not treated as a store.
+    return instruction->IsInstanceFieldSet() ||
+        instruction->IsArraySet() ||
+        instruction->IsStaticFieldSet();
+  }
+
+  // Returns the real heap value by finding its substitute or by "peeling"
+  // a store instruction.
+  HInstruction* GetRealHeapValue(HInstruction* heap_value) {
+    if (IsLoad(heap_value)) {
+      return FindSubstitute(heap_value);
+    }
+    if (!IsStore(heap_value)) {
+      return heap_value;
+    }
+
+    // We keep track of store instructions as the heap values which might be
+    // eliminated if the stores are later found not necessary. The real stored
+    // value needs to be fetched from the store instruction.
+    if (heap_value->IsInstanceFieldSet()) {
+      heap_value = heap_value->AsInstanceFieldSet()->GetValue();
+    } else if (heap_value->IsStaticFieldSet()) {
+      heap_value = heap_value->AsStaticFieldSet()->GetValue();
+    } else {
+      DCHECK(heap_value->IsArraySet());
+      heap_value = heap_value->AsArraySet()->GetValue();
+    }
+    // heap_value may already be a removed load.
+    return FindSubstitute(heap_value);
+  }
+
+  // If heap_value is a store, need to keep the store.
+  // This is necessary if a heap value is killed or replaced by another value,
+  // so that the store is no longer used to track heap value.
   void KeepIfIsStore(HInstruction* heap_value) {
-    if (heap_value == kDefaultHeapValue ||
-        heap_value == kUnknownHeapValue ||
-        !(heap_value->IsInstanceFieldSet() || heap_value->IsArraySet())) {
+    if (!IsStore(heap_value)) {
       return;
     }
     auto idx = std::find(possibly_removed_stores_.begin(),
@@ -239,26 +321,41 @@ class LSEVisitor : public HGraphDelegateVisitor {
     }
   }
 
+  // If a heap location X may alias with heap location at `loc_index`
+  // and heap_values of that heap location X holds a store, keep that store.
+  // It's needed for a dependent load that's not eliminated since any store
+  // that may put value into the load's heap location needs to be kept.
+  void KeepStoresIfAliasedToLocation(ScopedArenaVector<HInstruction*>& heap_values,
+                                     size_t loc_index) {
+    for (size_t i = 0; i < heap_values.size(); i++) {
+      if ((i == loc_index) || heap_location_collector_.MayAlias(i, loc_index)) {
+        KeepIfIsStore(heap_values[i]);
+      }
+    }
+  }
+
   void HandleLoopSideEffects(HBasicBlock* block) {
     DCHECK(block->IsLoopHeader());
     int block_id = block->GetBlockId();
     ScopedArenaVector<HInstruction*>& heap_values = heap_values_for_[block_id];
+    HBasicBlock* pre_header = block->GetLoopInformation()->GetPreHeader();
+    ScopedArenaVector<HInstruction*>& pre_header_heap_values =
+        heap_values_for_[pre_header->GetBlockId()];
 
-    // Don't eliminate loads in irreducible loops. This is safe for singletons, because
-    // they are always used by the non-eliminated loop-phi.
+    // Don't eliminate loads in irreducible loops.
+    // Also keep the stores before the loop.
     if (block->GetLoopInformation()->IsIrreducible()) {
       if (kIsDebugBuild) {
         for (size_t i = 0; i < heap_values.size(); i++) {
           DCHECK_EQ(heap_values[i], kUnknownHeapValue);
         }
       }
+      for (size_t i = 0; i < heap_values.size(); i++) {
+        KeepIfIsStore(pre_header_heap_values[i]);
+      }
       return;
     }
 
-    HBasicBlock* pre_header = block->GetLoopInformation()->GetPreHeader();
-    ScopedArenaVector<HInstruction*>& pre_header_heap_values =
-        heap_values_for_[pre_header->GetBlockId()];
-
     // Inherit the values from pre-header.
     for (size_t i = 0; i < heap_values.size(); i++) {
       heap_values[i] = pre_header_heap_values[i];
@@ -270,18 +367,17 @@ class LSEVisitor : public HGraphDelegateVisitor {
       for (size_t i = 0; i < heap_values.size(); i++) {
         HeapLocation* location = heap_location_collector_.GetHeapLocation(i);
         ReferenceInfo* ref_info = location->GetReferenceInfo();
-        if (ref_info->IsSingletonAndRemovable() &&
-            !location->IsValueKilledByLoopSideEffects()) {
-          // A removable singleton's field that's not stored into inside a loop is
+        if (ref_info->IsSingleton() && !location->IsValueKilledByLoopSideEffects()) {
+          // A singleton's field that's not stored into inside a loop is
           // invariant throughout the loop. Nothing to do.
         } else {
-          // heap value is killed by loop side effects (stored into directly, or
-          // due to aliasing). Or the heap value may be needed after method return
-          // or deoptimization.
+          // heap value is killed by loop side effects.
           KeepIfIsStore(pre_header_heap_values[i]);
           heap_values[i] = kUnknownHeapValue;
         }
       }
+    } else {
+      // The loop doesn't kill any value.
     }
   }
 
@@ -300,45 +396,73 @@ class LSEVisitor : public HGraphDelegateVisitor {
     ScopedArenaVector<HInstruction*>& heap_values = heap_values_for_[block->GetBlockId()];
     for (size_t i = 0; i < heap_values.size(); i++) {
       HInstruction* merged_value = nullptr;
+      // If we can merge the store itself from the predecessors, we keep
+      // the store as the heap value as long as possible. In case we cannot
+      // merge the store, we try to merge the values of the stores.
+      HInstruction* merged_store_value = nullptr;
       // Whether merged_value is a result that's merged from all predecessors.
       bool from_all_predecessors = true;
       ReferenceInfo* ref_info = heap_location_collector_.GetHeapLocation(i)->GetReferenceInfo();
+      HInstruction* ref = ref_info->GetReference();
       HInstruction* singleton_ref = nullptr;
       if (ref_info->IsSingleton()) {
-        // We do more analysis of liveness when merging heap values for such
-        // cases since stores into such references may potentially be eliminated.
-        singleton_ref = ref_info->GetReference();
+        // We do more analysis based on singleton's liveness when merging
+        // heap values for such cases.
+        singleton_ref = ref;
       }
 
       for (HBasicBlock* predecessor : predecessors) {
         HInstruction* pred_value = heap_values_for_[predecessor->GetBlockId()][i];
+        if (!IsStore(pred_value)) {
+          pred_value = FindSubstitute(pred_value);
+        }
+        DCHECK(pred_value != nullptr);
+        HInstruction* pred_store_value = GetRealHeapValue(pred_value);
         if ((singleton_ref != nullptr) &&
             !singleton_ref->GetBlock()->Dominates(predecessor)) {
-          // singleton_ref is not live in this predecessor. Skip this predecessor since
-          // it does not really have the location.
+          // singleton_ref is not live in this predecessor. No need to merge
+          // since singleton_ref is not live at the beginning of this block.
           DCHECK_EQ(pred_value, kUnknownHeapValue);
           from_all_predecessors = false;
-          continue;
+          break;
         }
         if (merged_value == nullptr) {
           // First seen heap value.
+          DCHECK(pred_value != nullptr);
           merged_value = pred_value;
         } else if (pred_value != merged_value) {
           // There are conflicting values.
           merged_value = kUnknownHeapValue;
+          // We may still be able to merge store values.
+        }
+
+        // Conflicting stores may be storing the same value. We do another merge
+        // of real stored values.
+        if (merged_store_value == nullptr) {
+          // First seen store value.
+          DCHECK(pred_store_value != nullptr);
+          merged_store_value = pred_store_value;
+        } else if (pred_store_value != merged_store_value) {
+          // There are conflicting store values.
+          merged_store_value = kUnknownHeapValue;
+          // There must be conflicting stores also.
+          DCHECK_EQ(merged_value, kUnknownHeapValue);
+          // No need to merge anymore.
           break;
         }
       }
 
-      if (ref_info->IsSingleton()) {
-        if (ref_info->IsSingletonAndNonRemovable() ||
-            (merged_value == kUnknownHeapValue &&
-             !block->IsSingleReturnOrReturnVoidAllowingPhis())) {
-          // The heap value may be needed after method return or deoptimization,
-          // or there are conflicting heap values from different predecessors and
-          // this block is not a single return,
-          // keep the last store in each predecessor since future loads may not
-          // be eliminated.
+      if (merged_value == nullptr) {
+        DCHECK(!from_all_predecessors);
+        DCHECK(singleton_ref != nullptr);
+      }
+      if (from_all_predecessors) {
+        if (ref_info->IsSingletonAndRemovable() &&
+            block->IsSingleReturnOrReturnVoidAllowingPhis()) {
+          // Values in the singleton are not needed anymore.
+        } else if (!IsStore(merged_value)) {
+          // We don't track merged value as a store anymore. We have to
+          // hold the stores in predecessors live here.
           for (HBasicBlock* predecessor : predecessors) {
             ScopedArenaVector<HInstruction*>& pred_values =
                 heap_values_for_[predecessor->GetBlockId()];
@@ -346,18 +470,33 @@ class LSEVisitor : public HGraphDelegateVisitor {
           }
         }
       } else {
-        // Currenctly we don't eliminate stores to non-singletons.
+        DCHECK(singleton_ref != nullptr);
+        // singleton_ref is non-existing at the beginning of the block. There is
+        // no need to keep the stores.
       }
 
-      if ((merged_value == nullptr) || !from_all_predecessors) {
+      if (!from_all_predecessors) {
         DCHECK(singleton_ref != nullptr);
         DCHECK((singleton_ref->GetBlock() == block) ||
-               !singleton_ref->GetBlock()->Dominates(block));
+               !singleton_ref->GetBlock()->Dominates(block))
+            << "method: " << GetGraph()->GetMethodName();
         // singleton_ref is not defined before block or defined only in some of its
         // predecessors, so block doesn't really have the location at its entry.
         heap_values[i] = kUnknownHeapValue;
-      } else {
+      } else if (predecessors.size() == 1) {
+        // Inherit heap value from the single predecessor.
+        DCHECK_EQ(heap_values_for_[predecessors[0]->GetBlockId()][i], merged_value);
         heap_values[i] = merged_value;
+      } else {
+        DCHECK(merged_value == kUnknownHeapValue ||
+               merged_value == kDefaultHeapValue ||
+               merged_value->GetBlock()->Dominates(block));
+        if (merged_value != kUnknownHeapValue) {
+          heap_values[i] = merged_value;
+        } else {
+          // Stores in different predecessors may be storing the same value.
+          heap_values[i] = merged_store_value;
+        }
       }
     }
   }
@@ -423,23 +562,12 @@ class LSEVisitor : public HGraphDelegateVisitor {
       heap_values[idx] = constant;
       return;
     }
-    if (heap_value != kUnknownHeapValue) {
-      if (heap_value->IsInstanceFieldSet() || heap_value->IsArraySet()) {
-        HInstruction* store = heap_value;
-        // This load must be from a singleton since it's from the same
-        // field/element that a "removed" store puts the value. That store
-        // must be to a singleton's field/element.
-        DCHECK(ref_info->IsSingleton());
-        // Get the real heap value of the store.
-        heap_value = heap_value->IsInstanceFieldSet() ? store->InputAt(1) : store->InputAt(2);
-        // heap_value may already have a substitute.
-        heap_value = FindSubstitute(heap_value);
-      }
-    }
+    heap_value = GetRealHeapValue(heap_value);
     if (heap_value == kUnknownHeapValue) {
       // Load isn't eliminated. Put the load as the value into the HeapLocation.
       // This acts like GVN but with better aliasing analysis.
       heap_values[idx] = instruction;
+      KeepStoresIfAliasedToLocation(heap_values, idx);
     } else {
       if (DataType::Kind(heap_value->GetType()) != DataType::Kind(instruction->GetType())) {
         // The only situation where the same heap location has different type is when
@@ -452,6 +580,10 @@ class LSEVisitor : public HGraphDelegateVisitor {
           DCHECK(heap_value->IsArrayGet()) << heap_value->DebugName();
           DCHECK(instruction->IsArrayGet()) << instruction->DebugName();
         }
+        // Load isn't eliminated. Put the load as the value into the HeapLocation.
+        // This acts like GVN but with better aliasing analysis.
+        heap_values[idx] = instruction;
+        KeepStoresIfAliasedToLocation(heap_values, idx);
         return;
       }
       AddRemovedLoad(instruction, heap_value);
@@ -460,12 +592,21 @@ class LSEVisitor : public HGraphDelegateVisitor {
   }
 
   bool Equal(HInstruction* heap_value, HInstruction* value) {
+    DCHECK(!IsStore(value)) << value->DebugName();
+    if (heap_value == kUnknownHeapValue) {
+      // Don't compare kUnknownHeapValue with other values.
+      return false;
+    }
     if (heap_value == value) {
       return true;
     }
     if (heap_value == kDefaultHeapValue && GetDefaultValue(value->GetType()) == value) {
       return true;
     }
+    HInstruction* real_heap_value = GetRealHeapValue(heap_value);
+    if (real_heap_value != heap_value) {
+      return Equal(real_heap_value, value);
+    }
     return false;
   }
 
@@ -476,6 +617,7 @@ class LSEVisitor : public HGraphDelegateVisitor {
                         size_t vector_length,
                         int16_t declaring_class_def_index,
                         HInstruction* value) {
+    DCHECK(!IsStore(value)) << value->DebugName();
     // value may already have a substitute.
     value = FindSubstitute(value);
     HInstruction* original_ref = heap_location_collector_.HuntForOriginalReference(ref);
@@ -486,59 +628,47 @@ class LSEVisitor : public HGraphDelegateVisitor {
     ScopedArenaVector<HInstruction*>& heap_values =
         heap_values_for_[instruction->GetBlock()->GetBlockId()];
     HInstruction* heap_value = heap_values[idx];
-    bool same_value = false;
     bool possibly_redundant = false;
+
     if (Equal(heap_value, value)) {
       // Store into the heap location with the same value.
-      same_value = true;
-    } else if (index != nullptr &&
-               heap_location_collector_.GetHeapLocation(idx)->HasAliasedLocations()) {
-      // For array element, don't eliminate stores if the location can be aliased
-      // (due to either ref or index aliasing).
-    } else if (ref_info->IsSingleton()) {
-      // Store into a field/element of a singleton. The value cannot be killed due to
-      // aliasing/invocation. It can be redundant since future loads can
-      // directly get the value set by this instruction. The value can still be killed due to
-      // merging or loop side effects. Stores whose values are killed due to merging/loop side
-      // effects later will be removed from possibly_removed_stores_ when that is detected.
-      // Stores whose values may be needed after method return or deoptimization
-      // are also removed from possibly_removed_stores_ when that is detected.
-      possibly_redundant = true;
+      // This store can be eliminated right away.
+      instruction->GetBlock()->RemoveInstruction(instruction);
+      return;
+    } else {
       HLoopInformation* loop_info = instruction->GetBlock()->GetLoopInformation();
-      if (loop_info != nullptr) {
-        // instruction is a store in the loop so the loop must does write.
+      if (loop_info == nullptr) {
+        // Store is not in a loop. We try to precisely track the heap value by
+        // the store.
+        possibly_redundant = true;
+      } else if (!loop_info->IsIrreducible()) {
+        // instruction is a store in the loop so the loop must do write.
         DCHECK(side_effects_.GetLoopEffects(loop_info->GetHeader()).DoesAnyWrite());
-
-        if (loop_info->IsDefinedOutOfTheLoop(original_ref)) {
-          DCHECK(original_ref->GetBlock()->Dominates(loop_info->GetPreHeader()));
-          // Keep the store since its value may be needed at the loop header.
-          possibly_redundant = false;
-        } else {
-          // The singleton is created inside the loop. Value stored to it isn't needed at
+        if (ref_info->IsSingleton() && !loop_info->IsDefinedOutOfTheLoop(original_ref)) {
+          // original_ref is created inside the loop. Value stored to it isn't needed at
           // the loop header. This is true for outer loops also.
+          possibly_redundant = true;
+        } else {
+          // Keep the store since its value may be needed at the loop header.
         }
+      } else {
+        // Keep the store inside irreducible loops.
       }
     }
-    if (same_value || possibly_redundant) {
+    if (possibly_redundant) {
       possibly_removed_stores_.push_back(instruction);
     }
 
-    if (!same_value) {
-      if (possibly_redundant) {
-        DCHECK(instruction->IsInstanceFieldSet() || instruction->IsArraySet());
-        // Put the store as the heap value. If the value is loaded from heap
-        // by a load later, this store isn't really redundant.
-        heap_values[idx] = instruction;
-      } else {
-        heap_values[idx] = value;
-      }
-    }
+    // Put the store as the heap value. If the value is loaded or needed after
+    // return/deoptimization later, this store isn't really redundant.
+    heap_values[idx] = instruction;
+
     // This store may kill values in other heap locations due to aliasing.
     for (size_t i = 0; i < heap_values.size(); i++) {
       if (i == idx) {
         continue;
       }
-      if (heap_values[i] == value) {
+      if (Equal(heap_values[i], value)) {
         // Same value should be kept even if aliasing happens.
         continue;
       }
@@ -547,7 +677,9 @@ class LSEVisitor : public HGraphDelegateVisitor {
         continue;
       }
       if (heap_location_collector_.MayAlias(i, idx)) {
-        // Kill heap locations that may alias.
+        // Kill heap locations that may alias and as a result if the heap value
+        // is a store, the store needs to be kept.
+        KeepIfIsStore(heap_values[i]);
         heap_values[i] = kUnknownHeapValue;
       }
     }
@@ -633,24 +765,35 @@ class LSEVisitor : public HGraphDelegateVisitor {
     const ScopedArenaVector<HInstruction*>& heap_values =
         heap_values_for_[instruction->GetBlock()->GetBlockId()];
     for (HInstruction* heap_value : heap_values) {
-      // Filter out fake instructions before checking instruction kind below.
-      if (heap_value == kUnknownHeapValue || heap_value == kDefaultHeapValue) {
-        continue;
-      }
       // A store is kept as the heap value for possibly removed stores.
-      if (heap_value->IsInstanceFieldSet() || heap_value->IsArraySet()) {
-        // Check whether the reference for a store is used by an environment local of
-        // HDeoptimize.
+      // That value stored is generally observeable after deoptimization, except
+      // for singletons that don't escape after deoptimization.
+      if (IsStore(heap_value)) {
+        if (heap_value->IsStaticFieldSet()) {
+          KeepIfIsStore(heap_value);
+          continue;
+        }
         HInstruction* reference = heap_value->InputAt(0);
-        DCHECK(heap_location_collector_.FindReferenceInfoOf(reference)->IsSingleton());
-        for (const HUseListNode<HEnvironment*>& use : reference->GetEnvUses()) {
-          HEnvironment* user = use.GetUser();
-          if (user->GetHolder() == instruction) {
-            // The singleton for the store is visible at this deoptimization
-            // point. Need to keep the store so that the heap value is
-            // seen by the interpreter.
+        if (heap_location_collector_.FindReferenceInfoOf(reference)->IsSingleton()) {
+          if (reference->IsNewInstance() && reference->AsNewInstance()->IsFinalizable()) {
+            // Finalizable objects alway escape.
             KeepIfIsStore(heap_value);
+            continue;
+          }
+          // Check whether the reference for a store is used by an environment local of
+          // HDeoptimize. If not, the singleton is not observed after
+          // deoptimizion.
+          for (const HUseListNode<HEnvironment*>& use : reference->GetEnvUses()) {
+            HEnvironment* user = use.GetUser();
+            if (user->GetHolder() == instruction) {
+              // The singleton for the store is visible at this deoptimization
+              // point. Need to keep the store so that the heap value is
+              // seen by the interpreter.
+              KeepIfIsStore(heap_value);
+            }
           }
+        } else {
+          KeepIfIsStore(heap_value);
         }
       }
     }
@@ -691,9 +834,12 @@ class LSEVisitor : public HGraphDelegateVisitor {
         // Singleton references cannot be seen by the callee.
       } else {
         if (side_effects.DoesAnyRead()) {
+          // Invocation may read the heap value.
           KeepIfIsStore(heap_values[i]);
         }
         if (side_effects.DoesAnyWrite()) {
+          // Keep the store since it's not used to track the heap value anymore.
+          KeepIfIsStore(heap_values[i]);
           heap_values[i] = kUnknownHeapValue;
         }
       }
@@ -758,7 +904,7 @@ class LSEVisitor : public HGraphDelegateVisitor {
       return;
     }
     if (ref_info->IsSingletonAndRemovable()) {
-      singleton_new_arrays_.push_back(new_array);
+      singleton_new_instances_.push_back(new_array);
     }
     ScopedArenaVector<HInstruction*>& heap_values =
         heap_values_for_[new_array->GetBlock()->GetBlockId()];
@@ -791,7 +937,6 @@ class LSEVisitor : public HGraphDelegateVisitor {
   ScopedArenaVector<HInstruction*> possibly_removed_stores_;
 
   ScopedArenaVector<HInstruction*> singleton_new_instances_;
-  ScopedArenaVector<HInstruction*> singleton_new_arrays_;
 
   DISALLOW_COPY_AND_ASSIGN(LSEVisitor);
 };
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index 47ef194574..b3f23a0dcd 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -1410,7 +1410,7 @@ void OptimizingCompiler::GenerateJitDebugInfo(ArtMethod* method, debug::MethodDe
       GetCompilerDriver()->GetInstructionSetFeatures(),
       mini_debug_info,
       ArrayRef<const debug::MethodDebugInfo>(&info, 1));
-  MutexLock mu(Thread::Current(), g_jit_debug_mutex);
+  MutexLock mu(Thread::Current(), *Locks::native_debug_interface_lock_);
   JITCodeEntry* entry = CreateJITCodeEntry(elf_file);
   IncrementJITCodeEntryRefcount(entry, info.code_address);
 
diff --git a/compiler/utils/atomic_dex_ref_map-inl.h b/compiler/utils/atomic_dex_ref_map-inl.h
index 203e484fb7..7023b9a0e8 100644
--- a/compiler/utils/atomic_dex_ref_map-inl.h
+++ b/compiler/utils/atomic_dex_ref_map-inl.h
@@ -75,6 +75,18 @@ inline bool AtomicDexRefMap<DexFileReferenceType, Value>::Get(const DexFileRefer
 }
 
 template <typename DexFileReferenceType, typename Value>
+inline bool AtomicDexRefMap<DexFileReferenceType, Value>::Remove(const DexFileReferenceType& ref,
+                                                                 Value* out) {
+  ElementArray* const array = GetArray(ref.dex_file);
+  if (array == nullptr) {
+    return false;
+  }
+  *out = (*array)[ref.index].LoadRelaxed();
+  (*array)[ref.index].StoreSequentiallyConsistent(nullptr);
+  return true;
+}
+
+template <typename DexFileReferenceType, typename Value>
 inline void AtomicDexRefMap<DexFileReferenceType, Value>::AddDexFile(const DexFile* dex_file) {
   arrays_.Put(dex_file, std::move(ElementArray(NumberOfDexIndices(dex_file))));
 }
diff --git a/compiler/utils/atomic_dex_ref_map.h b/compiler/utils/atomic_dex_ref_map.h
index 9ff506d6a4..3474e16b8d 100644
--- a/compiler/utils/atomic_dex_ref_map.h
+++ b/compiler/utils/atomic_dex_ref_map.h
@@ -45,6 +45,9 @@ class AtomicDexRefMap {
   // Retreive an item, returns false if the dex file is not added.
   bool Get(const DexFileReferenceType& ref, Value* out) const;
 
+  // Remove an item and return the existing value. Returns false if the dex file is not added.
+  bool Remove(const DexFileReferenceType& ref, Value* out);
+
   // Dex files must be added before method references belonging to them can be used as keys. Not
   // thread safe.
   void AddDexFile(const DexFile* dex_file);