78 files changed, 6464 insertions, 3189 deletions
diff --git a/compiler/Android.mk b/compiler/Android.mk
index e74a68f608..42ddfd83ab 100644
--- a/compiler/Android.mk
+++ b/compiler/Android.mk
@@ -67,7 +67,6 @@ LIBART_COMPILER_SRC_FILES := \
 	optimizing/builder.cc \
 	optimizing/code_generator.cc \
 	optimizing/code_generator_utils.cc \
-	optimizing/constant_area_fixups_x86.cc \
 	optimizing/constant_folding.cc \
 	optimizing/dead_code_elimination.cc \
 	optimizing/graph_checker.cc \
@@ -85,6 +84,7 @@ LIBART_COMPILER_SRC_FILES := \
 	optimizing/optimization.cc \
 	optimizing/optimizing_compiler.cc \
 	optimizing/parallel_move_resolver.cc \
+	optimizing/pc_relative_fixups_x86.cc \
 	optimizing/prepare_for_register_allocation.cc \
 	optimizing/primitive_type_propagation.cc \
 	optimizing/reference_type_propagation.cc \
diff --git a/compiler/buffered_output_stream.cc b/compiler/buffered_output_stream.cc
index 0940a80cc1..3ca518b686 100644
--- a/compiler/buffered_output_stream.cc
+++ b/compiler/buffered_output_stream.cc
@@ -25,12 +25,13 @@ BufferedOutputStream::BufferedOutputStream(OutputStream* out)
 
 bool BufferedOutputStream::WriteFully(const void* buffer, size_t byte_count) {
   if (byte_count > kBufferSize) {
-    Flush();
+    if (!Flush()) {
+      return false;
+    }
     return out_->WriteFully(buffer, byte_count);
   }
   if (used_ + byte_count > kBufferSize) {
-    bool success = Flush();
-    if (!success) {
+    if (!Flush()) {
       return false;
     }
   }
diff --git a/compiler/buffered_output_stream.h b/compiler/buffered_output_stream.h
index 15fc0335a9..b447f41e21 100644
--- a/compiler/buffered_output_stream.h
+++ b/compiler/buffered_output_stream.h
@@ -36,11 +36,11 @@ class BufferedOutputStream FINAL : public OutputStream {
 
   virtual off_t Seek(off_t offset, Whence whence);
 
+  bool Flush();
+
  private:
   static const size_t kBufferSize = 8 * KB;
 
-  bool Flush();
-
   OutputStream* const out_;
 
   uint8_t buffer_[kBufferSize];
diff --git a/compiler/cfi_test.h b/compiler/cfi_test.h
index 6fd457599f..508b04a16f 100644
--- a/compiler/cfi_test.h
+++ b/compiler/cfi_test.h
@@ -48,11 +48,11 @@ class CFITest : public dwarf::DwarfTest {
     // Pretty-print CFI opcodes.
     constexpr bool is64bit = false;
     dwarf::DebugFrameOpCodeWriter<> initial_opcodes;
-    dwarf::WriteDebugFrameCIE(is64bit, dwarf::DW_EH_PE_absptr, dwarf::Reg(8),
-                              initial_opcodes, kCFIFormat, &debug_frame_data_);
+    dwarf::WriteCIE(is64bit, dwarf::Reg(8),
+                    initial_opcodes, kCFIFormat, &debug_frame_data_);
     std::vector<uintptr_t> debug_frame_patches;
-    dwarf::WriteDebugFrameFDE(is64bit, 0, 0, actual_asm.size(), ArrayRef<const uint8_t>(actual_cfi),
-                              kCFIFormat, &debug_frame_data_, &debug_frame_patches);
+    dwarf::WriteFDE(is64bit, 0, 0, 0, actual_asm.size(), ArrayRef<const uint8_t>(actual_cfi),
+                    kCFIFormat, 0, &debug_frame_data_, &debug_frame_patches);
     ReformatCfi(Objdump(false, "-W"), &lines);
     // Pretty-print assembly.
     auto* opts = new DisassemblerOptions(false, actual_asm.data(), true);
diff --git a/compiler/common_compiler_test.cc b/compiler/common_compiler_test.cc
index c37cecaeac..e6cc50cc5e 100644
--- a/compiler/common_compiler_test.cc
+++ b/compiler/common_compiler_test.cc
@@ -223,6 +223,11 @@ void CommonCompilerTest::SetCompilerKind(Compiler::Kind compiler_kind) {
   compiler_kind_ = compiler_kind;
 }
 
+InstructionSet CommonCompilerTest::GetInstructionSet() const {
+  DCHECK(compiler_driver_.get() != nullptr);
+  return compiler_driver_->GetInstructionSet();
+}
+
 void CommonCompilerTest::TearDown() {
   timer_.reset();
   compiler_driver_.reset();
diff --git a/compiler/common_compiler_test.h b/compiler/common_compiler_test.h
index 67b4428324..7b0e5af246 100644
--- a/compiler/common_compiler_test.h
+++ b/compiler/common_compiler_test.h
@@ -61,6 +61,8 @@ class CommonCompilerTest : public CommonRuntimeTest {
   Compiler::Kind GetCompilerKind() const;
   void SetCompilerKind(Compiler::Kind compiler_kind);
 
+  InstructionSet GetInstructionSet() const;
+
   // Get the set of image classes given to the compiler-driver in SetUp. Note: the compiler
   // driver assumes ownership of the set, so the test should properly release the set.
   virtual std::unordered_set<std::string>* GetImageClasses();
@@ -115,6 +117,31 @@ class CommonCompilerTest : public CommonRuntimeTest {
     return; \
   }
 
+// TODO: When read barrier works with all compilers in use, get rid of this.
+#define TEST_DISABLED_FOR_READ_BARRIER_WITH_QUICK() \
+  if (kUseReadBarrier && GetCompilerKind() == Compiler::kQuick) { \
+    printf("WARNING: TEST DISABLED FOR READ BARRIER WITH QUICK\n"); \
+    return; \
+  }
+
+// TODO: When read barrier works with all Optimizing back ends, get rid of this.
+#define TEST_DISABLED_FOR_READ_BARRIER_WITH_OPTIMIZING_FOR_UNSUPPORTED_INSTRUCTION_SETS() \
+  if (kUseReadBarrier && GetCompilerKind() == Compiler::kOptimizing) {                    \
+    switch (GetInstructionSet()) {                                                        \
+      case kThumb2:                                                                       \
+      case kX86:                                                                          \
+      case kX86_64:                                                                       \
+        /* Instruction set has read barrier support. */                                   \
+        break;                                                                            \
+                                                                                          \
+      default:                                                                            \
+        /* Instruction set does not have barrier support. */                              \
+        printf("WARNING: TEST DISABLED FOR READ BARRIER WITH OPTIMIZING "                 \
+               "FOR THIS INSTRUCTION SET\n");                                             \
+        return;                                                                           \
+    }                                                                                     \
+  }
+
 // TODO: When non-PIC works with all compilers in use, get rid of this.
 #define TEST_DISABLED_FOR_NON_PIC_COMPILING_WITH_OPTIMIZING() \
   if (GetCompilerKind() == Compiler::kOptimizing) { \
diff --git a/compiler/dex/quick/gen_common.cc b/compiler/dex/quick/gen_common.cc
index 2b60a51e22..5da72147b0 100644
--- a/compiler/dex/quick/gen_common.cc
+++ b/compiler/dex/quick/gen_common.cc
@@ -1104,7 +1104,11 @@ void Mir2Lir::GenNewInstance(uint32_t type_idx, RegLocation rl_dest) {
   // access because the verifier was unable to?
   const DexFile* dex_file = cu_->dex_file;
   CompilerDriver* driver = cu_->compiler_driver;
-  if (driver->CanAccessInstantiableTypeWithoutChecks(cu_->method_idx, *dex_file, type_idx)) {
+  bool finalizable;
+  if (driver->CanAccessInstantiableTypeWithoutChecks(cu_->method_idx,
+                                                     *dex_file,
+                                                     type_idx,
+                                                     &finalizable)) {
     bool is_type_initialized;
     bool use_direct_type_ptr;
     uintptr_t direct_type_ptr;
diff --git a/compiler/dex/quick/quick_compiler.cc b/compiler/dex/quick/quick_compiler.cc
index 6673ea8ac5..05dde9f649 100644
--- a/compiler/dex/quick/quick_compiler.cc
+++ b/compiler/dex/quick/quick_compiler.cc
@@ -673,6 +673,12 @@ CompiledMethod* QuickCompiler::Compile(const DexFile::CodeItem* code_item,
     return nullptr;
   }
 
+  if (kEmitCompilerReadBarrier) {
+    VLOG(compiler) << "Skipping method : " << PrettyMethod(method_idx, dex_file)
+                   << "  Reason = Quick does not support read barrier.";
+    return nullptr;
+  }
+
   // TODO: check method fingerprint here to determine appropriate backend type.  Until then, use
   // build default.
   CompilerDriver* driver = GetCompilerDriver();
diff --git a/compiler/dex/verified_method.cc b/compiler/dex/verified_method.cc
index 8eb37cf3dc..0355f116f1 100644
--- a/compiler/dex/verified_method.cc
+++ b/compiler/dex/verified_method.cc
@@ -313,8 +313,9 @@ void VerifiedMethod::GenerateDevirtMap(verifier::MethodVerifier* method_verifier
       concrete_method = reg_type.GetClass()->FindVirtualMethodForVirtual(
           abstract_method, pointer_size);
     }
-    if (concrete_method == nullptr || concrete_method->IsAbstract()) {
-      // In cases where concrete_method is not found, or is abstract, continue to the next invoke.
+    if (concrete_method == nullptr || !concrete_method->IsInvokable()) {
+      // In cases where concrete_method is not found, or is not invokable, continue to the next
+      // invoke.
       continue;
     }
     if (reg_type.IsPreciseReference() || concrete_method->IsFinal() ||
diff --git a/compiler/driver/compiler_driver-inl.h b/compiler/driver/compiler_driver-inl.h
index 14ba81d193..10841e6700 100644
--- a/compiler/driver/compiler_driver-inl.h
+++ b/compiler/driver/compiler_driver-inl.h
@@ -329,7 +329,7 @@ inline int CompilerDriver::IsFastInvoke(
       resolved_method->GetMethodIndex() < methods_class->GetVTableLength() &&
       (methods_class->GetVTableEntry(
           resolved_method->GetMethodIndex(), pointer_size) == resolved_method) &&
-      !resolved_method->IsAbstract();
+      resolved_method->IsInvokable();
 
   if (can_sharpen_virtual_based_on_type || can_sharpen_super_based_on_type) {
     // Sharpen a virtual call into a direct call. The method_idx is into referrer's
@@ -374,7 +374,7 @@ inline int CompilerDriver::IsFastInvoke(
           class_loader, nullptr, kVirtual);
     }
     CHECK(called_method != nullptr);
-    CHECK(!called_method->IsAbstract());
+    CHECK(called_method->IsInvokable());
     int stats_flags = kFlagMethodResolved;
     GetCodeAndMethodForDirectCall(/*out*/invoke_type,
                                   kDirect,  // Sharp type
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index aa5e411ba8..e42a73723b 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -1208,7 +1208,8 @@ bool CompilerDriver::CanAccessTypeWithoutChecks(uint32_t referrer_idx, const Dex
 
 bool CompilerDriver::CanAccessInstantiableTypeWithoutChecks(uint32_t referrer_idx,
                                                             const DexFile& dex_file,
-                                                            uint32_t type_idx) {
+                                                            uint32_t type_idx,
+                                                            bool* finalizable) {
   ScopedObjectAccess soa(Thread::Current());
   mirror::DexCache* dex_cache = Runtime::Current()->GetClassLinker()->FindDexCache(
       soa.Self(), dex_file, false);
@@ -1216,8 +1217,11 @@ bool CompilerDriver::CanAccessInstantiableTypeWithoutChecks(uint32_t referrer_id
   mirror::Class* resolved_class = dex_cache->GetResolvedType(type_idx);
   if (resolved_class == nullptr) {
     stats_->TypeNeedsAccessCheck();
+    // Be conservative.
+    *finalizable = true;
     return false;  // Unknown class needs access checks.
   }
+  *finalizable = resolved_class->IsFinalizable();
   const DexFile::MethodId& method_id = dex_file.GetMethodId(referrer_idx);
   mirror::Class* referrer_class = dex_cache->GetResolvedType(method_id.class_idx_);
   if (referrer_class == nullptr) {
@@ -1552,7 +1556,7 @@ void CompilerDriver::GetCodeAndMethodForDirectCall(InvokeType* type, InvokeType
       *type = sharp_type;
     }
   } else {
-    auto* image_space = heap->GetImageSpace();
+    auto* image_space = heap->GetBootImageSpace();
     bool method_in_image = false;
     if (image_space != nullptr) {
       const auto& method_section = image_space->GetImageHeader().GetMethodsSection();
@@ -2034,8 +2038,14 @@ class VerifyClassVisitor : public CompilationVisitor {
       Handle<mirror::DexCache> dex_cache(hs.NewHandle(class_linker->FindDexCache(
           soa.Self(), dex_file, false)));
       std::string error_msg;
-      if (verifier::MethodVerifier::VerifyClass(soa.Self(), &dex_file, dex_cache, class_loader,
-                                                &class_def, true, &error_msg) ==
+      if (verifier::MethodVerifier::VerifyClass(soa.Self(),
+                                                &dex_file,
+                                                dex_cache,
+                                                class_loader,
+                                                &class_def,
+                                                true /* allow soft failures */,
+                                                true /* log hard failures */,
+                                                &error_msg) ==
                                                     verifier::MethodVerifier::kHardFailure) {
         LOG(ERROR) << "Verification failed on class " << PrettyDescriptor(descriptor)
                    << " because: " << error_msg;
diff --git a/compiler/driver/compiler_driver.h b/compiler/driver/compiler_driver.h
index 5683b03a71..dae785b688 100644
--- a/compiler/driver/compiler_driver.h
+++ b/compiler/driver/compiler_driver.h
@@ -211,8 +211,11 @@ class CompilerDriver {
       REQUIRES(!Locks::mutator_lock_);
 
   // Are runtime access and instantiable checks necessary in the code?
-  bool CanAccessInstantiableTypeWithoutChecks(uint32_t referrer_idx, const DexFile& dex_file,
-                                              uint32_t type_idx)
+  // out_is_finalizable is set to whether the type is finalizable.
+  bool CanAccessInstantiableTypeWithoutChecks(uint32_t referrer_idx,
+                                              const DexFile& dex_file,
+                                              uint32_t type_idx,
+                                              bool* out_is_finalizable)
       REQUIRES(!Locks::mutator_lock_);
 
   bool CanEmbedTypeInCode(const DexFile& dex_file, uint32_t type_idx,
diff --git a/compiler/driver/compiler_driver_test.cc b/compiler/driver/compiler_driver_test.cc
index 1107599779..f8de9fa4a1 100644
--- a/compiler/driver/compiler_driver_test.cc
+++ b/compiler/driver/compiler_driver_test.cc
@@ -147,6 +147,8 @@ TEST_F(CompilerDriverTest, DISABLED_LARGE_CompileDexLibCore) {
 
 TEST_F(CompilerDriverTest, AbstractMethodErrorStub) {
   TEST_DISABLED_FOR_HEAP_REFERENCE_POISONING_WITH_QUICK();
+  TEST_DISABLED_FOR_READ_BARRIER_WITH_QUICK();
+  TEST_DISABLED_FOR_READ_BARRIER_WITH_OPTIMIZING_FOR_UNSUPPORTED_INSTRUCTION_SETS();
   jobject class_loader;
   {
     ScopedObjectAccess soa(Thread::Current());
@@ -193,6 +195,8 @@ class CompilerDriverMethodsTest : public CompilerDriverTest {
 
 TEST_F(CompilerDriverMethodsTest, Selection) {
   TEST_DISABLED_FOR_HEAP_REFERENCE_POISONING_WITH_QUICK();
+  TEST_DISABLED_FOR_READ_BARRIER_WITH_QUICK();
+  TEST_DISABLED_FOR_READ_BARRIER_WITH_OPTIMIZING_FOR_UNSUPPORTED_INSTRUCTION_SETS();
   Thread* self = Thread::Current();
   jobject class_loader;
   {
diff --git a/compiler/dwarf/debug_info_entry_writer.h b/compiler/dwarf/debug_info_entry_writer.h
index d9b367bdf1..aa31036c8b 100644
--- a/compiler/dwarf/debug_info_entry_writer.h
+++ b/compiler/dwarf/debug_info_entry_writer.h
@@ -20,6 +20,7 @@
 #include <cstdint>
 #include <unordered_map>
 
+#include "base/casts.h"
 #include "dwarf/dwarf_constants.h"
 #include "dwarf/writer.h"
 #include "leb128.h"
@@ -47,9 +48,9 @@ struct FNVHash {
  * It also handles generation of abbreviations.
  *
  * Usage:
- *   StartTag(DW_TAG_compile_unit, DW_CHILDREN_yes);
+ *   StartTag(DW_TAG_compile_unit);
  *     WriteStrp(DW_AT_producer, "Compiler name", debug_str);
- *     StartTag(DW_TAG_subprogram, DW_CHILDREN_no);
+ *     StartTag(DW_TAG_subprogram);
  *       WriteStrp(DW_AT_name, "Foo", debug_str);
  *     EndTag();
  *   EndTag();
@@ -59,36 +60,40 @@ class DebugInfoEntryWriter FINAL : private Writer<Vector> {
   static_assert(std::is_same<typename Vector::value_type, uint8_t>::value, "Invalid value type");
 
  public:
+  static constexpr size_t kCompilationUnitHeaderSize = 11;
+
   // Start debugging information entry.
-  void StartTag(Tag tag, Children children) {
-    DCHECK(has_children) << "This tag can not have nested tags";
+  // Returns offset of the entry in compilation unit.
+  size_t StartTag(Tag tag) {
     if (inside_entry_) {
       // Write abbrev code for the previous entry.
-      this->UpdateUleb128(abbrev_code_offset_, EndAbbrev());
+      // Parent entry is finalized before any children are written.
+      this->UpdateUleb128(abbrev_code_offset_, EndAbbrev(DW_CHILDREN_yes));
       inside_entry_ = false;
     }
-    StartAbbrev(tag, children);
+    StartAbbrev(tag);
     // Abbrev code placeholder of sufficient size.
     abbrev_code_offset_ = this->data()->size();
     this->PushUleb128(NextAbbrevCode());
     depth_++;
     inside_entry_ = true;
-    has_children = (children == DW_CHILDREN_yes);
+    return abbrev_code_offset_ + kCompilationUnitHeaderSize;
   }
 
   // End debugging information entry.
   void EndTag() {
     DCHECK_GT(depth_, 0);
     if (inside_entry_) {
-      // Write abbrev code for this tag.
-      this->UpdateUleb128(abbrev_code_offset_, EndAbbrev());
+      // Write abbrev code for this entry.
+      this->UpdateUleb128(abbrev_code_offset_, EndAbbrev(DW_CHILDREN_no));
       inside_entry_ = false;
-    }
-    if (has_children) {
-      this->PushUint8(0);  // End of children.
+      // This entry has no children and so there is no terminator.
+    } else {
+      // The entry has been already finalized so it must be parent entry
+      // and we need to write the terminator required by DW_CHILDREN_yes.
+      this->PushUint8(0);
     }
     depth_--;
-    has_children = true;  // Parent tag obviously has children.
   }
 
   void WriteAddr(Attribute attrib, uint64_t value) {
@@ -101,10 +106,10 @@ class DebugInfoEntryWriter FINAL : private Writer<Vector> {
     }
   }
 
-  void WriteBlock(Attribute attrib, const void* ptr, int size) {
+  void WriteBlock(Attribute attrib, const void* ptr, size_t num_bytes) {
     AddAbbrevAttribute(attrib, DW_FORM_block);
-    this->PushUleb128(size);
-    this->PushData(ptr, size);
+    this->PushUleb128(num_bytes);
+    this->PushData(ptr, num_bytes);
   }
 
   void WriteData1(Attribute attrib, uint8_t value) {
@@ -147,12 +152,12 @@ class DebugInfoEntryWriter FINAL : private Writer<Vector> {
     this->PushUint8(value ? 1 : 0);
   }
 
-  void WriteRef4(Attribute attrib, int cu_offset) {
+  void WriteRef4(Attribute attrib, uint32_t cu_offset) {
     AddAbbrevAttribute(attrib, DW_FORM_ref4);
     this->PushUint32(cu_offset);
   }
 
-  void WriteRef(Attribute attrib, int cu_offset) {
+  void WriteRef(Attribute attrib, uint32_t cu_offset) {
     AddAbbrevAttribute(attrib, DW_FORM_ref_udata);
     this->PushUleb128(cu_offset);
   }
@@ -162,16 +167,21 @@ class DebugInfoEntryWriter FINAL : private Writer<Vector> {
     this->PushString(value);
   }
 
-  void WriteStrp(Attribute attrib, int address) {
+  void WriteStrp(Attribute attrib, size_t debug_str_offset) {
     AddAbbrevAttribute(attrib, DW_FORM_strp);
-    this->PushUint32(address);
+    this->PushUint32(dchecked_integral_cast<uint32_t>(debug_str_offset));
   }
 
-  void WriteStrp(Attribute attrib, const char* value, std::vector<uint8_t>* debug_str) {
+  void WriteStrp(Attribute attrib, const char* str, size_t len,
+                 std::vector<uint8_t>* debug_str) {
     AddAbbrevAttribute(attrib, DW_FORM_strp);
-    int address = debug_str->size();
-    debug_str->insert(debug_str->end(), value, value + strlen(value) + 1);
-    this->PushUint32(address);
+    this->PushUint32(debug_str->size());
+    debug_str->insert(debug_str->end(), str, str + len);
+    debug_str->push_back(0);
+  }
+
+  void WriteStrp(Attribute attrib, const char* str, std::vector<uint8_t>* debug_str) {
+    WriteStrp(attrib, str, strlen(str), debug_str);
   }
 
   bool Is64bit() const { return is64bit_; }
@@ -180,7 +190,11 @@ class DebugInfoEntryWriter FINAL : private Writer<Vector> {
     return patch_locations_;
   }
 
+  int Depth() const { return depth_; }
+
   using Writer<Vector>::data;
+  using Writer<Vector>::size;
+  using Writer<Vector>::UpdateUint32;
 
   DebugInfoEntryWriter(bool is64bitArch,
                        Vector* debug_abbrev,
@@ -196,16 +210,17 @@ class DebugInfoEntryWriter FINAL : private Writer<Vector> {
   }
 
   ~DebugInfoEntryWriter() {
+    DCHECK(!inside_entry_);
     DCHECK_EQ(depth_, 0);
   }
 
  private:
   // Start abbreviation declaration.
-  void StartAbbrev(Tag tag, Children children) {
-    DCHECK(!inside_entry_);
+  void StartAbbrev(Tag tag) {
     current_abbrev_.clear();
     EncodeUnsignedLeb128(&current_abbrev_, tag);
-    current_abbrev_.push_back(children);
+    has_children_offset_ = current_abbrev_.size();
+    current_abbrev_.push_back(0);  // Place-holder for DW_CHILDREN.
   }
 
   // Add attribute specification.
@@ -220,8 +235,9 @@ class DebugInfoEntryWriter FINAL : private Writer<Vector> {
   }
 
   // End abbreviation declaration and return its code.
-  int EndAbbrev() {
-    DCHECK(inside_entry_);
+  int EndAbbrev(Children has_children) {
+    DCHECK(!current_abbrev_.empty());
+    current_abbrev_[has_children_offset_] = has_children;
     auto it = abbrev_codes_.insert(std::make_pair(std::move(current_abbrev_),
                                                   NextAbbrevCode()));
     int abbrev_code = it.first->second;
@@ -241,6 +257,7 @@ class DebugInfoEntryWriter FINAL : private Writer<Vector> {
   // Fields for writing and deduplication of abbrevs.
   Writer<Vector> debug_abbrev_;
   Vector current_abbrev_;
+  size_t has_children_offset_ = 0;
   std::unordered_map<Vector, int,
                      FNVHash<Vector> > abbrev_codes_;
 
@@ -250,7 +267,6 @@ class DebugInfoEntryWriter FINAL : private Writer<Vector> {
   int depth_ = 0;
   size_t abbrev_code_offset_ = 0;  // Location to patch once we know the code.
   bool inside_entry_ = false;  // Entry ends at first child (if any).
-  bool has_children = true;
   std::vector<uintptr_t> patch_locations_;
 };
 
diff --git a/compiler/dwarf/dedup_vector.h b/compiler/dwarf/dedup_vector.h
new file mode 100644
index 0000000000..7fb21b76e2
--- /dev/null
+++ b/compiler/dwarf/dedup_vector.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_DWARF_DEDUP_VECTOR_H_
+#define ART_COMPILER_DWARF_DEDUP_VECTOR_H_
+
+#include <vector>
+#include <unordered_map>
+
+namespace art {
+namespace dwarf {
+  class DedupVector {
+   public:
+    // Returns an offset to previously inserted identical block of data,
+    // or appends the data at the end of the vector and returns offset to it.
+    size_t Insert(const uint8_t* ptr, size_t num_bytes) {
+      // See http://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
+      uint32_t hash = 2166136261u;
+      for (size_t i = 0; i < num_bytes; i++) {
+        hash = (hash ^ ptr[i]) * 16777619u;
+      }
+      // Try to find existing copy of the data.
+      const auto& range = hash_to_offset_.equal_range(hash);
+      for (auto it = range.first; it != range.second; ++it) {
+        const size_t offset = it->second;
+        if (offset + num_bytes <= vector_.size() &&
+            memcmp(vector_.data() + offset, ptr, num_bytes) == 0) {
+          return offset;
+        }
+      }
+      // Append the data at the end of the vector.
+      const size_t new_offset = vector_.size();
+      hash_to_offset_.emplace(hash, new_offset);
+      vector_.insert(vector_.end(), ptr, ptr + num_bytes);
+      return new_offset;
+    }
+
+    const std::vector<uint8_t>& Data() const { return vector_; }
+
+   private:
+    struct IdentityHash {
+      size_t operator()(uint32_t v) const { return v; }
+    };
+
+    // We store the full hash as the key to simplify growing of the table.
+    // It avoids storing or referencing the actual data in the hash-table.
+    std::unordered_multimap<uint32_t, size_t, IdentityHash> hash_to_offset_;
+
+    std::vector<uint8_t> vector_;
+  };
+}  // namespace dwarf
+}  // namespace art
+
+#endif  // ART_COMPILER_DWARF_DEDUP_VECTOR_H_
diff --git a/compiler/dwarf/dwarf_test.cc b/compiler/dwarf/dwarf_test.cc
index 3ba380e9db..e9cd421da9 100644
--- a/compiler/dwarf/dwarf_test.cc
+++ b/compiler/dwarf/dwarf_test.cc
@@ -122,12 +122,12 @@ TEST_F(DwarfTest, DebugFrame) {
   DW_CHECK_NEXT("DW_CFA_restore: r5 (ebp)");
 
   DebugFrameOpCodeWriter<> initial_opcodes;
-  WriteDebugFrameCIE(is64bit, DW_EH_PE_absptr, Reg(is64bit ? 16 : 8),
-                     initial_opcodes, kCFIFormat, &debug_frame_data_);
+  WriteCIE(is64bit, Reg(is64bit ? 16 : 8),
+           initial_opcodes, kCFIFormat, &debug_frame_data_);
   std::vector<uintptr_t> debug_frame_patches;
   std::vector<uintptr_t> expected_patches { 28 };  // NOLINT
-  WriteDebugFrameFDE(is64bit, 0, 0x01000000, 0x01000000, ArrayRef<const uint8_t>(*opcodes.data()),
-                     kCFIFormat, &debug_frame_data_, &debug_frame_patches);
+  WriteFDE(is64bit, 0, 0, 0x01000000, 0x01000000, ArrayRef<const uint8_t>(*opcodes.data()),
+           kCFIFormat, 0, &debug_frame_data_, &debug_frame_patches);
 
   EXPECT_EQ(expected_patches, debug_frame_patches);
   CheckObjdumpOutput(is64bit, "-W");
@@ -136,14 +136,14 @@ TEST_F(DwarfTest, DebugFrame) {
 TEST_F(DwarfTest, DebugFrame64) {
   constexpr bool is64bit = true;
   DebugFrameOpCodeWriter<> initial_opcodes;
-  WriteDebugFrameCIE(is64bit, DW_EH_PE_absptr, Reg(16),
-                     initial_opcodes, kCFIFormat, &debug_frame_data_);
+  WriteCIE(is64bit, Reg(16),
+           initial_opcodes, kCFIFormat, &debug_frame_data_);
   DebugFrameOpCodeWriter<> opcodes;
   std::vector<uintptr_t> debug_frame_patches;
   std::vector<uintptr_t> expected_patches { 32 };  // NOLINT
-  WriteDebugFrameFDE(is64bit, 0, 0x0100000000000000, 0x0200000000000000,
-                     ArrayRef<const uint8_t>(*opcodes.data()),
-                     kCFIFormat, &debug_frame_data_, &debug_frame_patches);
+  WriteFDE(is64bit, 0, 0, 0x0100000000000000, 0x0200000000000000,
+           ArrayRef<const uint8_t>(*opcodes.data()),
+                     kCFIFormat, 0, &debug_frame_data_, &debug_frame_patches);
   DW_CHECK("FDE cie=00000000 pc=100000000000000..300000000000000");
 
   EXPECT_EQ(expected_patches, debug_frame_patches);
@@ -176,12 +176,12 @@ TEST_F(DwarfTest, x86_64_RegisterMapping) {
   DW_CHECK_NEXT("DW_CFA_offset: r14 (r14)");
   DW_CHECK_NEXT("DW_CFA_offset: r15 (r15)");
   DebugFrameOpCodeWriter<> initial_opcodes;
-  WriteDebugFrameCIE(is64bit, DW_EH_PE_absptr, Reg(16),
-                     initial_opcodes, kCFIFormat, &debug_frame_data_);
+  WriteCIE(is64bit, Reg(16),
+           initial_opcodes, kCFIFormat, &debug_frame_data_);
   std::vector<uintptr_t> debug_frame_patches;
-  WriteDebugFrameFDE(is64bit, 0, 0x0100000000000000, 0x0200000000000000,
-                     ArrayRef<const uint8_t>(*opcodes.data()),
-                     kCFIFormat, &debug_frame_data_, &debug_frame_patches);
+  WriteFDE(is64bit, 0, 0, 0x0100000000000000, 0x0200000000000000,
+           ArrayRef<const uint8_t>(*opcodes.data()),
+                     kCFIFormat, 0, &debug_frame_data_, &debug_frame_patches);
 
   CheckObjdumpOutput(is64bit, "-W");
 }
@@ -237,7 +237,7 @@ TEST_F(DwarfTest, DebugLine) {
   std::vector<uintptr_t> debug_line_patches;
   std::vector<uintptr_t> expected_patches { 87 };  // NOLINT
   WriteDebugLineTable(include_directories, files, opcodes,
-                      &debug_line_data_, &debug_line_patches);
+                      0, &debug_line_data_, &debug_line_patches);
 
   EXPECT_EQ(expected_patches, debug_line_patches);
   CheckObjdumpOutput(is64bit, "-W");
@@ -276,7 +276,7 @@ TEST_F(DwarfTest, DebugLineSpecialOpcodes) {
   std::vector<FileEntry> files { { "file.c", 0, 1000, 2000 } };  // NOLINT
   std::vector<uintptr_t> debug_line_patches;
   WriteDebugLineTable(directories, files, opcodes,
-                      &debug_line_data_, &debug_line_patches);
+                      0, &debug_line_data_, &debug_line_patches);
 
   CheckObjdumpOutput(is64bit, "-W -WL");
 }
@@ -285,7 +285,7 @@ TEST_F(DwarfTest, DebugInfo) {
   constexpr bool is64bit = false;
   DebugInfoEntryWriter<> info(is64bit, &debug_abbrev_data_);
   DW_CHECK("Contents of the .debug_info section:");
-  info.StartTag(dwarf::DW_TAG_compile_unit, dwarf::DW_CHILDREN_yes);
+  info.StartTag(dwarf::DW_TAG_compile_unit);
   DW_CHECK("Abbrev Number: 1 (DW_TAG_compile_unit)");
   info.WriteStrp(dwarf::DW_AT_producer, "Compiler name", &debug_str_data_);
   DW_CHECK_NEXT("DW_AT_producer    : (indirect string, offset: 0x0): Compiler name");
@@ -293,7 +293,7 @@ TEST_F(DwarfTest, DebugInfo) {
   DW_CHECK_NEXT("DW_AT_low_pc      : 0x1000000");
   info.WriteAddr(dwarf::DW_AT_high_pc, 0x02000000);
   DW_CHECK_NEXT("DW_AT_high_pc     : 0x2000000");
-  info.StartTag(dwarf::DW_TAG_subprogram, dwarf::DW_CHILDREN_no);
+  info.StartTag(dwarf::DW_TAG_subprogram);
   DW_CHECK("Abbrev Number: 2 (DW_TAG_subprogram)");
   info.WriteStrp(dwarf::DW_AT_name, "Foo", &debug_str_data_);
   DW_CHECK_NEXT("DW_AT_name        : (indirect string, offset: 0xe): Foo");
@@ -302,7 +302,7 @@ TEST_F(DwarfTest, DebugInfo) {
   info.WriteAddr(dwarf::DW_AT_high_pc, 0x01020000);
   DW_CHECK_NEXT("DW_AT_high_pc     : 0x1020000");
   info.EndTag();  // DW_TAG_subprogram
-  info.StartTag(dwarf::DW_TAG_subprogram, dwarf::DW_CHILDREN_no);
+  info.StartTag(dwarf::DW_TAG_subprogram);
   DW_CHECK("Abbrev Number: 2 (DW_TAG_subprogram)");
   info.WriteStrp(dwarf::DW_AT_name, "Bar", &debug_str_data_);
   DW_CHECK_NEXT("DW_AT_name        : (indirect string, offset: 0x12): Bar");
@@ -313,7 +313,7 @@ TEST_F(DwarfTest, DebugInfo) {
   info.EndTag();  // DW_TAG_subprogram
   info.EndTag();  // DW_TAG_compile_unit
   // Test that previous list was properly terminated and empty children.
-  info.StartTag(dwarf::DW_TAG_compile_unit, dwarf::DW_CHILDREN_yes);
+  info.StartTag(dwarf::DW_TAG_compile_unit);
   info.EndTag();  // DW_TAG_compile_unit
 
   // The abbrev table is just side product, but check it as well.
@@ -327,12 +327,12 @@ TEST_F(DwarfTest, DebugInfo) {
   DW_CHECK_NEXT("DW_AT_name         DW_FORM_strp");
   DW_CHECK_NEXT("DW_AT_low_pc       DW_FORM_addr");
   DW_CHECK_NEXT("DW_AT_high_pc      DW_FORM_addr");
-  DW_CHECK("3      DW_TAG_compile_unit    [has children]");
+  DW_CHECK("3      DW_TAG_compile_unit    [no children]");
 
   std::vector<uintptr_t> debug_info_patches;
   std::vector<uintptr_t> expected_patches { 16, 20, 29, 33, 42, 46 };  // NOLINT
   dwarf::WriteDebugInfoCU(0 /* debug_abbrev_offset */, info,
-                          &debug_info_data_, &debug_info_patches);
+                          0, &debug_info_data_, &debug_info_patches);
 
   EXPECT_EQ(expected_patches, debug_info_patches);
   CheckObjdumpOutput(is64bit, "-W");
diff --git a/compiler/dwarf/dwarf_test.h b/compiler/dwarf/dwarf_test.h
index f819c49cee..5464ed9c49 100644
--- a/compiler/dwarf/dwarf_test.h
+++ b/compiler/dwarf/dwarf_test.h
@@ -59,38 +59,27 @@ class DwarfTest : public CommonRuntimeTest {
   std::vector<std::string> Objdump(const char* args) {
     // Write simple elf file with just the DWARF sections.
     InstructionSet isa = (sizeof(typename ElfTypes::Addr) == 8) ? kX86_64 : kX86;
-    class NoCode : public CodeOutput {
-      bool Write(OutputStream*) OVERRIDE { return true; }  // NOLINT
-    } no_code;
-    ElfBuilder<ElfTypes> builder(isa, 0, &no_code, 0, &no_code, 0);
-    typedef typename ElfBuilder<ElfTypes>::RawSection RawSection;
-    RawSection debug_info(".debug_info", SHT_PROGBITS, 0, nullptr, 0, 1, 0);
-    RawSection debug_abbrev(".debug_abbrev", SHT_PROGBITS, 0, nullptr, 0, 1, 0);
-    RawSection debug_str(".debug_str", SHT_PROGBITS, 0, nullptr, 0, 1, 0);
-    RawSection debug_line(".debug_line", SHT_PROGBITS, 0, nullptr, 0, 1, 0);
-    RawSection debug_frame(".debug_frame", SHT_PROGBITS, 0, nullptr, 0, 8, 0);
+    ScratchFile file;
+    FileOutputStream output_stream(file.GetFile());
+    ElfBuilder<ElfTypes> builder(isa, &output_stream);
+    builder.Start();
     if (!debug_info_data_.empty()) {
-      debug_info.SetBuffer(debug_info_data_);
-      builder.RegisterSection(&debug_info);
+      builder.WriteSection(".debug_info", &debug_info_data_);
     }
     if (!debug_abbrev_data_.empty()) {
-      debug_abbrev.SetBuffer(debug_abbrev_data_);
-      builder.RegisterSection(&debug_abbrev);
+      builder.WriteSection(".debug_abbrev", &debug_abbrev_data_);
     }
     if (!debug_str_data_.empty()) {
-      debug_str.SetBuffer(debug_str_data_);
-      builder.RegisterSection(&debug_str);
+      builder.WriteSection(".debug_str", &debug_str_data_);
     }
     if (!debug_line_data_.empty()) {
-      debug_line.SetBuffer(debug_line_data_);
-      builder.RegisterSection(&debug_line);
+      builder.WriteSection(".debug_line", &debug_line_data_);
     }
     if (!debug_frame_data_.empty()) {
-      debug_frame.SetBuffer(debug_frame_data_);
-      builder.RegisterSection(&debug_frame);
+      builder.WriteSection(".debug_frame", &debug_frame_data_);
     }
-    ScratchFile file;
-    builder.Write(file.GetFile());
+    builder.End();
+    EXPECT_TRUE(builder.Good());
 
     // Read the elf file back using objdump.
     std::vector<std::string> lines;
diff --git a/compiler/dwarf/headers.h b/compiler/dwarf/headers.h
index f3fba4b1fa..c75aeacabd 100644
--- a/compiler/dwarf/headers.h
+++ b/compiler/dwarf/headers.h
@@ -38,15 +38,14 @@ namespace dwarf {
 
 // Write common information entry (CIE) to .debug_frame or .eh_frame section.
 template<typename Vector>
-void WriteDebugFrameCIE(bool is64bit,
-                        ExceptionHeaderValueApplication address_type,
-                        Reg return_address_register,
-                        const DebugFrameOpCodeWriter<Vector>& opcodes,
-                        CFIFormat format,
-                        std::vector<uint8_t>* debug_frame) {
+void WriteCIE(bool is64bit,
+              Reg return_address_register,
+              const DebugFrameOpCodeWriter<Vector>& opcodes,
+              CFIFormat format,
+              std::vector<uint8_t>* buffer) {
   static_assert(std::is_same<typename Vector::value_type, uint8_t>::value, "Invalid value type");
 
-  Writer<> writer(debug_frame);
+  Writer<> writer(buffer);
   size_t cie_header_start_ = writer.data()->size();
   writer.PushUint32(0);  // Length placeholder.
   writer.PushUint32((format == DW_EH_FRAME_FORMAT) ? 0 : 0xFFFFFFFF);  // CIE id.
@@ -57,17 +56,17 @@ void WriteDebugFrameCIE(bool is64bit,
   writer.PushUleb128(return_address_register.num());  // ubyte in DWARF2.
   writer.PushUleb128(1);  // z: Augmentation data size.
   if (is64bit) {
-    if (address_type == DW_EH_PE_pcrel) {
+    if (format == DW_EH_FRAME_FORMAT) {
       writer.PushUint8(DW_EH_PE_pcrel | DW_EH_PE_sdata8);   // R: Pointer encoding.
     } else {
-      DCHECK(address_type == DW_EH_PE_absptr);
+      DCHECK(format == DW_DEBUG_FRAME_FORMAT);
       writer.PushUint8(DW_EH_PE_absptr | DW_EH_PE_udata8);  // R: Pointer encoding.
     }
   } else {
-    if (address_type == DW_EH_PE_pcrel) {
+    if (format == DW_EH_FRAME_FORMAT) {
       writer.PushUint8(DW_EH_PE_pcrel | DW_EH_PE_sdata4);   // R: Pointer encoding.
     } else {
-      DCHECK(address_type == DW_EH_PE_absptr);
+      DCHECK(format == DW_DEBUG_FRAME_FORMAT);
       writer.PushUint8(DW_EH_PE_absptr | DW_EH_PE_udata4);  // R: Pointer encoding.
     }
   }
@@ -78,30 +77,44 @@ void WriteDebugFrameCIE(bool is64bit,
 
 // Write frame description entry (FDE) to .debug_frame or .eh_frame section.
 inline
-void WriteDebugFrameFDE(bool is64bit, size_t cie_offset,
-                        uint64_t initial_address, uint64_t address_range,
-                        const ArrayRef<const uint8_t>& opcodes,
-                        CFIFormat format,
-                        std::vector<uint8_t>* debug_frame,
-                        std::vector<uintptr_t>* debug_frame_patches) {
-  Writer<> writer(debug_frame);
+void WriteFDE(bool is64bit,
+              uint64_t section_address,  // Absolute address of the section.
+              uint64_t cie_address,  // Absolute address of last CIE.
+              uint64_t code_address,
+              uint64_t code_size,
+              const ArrayRef<const uint8_t>& opcodes,
+              CFIFormat format,
+              uint64_t buffer_address,  // Address of buffer in linked application.
+              std::vector<uint8_t>* buffer,
+              std::vector<uintptr_t>* patch_locations) {
+  CHECK_GE(cie_address, section_address);
+  CHECK_GE(buffer_address, section_address);
+
+  Writer<> writer(buffer);
   size_t fde_header_start = writer.data()->size();
   writer.PushUint32(0);  // Length placeholder.
   if (format == DW_EH_FRAME_FORMAT) {
-    uint32_t cie_pointer = writer.data()->size() - cie_offset;
+    uint32_t cie_pointer = (buffer_address + buffer->size()) - cie_address;
     writer.PushUint32(cie_pointer);
   } else {
-    uint32_t cie_pointer = cie_offset;
+    DCHECK(format == DW_DEBUG_FRAME_FORMAT);
+    uint32_t cie_pointer = cie_address - section_address;
     writer.PushUint32(cie_pointer);
   }
-  // Relocate initial_address, but not address_range (it is size).
-  debug_frame_patches->push_back(writer.data()->size());
+  if (format == DW_EH_FRAME_FORMAT) {
+    // .eh_frame encodes the location as relative address.
+    code_address -= buffer_address + buffer->size();
+  } else {
+    DCHECK(format == DW_DEBUG_FRAME_FORMAT);
+    // Relocate code_address if it has absolute value.
+    patch_locations->push_back(buffer_address + buffer->size() - section_address);
+  }
   if (is64bit) {
-    writer.PushUint64(initial_address);
-    writer.PushUint64(address_range);
+    writer.PushUint64(code_address);
+    writer.PushUint64(code_size);
   } else {
-    writer.PushUint32(initial_address);
-    writer.PushUint32(address_range);
+    writer.PushUint32(code_address);
+    writer.PushUint32(code_size);
   }
   writer.PushUleb128(0);  // Augmentation data size.
   writer.PushData(opcodes);
@@ -113,6 +126,7 @@ void WriteDebugFrameFDE(bool is64bit, size_t cie_offset,
 template<typename Vector>
 void WriteDebugInfoCU(uint32_t debug_abbrev_offset,
                       const DebugInfoEntryWriter<Vector>& entries,
+                      size_t debug_info_offset,  // offset from start of .debug_info.
                       std::vector<uint8_t>* debug_info,
                       std::vector<uintptr_t>* debug_info_patches) {
   static_assert(std::is_same<typename Vector::value_type, uint8_t>::value, "Invalid value type");
@@ -124,11 +138,12 @@ void WriteDebugInfoCU(uint32_t debug_abbrev_offset,
   writer.PushUint32(debug_abbrev_offset);
   writer.PushUint8(entries.Is64bit() ? 8 : 4);
   size_t entries_offset = writer.data()->size();
+  DCHECK_EQ(entries_offset, DebugInfoEntryWriter<Vector>::kCompilationUnitHeaderSize);
   writer.PushData(*entries.data());
   writer.UpdateUint32(start, writer.data()->size() - start - 4);
   // Copy patch locations and make them relative to .debug_info section.
   for (uintptr_t patch_location : entries.GetPatchLocations()) {
-    debug_info_patches->push_back(entries_offset + patch_location);
+    debug_info_patches->push_back(debug_info_offset + entries_offset + patch_location);
   }
 }
 
@@ -144,6 +159,7 @@ template<typename Vector>
 void WriteDebugLineTable(const std::vector<std::string>& include_directories,
                          const std::vector<FileEntry>& files,
                          const DebugLineOpCodeWriter<Vector>& opcodes,
+                         size_t debug_line_offset,  // offset from start of .debug_line.
                          std::vector<uint8_t>* debug_line,
                          std::vector<uintptr_t>* debug_line_patches) {
   static_assert(std::is_same<typename Vector::value_type, uint8_t>::value, "Invalid value type");
@@ -184,7 +200,7 @@ void WriteDebugLineTable(const std::vector<std::string>& include_directories,
   writer.UpdateUint32(header_start, writer.data()->size() - header_start - 4);
   // Copy patch locations and make them relative to .debug_line section.
   for (uintptr_t patch_location : opcodes.GetPatchLocations()) {
-    debug_line_patches->push_back(opcodes_offset + patch_location);
+    debug_line_patches->push_back(debug_line_offset + opcodes_offset + patch_location);
   }
 }
 
diff --git a/compiler/dwarf/writer.h b/compiler/dwarf/writer.h
index 00b9dfa303..d2add7f026 100644
--- a/compiler/dwarf/writer.h
+++ b/compiler/dwarf/writer.h
@@ -114,9 +114,9 @@ class Writer {
     data_->insert(data_->end(), value, value + strlen(value) + 1);
   }
 
-  void PushData(const void* ptr, size_t size) {
+  void PushData(const void* ptr, size_t num_bytes) {
     const char* p = reinterpret_cast<const char*>(ptr);
-    data_->insert(data_->end(), p, p + size);
+    data_->insert(data_->end(), p, p + num_bytes);
   }
 
   template<typename Vector2>
@@ -164,6 +164,10 @@ class Writer {
     return data_;
   }
 
+  size_t size() const {
+    return data_->size();
+  }
+
   explicit Writer(Vector* buffer) : data_(buffer) { }
 
  private:
diff --git a/compiler/elf_builder.h b/compiler/elf_builder.h
index e977798ab1..6e8dfd60fb 100644
--- a/compiler/elf_builder.h
+++ b/compiler/elf_builder.h
@@ -21,27 +21,58 @@
 
 #include "arch/instruction_set.h"
 #include "base/bit_utils.h"
+#include "base/casts.h"
 #include "base/unix_file/fd_file.h"
 #include "buffered_output_stream.h"
 #include "elf_utils.h"
 #include "file_output_stream.h"
+#include "leb128.h"
 
 namespace art {
 
-class CodeOutput {
- public:
-  virtual bool Write(OutputStream* out) = 0;
-  virtual ~CodeOutput() {}
-};
-
 // Writes ELF file.
-// The main complication is that the sections often want to reference
-// each other.  We solve this by writing the ELF file in two stages:
-//  * Sections are asked about their size, and overall layout is calculated.
-//  * Sections do the actual writes which may use offsets of other sections.
+//
+// The basic layout of the elf file:
+//   Elf_Ehdr                    - The ELF header.
+//   Elf_Phdr[]                  - Program headers for the linker.
+//   .rodata                     - DEX files and oat metadata.
+//   .text                       - Compiled code.
+//   .bss                        - Zero-initialized writeable section.
+//   .dynstr                     - Names for .dynsym.
+//   .dynsym                     - A few oat-specific dynamic symbols.
+//   .hash                       - Hash-table for .dynsym.
+//   .dynamic                    - Tags which let the linker locate .dynsym.
+//   .strtab                     - Names for .symtab.
+//   .symtab                     - Debug symbols.
+//   .eh_frame                   - Unwind information (CFI).
+//   .eh_frame_hdr               - Index of .eh_frame.
+//   .debug_frame                - Unwind information (CFI).
+//   .debug_frame.oat_patches    - Addresses for relocation.
+//   .debug_info                 - Debug information.
+//   .debug_info.oat_patches     - Addresses for relocation.
+//   .debug_abbrev               - Decoding information for .debug_info.
+//   .debug_str                  - Strings for .debug_info.
+//   .debug_line                 - Line number tables.
+//   .debug_line.oat_patches     - Addresses for relocation.
+//   .text.oat_patches           - Addresses for relocation.
+//   .shstrtab                   - Names of ELF sections.
+//   Elf_Shdr[]                  - Section headers.
+//
+// Some section are optional (the debug sections in particular).
+//
+// We try write the section data directly into the file without much
+// in-memory buffering.  This means we generally write sections based on the
+// dependency order (e.g. .dynamic points to .dynsym which points to .text).
+//
+// In the cases where we need to buffer, we write the larger section first
+// and buffer the smaller one (e.g. .strtab is bigger than .symtab).
+//
+// The debug sections are written last for easier stripping.
+//
 template <typename ElfTypes>
 class ElfBuilder FINAL {
  public:
+  static constexpr size_t kMaxProgramHeaders = 16;
   using Elf_Addr = typename ElfTypes::Addr;
   using Elf_Off = typename ElfTypes::Off;
   using Elf_Word = typename ElfTypes::Word;
@@ -53,776 +84,429 @@ class ElfBuilder FINAL {
   using Elf_Dyn = typename ElfTypes::Dyn;
 
   // Base class of all sections.
-  class Section {
+  class Section : public OutputStream {
    public:
-    Section(const std::string& name, Elf_Word type, Elf_Word flags,
-            const Section* link, Elf_Word info, Elf_Word align, Elf_Word entsize)
-        : header_(), section_index_(0), name_(name), link_(link) {
+    Section(ElfBuilder<ElfTypes>* owner, const std::string& name,
+            Elf_Word type, Elf_Word flags, const Section* link,
+            Elf_Word info, Elf_Word align, Elf_Word entsize)
+        : OutputStream(name), owner_(owner), header_(),
+          section_index_(0), name_(name), link_(link),
+          started_(false), finished_(false), phdr_flags_(PF_R), phdr_type_(0) {
+      DCHECK_GE(align, 1u);
       header_.sh_type = type;
       header_.sh_flags = flags;
       header_.sh_info = info;
       header_.sh_addralign = align;
       header_.sh_entsize = entsize;
     }
-    virtual ~Section() {}
-
-    // Returns the size of the content of this section.  It is used to
-    // calculate file offsets of all sections before doing any writes.
-    virtual Elf_Word GetSize() const = 0;
-
-    // Write the content of this section to the given file.
-    // This must write exactly the number of bytes returned by GetSize().
-    // Offsets of all sections are known when this method is called.
-    virtual bool Write(File* elf_file) = 0;
-
-    Elf_Word GetLink() const {
-      return (link_ != nullptr) ? link_->GetSectionIndex() : 0;
-    }
-
-    const Elf_Shdr* GetHeader() const {
-      return &header_;
-    }
-
-    Elf_Shdr* GetHeader() {
-      return &header_;
-    }
 
-    Elf_Word GetSectionIndex() const {
-      DCHECK_NE(section_index_, 0u);
-      return section_index_;
-    }
-
-    void SetSectionIndex(Elf_Word section_index) {
-      section_index_ = section_index;
-    }
-
-    const std::string& GetName() const {
-      return name_;
-    }
-
-   private:
-    Elf_Shdr header_;
-    Elf_Word section_index_;
-    const std::string name_;
-    const Section* const link_;
-
-    DISALLOW_COPY_AND_ASSIGN(Section);
-  };
-
-  // Writer of .dynamic section.
-  class DynamicSection FINAL : public Section {
-   public:
-    void AddDynamicTag(Elf_Sword tag, Elf_Word value, const Section* section) {
-      DCHECK_NE(tag, static_cast<Elf_Sword>(DT_NULL));
-      dynamics_.push_back({tag, value, section});
-    }
-
-    DynamicSection(const std::string& name, Section* link)
-        : Section(name, SHT_DYNAMIC, SHF_ALLOC,
-                  link, 0, kPageSize, sizeof(Elf_Dyn)) {}
-
-    Elf_Word GetSize() const OVERRIDE {
-      return (dynamics_.size() + 1 /* DT_NULL */) * sizeof(Elf_Dyn);
-    }
-
-    bool Write(File* elf_file) OVERRIDE {
-      std::vector<Elf_Dyn> buffer;
-      buffer.reserve(dynamics_.size() + 1u);
-      for (const ElfDynamicState& it : dynamics_) {
-        if (it.section_ != nullptr) {
-          // We are adding an address relative to a section.
-          buffer.push_back(
-              {it.tag_, {it.value_ + it.section_->GetHeader()->sh_addr}});
-        } else {
-          buffer.push_back({it.tag_, {it.value_}});
-        }
+    virtual ~Section() {
+      if (started_) {
+        CHECK(finished_);
       }
-      buffer.push_back({DT_NULL, {0}});
-      return WriteArray(elf_file, buffer.data(), buffer.size());
     }
 
-   private:
-    struct ElfDynamicState {
-      Elf_Sword tag_;
-      Elf_Word value_;
-      const Section* section_;
-    };
-    std::vector<ElfDynamicState> dynamics_;
-  };
-
-  using PatchFn = void (*)(const std::vector<uintptr_t>& patch_locations,
-                           Elf_Addr buffer_address,
-                           Elf_Addr base_address,
-                           std::vector<uint8_t>* buffer);
-
-  // Section with content based on simple memory buffer.
-  // The buffer can be optionally patched before writing.
-  class RawSection FINAL : public Section {
-   public:
-    RawSection(const std::string& name, Elf_Word type, Elf_Word flags,
-               const Section* link, Elf_Word info, Elf_Word align, Elf_Word entsize,
-               PatchFn patch = nullptr, const Section* patch_base_section = nullptr)
-        : Section(name, type, flags, link, info, align, entsize),
-          patched_(false), patch_(patch), patch_base_section_(patch_base_section) {
+    // Start writing of this section.
+    void Start() {
+      CHECK(!started_);
+      CHECK(!finished_);
+      started_ = true;
+      auto& sections = owner_->sections_;
+      // Check that the previous section is complete.
+      CHECK(sections.empty() || sections.back()->finished_);
+      // The first ELF section index is 1. Index 0 is reserved for NULL.
+      section_index_ = sections.size() + 1;
+      // Push this section on the list of written sections.
+      sections.push_back(this);
+      // Align file position.
+      if (header_.sh_type != SHT_NOBITS) {
+        header_.sh_offset = RoundUp(owner_->Seek(0, kSeekCurrent), header_.sh_addralign);
+        owner_->Seek(header_.sh_offset, kSeekSet);
+      }
+      // Align virtual memory address.
+      if ((header_.sh_flags & SHF_ALLOC) != 0) {
+        header_.sh_addr = RoundUp(owner_->virtual_address_, header_.sh_addralign);
+        owner_->virtual_address_ = header_.sh_addr;
+      }
     }
 
-    RawSection(const std::string& name, Elf_Word type)
-        : RawSection(name, type, 0, nullptr, 0, 1, 0, nullptr, nullptr) {
+    // Finish writing of this section.
+    void End() {
+      CHECK(started_);
+      CHECK(!finished_);
+      finished_ = true;
+      if (header_.sh_type == SHT_NOBITS) {
+        CHECK_GT(header_.sh_size, 0u);
+      } else {
+        // Use the current file position to determine section size.
+        off_t file_offset = owner_->Seek(0, kSeekCurrent);
+        CHECK_GE(file_offset, (off_t)header_.sh_offset);
+        header_.sh_size = file_offset - header_.sh_offset;
+      }
+      if ((header_.sh_flags & SHF_ALLOC) != 0) {
+        owner_->virtual_address_ += header_.sh_size;
+      }
     }
 
-    Elf_Word GetSize() const OVERRIDE {
-      return buffer_.size();
+    // Get the location of this section in virtual memory.
+    Elf_Addr GetAddress() const {
+      CHECK(started_);
+      return header_.sh_addr;
     }
 
-    bool Write(File* elf_file) OVERRIDE {
-      if (!patch_locations_.empty()) {
-        DCHECK(!patched_);  // Do not patch twice.
-        DCHECK(patch_ != nullptr);
-        DCHECK(patch_base_section_ != nullptr);
-        patch_(patch_locations_,
-               this->GetHeader()->sh_addr,
-               patch_base_section_->GetHeader()->sh_addr,
-               &buffer_);
-        patched_ = true;
+    // Returns the size of the content of this section.
+    Elf_Word GetSize() const {
+      if (finished_) {
+        return header_.sh_size;
+      } else {
+        CHECK(started_);
+        CHECK_NE(header_.sh_type, (Elf_Word)SHT_NOBITS);
+        return owner_->Seek(0, kSeekCurrent) - header_.sh_offset;
       }
-      return WriteArray(elf_file, buffer_.data(), buffer_.size());
-    }
-
-    bool IsEmpty() const {
-      return buffer_.size() == 0;
     }
 
-    std::vector<uint8_t>* GetBuffer() {
-      return &buffer_;
+    // Set desired allocation size for .bss section.
+    void SetSize(Elf_Word size) {
+      CHECK_EQ(header_.sh_type, (Elf_Word)SHT_NOBITS);
+      header_.sh_size = size;
     }
 
-    void SetBuffer(const std::vector<uint8_t>& buffer) {
-      buffer_ = buffer;
+    // This function always succeeds to simplify code.
+    // Use builder's Good() to check the actual status.
+    bool WriteFully(const void* buffer, size_t byte_count) OVERRIDE {
+      CHECK(started_);
+      CHECK(!finished_);
+      owner_->WriteFully(buffer, byte_count);
+      return true;
     }
 
-    std::vector<uintptr_t>* GetPatchLocations() {
-      return &patch_locations_;
+    // This function always succeeds to simplify code.
+    // Use builder's Good() to check the actual status.
+    off_t Seek(off_t offset, Whence whence) OVERRIDE {
+      // Forward the seek as-is and trust the caller to use it reasonably.
+      return owner_->Seek(offset, whence);
     }
 
-   private:
-    std::vector<uint8_t> buffer_;
-    std::vector<uintptr_t> patch_locations_;
-    bool patched_;
-    // User-provided function to do the actual patching.
-    PatchFn patch_;
-    // The section that we patch against (usually .text).
-    const Section* patch_base_section_;
-  };
-
-  // Writer of .rodata section or .text section.
-  // The write is done lazily using the provided CodeOutput.
-  class OatSection FINAL : public Section {
-   public:
-    OatSection(const std::string& name, Elf_Word type, Elf_Word flags,
-               const Section* link, Elf_Word info, Elf_Word align,
-               Elf_Word entsize, Elf_Word size, CodeOutput* code_output)
-        : Section(name, type, flags, link, info, align, entsize),
-          size_(size), code_output_(code_output) {
-    }
-
-    Elf_Word GetSize() const OVERRIDE {
-      return size_;
-    }
-
-    bool Write(File* elf_file) OVERRIDE {
-      // The BufferedOutputStream class contains the buffer as field,
-      // therefore it is too big to allocate on the stack.
-      std::unique_ptr<BufferedOutputStream> output_stream(
-          new BufferedOutputStream(new FileOutputStream(elf_file)));
-      return code_output_->Write(output_stream.get());
+    Elf_Word GetSectionIndex() const {
+      DCHECK(started_);
+      DCHECK_NE(section_index_, 0u);
+      return section_index_;
     }
 
    private:
-    Elf_Word size_;
-    CodeOutput* code_output_;
-  };
-
-  // Writer of .bss section.
-  class NoBitsSection FINAL : public Section {
-   public:
-    NoBitsSection(const std::string& name, Elf_Word size)
-        : Section(name, SHT_NOBITS, SHF_ALLOC, nullptr, 0, kPageSize, 0),
-          size_(size) {
-    }
+    ElfBuilder<ElfTypes>* owner_;
+    Elf_Shdr header_;
+    Elf_Word section_index_;
+    const std::string name_;
+    const Section* const link_;
+    bool started_;
+    bool finished_;
+    Elf_Word phdr_flags_;
+    Elf_Word phdr_type_;
 
-    Elf_Word GetSize() const OVERRIDE {
-      return size_;
-    }
+    friend class ElfBuilder;
 
-    bool Write(File* elf_file ATTRIBUTE_UNUSED) OVERRIDE {
-      LOG(ERROR) << "This section should not be written to the ELF file";
-      return false;
-    }
-
-   private:
-    Elf_Word size_;
+    DISALLOW_COPY_AND_ASSIGN(Section);
   };
 
   // Writer of .dynstr .strtab and .shstrtab sections.
-  class StrtabSection FINAL : public Section {
+  class StringSection FINAL : public Section {
    public:
-    StrtabSection(const std::string& name, Elf_Word flags, Elf_Word align)
-        : Section(name, SHT_STRTAB, flags, nullptr, 0, align, 0) {
-      buffer_.reserve(4 * KB);
-      // The first entry of strtab must be empty string.
-      buffer_ += '\0';
+    StringSection(ElfBuilder<ElfTypes>* owner, const std::string& name,
+                  Elf_Word flags, Elf_Word align)
+        : Section(owner, name, SHT_STRTAB, flags, nullptr, 0, align, 0),
+          current_offset_(0) {
     }
 
-    Elf_Word AddName(const std::string& name) {
-      Elf_Word offset = buffer_.size();
-      buffer_ += name;
-      buffer_ += '\0';
+    Elf_Word Write(const std::string& name) {
+      if (current_offset_ == 0) {
+        DCHECK(name.empty());
+      }
+      Elf_Word offset = current_offset_;
+      this->WriteFully(name.c_str(), name.length() + 1);
+      current_offset_ += name.length() + 1;
       return offset;
     }
 
-    Elf_Word GetSize() const OVERRIDE {
-      return buffer_.size();
-    }
-
-    bool Write(File* elf_file) OVERRIDE {
-      return WriteArray(elf_file, buffer_.data(), buffer_.size());
-    }
-
    private:
-    std::string buffer_;
+    Elf_Word current_offset_;
   };
 
-  class HashSection;
-
   // Writer of .dynsym and .symtab sections.
-  class SymtabSection FINAL : public Section {
+  class SymbolSection FINAL : public Section {
    public:
-    // Add a symbol with given name to this symtab. The symbol refers to
-    // 'relative_addr' within the given section and has the given attributes.
-    void AddSymbol(const std::string& name, const Section* section,
-                   Elf_Addr addr, bool is_relative, Elf_Word size,
-                   uint8_t binding, uint8_t type, uint8_t other = 0) {
-      CHECK(section != nullptr);
-      Elf_Word name_idx = strtab_->AddName(name);
-      symbols_.push_back({ name, section, addr, size, is_relative,
-                           MakeStInfo(binding, type), other, name_idx });
+    SymbolSection(ElfBuilder<ElfTypes>* owner, const std::string& name,
+                  Elf_Word type, Elf_Word flags, StringSection* strtab)
+        : Section(owner, name, type, flags, strtab, 0,
+                  sizeof(Elf_Off), sizeof(Elf_Sym)) {
     }
 
-    SymtabSection(const std::string& name, Elf_Word type, Elf_Word flags,
-                  StrtabSection* strtab)
-        : Section(name, type, flags, strtab, 0, sizeof(Elf_Off), sizeof(Elf_Sym)),
-          strtab_(strtab) {
-    }
-
-    bool IsEmpty() const {
-      return symbols_.empty();
+    // Buffer symbol for this section.  It will be written later.
+    void Add(Elf_Word name, const Section* section,
+             Elf_Addr addr, bool is_relative, Elf_Word size,
+             uint8_t binding, uint8_t type, uint8_t other = 0) {
+      CHECK(section != nullptr);
+      Elf_Sym sym = Elf_Sym();
+      sym.st_name = name;
+      sym.st_value = addr + (is_relative ? section->GetAddress() : 0);
+      sym.st_size = size;
+      sym.st_other = other;
+      sym.st_shndx = section->GetSectionIndex();
+      sym.st_info = (binding << 4) + (type & 0xf);
+      symbols_.push_back(sym);
     }
 
-    Elf_Word GetSize() const OVERRIDE {
-      return (1 /* NULL */ + symbols_.size()) * sizeof(Elf_Sym);
-    }
-
-    bool Write(File* elf_file) OVERRIDE {
-      std::vector<Elf_Sym> buffer;
-      buffer.reserve(1u + symbols_.size());
-      buffer.push_back(Elf_Sym());  // NULL.
-      for (const ElfSymbolState& it : symbols_) {
-        Elf_Sym sym = Elf_Sym();
-        sym.st_name = it.name_idx_;
-        if (it.is_relative_) {
-          sym.st_value = it.addr_ + it.section_->GetHeader()->sh_addr;
-        } else {
-          sym.st_value = it.addr_;
-        }
-        sym.st_size = it.size_;
-        sym.st_other = it.other_;
-        sym.st_shndx = it.section_->GetSectionIndex();
-        sym.st_info = it.info_;
-        buffer.push_back(sym);
-      }
-      return WriteArray(elf_file, buffer.data(), buffer.size());
+    void Write() {
+      // The symbol table always has to start with NULL symbol.
+      Elf_Sym null_symbol = Elf_Sym();
+      this->WriteFully(&null_symbol, sizeof(null_symbol));
+      this->WriteFully(symbols_.data(), symbols_.size() * sizeof(symbols_[0]));
+      symbols_.clear();
+      symbols_.shrink_to_fit();
     }
 
    private:
-    struct ElfSymbolState {
-      const std::string name_;
-      const Section* section_;
-      Elf_Addr addr_;
-      Elf_Word size_;
-      bool is_relative_;
-      uint8_t info_;
-      uint8_t other_;
-      Elf_Word name_idx_;  // index in the strtab.
-    };
-
-    static inline constexpr uint8_t MakeStInfo(uint8_t binding, uint8_t type) {
-      return ((binding) << 4) + ((type) & 0xf);
-    }
-
-    // The symbols in the same order they will be in the symbol table.
-    std::vector<ElfSymbolState> symbols_;
-    StrtabSection* strtab_;
-
-    friend class HashSection;
+    std::vector<Elf_Sym> symbols_;
   };
 
-  // TODO: Consider removing.
-  // We use it only for the dynsym section which has only 5 symbols.
-  // We do not use it for symtab, and we probably do not have to
-  // since we use those symbols only to print backtraces.
-  class HashSection FINAL : public Section {
-   public:
-    HashSection(const std::string& name, Elf_Word flags, SymtabSection* symtab)
-        : Section(name, SHT_HASH, flags, symtab,
-                  0, sizeof(Elf_Word), sizeof(Elf_Word)),
-          symtab_(symtab) {
-    }
-
-    Elf_Word GetSize() const OVERRIDE {
-      Elf_Word nbuckets = GetNumBuckets();
-      Elf_Word chain_size = symtab_->symbols_.size() + 1 /* NULL */;
-      return (2 /* header */ + nbuckets + chain_size) * sizeof(Elf_Word);
-    }
-
-    bool Write(File* const elf_file) OVERRIDE {
-      // Here is how The ELF hash table works.
-      // There are 3 arrays to worry about.
-      // * The symbol table where the symbol information is.
-      // * The bucket array which is an array of indexes into the symtab and chain.
-      // * The chain array which is also an array of indexes into the symtab and chain.
-      //
-      // Lets say the state is something like this.
-      // +--------+       +--------+      +-----------+
-      // | symtab |       | bucket |      |   chain   |
-      // |  null  |       | 1      |      | STN_UNDEF |
-      // | <sym1> |       | 4      |      | 2         |
-      // | <sym2> |       |        |      | 5         |
-      // | <sym3> |       |        |      | STN_UNDEF |
-      // | <sym4> |       |        |      | 3         |
-      // | <sym5> |       |        |      | STN_UNDEF |
-      // +--------+       +--------+      +-----------+
-      //
-      // The lookup process (in python psudocode) is
-      //
-      // def GetSym(name):
-      //     # NB STN_UNDEF == 0
-      //     indx = bucket[elfhash(name) % num_buckets]
-      //     while indx != STN_UNDEF:
-      //         if GetSymbolName(symtab[indx]) == name:
-      //             return symtab[indx]
-      //         indx = chain[indx]
-      //     return SYMBOL_NOT_FOUND
-      //
-      // Between bucket and chain arrays every symtab index must be present exactly
-      // once (except for STN_UNDEF, which must be present 1 + num_bucket times).
-      const auto& symbols = symtab_->symbols_;
-      // Select number of buckets.
-      // This is essentially arbitrary.
-      Elf_Word nbuckets = GetNumBuckets();
-      // 1 is for the implicit NULL symbol.
-      Elf_Word chain_size = (symbols.size() + 1);
-      std::vector<Elf_Word> hash;
-      hash.push_back(nbuckets);
-      hash.push_back(chain_size);
-      uint32_t bucket_offset = hash.size();
-      uint32_t chain_offset = bucket_offset + nbuckets;
-      hash.resize(hash.size() + nbuckets + chain_size, 0);
-
-      Elf_Word* buckets = hash.data() + bucket_offset;
-      Elf_Word* chain   = hash.data() + chain_offset;
-
-      // Set up the actual hash table.
-      for (Elf_Word i = 0; i < symbols.size(); i++) {
-        // Add 1 since we need to have the null symbol that is not in the symbols
-        // list.
-        Elf_Word index = i + 1;
-        Elf_Word hash_val = static_cast<Elf_Word>(elfhash(symbols[i].name_.c_str())) % nbuckets;
-        if (buckets[hash_val] == 0) {
-          buckets[hash_val] = index;
-        } else {
-          hash_val = buckets[hash_val];
-          CHECK_LT(hash_val, chain_size);
-          while (chain[hash_val] != 0) {
-            hash_val = chain[hash_val];
-            CHECK_LT(hash_val, chain_size);
-          }
-          chain[hash_val] = index;
-          // Check for loops. Works because if this is non-empty then there must be
-          // another cell which already contains the same symbol index as this one,
-          // which means some symbol has more then one name, which isn't allowed.
-          CHECK_EQ(chain[index], static_cast<Elf_Word>(0));
-        }
-      }
-      return WriteArray(elf_file, hash.data(), hash.size());
-    }
-
-   private:
-    Elf_Word GetNumBuckets() const {
-      const auto& symbols = symtab_->symbols_;
-      // Have about 32 ids per bucket.
-      return 1 + symbols.size()/32;
-    }
-
-    // from bionic
-    static inline unsigned elfhash(const char *_name) {
-      const unsigned char *name = (const unsigned char *) _name;
-      unsigned h = 0, g;
+  ElfBuilder(InstructionSet isa, OutputStream* output)
+    : isa_(isa),
+      output_(output),
+      output_good_(true),
+      output_offset_(0),
+      rodata_(this, ".rodata", SHT_PROGBITS, SHF_ALLOC, nullptr, 0, kPageSize, 0),
+      text_(this, ".text", SHT_PROGBITS, SHF_ALLOC | SHF_EXECINSTR, nullptr, 0, kPageSize, 0),
+      bss_(this, ".bss", SHT_NOBITS, SHF_ALLOC, nullptr, 0, kPageSize, 0),
+      dynstr_(this, ".dynstr", SHF_ALLOC, kPageSize),
+      dynsym_(this, ".dynsym", SHT_DYNSYM, SHF_ALLOC, &dynstr_),
+      hash_(this, ".hash", SHT_HASH, SHF_ALLOC, &dynsym_, 0, sizeof(Elf_Word), sizeof(Elf_Word)),
+      dynamic_(this, ".dynamic", SHT_DYNAMIC, SHF_ALLOC, &dynstr_, 0, kPageSize, sizeof(Elf_Dyn)),
+      eh_frame_(this, ".eh_frame", SHT_PROGBITS, SHF_ALLOC, nullptr, 0, kPageSize, 0),
+      eh_frame_hdr_(this, ".eh_frame_hdr", SHT_PROGBITS, SHF_ALLOC, nullptr, 0, 4, 0),
+      strtab_(this, ".strtab", 0, kPageSize),
+      symtab_(this, ".symtab", SHT_SYMTAB, 0, &strtab_),
+      debug_frame_(this, ".debug_frame", SHT_PROGBITS, 0, nullptr, 0, sizeof(Elf_Addr), 0),
+      debug_info_(this, ".debug_info", SHT_PROGBITS, 0, nullptr, 0, 1, 0),
+      debug_line_(this, ".debug_line", SHT_PROGBITS, 0, nullptr, 0, 1, 0),
+      shstrtab_(this, ".shstrtab", 0, 1),
+      virtual_address_(0) {
+    text_.phdr_flags_ = PF_R | PF_X;
+    bss_.phdr_flags_ = PF_R | PF_W;
+    dynamic_.phdr_flags_ = PF_R | PF_W;
+    dynamic_.phdr_type_ = PT_DYNAMIC;
+    eh_frame_hdr_.phdr_type_ = PT_GNU_EH_FRAME;
+  }
+  ~ElfBuilder() {}
 
-      while (*name) {
-        h = (h << 4) + *name++;
-        g = h & 0xf0000000;
-        h ^= g;
-        h ^= g >> 24;
-      }
-      return h;
+  InstructionSet GetIsa() { return isa_; }
+  Section* GetRoData() { return &rodata_; }
+  Section* GetText() { return &text_; }
+  Section* GetBss() { return &bss_; }
+  StringSection* GetStrTab() { return &strtab_; }
+  SymbolSection* GetSymTab() { return &symtab_; }
+  Section* GetEhFrame() { return &eh_frame_; }
+  Section* GetEhFrameHdr() { return &eh_frame_hdr_; }
+  Section* GetDebugFrame() { return &debug_frame_; }
+  Section* GetDebugInfo() { return &debug_info_; }
+  Section* GetDebugLine() { return &debug_line_; }
+
+  // Encode patch locations as LEB128 list of deltas between consecutive addresses.
+  // (exposed publicly for tests)
+  static void EncodeOatPatches(const std::vector<uintptr_t>& locations,
+                               std::vector<uint8_t>* buffer) {
+    buffer->reserve(buffer->size() + locations.size() * 2);  // guess 2 bytes per ULEB128.
+    uintptr_t address = 0;  // relative to start of section.
+    for (uintptr_t location : locations) {
+      DCHECK_GE(location, address) << "Patch locations are not in sorted order";
+      EncodeUnsignedLeb128(buffer, dchecked_integral_cast<uint32_t>(location - address));
+      address = location;
     }
+  }
 
-    SymtabSection* symtab_;
-
-    DISALLOW_COPY_AND_ASSIGN(HashSection);
-  };
+  void WritePatches(const char* name, const std::vector<uintptr_t>* patch_locations) {
+    std::vector<uint8_t> buffer;
+    EncodeOatPatches(*patch_locations, &buffer);
+    std::unique_ptr<Section> s(new Section(this, name, SHT_OAT_PATCH, 0, nullptr, 0, 1, 0));
+    s->Start();
+    s->WriteFully(buffer.data(), buffer.size());
+    s->End();
+    other_sections_.push_back(std::move(s));
+  }
 
-  ElfBuilder(InstructionSet isa,
-             Elf_Word rodata_size, CodeOutput* rodata_writer,
-             Elf_Word text_size, CodeOutput* text_writer,
-             Elf_Word bss_size)
-    : isa_(isa),
-      dynstr_(".dynstr", SHF_ALLOC, kPageSize),
-      dynsym_(".dynsym", SHT_DYNSYM, SHF_ALLOC, &dynstr_),
-      hash_(".hash", SHF_ALLOC, &dynsym_),
-      rodata_(".rodata", SHT_PROGBITS, SHF_ALLOC,
-              nullptr, 0, kPageSize, 0, rodata_size, rodata_writer),
-      text_(".text", SHT_PROGBITS, SHF_ALLOC | SHF_EXECINSTR,
-            nullptr, 0, kPageSize, 0, text_size, text_writer),
-      bss_(".bss", bss_size),
-      dynamic_(".dynamic", &dynstr_),
-      strtab_(".strtab", 0, kPageSize),
-      symtab_(".symtab", SHT_SYMTAB, 0, &strtab_),
-      shstrtab_(".shstrtab", 0, 1) {
+  void WriteSection(const char* name, const std::vector<uint8_t>* buffer) {
+    std::unique_ptr<Section> s(new Section(this, name, SHT_PROGBITS, 0, nullptr, 0, 1, 0));
+    s->Start();
+    s->WriteFully(buffer->data(), buffer->size());
+    s->End();
+    other_sections_.push_back(std::move(s));
   }
-  ~ElfBuilder() {}
 
-  OatSection* GetText() { return &text_; }
-  SymtabSection* GetSymtab() { return &symtab_; }
-
-  bool Write(File* elf_file) {
-    // Since the .text section of an oat file contains relative references to .rodata
-    // and (optionally) .bss, we keep these 2 or 3 sections together. This creates
-    // a non-traditional layout where the .bss section is mapped independently of the
-    // .dynamic section and needs its own program header with LOAD RW.
-    //
-    // The basic layout of the elf file. Order may be different in final output.
-    // +-------------------------+
-    // | Elf_Ehdr                |
-    // +-------------------------+
-    // | Elf_Phdr PHDR           |
-    // | Elf_Phdr LOAD R         | .dynsym .dynstr .hash .rodata
-    // | Elf_Phdr LOAD R X       | .text
-    // | Elf_Phdr LOAD RW        | .bss (Optional)
-    // | Elf_Phdr LOAD RW        | .dynamic
-    // | Elf_Phdr DYNAMIC        | .dynamic
-    // | Elf_Phdr LOAD R         | .eh_frame .eh_frame_hdr
-    // | Elf_Phdr EH_FRAME R     | .eh_frame_hdr
-    // +-------------------------+
-    // | .dynsym                 |
-    // | Elf_Sym  STN_UNDEF      |
-    // | Elf_Sym  oatdata        |
-    // | Elf_Sym  oatexec        |
-    // | Elf_Sym  oatlastword    |
-    // | Elf_Sym  oatbss         | (Optional)
-    // | Elf_Sym  oatbsslastword | (Optional)
-    // +-------------------------+
-    // | .dynstr                 |
-    // | names for .dynsym       |
-    // +-------------------------+
-    // | .hash                   |
-    // | hashtable for dynsym    |
-    // +-------------------------+
-    // | .rodata                 |
-    // | oatdata..oatexec-4      |
-    // +-------------------------+
-    // | .text                   |
-    // | oatexec..oatlastword    |
-    // +-------------------------+
-    // | .dynamic                |
-    // | Elf_Dyn DT_HASH         |
-    // | Elf_Dyn DT_STRTAB       |
-    // | Elf_Dyn DT_SYMTAB       |
-    // | Elf_Dyn DT_SYMENT       |
-    // | Elf_Dyn DT_STRSZ        |
-    // | Elf_Dyn DT_SONAME       |
-    // | Elf_Dyn DT_NULL         |
-    // +-------------------------+  (Optional)
-    // | .symtab                 |  (Optional)
-    // | program symbols         |  (Optional)
-    // +-------------------------+  (Optional)
-    // | .strtab                 |  (Optional)
-    // | names for .symtab       |  (Optional)
-    // +-------------------------+  (Optional)
-    // | .eh_frame               |  (Optional)
-    // +-------------------------+  (Optional)
-    // | .eh_frame_hdr           |  (Optional)
-    // +-------------------------+  (Optional)
-    // | .debug_info             |  (Optional)
-    // +-------------------------+  (Optional)
-    // | .debug_abbrev           |  (Optional)
-    // +-------------------------+  (Optional)
-    // | .debug_str              |  (Optional)
-    // +-------------------------+  (Optional)
-    // | .debug_line             |  (Optional)
-    // +-------------------------+
-    // | .shstrtab               |
-    // | names of sections       |
-    // +-------------------------+
-    // | Elf_Shdr null           |
-    // | Elf_Shdr .dynsym        |
-    // | Elf_Shdr .dynstr        |
-    // | Elf_Shdr .hash          |
-    // | Elf_Shdr .rodata        |
-    // | Elf_Shdr .text          |
-    // | Elf_Shdr .bss           |  (Optional)
-    // | Elf_Shdr .dynamic       |
-    // | Elf_Shdr .symtab        |  (Optional)
-    // | Elf_Shdr .strtab        |  (Optional)
-    // | Elf_Shdr .eh_frame      |  (Optional)
-    // | Elf_Shdr .eh_frame_hdr  |  (Optional)
-    // | Elf_Shdr .debug_info    |  (Optional)
-    // | Elf_Shdr .debug_abbrev  |  (Optional)
-    // | Elf_Shdr .debug_str     |  (Optional)
-    // | Elf_Shdr .debug_line    |  (Optional)
-    // | Elf_Shdr .oat_patches   |  (Optional)
-    // | Elf_Shdr .shstrtab      |
-    // +-------------------------+
-    constexpr bool debug_logging_ = false;
-
-    // Create a list of all section which we want to write.
-    // This is the order in which they will be written.
-    std::vector<Section*> sections;
-    sections.push_back(&rodata_);
-    // Need to write text to update checksum of header even if it is empty.
-    sections.push_back(&text_);
-    if (bss_.GetSize() != 0u) {
-      sections.push_back(&bss_);
-    }
-    sections.push_back(&dynstr_);
-    sections.push_back(&dynsym_);
-    sections.push_back(&hash_);
-    sections.push_back(&dynamic_);
-    if (!symtab_.IsEmpty()) {
-      sections.push_back(&strtab_);
-      sections.push_back(&symtab_);
-    }
-    for (Section* section : other_sections_) {
-      sections.push_back(section);
-    }
-    sections.push_back(&shstrtab_);
-    for (size_t i = 0; i < sections.size(); i++) {
-      // The first section index is 1.  Index 0 is reserved for NULL.
-      // Section index is used for relative symbols and for section links.
-      sections[i]->SetSectionIndex(i + 1);
-      // Add section name to .shstrtab.
-      Elf_Word name_offset = shstrtab_.AddName(sections[i]->GetName());
-      sections[i]->GetHeader()->sh_name = name_offset;
-    }
+  void Start() {
+    // Reserve space for ELF header and program headers.
+    // We do not know the number of headers until later, so
+    // it is easiest to just reserve a fixed amount of space.
+    int size = sizeof(Elf_Ehdr) + sizeof(Elf_Phdr) * kMaxProgramHeaders;
+    Seek(size, kSeekSet);
+    virtual_address_ += size;
+  }
 
-    // The running program does not have access to section headers
-    // and the loader is not supposed to use them either.
-    // The dynamic sections therefore replicates some of the layout
-    // information like the address and size of .rodata and .text.
-    // It also contains other metadata like the SONAME.
-    // The .dynamic section is found using the PT_DYNAMIC program header.
-    BuildDynsymSection();
-    BuildDynamicSection(elf_file->GetPath());
-
-    // We do not know the number of headers until the final stages of write.
-    // It is easiest to just reserve a fixed amount of space for them.
-    constexpr size_t kMaxProgramHeaders = 16;
-    constexpr size_t kProgramHeadersOffset = sizeof(Elf_Ehdr);
-
-    // Layout of all sections - determine the final file offsets and addresses.
-    // This must be done after we have built all sections and know their size.
-    Elf_Off file_offset = kProgramHeadersOffset + sizeof(Elf_Phdr) * kMaxProgramHeaders;
-    Elf_Addr load_address = file_offset;
-    std::vector<Elf_Shdr> section_headers;
-    section_headers.reserve(1u + sections.size());
-    section_headers.push_back(Elf_Shdr());  // NULL at index 0.
-    for (auto* section : sections) {
-      Elf_Shdr* header = section->GetHeader();
-      Elf_Off alignment = header->sh_addralign > 0 ? header->sh_addralign : 1;
-      header->sh_size = section->GetSize();
-      header->sh_link = section->GetLink();
-      // Allocate memory for the section in the file.
-      if (header->sh_type != SHT_NOBITS) {
-        header->sh_offset = RoundUp(file_offset, alignment);
-        file_offset = header->sh_offset + header->sh_size;
-      }
-      // Allocate memory for the section during program execution.
-      if ((header->sh_flags & SHF_ALLOC) != 0) {
-        header->sh_addr = RoundUp(load_address, alignment);
-        load_address = header->sh_addr + header->sh_size;
+  void End() {
+    // Write section names and finish the section headers.
+    shstrtab_.Start();
+    shstrtab_.Write("");
+    for (auto* section : sections_) {
+      section->header_.sh_name = shstrtab_.Write(section->name_);
+      if (section->link_ != nullptr) {
+        section->header_.sh_link = section->link_->GetSectionIndex();
       }
-      if (debug_logging_) {
-        LOG(INFO) << "Section " << section->GetName() << ":" << std::hex
-                  << " offset=0x" << header->sh_offset
-                  << " addr=0x" << header->sh_addr
-                  << " size=0x" << header->sh_size;
-      }
-      // Collect section headers into continuous array for convenience.
-      section_headers.push_back(*header);
-    }
-    Elf_Off section_headers_offset = RoundUp(file_offset, sizeof(Elf_Off));
-
-    // Create program headers now that we know the layout of the whole file.
-    // Each segment contains one or more sections which are mapped together.
-    // Not all sections are mapped during the execution of the program.
-    // PT_LOAD does the mapping.  Other PT_* types allow the program to locate
-    // interesting parts of memory and their addresses overlap with PT_LOAD.
-    std::vector<Elf_Phdr> program_headers;
-    program_headers.push_back(Elf_Phdr());  // Placeholder for PT_PHDR.
-    // Create the main LOAD R segment which spans all sections up to .rodata.
-    const Elf_Shdr* rodata = rodata_.GetHeader();
-    program_headers.push_back(MakeProgramHeader(PT_LOAD, PF_R,
-      0, rodata->sh_offset + rodata->sh_size, rodata->sh_addralign));
-    if (text_.GetHeader()->sh_size != 0u) {
-      program_headers.push_back(MakeProgramHeader(PT_LOAD, PF_R | PF_X, text_));
-    }
-    if (bss_.GetHeader()->sh_size != 0u) {
-      program_headers.push_back(MakeProgramHeader(PT_LOAD, PF_R | PF_W, bss_));
     }
-    program_headers.push_back(MakeProgramHeader(PT_LOAD, PF_R, dynstr_));
-    int dynstr_dynsym_hash_size = hash_.GetHeader()->sh_offset +
-      hash_.GetHeader()->sh_size - dynstr_.GetHeader()->sh_offset;
-    program_headers.back().p_filesz = dynstr_dynsym_hash_size;
-    program_headers.back().p_memsz  = dynstr_dynsym_hash_size;
-    program_headers.push_back(MakeProgramHeader(PT_LOAD, PF_R | PF_W, dynamic_));
-    program_headers.push_back(MakeProgramHeader(PT_DYNAMIC, PF_R | PF_W, dynamic_));
-    const Section* eh_frame = FindSection(".eh_frame");
-    if (eh_frame != nullptr) {
-      program_headers.push_back(MakeProgramHeader(PT_LOAD, PF_R, *eh_frame));
-      const Section* eh_frame_hdr = FindSection(".eh_frame_hdr");
-      if (eh_frame_hdr != nullptr) {
-        // Check layout: eh_frame is before eh_frame_hdr and there is no gap.
-        CHECK_LE(eh_frame->GetHeader()->sh_offset, eh_frame_hdr->GetHeader()->sh_offset);
-        CHECK_EQ(eh_frame->GetHeader()->sh_offset + eh_frame->GetHeader()->sh_size,
-                 eh_frame_hdr->GetHeader()->sh_offset);
-        // Extend the PT_LOAD of .eh_frame to include the .eh_frame_hdr as well.
-        program_headers.back().p_filesz += eh_frame_hdr->GetHeader()->sh_size;
-        program_headers.back().p_memsz  += eh_frame_hdr->GetHeader()->sh_size;
-        program_headers.push_back(MakeProgramHeader(PT_GNU_EH_FRAME, PF_R, *eh_frame_hdr));
-      }
+    shstrtab_.End();
+
+    // Write section headers at the end of the ELF file.
+    std::vector<Elf_Shdr> shdrs;
+    shdrs.reserve(1u + sections_.size());
+    shdrs.push_back(Elf_Shdr());  // NULL at index 0.
+    for (auto* section : sections_) {
+      shdrs.push_back(section->header_);
     }
-    DCHECK_EQ(program_headers[0].p_type, 0u);  // Check placeholder.
-    program_headers[0] = MakeProgramHeader(PT_PHDR, PF_R,
-      kProgramHeadersOffset, program_headers.size() * sizeof(Elf_Phdr), sizeof(Elf_Off));
-    CHECK_LE(program_headers.size(), kMaxProgramHeaders);
+    Elf_Off section_headers_offset;
+    section_headers_offset = RoundUp(Seek(0, kSeekCurrent), sizeof(Elf_Off));
+    Seek(section_headers_offset, kSeekSet);
+    WriteFully(shdrs.data(), shdrs.size() * sizeof(shdrs[0]));
 
-    // Create the main ELF header.
+    // Write the initial file headers.
+    std::vector<Elf_Phdr> phdrs = MakeProgramHeaders();
     Elf_Ehdr elf_header = MakeElfHeader(isa_);
-    elf_header.e_phoff = kProgramHeadersOffset;
+    elf_header.e_phoff = sizeof(Elf_Ehdr);
     elf_header.e_shoff = section_headers_offset;
-    elf_header.e_phnum = program_headers.size();
-    elf_header.e_shnum = section_headers.size();
+    elf_header.e_phnum = phdrs.size();
+    elf_header.e_shnum = shdrs.size();
     elf_header.e_shstrndx = shstrtab_.GetSectionIndex();
+    Seek(0, kSeekSet);
+    WriteFully(&elf_header, sizeof(elf_header));
+    WriteFully(phdrs.data(), phdrs.size() * sizeof(phdrs[0]));
+  }
 
-    // Write all headers and section content to the file.
-    // Depending on the implementations of Section::Write, this
-    // might be just memory copies or some more elaborate operations.
-    if (!WriteArray(elf_file, &elf_header, 1)) {
-      LOG(INFO) << "Failed to write the ELF header";
-      return false;
-    }
-    if (!WriteArray(elf_file, program_headers.data(), program_headers.size())) {
-      LOG(INFO) << "Failed to write the program headers";
-      return false;
-    }
-    for (Section* section : sections) {
-      const Elf_Shdr* header = section->GetHeader();
-      if (header->sh_type != SHT_NOBITS) {
-        if (!SeekTo(elf_file, header->sh_offset) || !section->Write(elf_file)) {
-          LOG(INFO) << "Failed to write section " << section->GetName();
-          return false;
-        }
-        Elf_Word current_offset = lseek(elf_file->Fd(), 0, SEEK_CUR);
-        CHECK_EQ(current_offset, header->sh_offset + header->sh_size)
-          << "The number of bytes written does not match GetSize()";
-      }
-    }
-    if (!SeekTo(elf_file, section_headers_offset) ||
-        !WriteArray(elf_file, section_headers.data(), section_headers.size())) {
-      LOG(INFO) << "Failed to write the section headers";
-      return false;
+  // The running program does not have access to section headers
+  // and the loader is not supposed to use them either.
+  // The dynamic sections therefore replicates some of the layout
+  // information like the address and size of .rodata and .text.
+  // It also contains other metadata like the SONAME.
+  // The .dynamic section is found using the PT_DYNAMIC program header.
+  void WriteDynamicSection(const std::string& elf_file_path) {
+    std::string soname(elf_file_path);
+    size_t directory_separator_pos = soname.rfind('/');
+    if (directory_separator_pos != std::string::npos) {
+      soname = soname.substr(directory_separator_pos + 1);
     }
-    return true;
-  }
 
-  // Adds the given section to the builder.  It does not take ownership.
-  void RegisterSection(Section* section) {
-    other_sections_.push_back(section);
+    dynstr_.Start();
+    dynstr_.Write("");  // dynstr should start with empty string.
+    dynsym_.Add(dynstr_.Write("oatdata"), &rodata_, 0, true,
+                rodata_.GetSize(), STB_GLOBAL, STT_OBJECT);
+    if (text_.GetSize() != 0u) {
+      dynsym_.Add(dynstr_.Write("oatexec"), &text_, 0, true,
+                  text_.GetSize(), STB_GLOBAL, STT_OBJECT);
+      dynsym_.Add(dynstr_.Write("oatlastword"), &text_, text_.GetSize() - 4,
+                  true, 4, STB_GLOBAL, STT_OBJECT);
+    } else if (rodata_.GetSize() != 0) {
+      // rodata_ can be size 0 for dwarf_test.
+      dynsym_.Add(dynstr_.Write("oatlastword"), &rodata_, rodata_.GetSize() - 4,
+                  true, 4, STB_GLOBAL, STT_OBJECT);
+    }
+    if (bss_.finished_) {
+      dynsym_.Add(dynstr_.Write("oatbss"), &bss_,
+                  0, true, bss_.GetSize(), STB_GLOBAL, STT_OBJECT);
+      dynsym_.Add(dynstr_.Write("oatbsslastword"), &bss_,
+                  bss_.GetSize() - 4, true, 4, STB_GLOBAL, STT_OBJECT);
+    }
+    Elf_Word soname_offset = dynstr_.Write(soname);
+    dynstr_.End();
+
+    dynsym_.Start();
+    dynsym_.Write();
+    dynsym_.End();
+
+    // We do not really need a hash-table since there is so few entries.
+    // However, the hash-table is the only way the linker can actually
+    // determine the number of symbols in .dynsym so it is required.
+    hash_.Start();
+    int count = dynsym_.GetSize() / sizeof(Elf_Sym);  // Includes NULL.
+    std::vector<Elf_Word> hash;
+    hash.push_back(1);  // Number of buckets.
+    hash.push_back(count);  // Number of chains.
+    // Buckets.  Having just one makes it linear search.
+    hash.push_back(1);  // Point to first non-NULL symbol.
+    // Chains.  This creates linked list of symbols.
+    hash.push_back(0);  // Dummy entry for the NULL symbol.
+    for (int i = 1; i < count - 1; i++) {
+      hash.push_back(i + 1);  // Each symbol points to the next one.
+    }
+    hash.push_back(0);  // Last symbol terminates the chain.
+    hash_.WriteFully(hash.data(), hash.size() * sizeof(hash[0]));
+    hash_.End();
+
+    dynamic_.Start();
+    Elf_Dyn dyns[] = {
+      { DT_HASH, { hash_.GetAddress() } },
+      { DT_STRTAB, { dynstr_.GetAddress() } },
+      { DT_SYMTAB, { dynsym_.GetAddress() } },
+      { DT_SYMENT, { sizeof(Elf_Sym) } },
+      { DT_STRSZ, { dynstr_.GetSize() } },
+      { DT_SONAME, { soname_offset } },
+      { DT_NULL, { 0 } },
+    };
+    dynamic_.WriteFully(&dyns, sizeof(dyns));
+    dynamic_.End();
   }
 
-  const Section* FindSection(const char* name) {
-    for (const auto* section : other_sections_) {
-      if (section->GetName() == name) {
-        return section;
-      }
-    }
-    return nullptr;
+  // Returns true if all writes and seeks on the output stream succeeded.
+  bool Good() {
+    return output_good_;
   }
 
  private:
-  static bool SeekTo(File* elf_file, Elf_Word offset) {
-    DCHECK_LE(lseek(elf_file->Fd(), 0, SEEK_CUR), static_cast<off_t>(offset))
-      << "Seeking backwards";
-    if (static_cast<off_t>(offset) != lseek(elf_file->Fd(), offset, SEEK_SET)) {
-      PLOG(ERROR) << "Failed to seek in file " << elf_file->GetPath();
-      return false;
+  // This function always succeeds to simplify code.
+  // Use Good() to check the actual status of the output stream.
+  void WriteFully(const void* buffer, size_t byte_count) {
+    if (output_good_) {
+      if (!output_->WriteFully(buffer, byte_count)) {
+        PLOG(ERROR) << "Failed to write " << byte_count
+                    << " bytes to ELF file at offset " << output_offset_;
+        output_good_ = false;
+      }
     }
-    return true;
+    output_offset_ += byte_count;
   }
 
-  template<typename T>
-  static bool WriteArray(File* elf_file, const T* data, size_t count) {
-    if (count != 0) {
-      DCHECK(data != nullptr);
-      if (!elf_file->WriteFully(data, count * sizeof(T))) {
-        PLOG(ERROR) << "Failed to write to file " << elf_file->GetPath();
-        return false;
+  // This function always succeeds to simplify code.
+  // Use Good() to check the actual status of the output stream.
+  off_t Seek(off_t offset, Whence whence) {
+    // We keep shadow copy of the offset so that we return
+    // the expected value even if the output stream failed.
+    off_t new_offset;
+    switch (whence) {
+      case kSeekSet:
+        new_offset = offset;
+        break;
+      case kSeekCurrent:
+        new_offset = output_offset_ + offset;
+        break;
+      default:
+        LOG(FATAL) << "Unsupported seek type: " << whence;
+        UNREACHABLE();
+    }
+    if (output_good_) {
+      off_t actual_offset = output_->Seek(offset, whence);
+      if (actual_offset == (off_t)-1) {
+        PLOG(ERROR) << "Failed to seek in ELF file. Offset=" << offset
+                    << " whence=" << whence << " new_offset=" << new_offset;
+        output_good_ = false;
       }
+      DCHECK_EQ(actual_offset, new_offset);
     }
-    return true;
-  }
-
-  // Helper - create segment header based on memory range.
-  static Elf_Phdr MakeProgramHeader(Elf_Word type, Elf_Word flags,
-                                    Elf_Off offset, Elf_Word size, Elf_Word align) {
-    Elf_Phdr phdr = Elf_Phdr();
-    phdr.p_type    = type;
-    phdr.p_flags   = flags;
-    phdr.p_offset  = offset;
-    phdr.p_vaddr   = offset;
-    phdr.p_paddr   = offset;
-    phdr.p_filesz  = size;
-    phdr.p_memsz   = size;
-    phdr.p_align   = align;
-    return phdr;
-  }
-
-  // Helper - create segment header based on section header.
-  static Elf_Phdr MakeProgramHeader(Elf_Word type, Elf_Word flags,
-                                    const Section& section) {
-    const Elf_Shdr* shdr = section.GetHeader();
-    // Only run-time allocated sections should be in segment headers.
-    CHECK_NE(shdr->sh_flags & SHF_ALLOC, 0u);
-    Elf_Phdr phdr = Elf_Phdr();
-    phdr.p_type   = type;
-    phdr.p_flags  = flags;
-    phdr.p_offset = shdr->sh_offset;
-    phdr.p_vaddr  = shdr->sh_addr;
-    phdr.p_paddr  = shdr->sh_addr;
-    phdr.p_filesz = shdr->sh_type != SHT_NOBITS ? shdr->sh_size : 0u;
-    phdr.p_memsz  = shdr->sh_size;
-    phdr.p_align  = shdr->sh_addralign;
-    return phdr;
+    output_offset_ = new_offset;
+    return new_offset;
   }
 
   static Elf_Ehdr MakeElfHeader(InstructionSet isa) {
@@ -869,6 +553,10 @@ class ElfBuilder FINAL {
       }
       case kNone: {
         LOG(FATAL) << "No instruction set";
+        break;
+      }
+      default: {
+        LOG(FATAL) << "Unknown instruction set " << isa;
       }
     }
 
@@ -892,56 +580,112 @@ class ElfBuilder FINAL {
     return elf_header;
   }
 
-  void BuildDynamicSection(const std::string& elf_file_path) {
-    std::string soname(elf_file_path);
-    size_t directory_separator_pos = soname.rfind('/');
-    if (directory_separator_pos != std::string::npos) {
-      soname = soname.substr(directory_separator_pos + 1);
-    }
-    // NB: We must add the name before adding DT_STRSZ.
-    Elf_Word soname_offset = dynstr_.AddName(soname);
-
-    dynamic_.AddDynamicTag(DT_HASH, 0, &hash_);
-    dynamic_.AddDynamicTag(DT_STRTAB, 0, &dynstr_);
-    dynamic_.AddDynamicTag(DT_SYMTAB, 0, &dynsym_);
-    dynamic_.AddDynamicTag(DT_SYMENT, sizeof(Elf_Sym), nullptr);
-    dynamic_.AddDynamicTag(DT_STRSZ, dynstr_.GetSize(), nullptr);
-    dynamic_.AddDynamicTag(DT_SONAME, soname_offset, nullptr);
-  }
-
-  void BuildDynsymSection() {
-    dynsym_.AddSymbol("oatdata", &rodata_, 0, true,
-                      rodata_.GetSize(), STB_GLOBAL, STT_OBJECT);
-    if (text_.GetSize() != 0u) {
-      dynsym_.AddSymbol("oatexec", &text_, 0, true,
-                        text_.GetSize(), STB_GLOBAL, STT_OBJECT);
-      dynsym_.AddSymbol("oatlastword", &text_, text_.GetSize() - 4,
-                        true, 4, STB_GLOBAL, STT_OBJECT);
-    } else if (rodata_.GetSize() != 0) {
-      // rodata_ be size 0 for dwarf_test.
-      dynsym_.AddSymbol("oatlastword", &rodata_, rodata_.GetSize() - 4,
-                        true, 4, STB_GLOBAL, STT_OBJECT);
+  // Create program headers based on written sections.
+  std::vector<Elf_Phdr> MakeProgramHeaders() {
+    CHECK(!sections_.empty());
+    std::vector<Elf_Phdr> phdrs;
+    {
+      // The program headers must start with PT_PHDR which is used in
+      // loaded process to determine the number of program headers.
+      Elf_Phdr phdr = Elf_Phdr();
+      phdr.p_type    = PT_PHDR;
+      phdr.p_flags   = PF_R;
+      phdr.p_offset  = phdr.p_vaddr = phdr.p_paddr = sizeof(Elf_Ehdr);
+      phdr.p_filesz  = phdr.p_memsz = 0;  // We need to fill this later.
+      phdr.p_align   = sizeof(Elf_Off);
+      phdrs.push_back(phdr);
+      // Tell the linker to mmap the start of file to memory.
+      Elf_Phdr load = Elf_Phdr();
+      load.p_type    = PT_LOAD;
+      load.p_flags   = PF_R;
+      load.p_offset  = load.p_vaddr = load.p_paddr = 0;
+      load.p_filesz  = load.p_memsz = sections_[0]->header_.sh_offset;
+      load.p_align   = kPageSize;
+      phdrs.push_back(load);
+    }
+    // Create program headers for sections.
+    for (auto* section : sections_) {
+      const Elf_Shdr& shdr = section->header_;
+      if ((shdr.sh_flags & SHF_ALLOC) != 0 && shdr.sh_size != 0) {
+        // PT_LOAD tells the linker to mmap part of the file.
+        // The linker can only mmap page-aligned sections.
+        // Single PT_LOAD may contain several ELF sections.
+        Elf_Phdr& prev = phdrs.back();
+        Elf_Phdr load = Elf_Phdr();
+        load.p_type   = PT_LOAD;
+        load.p_flags  = section->phdr_flags_;
+        load.p_offset = shdr.sh_offset;
+        load.p_vaddr  = load.p_paddr = shdr.sh_addr;
+        load.p_filesz = (shdr.sh_type != SHT_NOBITS ? shdr.sh_size : 0u);
+        load.p_memsz  = shdr.sh_size;
+        load.p_align  = shdr.sh_addralign;
+        if (prev.p_type == load.p_type &&
+            prev.p_flags == load.p_flags &&
+            prev.p_filesz == prev.p_memsz &&  // Do not merge .bss
+            load.p_filesz == load.p_memsz) {  // Do not merge .bss
+          // Merge this PT_LOAD with the previous one.
+          Elf_Word size = shdr.sh_offset + shdr.sh_size - prev.p_offset;
+          prev.p_filesz = size;
+          prev.p_memsz  = size;
+        } else {
+          // If we are adding new load, it must be aligned.
+          CHECK_EQ(shdr.sh_addralign, (Elf_Word)kPageSize);
+          phdrs.push_back(load);
+        }
+      }
     }
-    if (bss_.GetSize() != 0u) {
-      dynsym_.AddSymbol("oatbss", &bss_, 0, true,
-                        bss_.GetSize(), STB_GLOBAL, STT_OBJECT);
-      dynsym_.AddSymbol("oatbsslastword", &bss_, bss_.GetSize() - 4,
-                        true, 4, STB_GLOBAL, STT_OBJECT);
+    for (auto* section : sections_) {
+      const Elf_Shdr& shdr = section->header_;
+      if ((shdr.sh_flags & SHF_ALLOC) != 0 && shdr.sh_size != 0) {
+        // Other PT_* types allow the program to locate interesting
+        // parts of memory at runtime. They must overlap with PT_LOAD.
+        if (section->phdr_type_ != 0) {
+          Elf_Phdr phdr = Elf_Phdr();
+          phdr.p_type   = section->phdr_type_;
+          phdr.p_flags  = section->phdr_flags_;
+          phdr.p_offset = shdr.sh_offset;
+          phdr.p_vaddr  = phdr.p_paddr = shdr.sh_addr;
+          phdr.p_filesz = phdr.p_memsz = shdr.sh_size;
+          phdr.p_align  = shdr.sh_addralign;
+          phdrs.push_back(phdr);
+        }
+      }
     }
+    // Set the size of the initial PT_PHDR.
+    CHECK_EQ(phdrs[0].p_type, (Elf_Word)PT_PHDR);
+    phdrs[0].p_filesz = phdrs[0].p_memsz = phdrs.size() * sizeof(Elf_Phdr);
+
+    return phdrs;
   }
 
   InstructionSet isa_;
-  StrtabSection dynstr_;
-  SymtabSection dynsym_;
-  HashSection hash_;
-  OatSection rodata_;
-  OatSection text_;
-  NoBitsSection bss_;
-  DynamicSection dynamic_;
-  StrtabSection strtab_;
-  SymtabSection symtab_;
-  std::vector<Section*> other_sections_;
-  StrtabSection shstrtab_;
+
+  OutputStream* output_;
+  bool output_good_;  // True if all writes to output succeeded.
+  off_t output_offset_;  // Keep track of the current position in the stream.
+
+  Section rodata_;
+  Section text_;
+  Section bss_;
+  StringSection dynstr_;
+  SymbolSection dynsym_;
+  Section hash_;
+  Section dynamic_;
+  Section eh_frame_;
+  Section eh_frame_hdr_;
+  StringSection strtab_;
+  SymbolSection symtab_;
+  Section debug_frame_;
+  Section debug_info_;
+  Section debug_line_;
+  StringSection shstrtab_;
+  std::vector<std::unique_ptr<Section>> other_sections_;
+
+  // List of used section in the order in which they were written.
+  std::vector<Section*> sections_;
+
+  // Used for allocation of virtual address space.
+  Elf_Addr virtual_address_;
 
   DISALLOW_COPY_AND_ASSIGN(ElfBuilder);
 };
diff --git a/compiler/elf_writer_debug.cc b/compiler/elf_writer_debug.cc
index 3a9e312225..5e2a8bf50e 100644
--- a/compiler/elf_writer_debug.cc
+++ b/compiler/elf_writer_debug.cc
@@ -19,21 +19,23 @@
 #include <unordered_set>
 
 #include "base/casts.h"
+#include "base/stl_util.h"
 #include "compiled_method.h"
 #include "driver/compiler_driver.h"
 #include "dex_file-inl.h"
+#include "dwarf/dedup_vector.h"
 #include "dwarf/headers.h"
 #include "dwarf/register.h"
+#include "elf_builder.h"
 #include "oat_writer.h"
 #include "utils.h"
 
 namespace art {
 namespace dwarf {
 
-static void WriteDebugFrameCIE(InstructionSet isa,
-                               ExceptionHeaderValueApplication addr_type,
-                               CFIFormat format,
-                               std::vector<uint8_t>* eh_frame) {
+static void WriteCIE(InstructionSet isa,
+                     CFIFormat format,
+                     std::vector<uint8_t>* buffer) {
   // Scratch registers should be marked as undefined.  This tells the
   // debugger that its value in the previous frame is not recoverable.
   bool is64bit = Is64BitInstructionSet(isa);
@@ -59,8 +61,7 @@ static void WriteDebugFrameCIE(InstructionSet isa,
         }
       }
       auto return_reg = Reg::ArmCore(14);  // R14(LR).
-      WriteDebugFrameCIE(is64bit, addr_type, return_reg,
-                         opcodes, format, eh_frame);
+      WriteCIE(is64bit, return_reg, opcodes, format, buffer);
       return;
     }
     case kArm64: {
@@ -83,8 +84,7 @@ static void WriteDebugFrameCIE(InstructionSet isa,
         }
       }
       auto return_reg = Reg::Arm64Core(30);  // R30(LR).
-      WriteDebugFrameCIE(is64bit, addr_type, return_reg,
-                         opcodes, format, eh_frame);
+      WriteCIE(is64bit, return_reg, opcodes, format, buffer);
       return;
     }
     case kMips:
@@ -100,8 +100,7 @@ static void WriteDebugFrameCIE(InstructionSet isa,
         }
       }
       auto return_reg = Reg::MipsCore(31);  // R31(RA).
-      WriteDebugFrameCIE(is64bit, addr_type, return_reg,
-                         opcodes, format, eh_frame);
+      WriteCIE(is64bit, return_reg, opcodes, format, buffer);
       return;
     }
     case kX86: {
@@ -127,8 +126,7 @@ static void WriteDebugFrameCIE(InstructionSet isa,
         }
       }
       auto return_reg = Reg::X86Core(8);  // R8(EIP).
-      WriteDebugFrameCIE(is64bit, addr_type, return_reg,
-                         opcodes, format, eh_frame);
+      WriteCIE(is64bit, return_reg, opcodes, format, buffer);
       return;
     }
     case kX86_64: {
@@ -154,8 +152,7 @@ static void WriteDebugFrameCIE(InstructionSet isa,
         }
       }
       auto return_reg = Reg::X86_64Core(16);  // R16(RIP).
-      WriteDebugFrameCIE(is64bit, addr_type, return_reg,
-                         opcodes, format, eh_frame);
+      WriteCIE(is64bit, return_reg, opcodes, format, buffer);
       return;
     }
     case kNone:
@@ -165,36 +162,69 @@ static void WriteDebugFrameCIE(InstructionSet isa,
   UNREACHABLE();
 }
 
-void WriteCFISection(const CompilerDriver* compiler,
-                     const OatWriter* oat_writer,
-                     ExceptionHeaderValueApplication address_type,
-                     CFIFormat format,
-                     std::vector<uint8_t>* debug_frame,
-                     std::vector<uintptr_t>* debug_frame_patches,
-                     std::vector<uint8_t>* eh_frame_hdr,
-                     std::vector<uintptr_t>* eh_frame_hdr_patches) {
-  const auto& method_infos = oat_writer->GetMethodDebugInfo();
-  const InstructionSet isa = compiler->GetInstructionSet();
+template<typename ElfTypes>
+void WriteCFISection(ElfBuilder<ElfTypes>* builder,
+                     const std::vector<OatWriter::DebugInfo>& method_infos,
+                     CFIFormat format) {
+  CHECK(format == dwarf::DW_DEBUG_FRAME_FORMAT ||
+        format == dwarf::DW_EH_FRAME_FORMAT);
+  typedef typename ElfTypes::Addr Elf_Addr;
+
+  std::vector<uint32_t> binary_search_table;
+  std::vector<uintptr_t> patch_locations;
+  if (format == DW_EH_FRAME_FORMAT) {
+    binary_search_table.reserve(2 * method_infos.size());
+  } else {
+    patch_locations.reserve(method_infos.size());
+  }
 
   // Write .eh_frame/.debug_frame section.
-  std::map<uint32_t, size_t> address_to_fde_offset_map;
-  size_t cie_offset = debug_frame->size();
-  WriteDebugFrameCIE(isa, address_type, format, debug_frame);
-  for (const OatWriter::DebugInfo& mi : method_infos) {
-    if (!mi.deduped_) {  // Only one FDE per unique address.
-      ArrayRef<const uint8_t> opcodes = mi.compiled_method_->GetCFIInfo();
-      if (!opcodes.empty()) {
-        address_to_fde_offset_map.emplace(mi.low_pc_, debug_frame->size());
-        WriteDebugFrameFDE(Is64BitInstructionSet(isa), cie_offset,
-                           mi.low_pc_, mi.high_pc_ - mi.low_pc_,
-                           opcodes, format, debug_frame, debug_frame_patches);
+  auto* cfi_section = (format == dwarf::DW_DEBUG_FRAME_FORMAT
+                       ? builder->GetDebugFrame()
+                       : builder->GetEhFrame());
+  {
+    cfi_section->Start();
+    const bool is64bit = Is64BitInstructionSet(builder->GetIsa());
+    const Elf_Addr text_address = builder->GetText()->GetAddress();
+    const Elf_Addr cfi_address = cfi_section->GetAddress();
+    const Elf_Addr cie_address = cfi_address;
+    Elf_Addr buffer_address = cfi_address;
+    std::vector<uint8_t> buffer;  // Small temporary buffer.
+    WriteCIE(builder->GetIsa(), format, &buffer);
+    cfi_section->WriteFully(buffer.data(), buffer.size());
+    buffer_address += buffer.size();
+    buffer.clear();
+    for (const OatWriter::DebugInfo& mi : method_infos) {
+      if (!mi.deduped_) {  // Only one FDE per unique address.
+        ArrayRef<const uint8_t> opcodes = mi.compiled_method_->GetCFIInfo();
+        if (!opcodes.empty()) {
+          const Elf_Addr code_address = text_address + mi.low_pc_;
+          if (format == DW_EH_FRAME_FORMAT) {
+            binary_search_table.push_back(
+                dchecked_integral_cast<uint32_t>(code_address));
+            binary_search_table.push_back(
+                dchecked_integral_cast<uint32_t>(buffer_address));
+          }
+          WriteFDE(is64bit, cfi_address, cie_address,
+                   code_address, mi.high_pc_ - mi.low_pc_,
+                   opcodes, format, buffer_address, &buffer,
+                   &patch_locations);
+          cfi_section->WriteFully(buffer.data(), buffer.size());
+          buffer_address += buffer.size();
+          buffer.clear();
+        }
       }
     }
+    cfi_section->End();
   }
 
   if (format == DW_EH_FRAME_FORMAT) {
+    auto* header_section = builder->GetEhFrameHdr();
+    header_section->Start();
+    uint32_t header_address = dchecked_integral_cast<int32_t>(header_section->GetAddress());
     // Write .eh_frame_hdr section.
-    Writer<> header(eh_frame_hdr);
+    std::vector<uint8_t> buffer;
+    Writer<> header(&buffer);
     header.PushUint8(1);  // Version.
     // Encoding of .eh_frame pointer - libunwind does not honor datarel here,
     // so we have to use pcrel which means relative to the pointer's location.
@@ -204,114 +234,294 @@ void WriteCFISection(const CompilerDriver* compiler,
     // Encoding of binary search table addresses - libunwind supports only this
     // specific combination, which means relative to the start of .eh_frame_hdr.
     header.PushUint8(DW_EH_PE_datarel | DW_EH_PE_sdata4);
-    // .eh_frame pointer - .eh_frame_hdr section is after .eh_frame section
-    const int32_t relative_eh_frame_begin = -static_cast<int32_t>(debug_frame->size());
-    header.PushInt32(relative_eh_frame_begin - 4U);
+    // .eh_frame pointer
+    header.PushInt32(cfi_section->GetAddress() - (header_address + 4u));
     // Binary search table size (number of entries).
-    header.PushUint32(dchecked_integral_cast<uint32_t>(address_to_fde_offset_map.size()));
+    header.PushUint32(dchecked_integral_cast<uint32_t>(binary_search_table.size()/2));
+    header_section->WriteFully(buffer.data(), buffer.size());
     // Binary search table.
-    for (const auto& address_to_fde_offset : address_to_fde_offset_map) {
-      u_int32_t code_address = address_to_fde_offset.first;
-      int32_t fde_address = dchecked_integral_cast<int32_t>(address_to_fde_offset.second);
-      eh_frame_hdr_patches->push_back(header.data()->size());
-      header.PushUint32(code_address);
-      // We know the exact layout (eh_frame is immediately before eh_frame_hdr)
-      // and the data is relative to the start of the eh_frame_hdr,
-      // so patching isn't necessary (in contrast to the code address above).
-      header.PushInt32(relative_eh_frame_begin + fde_address);
+    for (size_t i = 0; i < binary_search_table.size(); i++) {
+      // Make addresses section-relative since we know the header address now.
+      binary_search_table[i] -= header_address;
     }
+    header_section->WriteFully(binary_search_table.data(), binary_search_table.size());
+    header_section->End();
+  } else {
+    builder->WritePatches(".debug_frame.oat_patches", &patch_locations);
   }
 }
 
-/*
- * @brief Generate the DWARF sections.
- * @param oat_writer The Oat file Writer.
- * @param eh_frame Call Frame Information.
- * @param debug_info Compilation unit information.
- * @param debug_info_patches Address locations to be patched.
- * @param debug_abbrev Abbreviations used to generate dbg_info.
- * @param debug_str Debug strings.
- * @param debug_line Line number table.
- * @param debug_line_patches Address locations to be patched.
- */
-void WriteDebugSections(const CompilerDriver* compiler,
-                        const OatWriter* oat_writer,
-                        std::vector<uint8_t>* debug_info,
-                        std::vector<uintptr_t>* debug_info_patches,
-                        std::vector<uint8_t>* debug_abbrev,
-                        std::vector<uint8_t>* debug_str,
-                        std::vector<uint8_t>* debug_line,
-                        std::vector<uintptr_t>* debug_line_patches) {
-  const std::vector<OatWriter::DebugInfo>& method_infos = oat_writer->GetMethodDebugInfo();
-  const InstructionSet isa = compiler->GetInstructionSet();
-  const bool is64bit = Is64BitInstructionSet(isa);
-
-  // Find all addresses (low_pc) which contain deduped methods.
-  // The first instance of method is not marked deduped_, but the rest is.
-  std::unordered_set<uint32_t> deduped_addresses;
-  for (const OatWriter::DebugInfo& mi : method_infos) {
-    if (mi.deduped_) {
-      deduped_addresses.insert(mi.low_pc_);
+struct CompilationUnit {
+  std::vector<const OatWriter::DebugInfo*> methods_;
+  size_t debug_line_offset_ = 0;
+  uint32_t low_pc_ = 0xFFFFFFFFU;
+  uint32_t high_pc_ = 0;
+};
+
+// Helper class to write .debug_info and its supporting sections.
+template<typename ElfTypes>
+class DebugInfoWriter {
+  typedef typename ElfTypes::Addr Elf_Addr;
+
+  // Helper class to write one compilation unit.
+  // It holds helper methods and temporary state.
+  class CompilationUnitWriter {
+   public:
+    explicit CompilationUnitWriter(DebugInfoWriter* owner)
+      : owner_(owner),
+        info_(Is64BitInstructionSet(owner_->builder_->GetIsa()), &debug_abbrev_) {
     }
-  }
 
-  // Group the methods into compilation units based on source file.
-  std::vector<std::vector<const OatWriter::DebugInfo*>> compilation_units;
-  const char* last_source_file = nullptr;
-  for (const OatWriter::DebugInfo& mi : method_infos) {
-    // Attribute given instruction range only to single method.
-    // Otherwise the debugger might get really confused.
-    if (!mi.deduped_) {
-      auto& dex_class_def = mi.dex_file_->GetClassDef(mi.class_def_index_);
-      const char* source_file = mi.dex_file_->GetSourceFile(dex_class_def);
-      if (compilation_units.empty() || source_file != last_source_file) {
-        compilation_units.push_back(std::vector<const OatWriter::DebugInfo*>());
+    void Write(const CompilationUnit& compilation_unit) {
+      CHECK(!compilation_unit.methods_.empty());
+      const Elf_Addr text_address = owner_->builder_->GetText()->GetAddress();
+
+      info_.StartTag(DW_TAG_compile_unit);
+      info_.WriteStrp(DW_AT_producer, owner_->WriteString("Android dex2oat"));
+      info_.WriteData1(DW_AT_language, DW_LANG_Java);
+      info_.WriteAddr(DW_AT_low_pc, text_address + compilation_unit.low_pc_);
+      info_.WriteAddr(DW_AT_high_pc, text_address + compilation_unit.high_pc_);
+      info_.WriteData4(DW_AT_stmt_list, compilation_unit.debug_line_offset_);
+
+      const char* last_dex_class_desc = nullptr;
+      for (auto mi : compilation_unit.methods_) {
+        const DexFile* dex = mi->dex_file_;
+        const DexFile::MethodId& dex_method = dex->GetMethodId(mi->dex_method_index_);
+        const DexFile::ProtoId& dex_proto = dex->GetMethodPrototype(dex_method);
+        const DexFile::TypeList* dex_params = dex->GetProtoParameters(dex_proto);
+        const char* dex_class_desc = dex->GetMethodDeclaringClassDescriptor(dex_method);
+
+        // Enclose the method in correct class definition.
+        if (last_dex_class_desc != dex_class_desc) {
+          if (last_dex_class_desc != nullptr) {
+            EndClassTag(last_dex_class_desc);
+          }
+          size_t offset = StartClassTag(dex_class_desc);
+          type_cache_.emplace(dex_class_desc, offset);
+          // Check that each class is defined only once.
+          bool unique = owner_->defined_dex_classes_.insert(dex_class_desc).second;
+          CHECK(unique) << "Redefinition of " << dex_class_desc;
+          last_dex_class_desc = dex_class_desc;
+        }
+
+        std::vector<const char*> param_names;
+        if (mi->code_item_ != nullptr) {
+          const uint8_t* stream = dex->GetDebugInfoStream(mi->code_item_);
+          if (stream != nullptr) {
+            DecodeUnsignedLeb128(&stream);  // line.
+            uint32_t parameters_size = DecodeUnsignedLeb128(&stream);
+            for (uint32_t i = 0; i < parameters_size; ++i) {
+              uint32_t id = DecodeUnsignedLeb128P1(&stream);
+              param_names.push_back(mi->dex_file_->StringDataByIdx(id));
+            }
+          }
+        }
+
+        int start_depth = info_.Depth();
+        info_.StartTag(DW_TAG_subprogram);
+        WriteName(dex->GetMethodName(dex_method));
+        info_.WriteAddr(DW_AT_low_pc, text_address + mi->low_pc_);
+        info_.WriteAddr(DW_AT_high_pc, text_address + mi->high_pc_);
+        WriteLazyType(dex->GetReturnTypeDescriptor(dex_proto));
+        if (dex_params != nullptr) {
+          for (uint32_t i = 0; i < dex_params->Size(); ++i) {
+            info_.StartTag(DW_TAG_formal_parameter);
+            // Parameter names may not be always available.
+            if (i < param_names.size() && param_names[i] != nullptr) {
+              WriteName(param_names[i]);
+            }
+            WriteLazyType(dex->StringByTypeIdx(dex_params->GetTypeItem(i).type_idx_));
+            info_.EndTag();
+          }
+        }
+        info_.EndTag();
+        CHECK_EQ(info_.Depth(), start_depth);  // Balanced start/end.
+      }
+      if (last_dex_class_desc != nullptr) {
+        EndClassTag(last_dex_class_desc);
       }
-      compilation_units.back().push_back(&mi);
-      last_source_file = source_file;
+      CHECK_EQ(info_.Depth(), 1);
+      FinishLazyTypes();
+      info_.EndTag();  // DW_TAG_compile_unit
+      std::vector<uint8_t> buffer;
+      buffer.reserve(info_.data()->size() + KB);
+      const size_t offset = owner_->builder_->GetDebugInfo()->GetSize();
+      const size_t debug_abbrev_offset =
+          owner_->debug_abbrev_.Insert(debug_abbrev_.data(), debug_abbrev_.size());
+      WriteDebugInfoCU(debug_abbrev_offset, info_, offset, &buffer, &owner_->debug_info_patches_);
+      owner_->builder_->GetDebugInfo()->WriteFully(buffer.data(), buffer.size());
     }
-  }
 
-  // Write .debug_info section.
-  for (const auto& compilation_unit : compilation_units) {
-    uint32_t cunit_low_pc = 0xFFFFFFFFU;
-    uint32_t cunit_high_pc = 0;
-    for (auto method_info : compilation_unit) {
-      cunit_low_pc = std::min(cunit_low_pc, method_info->low_pc_);
-      cunit_high_pc = std::max(cunit_high_pc, method_info->high_pc_);
+    // Some types are difficult to define as we go since they need
+    // to be enclosed in the right set of namespaces. Therefore we
+    // just define all types lazily at the end of compilation unit.
+    void WriteLazyType(const char* type_descriptor) {
+      DCHECK(type_descriptor != nullptr);
+      if (type_descriptor[0] != 'V') {
+        lazy_types_.emplace(type_descriptor, info_.size());
+        info_.WriteRef4(DW_AT_type, 0);
+      }
+    }
+
+    void FinishLazyTypes() {
+      for (const auto& lazy_type : lazy_types_) {
+        info_.UpdateUint32(lazy_type.second, WriteType(lazy_type.first));
+      }
+      lazy_types_.clear();
+    }
+
+   private:
+    void WriteName(const char* name) {
+      info_.WriteStrp(DW_AT_name, owner_->WriteString(name));
+    }
+
+    // Convert dex type descriptor to DWARF.
+    // Returns offset in the compilation unit.
+    size_t WriteType(const char* desc) {
+      const auto& it = type_cache_.find(desc);
+      if (it != type_cache_.end()) {
+        return it->second;
+      }
+
+      size_t offset;
+      if (*desc == 'L') {
+        // Class type. For example: Lpackage/name;
+        offset = StartClassTag(desc);
+        info_.WriteFlag(DW_AT_declaration, true);
+        EndClassTag(desc);
+      } else if (*desc == '[') {
+        // Array type.
+        size_t element_type = WriteType(desc + 1);
+        offset = info_.StartTag(DW_TAG_array_type);
+        info_.WriteRef(DW_AT_type, element_type);
+        info_.EndTag();
+      } else {
+        // Primitive types.
+        const char* name;
+        switch (*desc) {
+        case 'B': name = "byte"; break;
+        case 'C': name = "char"; break;
+        case 'D': name = "double"; break;
+        case 'F': name = "float"; break;
+        case 'I': name = "int"; break;
+        case 'J': name = "long"; break;
+        case 'S': name = "short"; break;
+        case 'Z': name = "boolean"; break;
+        case 'V': name = "void"; break;
+        default:
+          LOG(FATAL) << "Unknown dex type descriptor: " << desc;
+          UNREACHABLE();
+        }
+        offset = info_.StartTag(DW_TAG_base_type);
+        WriteName(name);
+        info_.EndTag();
+      }
+
+      type_cache_.emplace(desc, offset);
+      return offset;
+    }
+
+    // Start DW_TAG_class_type tag nested in DW_TAG_namespace tags.
+    // Returns offset of the class tag in the compilation unit.
+    size_t StartClassTag(const char* desc) {
+      DCHECK(desc != nullptr && desc[0] == 'L');
+      // Enclose the type in namespace tags.
+      const char* end;
+      for (desc = desc + 1; (end = strchr(desc, '/')) != nullptr; desc = end + 1) {
+        info_.StartTag(DW_TAG_namespace);
+        WriteName(std::string(desc, end - desc).c_str());
+      }
+      // Start the class tag.
+      size_t offset = info_.StartTag(DW_TAG_class_type);
+      end = strchr(desc, ';');
+      CHECK(end != nullptr);
+      WriteName(std::string(desc, end - desc).c_str());
+      return offset;
     }
 
-    size_t debug_abbrev_offset = debug_abbrev->size();
-    DebugInfoEntryWriter<> info(is64bit, debug_abbrev);
-    info.StartTag(DW_TAG_compile_unit, DW_CHILDREN_yes);
-    info.WriteStrp(DW_AT_producer, "Android dex2oat", debug_str);
-    info.WriteData1(DW_AT_language, DW_LANG_Java);
-    info.WriteAddr(DW_AT_low_pc, cunit_low_pc);
-    info.WriteAddr(DW_AT_high_pc, cunit_high_pc);
-    info.WriteData4(DW_AT_stmt_list, debug_line->size());
-    for (auto method_info : compilation_unit) {
-      std::string method_name = PrettyMethod(method_info->dex_method_index_,
-                                             *method_info->dex_file_, true);
-      if (deduped_addresses.find(method_info->low_pc_) != deduped_addresses.end()) {
-        method_name += " [DEDUPED]";
+    void EndClassTag(const char* desc) {
+      DCHECK(desc != nullptr && desc[0] == 'L');
+      // End the class tag.
+      info_.EndTag();
+      // Close namespace tags.
+      const char* end;
+      for (desc = desc + 1; (end = strchr(desc, '/')) != nullptr; desc = end + 1) {
+        info_.EndTag();
       }
-      info.StartTag(DW_TAG_subprogram, DW_CHILDREN_no);
-      info.WriteStrp(DW_AT_name, method_name.data(), debug_str);
-      info.WriteAddr(DW_AT_low_pc, method_info->low_pc_);
-      info.WriteAddr(DW_AT_high_pc, method_info->high_pc_);
-      info.EndTag();  // DW_TAG_subprogram
     }
-    info.EndTag();  // DW_TAG_compile_unit
-    WriteDebugInfoCU(debug_abbrev_offset, info, debug_info, debug_info_patches);
 
-    // Write .debug_line section.
+    // For access to the ELF sections.
+    DebugInfoWriter<ElfTypes>* owner_;
+    // Debug abbrevs for this compilation unit only.
+    std::vector<uint8_t> debug_abbrev_;
+    // Temporary buffer to create and store the entries.
+    DebugInfoEntryWriter<> info_;
+    // Cache of already translated type descriptors.
+    std::map<const char*, size_t, CStringLess> type_cache_;  // type_desc -> definition_offset.
+    // 32-bit references which need to be resolved to a type later.
+    std::multimap<const char*, size_t, CStringLess> lazy_types_;  // type_desc -> patch_offset.
+  };
+
+ public:
+  explicit DebugInfoWriter(ElfBuilder<ElfTypes>* builder) : builder_(builder) {
+  }
+
+  void Start() {
+    builder_->GetDebugInfo()->Start();
+  }
+
+  void WriteCompilationUnit(const CompilationUnit& compilation_unit) {
+    CompilationUnitWriter writer(this);
+    writer.Write(compilation_unit);
+  }
+
+  void End() {
+    builder_->GetDebugInfo()->End();
+    builder_->WritePatches(".debug_info.oat_patches", &debug_info_patches_);
+    builder_->WriteSection(".debug_abbrev", &debug_abbrev_.Data());
+    builder_->WriteSection(".debug_str", &debug_str_.Data());
+  }
+
+ private:
+  size_t WriteString(const char* str) {
+    return debug_str_.Insert(reinterpret_cast<const uint8_t*>(str), strlen(str) + 1);
+  }
+
+  ElfBuilder<ElfTypes>* builder_;
+  std::vector<uintptr_t> debug_info_patches_;
+  DedupVector debug_abbrev_;
+  DedupVector debug_str_;
+
+  std::unordered_set<const char*> defined_dex_classes_;  // For CHECKs only.
+};
+
+template<typename ElfTypes>
+class DebugLineWriter {
+  typedef typename ElfTypes::Addr Elf_Addr;
+
+ public:
+  explicit DebugLineWriter(ElfBuilder<ElfTypes>* builder) : builder_(builder) {
+  }
+
+  void Start() {
+    builder_->GetDebugLine()->Start();
+  }
+
+  // Write line table for given set of methods.
+  // Returns the number of bytes written.
+  size_t WriteCompilationUnit(CompilationUnit& compilation_unit) {
+    const bool is64bit = Is64BitInstructionSet(builder_->GetIsa());
+    const Elf_Addr text_address = builder_->GetText()->GetAddress();
+
+    compilation_unit.debug_line_offset_ = builder_->GetDebugLine()->GetSize();
+
     std::vector<FileEntry> files;
     std::unordered_map<std::string, size_t> files_map;
     std::vector<std::string> directories;
     std::unordered_map<std::string, size_t> directories_map;
     int code_factor_bits_ = 0;
     int dwarf_isa = -1;
-    switch (isa) {
+    switch (builder_->GetIsa()) {
       case kArm:  // arm actually means thumb2.
       case kThumb2:
         code_factor_bits_ = 1;  // 16-bit instuctions
@@ -328,11 +538,17 @@ void WriteDebugSections(const CompilerDriver* compiler,
         break;
     }
     DebugLineOpCodeWriter<> opcodes(is64bit, code_factor_bits_);
-    opcodes.SetAddress(cunit_low_pc);
+    opcodes.SetAddress(text_address + compilation_unit.low_pc_);
     if (dwarf_isa != -1) {
       opcodes.SetISA(dwarf_isa);
     }
-    for (const OatWriter::DebugInfo* mi : compilation_unit) {
+    for (const OatWriter::DebugInfo* mi : compilation_unit.methods_) {
+      // Ignore function if we have already generated line table for the same address.
+      // It would confuse the debugger and the DWARF specification forbids it.
+      if (mi->deduped_) {
+        continue;
+      }
+
       struct DebugInfoCallbacks {
         static bool NewPosition(void* ctx, uint32_t address, uint32_t line) {
           auto* context = reinterpret_cast<DebugInfoCallbacks*>(ctx);
@@ -342,6 +558,8 @@ void WriteDebugSections(const CompilerDriver* compiler,
         DefaultSrcMap dex2line_;
       } debug_info_callbacks;
 
+      Elf_Addr method_address = text_address + mi->low_pc_;
+
       const DexFile* dex = mi->dex_file_;
       if (mi->code_item_ != nullptr) {
         dex->DecodeDebugInfo(mi->code_item_,
@@ -414,26 +632,96 @@ void WriteDebugSections(const CompilerDriver* compiler,
                 int first_line = dex2line_map.front().to_;
                 // Prologue is not a sensible place for a breakpoint.
                 opcodes.NegateStmt();
-                opcodes.AddRow(mi->low_pc_, first_line);
+                opcodes.AddRow(method_address, first_line);
                 opcodes.NegateStmt();
                 opcodes.SetPrologueEnd();
               }
-              opcodes.AddRow(mi->low_pc_ + pc, line);
+              opcodes.AddRow(method_address + pc, line);
             } else if (line != opcodes.CurrentLine()) {
-              opcodes.AddRow(mi->low_pc_ + pc, line);
+              opcodes.AddRow(method_address + pc, line);
             }
           }
         }
       } else {
         // line 0 - instruction cannot be attributed to any source line.
-        opcodes.AddRow(mi->low_pc_, 0);
+        opcodes.AddRow(method_address, 0);
       }
     }
-    opcodes.AdvancePC(cunit_high_pc);
+    opcodes.AdvancePC(text_address + compilation_unit.high_pc_);
     opcodes.EndSequence();
-    WriteDebugLineTable(directories, files, opcodes, debug_line, debug_line_patches);
+    std::vector<uint8_t> buffer;
+    buffer.reserve(opcodes.data()->size() + KB);
+    size_t offset = builder_->GetDebugLine()->GetSize();
+    WriteDebugLineTable(directories, files, opcodes, offset, &buffer, &debug_line_patches);
+    builder_->GetDebugLine()->WriteFully(buffer.data(), buffer.size());
+    return buffer.size();
+  }
+
+  void End() {
+    builder_->GetDebugLine()->End();
+    builder_->WritePatches(".debug_line.oat_patches", &debug_line_patches);
+  }
+
+ private:
+  ElfBuilder<ElfTypes>* builder_;
+  std::vector<uintptr_t> debug_line_patches;
+};
+
+template<typename ElfTypes>
+void WriteDebugSections(ElfBuilder<ElfTypes>* builder,
+                        const std::vector<OatWriter::DebugInfo>& method_infos) {
+  // Group the methods into compilation units based on source file.
+  std::vector<CompilationUnit> compilation_units;
+  const char* last_source_file = nullptr;
+  for (const OatWriter::DebugInfo& mi : method_infos) {
+    auto& dex_class_def = mi.dex_file_->GetClassDef(mi.class_def_index_);
+    const char* source_file = mi.dex_file_->GetSourceFile(dex_class_def);
+    if (compilation_units.empty() || source_file != last_source_file) {
+      compilation_units.push_back(CompilationUnit());
+    }
+    CompilationUnit& cu = compilation_units.back();
+    cu.methods_.push_back(&mi);
+    cu.low_pc_ = std::min(cu.low_pc_, mi.low_pc_);
+    cu.high_pc_ = std::max(cu.high_pc_, mi.high_pc_);
+    last_source_file = source_file;
+  }
+
+  // Write .debug_line section.
+  {
+    DebugLineWriter<ElfTypes> line_writer(builder);
+    line_writer.Start();
+    for (auto& compilation_unit : compilation_units) {
+      line_writer.WriteCompilationUnit(compilation_unit);
+    }
+    line_writer.End();
+  }
+
+  // Write .debug_info section.
+  {
+    DebugInfoWriter<ElfTypes> info_writer(builder);
+    info_writer.Start();
+    for (const auto& compilation_unit : compilation_units) {
+      info_writer.WriteCompilationUnit(compilation_unit);
+    }
+    info_writer.End();
   }
 }
 
+// Explicit instantiations
+template void WriteCFISection<ElfTypes32>(
+    ElfBuilder<ElfTypes32>* builder,
+    const std::vector<OatWriter::DebugInfo>& method_infos,
+    CFIFormat format);
+template void WriteCFISection<ElfTypes64>(
+    ElfBuilder<ElfTypes64>* builder,
+    const std::vector<OatWriter::DebugInfo>& method_infos,
+    CFIFormat format);
+template void WriteDebugSections<ElfTypes32>(
+    ElfBuilder<ElfTypes32>* builder,
+    const std::vector<OatWriter::DebugInfo>& method_infos);
+template void WriteDebugSections<ElfTypes64>(
+    ElfBuilder<ElfTypes64>* builder,
+    const std::vector<OatWriter::DebugInfo>& method_infos);
+
 }  // namespace dwarf
 }  // namespace art
diff --git a/compiler/elf_writer_debug.h b/compiler/elf_writer_debug.h
index 69f7e0d811..e58fd0a390 100644
--- a/compiler/elf_writer_debug.h
+++ b/compiler/elf_writer_debug.h
@@ -19,29 +19,21 @@
 
 #include <vector>
 
+#include "elf_builder.h"
 #include "dwarf/dwarf_constants.h"
 #include "oat_writer.h"
 
 namespace art {
 namespace dwarf {
 
-void WriteCFISection(const CompilerDriver* compiler,
-                     const OatWriter* oat_writer,
-                     ExceptionHeaderValueApplication address_type,
-                     CFIFormat format,
-                     std::vector<uint8_t>* debug_frame,
-                     std::vector<uintptr_t>* debug_frame_patches,
-                     std::vector<uint8_t>* eh_frame_hdr,
-                     std::vector<uintptr_t>* eh_frame_hdr_patches);
-
-void WriteDebugSections(const CompilerDriver* compiler,
-                        const OatWriter* oat_writer,
-                        std::vector<uint8_t>* debug_info,
-                        std::vector<uintptr_t>* debug_info_patches,
-                        std::vector<uint8_t>* debug_abbrev,
-                        std::vector<uint8_t>* debug_str,
-                        std::vector<uint8_t>* debug_line,
-                        std::vector<uintptr_t>* debug_line_patches);
+template<typename ElfTypes>
+void WriteCFISection(ElfBuilder<ElfTypes>* builder,
+                     const std::vector<OatWriter::DebugInfo>& method_infos,
+                     CFIFormat format);
+
+template<typename ElfTypes>
+void WriteDebugSections(ElfBuilder<ElfTypes>* builder,
+                        const std::vector<OatWriter::DebugInfo>& method_infos);
 
 }  // namespace dwarf
 }  // namespace art
diff --git a/compiler/elf_writer_quick.cc b/compiler/elf_writer_quick.cc
index dce1e861b4..5c059e1e82 100644
--- a/compiler/elf_writer_quick.cc
+++ b/compiler/elf_writer_quick.cc
@@ -70,190 +70,78 @@ bool ElfWriterQuick<ElfTypes>::Create(File* elf_file,
 template <typename ElfTypes>
 static void WriteDebugSymbols(ElfBuilder<ElfTypes>* builder, OatWriter* oat_writer);
 
-// Encode patch locations as LEB128 list of deltas between consecutive addresses.
 template <typename ElfTypes>
-void ElfWriterQuick<ElfTypes>::EncodeOatPatches(const std::vector<uintptr_t>& locations,
-                                                std::vector<uint8_t>* buffer) {
-  buffer->reserve(buffer->size() + locations.size() * 2);  // guess 2 bytes per ULEB128.
-  uintptr_t address = 0;  // relative to start of section.
-  for (uintptr_t location : locations) {
-    DCHECK_GE(location, address) << "Patch locations are not in sorted order";
-    EncodeUnsignedLeb128(buffer, dchecked_integral_cast<uint32_t>(location - address));
-    address = location;
-  }
-}
-
-class RodataWriter FINAL : public CodeOutput {
- public:
-  explicit RodataWriter(OatWriter* oat_writer) : oat_writer_(oat_writer) {}
-
-  bool Write(OutputStream* out) OVERRIDE {
-    return oat_writer_->WriteRodata(out);
-  }
+bool ElfWriterQuick<ElfTypes>::Write(
+    OatWriter* oat_writer,
+    const std::vector<const DexFile*>& dex_files_unused ATTRIBUTE_UNUSED,
+    const std::string& android_root_unused ATTRIBUTE_UNUSED,
+    bool is_host_unused ATTRIBUTE_UNUSED) {
+  const InstructionSet isa = compiler_driver_->GetInstructionSet();
+  std::unique_ptr<BufferedOutputStream> output_stream(
+      new BufferedOutputStream(new FileOutputStream(elf_file_)));
+  std::unique_ptr<ElfBuilder<ElfTypes>> builder(
+      new ElfBuilder<ElfTypes>(isa, output_stream.get()));
 
- private:
-  OatWriter* oat_writer_;
-};
+  builder->Start();
 
-class TextWriter FINAL : public CodeOutput {
- public:
-  explicit TextWriter(OatWriter* oat_writer) : oat_writer_(oat_writer) {}
+  auto* rodata = builder->GetRoData();
+  auto* text = builder->GetText();
+  auto* bss = builder->GetBss();
 
-  bool Write(OutputStream* out) OVERRIDE {
-    return oat_writer_->WriteCode(out);
+  rodata->Start();
+  if (!oat_writer->WriteRodata(rodata)) {
+    return false;
   }
+  rodata->End();
 
- private:
-  OatWriter* oat_writer_;
-};
-
-enum PatchResult {
-  kAbsoluteAddress,  // Absolute memory location.
-  kPointerRelativeAddress,  // Offset relative to the location of the pointer.
-  kSectionRelativeAddress,  // Offset relative to start of containing section.
-};
-
-// Patch memory addresses within a buffer.
-// It assumes that the unpatched addresses are offsets relative to base_address.
-// (which generally means method's low_pc relative to the start of .text)
-template <typename Elf_Addr, typename Address, PatchResult kPatchResult>
-static void Patch(const std::vector<uintptr_t>& patch_locations,
-                  Elf_Addr buffer_address, Elf_Addr base_address,
-                  std::vector<uint8_t>* buffer) {
-  for (uintptr_t location : patch_locations) {
-    typedef __attribute__((__aligned__(1))) Address UnalignedAddress;
-    auto* to_patch = reinterpret_cast<UnalignedAddress*>(buffer->data() + location);
-    switch (kPatchResult) {
-      case kAbsoluteAddress:
-        *to_patch = (base_address + *to_patch);
-        break;
-      case kPointerRelativeAddress:
-        *to_patch = (base_address + *to_patch) - (buffer_address + location);
-        break;
-      case kSectionRelativeAddress:
-        *to_patch = (base_address + *to_patch) - buffer_address;
-        break;
-    }
+  text->Start();
+  if (!oat_writer->WriteCode(text)) {
+    return false;
   }
-}
+  text->End();
 
-template <typename ElfTypes>
-bool ElfWriterQuick<ElfTypes>::Write(
-    OatWriter* oat_writer,
-    const std::vector<const DexFile*>& dex_files_unused ATTRIBUTE_UNUSED,
-    const std::string& android_root_unused ATTRIBUTE_UNUSED,
-    bool is_host_unused ATTRIBUTE_UNUSED) {
-  using Elf_Addr = typename ElfTypes::Addr;
-  const InstructionSet isa = compiler_driver_->GetInstructionSet();
+  if (oat_writer->GetBssSize() != 0) {
+    bss->Start();
+    bss->SetSize(oat_writer->GetBssSize());
+    bss->End();
+  }
 
-  // Setup the builder with the main OAT sections (.rodata .text .bss).
-  const size_t rodata_size = oat_writer->GetOatHeader().GetExecutableOffset();
-  const size_t text_size = oat_writer->GetSize() - rodata_size;
-  const size_t bss_size = oat_writer->GetBssSize();
-  RodataWriter rodata_writer(oat_writer);
-  TextWriter text_writer(oat_writer);
-  std::unique_ptr<ElfBuilder<ElfTypes>> builder(new ElfBuilder<ElfTypes>(
-      isa, rodata_size, &rodata_writer, text_size, &text_writer, bss_size));
+  builder->WriteDynamicSection(elf_file_->GetPath());
 
-  // Add debug sections.
-  // They are allocated here (in the same scope as the builder),
-  // but they are registered with the builder only if they are used.
-  using RawSection = typename ElfBuilder<ElfTypes>::RawSection;
-  const auto* text = builder->GetText();
-  const bool is64bit = Is64BitInstructionSet(isa);
-  const int pointer_size = GetInstructionSetPointerSize(isa);
-  std::unique_ptr<RawSection> eh_frame(new RawSection(
-      ".eh_frame", SHT_PROGBITS, SHF_ALLOC, nullptr, 0, kPageSize, 0,
-      is64bit ? Patch<Elf_Addr, uint64_t, kPointerRelativeAddress> :
-                Patch<Elf_Addr, uint32_t, kPointerRelativeAddress>,
-      text));
-  std::unique_ptr<RawSection> eh_frame_hdr(new RawSection(
-      ".eh_frame_hdr", SHT_PROGBITS, SHF_ALLOC, nullptr, 0, 4, 0,
-      Patch<Elf_Addr, uint32_t, kSectionRelativeAddress>, text));
-  std::unique_ptr<RawSection> debug_frame(new RawSection(
-      ".debug_frame", SHT_PROGBITS, 0, nullptr, 0, pointer_size, 0,
-      is64bit ? Patch<Elf_Addr, uint64_t, kAbsoluteAddress> :
-                Patch<Elf_Addr, uint32_t, kAbsoluteAddress>,
-      text));
-  std::unique_ptr<RawSection> debug_frame_oat_patches(new RawSection(
-      ".debug_frame.oat_patches", SHT_OAT_PATCH));
-  std::unique_ptr<RawSection> debug_info(new RawSection(
-      ".debug_info", SHT_PROGBITS, 0, nullptr, 0, 1, 0,
-      Patch<Elf_Addr, uint32_t, kAbsoluteAddress>, text));
-  std::unique_ptr<RawSection> debug_info_oat_patches(new RawSection(
-      ".debug_info.oat_patches", SHT_OAT_PATCH));
-  std::unique_ptr<RawSection> debug_abbrev(new RawSection(
-      ".debug_abbrev", SHT_PROGBITS));
-  std::unique_ptr<RawSection> debug_str(new RawSection(
-      ".debug_str", SHT_PROGBITS));
-  std::unique_ptr<RawSection> debug_line(new RawSection(
-      ".debug_line", SHT_PROGBITS, 0, nullptr, 0, 1, 0,
-      Patch<Elf_Addr, uint32_t, kAbsoluteAddress>, text));
-  std::unique_ptr<RawSection> debug_line_oat_patches(new RawSection(
-      ".debug_line.oat_patches", SHT_OAT_PATCH));
-  if (!oat_writer->GetMethodDebugInfo().empty()) {
-    if (compiler_driver_->GetCompilerOptions().GetGenerateDebugInfo()) {
-      // Generate CFI (stack unwinding information).
-      if (kCFIFormat == dwarf::DW_EH_FRAME_FORMAT) {
-        dwarf::WriteCFISection(
-            compiler_driver_, oat_writer,
-            dwarf::DW_EH_PE_pcrel, kCFIFormat,
-            eh_frame->GetBuffer(), eh_frame->GetPatchLocations(),
-            eh_frame_hdr->GetBuffer(), eh_frame_hdr->GetPatchLocations());
-        builder->RegisterSection(eh_frame.get());
-        builder->RegisterSection(eh_frame_hdr.get());
-      } else {
-        DCHECK(kCFIFormat == dwarf::DW_DEBUG_FRAME_FORMAT);
-        dwarf::WriteCFISection(
-            compiler_driver_, oat_writer,
-            dwarf::DW_EH_PE_absptr, kCFIFormat,
-            debug_frame->GetBuffer(), debug_frame->GetPatchLocations(),
-            nullptr, nullptr);
-        builder->RegisterSection(debug_frame.get());
-        EncodeOatPatches(*debug_frame->GetPatchLocations(),
-                         debug_frame_oat_patches->GetBuffer());
-        builder->RegisterSection(debug_frame_oat_patches.get());
-      }
+  if (compiler_driver_->GetCompilerOptions().GetGenerateDebugInfo()) {
+    const auto& method_infos = oat_writer->GetMethodDebugInfo();
+    if (!method_infos.empty()) {
       // Add methods to .symtab.
       WriteDebugSymbols(builder.get(), oat_writer);
-      // Generate DWARF .debug_* sections.
-      dwarf::WriteDebugSections(
-          compiler_driver_, oat_writer,
-          debug_info->GetBuffer(), debug_info->GetPatchLocations(),
-          debug_abbrev->GetBuffer(),
-          debug_str->GetBuffer(),
-          debug_line->GetBuffer(), debug_line->GetPatchLocations());
-      builder->RegisterSection(debug_info.get());
-      EncodeOatPatches(*debug_info->GetPatchLocations(),
-                       debug_info_oat_patches->GetBuffer());
-      builder->RegisterSection(debug_info_oat_patches.get());
-      builder->RegisterSection(debug_abbrev.get());
-      builder->RegisterSection(debug_str.get());
-      builder->RegisterSection(debug_line.get());
-      EncodeOatPatches(*debug_line->GetPatchLocations(),
-                       debug_line_oat_patches->GetBuffer());
-      builder->RegisterSection(debug_line_oat_patches.get());
+      // Generate CFI (stack unwinding information).
+      dwarf::WriteCFISection(builder.get(), method_infos, kCFIFormat);
+      // Write DWARF .debug_* sections.
+      dwarf::WriteDebugSections(builder.get(), method_infos);
     }
   }
 
   // Add relocation section for .text.
-  std::unique_ptr<RawSection> text_oat_patches(new RawSection(
-      ".text.oat_patches", SHT_OAT_PATCH));
   if (compiler_driver_->GetCompilerOptions().GetIncludePatchInformation()) {
     // Note that ElfWriter::Fixup will be called regardless and therefore
     // we need to include oat_patches for debug sections unconditionally.
-    EncodeOatPatches(oat_writer->GetAbsolutePatchLocations(),
-                     text_oat_patches->GetBuffer());
-    builder->RegisterSection(text_oat_patches.get());
+    builder->WritePatches(".text.oat_patches", &oat_writer->GetAbsolutePatchLocations());
   }
 
-  return builder->Write(elf_file_);
+  builder->End();
+
+  return builder->Good() && output_stream->Flush();
 }
 
 template <typename ElfTypes>
 static void WriteDebugSymbols(ElfBuilder<ElfTypes>* builder, OatWriter* oat_writer) {
   const std::vector<OatWriter::DebugInfo>& method_info = oat_writer->GetMethodDebugInfo();
   bool generated_mapping_symbol = false;
+  auto* strtab = builder->GetStrTab();
+  auto* symtab = builder->GetSymTab();
+
+  if (method_info.empty()) {
+    return;
+  }
 
   // Find all addresses (low_pc) which contain deduped methods.
   // The first instance of method is not marked deduped_, but the rest is.
@@ -264,7 +152,8 @@ static void WriteDebugSymbols(ElfBuilder<ElfTypes>* builder, OatWriter* oat_writ
     }
   }
 
-  auto* symtab = builder->GetSymtab();
+  strtab->Start();
+  strtab->Write("");  // strtab should start with empty string.
   for (auto it = method_info.begin(); it != method_info.end(); ++it) {
     if (it->deduped_) {
       continue;  // Add symbol only for the first instance.
@@ -277,8 +166,8 @@ static void WriteDebugSymbols(ElfBuilder<ElfTypes>* builder, OatWriter* oat_writ
     uint32_t low_pc = it->low_pc_;
     // Add in code delta, e.g., thumb bit 0 for Thumb2 code.
     low_pc += it->compiled_method_->CodeDelta();
-    symtab->AddSymbol(name, builder->GetText(), low_pc,
-                      true, it->high_pc_ - it->low_pc_, STB_GLOBAL, STT_FUNC);
+    symtab->Add(strtab->Write(name), builder->GetText(), low_pc,
+                true, it->high_pc_ - it->low_pc_, STB_GLOBAL, STT_FUNC);
 
     // Conforming to aaelf, add $t mapping symbol to indicate start of a sequence of thumb2
     // instructions, so that disassembler tools can correctly disassemble.
@@ -286,12 +175,19 @@ static void WriteDebugSymbols(ElfBuilder<ElfTypes>* builder, OatWriter* oat_writ
     // requires it to match function symbol.  Just address 0 does not work.
     if (it->compiled_method_->GetInstructionSet() == kThumb2) {
       if (!generated_mapping_symbol || !kGenerateSingleArmMappingSymbol) {
-        symtab->AddSymbol("$t", builder->GetText(), it->low_pc_ & ~1, true,
-                          0, STB_LOCAL, STT_NOTYPE);
+        symtab->Add(strtab->Write("$t"), builder->GetText(), it->low_pc_ & ~1,
+                    true, 0, STB_LOCAL, STT_NOTYPE);
         generated_mapping_symbol = true;
       }
     }
   }
+  strtab->End();
+
+  // Symbols are buffered and written after names (because they are smaller).
+  // We could also do two passes in this function to avoid the buffering.
+  symtab->Start();
+  symtab->Write();
+  symtab->End();
 }
 
 // Explicit instantiations
diff --git a/compiler/elf_writer_test.cc b/compiler/elf_writer_test.cc
index ccf34b816b..b413a9eb7b 100644
--- a/compiler/elf_writer_test.cc
+++ b/compiler/elf_writer_test.cc
@@ -21,6 +21,7 @@
 #include "common_compiler_test.h"
 #include "elf_file.h"
 #include "elf_file_impl.h"
+#include "elf_builder.h"
 #include "elf_writer_quick.h"
 #include "oat.h"
 #include "utils.h"
@@ -100,7 +101,7 @@ TEST_F(ElfWriterTest, EncodeDecodeOatPatches) {
 
     // Encode patch locations.
     std::vector<uint8_t> oat_patches;
-    ElfWriterQuick32::EncodeOatPatches(patch_locations, &oat_patches);
+    ElfBuilder<ElfTypes32>::EncodeOatPatches(patch_locations, &oat_patches);
 
     // Create buffer to be patched.
     std::vector<uint8_t> initial_data(256);
diff --git a/compiler/image_test.cc b/compiler/image_test.cc
index a38e1f54c0..6df15279a0 100644
--- a/compiler/image_test.cc
+++ b/compiler/image_test.cc
@@ -181,7 +181,7 @@ TEST_F(ImageTest, WriteRead) {
   ASSERT_TRUE(heap->HasImageSpace());
   ASSERT_TRUE(heap->GetNonMovingSpace()->IsMallocSpace());
 
-  gc::space::ImageSpace* image_space = heap->GetImageSpace();
+  gc::space::ImageSpace* image_space = heap->GetBootImageSpace();
   ASSERT_TRUE(image_space != nullptr);
   ASSERT_LE(image_space->Size(), image_file_size);
 
diff --git a/compiler/image_writer.cc b/compiler/image_writer.cc
index 0c85323805..3d9e7e7cda 100644
--- a/compiler/image_writer.cc
+++ b/compiler/image_writer.cc
@@ -334,9 +334,9 @@ void ImageWriter::PrepareDexCacheArraySlots() {
   Thread* const self = Thread::Current();
   ReaderMutexLock mu(self, *class_linker->DexLock());
   uint32_t size = 0u;
-  for (jobject weak_root : class_linker->GetDexCaches()) {
+  for (const ClassLinker::DexCacheData& data : class_linker->GetDexCachesData()) {
     mirror::DexCache* dex_cache =
-        down_cast<mirror::DexCache*>(self->DecodeJObject(weak_root));
+        down_cast<mirror::DexCache*>(self->DecodeJObject(data.weak_root));
     if (dex_cache == nullptr || IsInBootImage(dex_cache)) {
       continue;
     }
@@ -372,9 +372,9 @@ void ImageWriter::AddMethodPointerArray(mirror::PointerArray* arr) {
   DCHECK(arr != nullptr);
   if (kIsDebugBuild) {
     for (size_t i = 0, len = arr->GetLength(); i < len; i++) {
-      auto* method = arr->GetElementPtrSize<ArtMethod*>(i, target_ptr_size_);
+      ArtMethod* method = arr->GetElementPtrSize<ArtMethod*>(i, target_ptr_size_);
       if (method != nullptr && !method->IsRuntimeMethod()) {
-        auto* klass = method->GetDeclaringClass();
+        mirror::Class* klass = method->GetDeclaringClass();
         CHECK(klass == nullptr || KeepClass(klass))
             << PrettyClass(klass) << " should be a kept class";
       }
@@ -514,7 +514,7 @@ bool ImageWriter::IsImageBinSlotAssigned(mirror::Object* object) const {
     size_t offset = lock_word.ForwardingAddress();
     BinSlot bin_slot(offset);
     DCHECK_LT(bin_slot.GetIndex(), bin_slot_sizes_[bin_slot.GetBin()])
-      << "bin slot offset should not exceed the size of that bin";
+        << "bin slot offset should not exceed the size of that bin";
   }
   return true;
 }
@@ -537,8 +537,13 @@ bool ImageWriter::AllocMemory() {
   const size_t length = RoundUp(image_objects_offset_begin_ + GetBinSizeSum() + intern_table_bytes_,
                                 kPageSize);
   std::string error_msg;
-  image_.reset(MemMap::MapAnonymous("image writer image", nullptr, length, PROT_READ | PROT_WRITE,
-                                    false, false, &error_msg));
+  image_.reset(MemMap::MapAnonymous("image writer image",
+                                    nullptr,
+                                    length,
+                                    PROT_READ | PROT_WRITE,
+                                    false,
+                                    false,
+                                    &error_msg));
   if (UNLIKELY(image_.get() == nullptr)) {
     LOG(ERROR) << "Failed to allocate memory for image file generation: " << error_msg;
     return false;
@@ -547,7 +552,9 @@ bool ImageWriter::AllocMemory() {
   // Create the image bitmap, only needs to cover mirror object section which is up to image_end_.
   CHECK_LE(image_end_, length);
   image_bitmap_.reset(gc::accounting::ContinuousSpaceBitmap::Create(
-      "image bitmap", image_->Begin(), RoundUp(image_end_, kPageSize)));
+      "image bitmap",
+      image_->Begin(),
+      RoundUp(image_end_, kPageSize)));
   if (image_bitmap_.get() == nullptr) {
     LOG(ERROR) << "Failed to allocate memory for image bitmap";
     return false;
@@ -676,8 +683,8 @@ void ImageWriter::PruneNonImageClasses() {
   ScopedAssertNoThreadSuspension sa(self, __FUNCTION__);
   ReaderMutexLock mu(self, *Locks::classlinker_classes_lock_);  // For ClassInClassTable
   ReaderMutexLock mu2(self, *class_linker->DexLock());
-  for (jobject weak_root : class_linker->GetDexCaches()) {
-    mirror::DexCache* dex_cache = down_cast<mirror::DexCache*>(self->DecodeJObject(weak_root));
+  for (const ClassLinker::DexCacheData& data : class_linker->GetDexCachesData()) {
+    mirror::DexCache* dex_cache = down_cast<mirror::DexCache*>(self->DecodeJObject(data.weak_root));
     if (dex_cache == nullptr) {
       continue;
     }
@@ -799,8 +806,9 @@ ObjectArray<Object>* ImageWriter::CreateImageRoots() const {
   {
     ReaderMutexLock mu(self, *class_linker->DexLock());
     // Count number of dex caches not in the boot image.
-    for (jobject weak_root : class_linker->GetDexCaches()) {
-      mirror::DexCache* dex_cache = down_cast<mirror::DexCache*>(self->DecodeJObject(weak_root));
+    for (const ClassLinker::DexCacheData& data : class_linker->GetDexCachesData()) {
+      mirror::DexCache* dex_cache =
+          down_cast<mirror::DexCache*>(self->DecodeJObject(data.weak_root));
       dex_cache_count += IsInBootImage(dex_cache) ? 0u : 1u;
     }
   }
@@ -811,15 +819,17 @@ ObjectArray<Object>* ImageWriter::CreateImageRoots() const {
     ReaderMutexLock mu(self, *class_linker->DexLock());
     size_t non_image_dex_caches = 0;
     // Re-count number of non image dex caches.
-    for (jobject weak_root : class_linker->GetDexCaches()) {
-      mirror::DexCache* dex_cache = down_cast<mirror::DexCache*>(self->DecodeJObject(weak_root));
+    for (const ClassLinker::DexCacheData& data : class_linker->GetDexCachesData()) {
+      mirror::DexCache* dex_cache =
+          down_cast<mirror::DexCache*>(self->DecodeJObject(data.weak_root));
       non_image_dex_caches += IsInBootImage(dex_cache) ? 0u : 1u;
     }
     CHECK_EQ(dex_cache_count, non_image_dex_caches)
         << "The number of non-image dex caches changed.";
     size_t i = 0;
-    for (jobject weak_root : class_linker->GetDexCaches()) {
-      mirror::DexCache* dex_cache = down_cast<mirror::DexCache*>(self->DecodeJObject(weak_root));
+    for (const ClassLinker::DexCacheData& data : class_linker->GetDexCachesData()) {
+      mirror::DexCache* dex_cache =
+          down_cast<mirror::DexCache*>(self->DecodeJObject(data.weak_root));
       if (!IsInBootImage(dex_cache)) {
         dex_caches->Set<false>(i, dex_cache);
         ++i;
@@ -905,8 +915,8 @@ void ImageWriter::WalkFieldsInOrder(mirror::Object* obj) {
           size_t& offset = bin_slot_sizes_[kBinArtField];
           DCHECK(!IsInBootImage(cur_fields));
           native_object_relocations_.emplace(
-              cur_fields, NativeObjectRelocation {
-                  offset, kNativeObjectRelocationTypeArtFieldArray });
+              cur_fields,
+              NativeObjectRelocation {offset, kNativeObjectRelocationTypeArtFieldArray });
           offset += header_size;
           // Forward individual fields so that we can quickly find where they belong.
           for (size_t i = 0, count = cur_fields->size(); i < count; ++i) {
@@ -917,7 +927,8 @@ void ImageWriter::WalkFieldsInOrder(mirror::Object* obj) {
                 << " already assigned " << PrettyField(field) << " static=" << field->IsStatic();
             DCHECK(!IsInBootImage(field));
             native_object_relocations_.emplace(
-                field, NativeObjectRelocation {offset, kNativeObjectRelocationTypeArtField });
+                field,
+                NativeObjectRelocation {offset, kNativeObjectRelocationTypeArtField });
             offset += sizeof(ArtField);
           }
         }
@@ -940,8 +951,9 @@ void ImageWriter::WalkFieldsInOrder(mirror::Object* obj) {
           any_dirty = any_dirty || WillMethodBeDirty(&m);
           ++count;
         }
-        NativeObjectRelocationType type = any_dirty ? kNativeObjectRelocationTypeArtMethodDirty :
-            kNativeObjectRelocationTypeArtMethodClean;
+        NativeObjectRelocationType type = any_dirty
+            ? kNativeObjectRelocationTypeArtMethodDirty
+            : kNativeObjectRelocationTypeArtMethodClean;
         Bin bin_type = BinTypeForNativeRelocationType(type);
         // Forward the entire array at once, but header first.
         const size_t header_size = LengthPrefixedArray<ArtMethod>::ComputeSize(0,
@@ -1124,8 +1136,9 @@ void ImageWriter::CreateHeader(size_t oat_loaded_size, size_t oat_data_offset) {
   cur_pos = RoundUp(cur_pos, ArtMethod::Alignment(target_ptr_size_));
   // Add method section.
   auto* methods_section = &sections[ImageHeader::kSectionArtMethods];
-  *methods_section = ImageSection(cur_pos, bin_slot_sizes_[kBinArtMethodClean] +
-                                  bin_slot_sizes_[kBinArtMethodDirty]);
+  *methods_section = ImageSection(cur_pos,
+                                  bin_slot_sizes_[kBinArtMethodClean] +
+                                      bin_slot_sizes_[kBinArtMethodDirty]);
   CHECK_EQ(bin_slot_offsets_[kBinArtMethodClean], methods_section->Offset());
   cur_pos = methods_section->End();
   // Add dex cache arrays section.
@@ -1156,12 +1169,17 @@ void ImageWriter::CreateHeader(size_t oat_loaded_size, size_t oat_data_offset) {
   CHECK_EQ(AlignUp(image_begin_ + image_end, kPageSize), oat_file_begin) <<
       "Oat file should be right after the image.";
   // Create the header.
-  new (image_->Begin()) ImageHeader(
-      PointerToLowMemUInt32(image_begin_), image_end,
-      sections, image_roots_address_, oat_file_->GetOatHeader().GetChecksum(),
-      PointerToLowMemUInt32(oat_file_begin), PointerToLowMemUInt32(oat_data_begin_),
-      PointerToLowMemUInt32(oat_data_end), PointerToLowMemUInt32(oat_file_end), target_ptr_size_,
-      compile_pic_);
+  new (image_->Begin()) ImageHeader(PointerToLowMemUInt32(image_begin_),
+                                                          image_end,
+                                                          sections,
+                                                          image_roots_address_,
+                                                          oat_file_->GetOatHeader().GetChecksum(),
+                                                          PointerToLowMemUInt32(oat_file_begin),
+                                                          PointerToLowMemUInt32(oat_data_begin_),
+                                                          PointerToLowMemUInt32(oat_data_end),
+                                                          PointerToLowMemUInt32(oat_file_end),
+                                                          target_ptr_size_,
+                                                          compile_pic_);
 }
 
 ArtMethod* ImageWriter::GetImageMethodAddress(ArtMethod* method) {
@@ -1371,14 +1389,16 @@ class FixupVisitor {
     // Use SetFieldObjectWithoutWriteBarrier to avoid card marking since we are writing to the
     // image.
     copy_->SetFieldObjectWithoutWriteBarrier<false, true, kVerifyNone>(
-        offset, image_writer_->GetImageAddress(ref));
+        offset,
+        image_writer_->GetImageAddress(ref));
   }
 
   // java.lang.ref.Reference visitor.
   void operator()(mirror::Class* klass ATTRIBUTE_UNUSED, mirror::Reference* ref) const
       SHARED_REQUIRES(Locks::mutator_lock_) REQUIRES(Locks::heap_bitmap_lock_) {
     copy_->SetFieldObjectWithoutWriteBarrier<false, true, kVerifyNone>(
-        mirror::Reference::ReferentOffset(), image_writer_->GetImageAddress(ref->GetReferent()));
+        mirror::Reference::ReferentOffset(),
+        image_writer_->GetImageAddress(ref->GetReferent()));
   }
 
  protected:
@@ -1421,34 +1441,28 @@ T* ImageWriter::NativeLocationInImage(T* obj) {
       : reinterpret_cast<T*>(image_begin_ + NativeOffsetInImage(obj));
 }
 
-void ImageWriter::FixupClass(mirror::Class* orig, mirror::Class* copy) {
-  // Update the field arrays.
-  copy->SetSFieldsPtrUnchecked(NativeLocationInImage(orig->GetSFieldsPtr()));
-  copy->SetIFieldsPtrUnchecked(NativeLocationInImage(orig->GetIFieldsPtr()));
-  // Update direct and virtual method arrays.
-  copy->SetDirectMethodsPtrUnchecked(NativeLocationInImage(orig->GetDirectMethodsPtr()));
-  copy->SetVirtualMethodsPtr(NativeLocationInImage(orig->GetVirtualMethodsPtr()));
-  // Update dex cache strings.
-  copy->SetDexCacheStrings(NativeLocationInImage(orig->GetDexCacheStrings()));
-  // Fix up embedded tables.
-  if (!orig->IsTemp()) {
-    // TODO: Why do we have temp classes in some cases?
-    if (orig->ShouldHaveEmbeddedImtAndVTable()) {
-      for (int32_t i = 0; i < orig->GetEmbeddedVTableLength(); ++i) {
-        ArtMethod* orig_method = orig->GetEmbeddedVTableEntry(i, target_ptr_size_);
-        copy->SetEmbeddedVTableEntryUnchecked(
-            i,
-            NativeLocationInImage(orig_method),
-            target_ptr_size_);
-      }
-      for (size_t i = 0; i < mirror::Class::kImtSize; ++i) {
-        copy->SetEmbeddedImTableEntry(
-            i,
-            NativeLocationInImage(orig->GetEmbeddedImTableEntry(i, target_ptr_size_)),
-            target_ptr_size_);
-      }
-    }
+template <typename T>
+T* ImageWriter::NativeCopyLocation(T* obj) {
+  return (obj == nullptr || IsInBootImage(obj))
+      ? obj
+      : reinterpret_cast<T*>(image_->Begin() + NativeOffsetInImage(obj));
+}
+
+class NativeLocationVisitor {
+ public:
+  explicit NativeLocationVisitor(ImageWriter* image_writer) : image_writer_(image_writer) {}
+
+  template <typename T>
+  T* operator()(T* ptr) const {
+    return image_writer_->NativeLocationInImage(ptr);
   }
+
+ private:
+  ImageWriter* const image_writer_;
+};
+
+void ImageWriter::FixupClass(mirror::Class* orig, mirror::Class* copy) {
+  orig->FixupNativePointers(copy, target_ptr_size_, NativeLocationVisitor(this));
   FixupClassVisitor visitor(this, copy);
   static_cast<mirror::Object*>(orig)->VisitReferences(visitor, visitor);
 }
@@ -1508,6 +1522,21 @@ void ImageWriter::FixupObject(Object* orig, Object* copy) {
   }
 }
 
+
+class ImageAddressVisitor {
+ public:
+  explicit ImageAddressVisitor(ImageWriter* image_writer) : image_writer_(image_writer) {}
+
+  template <typename T>
+  T* operator()(T* ptr) const SHARED_REQUIRES(Locks::mutator_lock_) {
+    return image_writer_->GetImageAddress(ptr);
+  }
+
+ private:
+  ImageWriter* const image_writer_;
+};
+
+
 void ImageWriter::FixupDexCache(mirror::DexCache* orig_dex_cache,
                                 mirror::DexCache* copy_dex_cache) {
   // Though the DexCache array fields are usually treated as native pointers, we set the full
@@ -1516,52 +1545,39 @@ void ImageWriter::FixupDexCache(mirror::DexCache* orig_dex_cache,
   //     static_cast<int64_t>(reinterpret_cast<uintptr_t>(image_begin_ + offset))).
   GcRoot<mirror::String>* orig_strings = orig_dex_cache->GetStrings();
   if (orig_strings != nullptr) {
-    uintptr_t copy_strings_offset = NativeOffsetInImage(orig_strings);
-    copy_dex_cache->SetField64<false>(
-        mirror::DexCache::StringsOffset(),
-        static_cast<int64_t>(reinterpret_cast<uintptr_t>(image_begin_ + copy_strings_offset)));
-    GcRoot<mirror::String>* copy_strings =
-        reinterpret_cast<GcRoot<mirror::String>*>(image_->Begin() + copy_strings_offset);
-    for (size_t i = 0, num = orig_dex_cache->NumStrings(); i != num; ++i) {
-      copy_strings[i] = GcRoot<mirror::String>(GetImageAddress(orig_strings[i].Read()));
-    }
+    copy_dex_cache->SetFieldPtrWithSize<false>(mirror::DexCache::StringsOffset(),
+                                               NativeLocationInImage(orig_strings),
+                                               /*pointer size*/8u);
+    orig_dex_cache->FixupStrings(NativeCopyLocation(orig_strings), ImageAddressVisitor(this));
   }
   GcRoot<mirror::Class>* orig_types = orig_dex_cache->GetResolvedTypes();
   if (orig_types != nullptr) {
-    uintptr_t copy_types_offset = NativeOffsetInImage(orig_types);
-    copy_dex_cache->SetField64<false>(
-        mirror::DexCache::ResolvedTypesOffset(),
-        static_cast<int64_t>(reinterpret_cast<uintptr_t>(image_begin_ + copy_types_offset)));
-    GcRoot<mirror::Class>* copy_types =
-        reinterpret_cast<GcRoot<mirror::Class>*>(image_->Begin() + copy_types_offset);
-    for (size_t i = 0, num = orig_dex_cache->NumResolvedTypes(); i != num; ++i) {
-      copy_types[i] = GcRoot<mirror::Class>(GetImageAddress(orig_types[i].Read()));
-    }
+    copy_dex_cache->SetFieldPtrWithSize<false>(mirror::DexCache::ResolvedTypesOffset(),
+                                               NativeLocationInImage(orig_types),
+                                               /*pointer size*/8u);
+    orig_dex_cache->FixupResolvedTypes(NativeCopyLocation(orig_types), ImageAddressVisitor(this));
   }
   ArtMethod** orig_methods = orig_dex_cache->GetResolvedMethods();
   if (orig_methods != nullptr) {
-    uintptr_t copy_methods_offset = NativeOffsetInImage(orig_methods);
-    copy_dex_cache->SetField64<false>(
-        mirror::DexCache::ResolvedMethodsOffset(),
-        static_cast<int64_t>(reinterpret_cast<uintptr_t>(image_begin_ + copy_methods_offset)));
-    ArtMethod** copy_methods =
-        reinterpret_cast<ArtMethod**>(image_->Begin() + copy_methods_offset);
+    copy_dex_cache->SetFieldPtrWithSize<false>(mirror::DexCache::ResolvedMethodsOffset(),
+                                               NativeLocationInImage(orig_methods),
+                                               /*pointer size*/8u);
+    ArtMethod** copy_methods = NativeCopyLocation(orig_methods);
     for (size_t i = 0, num = orig_dex_cache->NumResolvedMethods(); i != num; ++i) {
       ArtMethod* orig = mirror::DexCache::GetElementPtrSize(orig_methods, i, target_ptr_size_);
-      ArtMethod* copy = IsInBootImage(orig) ? orig : NativeLocationInImage(orig);
+      ArtMethod* copy = NativeLocationInImage(orig);
       mirror::DexCache::SetElementPtrSize(copy_methods, i, copy, target_ptr_size_);
     }
   }
   ArtField** orig_fields = orig_dex_cache->GetResolvedFields();
   if (orig_fields != nullptr) {
-    uintptr_t copy_fields_offset = NativeOffsetInImage(orig_fields);
-    copy_dex_cache->SetField64<false>(
-        mirror::DexCache::ResolvedFieldsOffset(),
-        static_cast<int64_t>(reinterpret_cast<uintptr_t>(image_begin_ + copy_fields_offset)));
-    ArtField** copy_fields = reinterpret_cast<ArtField**>(image_->Begin() + copy_fields_offset);
+    copy_dex_cache->SetFieldPtrWithSize<false>(mirror::DexCache::ResolvedFieldsOffset(),
+                                               NativeLocationInImage(orig_fields),
+                                               /*pointer size*/8u);
+    ArtField** copy_fields = NativeCopyLocation(orig_fields);
     for (size_t i = 0, num = orig_dex_cache->NumResolvedFields(); i != num; ++i) {
       ArtField* orig = mirror::DexCache::GetElementPtrSize(orig_fields, i, target_ptr_size_);
-      ArtField* copy = IsInBootImage(orig) ? orig : NativeLocationInImage(orig);
+      ArtField* copy = NativeLocationInImage(orig);
       mirror::DexCache::SetElementPtrSize(copy_fields, i, copy, target_ptr_size_);
     }
   }
@@ -1572,7 +1588,7 @@ const uint8_t* ImageWriter::GetOatAddress(OatAddress type) const {
   // If we are compiling an app image, we need to use the stubs of the boot image.
   if (compile_app_image_) {
     // Use the current image pointers.
-    gc::space::ImageSpace* image_space = Runtime::Current()->GetHeap()->GetImageSpace();
+    gc::space::ImageSpace* image_space = Runtime::Current()->GetHeap()->GetBootImageSpace();
     DCHECK(image_space != nullptr);
     const OatFile* oat_file = image_space->GetOatFile();
     CHECK(oat_file != nullptr);
@@ -1604,7 +1620,7 @@ const uint8_t* ImageWriter::GetQuickCode(ArtMethod* method, bool* quick_is_inter
   DCHECK(!method->IsResolutionMethod()) << PrettyMethod(method);
   DCHECK(!method->IsImtConflictMethod()) << PrettyMethod(method);
   DCHECK(!method->IsImtUnimplementedMethod()) << PrettyMethod(method);
-  DCHECK(!method->IsAbstract()) << PrettyMethod(method);
+  DCHECK(method->IsInvokable()) << PrettyMethod(method);
   DCHECK(!IsInBootImage(method)) << PrettyMethod(method);
 
   // Use original code if it exists. Otherwise, set the code pointer to the resolution
@@ -1651,7 +1667,7 @@ const uint8_t* ImageWriter::GetQuickEntryPoint(ArtMethod* method) {
     // We assume all methods have code. If they don't currently then we set them to the use the
     // resolution trampoline. Abstract methods never have code and so we need to make sure their
     // use results in an AbstractMethodError. We use the interpreter to achieve this.
-    if (UNLIKELY(method->IsAbstract())) {
+    if (UNLIKELY(!method->IsInvokable())) {
       return GetOatAddress(kOatAddressQuickToInterpreterBridge);
     } else {
       bool quick_is_interpreted;
@@ -1697,7 +1713,7 @@ void ImageWriter::CopyAndFixupMethod(ArtMethod* orig, ArtMethod* copy) {
     // We assume all methods have code. If they don't currently then we set them to the use the
     // resolution trampoline. Abstract methods never have code and so we need to make sure their
     // use results in an AbstractMethodError. We use the interpreter to achieve this.
-    if (UNLIKELY(orig->IsAbstract())) {
+    if (UNLIKELY(!orig->IsInvokable())) {
       copy->SetEntryPointFromQuickCompiledCodePtrSize(
           GetOatAddress(kOatAddressQuickToInterpreterBridge), target_ptr_size_);
     } else {
@@ -1727,8 +1743,10 @@ static OatHeader* GetOatHeaderFromElf(ElfFile* elf) {
 
 void ImageWriter::SetOatChecksumFromElfFile(File* elf_file) {
   std::string error_msg;
-  std::unique_ptr<ElfFile> elf(ElfFile::Open(elf_file, PROT_READ|PROT_WRITE,
-                                             MAP_SHARED, &error_msg));
+  std::unique_ptr<ElfFile> elf(ElfFile::Open(elf_file,
+                                             PROT_READ | PROT_WRITE,
+                                             MAP_SHARED,
+                                             &error_msg));
   if (elf.get() == nullptr) {
     LOG(FATAL) << "Unable open oat file: " << error_msg;
     return;
@@ -1771,10 +1789,11 @@ uint32_t ImageWriter::BinSlot::GetIndex() const {
 
 uint8_t* ImageWriter::GetOatFileBegin() const {
   DCHECK_GT(intern_table_bytes_, 0u);
-  size_t native_sections_size =
-      bin_slot_sizes_[kBinArtField] + bin_slot_sizes_[kBinArtMethodDirty] +
-      bin_slot_sizes_[kBinArtMethodClean] + bin_slot_sizes_[kBinDexCacheArray] +
-      intern_table_bytes_;
+  size_t native_sections_size = bin_slot_sizes_[kBinArtField] +
+                                bin_slot_sizes_[kBinArtMethodDirty] +
+                                bin_slot_sizes_[kBinArtMethodClean] +
+                                bin_slot_sizes_[kBinDexCacheArray] +
+                                intern_table_bytes_;
   return image_begin_ + RoundUp(image_end_ + native_sections_size, kPageSize);
 }
 
diff --git a/compiler/image_writer.h b/compiler/image_writer.h
index 120de97620..22cb91a56d 100644
--- a/compiler/image_writer.h
+++ b/compiler/image_writer.h
@@ -308,8 +308,11 @@ class ImageWriter FINAL {
       SHARED_REQUIRES(Locks::mutator_lock_);
   void FixupDexCache(mirror::DexCache* orig_dex_cache, mirror::DexCache* copy_dex_cache)
       SHARED_REQUIRES(Locks::mutator_lock_);
-  void FixupPointerArray(mirror::Object* dst, mirror::PointerArray* arr, mirror::Class* klass,
-                         Bin array_type) SHARED_REQUIRES(Locks::mutator_lock_);
+  void FixupPointerArray(mirror::Object* dst,
+                         mirror::PointerArray* arr,
+                         mirror::Class* klass,
+                         Bin array_type)
+      SHARED_REQUIRES(Locks::mutator_lock_);
 
   // Get quick code for non-resolution/imt_conflict/abstract method.
   const uint8_t* GetQuickCode(ArtMethod* method, bool* quick_is_interpreted)
@@ -331,8 +334,12 @@ class ImageWriter FINAL {
   void AssignMethodOffset(ArtMethod* method, NativeObjectRelocationType type)
       SHARED_REQUIRES(Locks::mutator_lock_);
 
+  // Return true if klass is loaded by the boot class loader but not in the boot image.
   bool IsBootClassLoaderNonImageClass(mirror::Class* klass) SHARED_REQUIRES(Locks::mutator_lock_);
 
+  // Return true if klass depends on a boot class loader non image class live. We want to prune
+  // these classes since we do not want any boot class loader classes in the image. This means that
+  // we also cannot have any classes which refer to these boot class loader non image classes.
   bool ContainsBootClassLoaderNonImageClass(mirror::Class* klass)
       SHARED_REQUIRES(Locks::mutator_lock_);
 
@@ -340,9 +347,14 @@ class ImageWriter FINAL {
 
   uintptr_t NativeOffsetInImage(void* obj);
 
+  // Location of where the object will be when the image is loaded at runtime.
   template <typename T>
   T* NativeLocationInImage(T* obj);
 
+  // Location of where the temporary copy of the object currently is.
+  template <typename T>
+  T* NativeCopyLocation(T* obj);
+
   // Return true of obj is inside of the boot image space. This may only return true if we are
   // compiling an app image.
   bool IsInBootImage(const void* obj) const;
@@ -394,7 +406,7 @@ class ImageWriter FINAL {
   const bool compile_pic_;
   const bool compile_app_image_;
 
-  // Boot image space for fast lookups.
+  // Cache the boot image space in this class for faster lookups.
   gc::space::ImageSpace* boot_image_space_;
 
   // Size of pointers on the target architecture.
@@ -432,13 +444,14 @@ class ImageWriter FINAL {
   uint64_t dirty_methods_;
   uint64_t clean_methods_;
 
-  // Prune class memoization table.
+  // Prune class memoization table to speed up ContainsBootClassLoaderNonImageClass.
   std::unordered_map<mirror::Class*, bool> prune_class_memo_;
 
   friend class ContainsBootClassLoaderNonImageClassVisitor;
   friend class FixupClassVisitor;
   friend class FixupRootVisitor;
   friend class FixupVisitor;
+  friend class NativeLocationVisitor;
   friend class NonImageClassesVisitor;
   DISALLOW_COPY_AND_ASSIGN(ImageWriter);
 };
diff --git a/compiler/jit/jit_compiler.cc b/compiler/jit/jit_compiler.cc
index 5f4f47292b..2125c9a26a 100644
--- a/compiler/jit/jit_compiler.cc
+++ b/compiler/jit/jit_compiler.cc
@@ -177,7 +177,8 @@ bool JitCompiler::CompileMethod(Thread* self, ArtMethod* method) {
   }
 
   // Don't compile the method if we are supposed to be deoptimized.
-  if (runtime->GetInstrumentation()->AreAllMethodsDeoptimized()) {
+  instrumentation::Instrumentation* instrumentation = runtime->GetInstrumentation();
+  if (instrumentation->AreAllMethodsDeoptimized() || instrumentation->IsDeoptimized(method)) {
     return false;
   }
 
diff --git a/compiler/oat_test.cc b/compiler/oat_test.cc
index 16f641ab56..030451c1cb 100644
--- a/compiler/oat_test.cc
+++ b/compiler/oat_test.cc
@@ -232,7 +232,7 @@ TEST_F(OatTest, OatHeaderSizeCheck) {
   EXPECT_EQ(72U, sizeof(OatHeader));
   EXPECT_EQ(4U, sizeof(OatMethodOffsets));
   EXPECT_EQ(28U, sizeof(OatQuickMethodHeader));
-  EXPECT_EQ(113 * GetInstructionSetPointerSize(kRuntimeISA), sizeof(QuickEntryPoints));
+  EXPECT_EQ(114 * GetInstructionSetPointerSize(kRuntimeISA), sizeof(QuickEntryPoints));
 }
 
 TEST_F(OatTest, OatHeaderIsValid) {
diff --git a/compiler/oat_writer.cc b/compiler/oat_writer.cc
index 3f2271ef11..40a3f14f93 100644
--- a/compiler/oat_writer.cc
+++ b/compiler/oat_writer.cc
@@ -899,7 +899,7 @@ class OatWriter::WriteCodeMethodVisitor : public OatDexMethodVisitor {
       // NOTE: We're using linker patches for app->boot references when the image can
       // be relocated and therefore we need to emit .oat_patches. We're not using this
       // for app->app references, so check that the method is an image method.
-      gc::space::ImageSpace* image_space = Runtime::Current()->GetHeap()->GetImageSpace();
+      gc::space::ImageSpace* image_space = Runtime::Current()->GetHeap()->GetBootImageSpace();
       size_t method_offset = reinterpret_cast<const uint8_t*>(method) - image_space->Begin();
       CHECK(image_space->GetImageHeader().GetMethodsSection().Contains(method_offset));
     }
diff --git a/compiler/optimizing/builder.cc b/compiler/optimizing/builder.cc
index ed193c7b61..3257de1858 100644
--- a/compiler/optimizing/builder.cc
+++ b/compiler/optimizing/builder.cc
@@ -359,18 +359,10 @@ void HGraphBuilder::InsertTryBoundaryBlocks(const DexFile::CodeItem& code_item)
           // need a strategy for splitting exceptional edges. We split the block
           // after the move-exception (if present) and mark the first part not
           // throwing. The normal-flow edge between them will be split later.
-          HInstruction* first_insn = block->GetFirstInstruction();
-          if (first_insn->IsLoadException()) {
-            // Catch block starts with a LoadException. Split the block after
-            // the StoreLocal and ClearException which must come after the load.
-            DCHECK(first_insn->GetNext()->IsStoreLocal());
-            DCHECK(first_insn->GetNext()->GetNext()->IsClearException());
-            throwing_block = block->SplitBefore(first_insn->GetNext()->GetNext()->GetNext());
-          } else {
-            // Catch block does not load the exception. Split at the beginning
-            // to create an empty catch block.
-            throwing_block = block->SplitBefore(first_insn);
-          }
+          throwing_block = block->SplitCatchBlockAfterMoveException();
+          // Move-exception does not throw and the block has throwing insructions
+          // so it must have been possible to split it.
+          DCHECK(throwing_block != nullptr);
         }
 
         try_block_info.Put(throwing_block->GetBlockId(),
@@ -1006,7 +998,9 @@ bool HGraphBuilder::SetupInvokeArguments(HInvoke* invoke,
     return false;
   }
 
-  if (invoke->IsInvokeStaticOrDirect()) {
+  if (invoke->IsInvokeStaticOrDirect() &&
+      HInvokeStaticOrDirect::NeedsCurrentMethodInput(
+          invoke->AsInvokeStaticOrDirect()->GetMethodLoadKind())) {
     invoke->SetArgumentAt(*argument_index, graph_->GetCurrentMethod());
     (*argument_index)++;
   }
@@ -1455,7 +1449,8 @@ void HGraphBuilder::BuildFilledNewArray(uint32_t dex_pc,
                                         uint32_t* args,
                                         uint32_t register_index) {
   HInstruction* length = graph_->GetIntConstant(number_of_vreg_arguments, dex_pc);
-  QuickEntrypointEnum entrypoint = NeedsAccessCheck(type_index)
+  bool finalizable;
+  QuickEntrypointEnum entrypoint = NeedsAccessCheck(type_index, &finalizable)
       ? kQuickAllocArrayWithAccessCheck
       : kQuickAllocArray;
   HInstruction* object = new (arena_) HNewArray(length,
@@ -1635,9 +1630,9 @@ void HGraphBuilder::BuildTypeCheck(const Instruction& instruction,
   }
 }
 
-bool HGraphBuilder::NeedsAccessCheck(uint32_t type_index) const {
+bool HGraphBuilder::NeedsAccessCheck(uint32_t type_index, bool* finalizable) const {
   return !compiler_driver_->CanAccessInstantiableTypeWithoutChecks(
-      dex_compilation_unit_->GetDexMethodIndex(), *dex_file_, type_index);
+      dex_compilation_unit_->GetDexMethodIndex(), *dex_file_, type_index, finalizable);
 }
 
 void HGraphBuilder::BuildSwitchJumpTable(const SwitchTable& table,
@@ -2514,7 +2509,9 @@ bool HGraphBuilder::AnalyzeDexInstruction(const Instruction& instruction, uint32
         current_block_->AddInstruction(fake_string);
         UpdateLocal(register_index, fake_string, dex_pc);
       } else {
-        QuickEntrypointEnum entrypoint = NeedsAccessCheck(type_index)
+        bool finalizable;
+        bool can_throw = NeedsAccessCheck(type_index, &finalizable);
+        QuickEntrypointEnum entrypoint = can_throw
             ? kQuickAllocObjectWithAccessCheck
             : kQuickAllocObject;
 
@@ -2523,6 +2520,8 @@ bool HGraphBuilder::AnalyzeDexInstruction(const Instruction& instruction, uint32
             dex_pc,
             type_index,
             *dex_compilation_unit_->GetDexFile(),
+            can_throw,
+            finalizable,
             entrypoint));
         UpdateLocal(instruction.VRegA(), current_block_->GetLastInstruction(), dex_pc);
       }
@@ -2532,7 +2531,8 @@ bool HGraphBuilder::AnalyzeDexInstruction(const Instruction& instruction, uint32
     case Instruction::NEW_ARRAY: {
       uint16_t type_index = instruction.VRegC_22c();
       HInstruction* length = LoadLocal(instruction.VRegB_22c(), Primitive::kPrimInt, dex_pc);
-      QuickEntrypointEnum entrypoint = NeedsAccessCheck(type_index)
+      bool finalizable;
+      QuickEntrypointEnum entrypoint = NeedsAccessCheck(type_index, &finalizable)
           ? kQuickAllocArrayWithAccessCheck
           : kQuickAllocArray;
       current_block_->AddInstruction(new (arena_) HNewArray(length,
diff --git a/compiler/optimizing/builder.h b/compiler/optimizing/builder.h
index 9eaa4b62c5..f857ef0e12 100644
--- a/compiler/optimizing/builder.h
+++ b/compiler/optimizing/builder.h
@@ -138,7 +138,10 @@ class HGraphBuilder : public ValueObject {
   HInstruction* LoadLocal(uint32_t register_index, Primitive::Type type, uint32_t dex_pc) const;
   void PotentiallyAddSuspendCheck(HBasicBlock* target, uint32_t dex_pc);
   void InitializeParameters(uint16_t number_of_parameters);
-  bool NeedsAccessCheck(uint32_t type_index) const;
+
+  // Returns whether the current method needs access check for the type.
+  // Output parameter finalizable is set to whether the type is finalizable.
+  bool NeedsAccessCheck(uint32_t type_index, /*out*/bool* finalizable) const;
 
   template<typename T>
   void Unop_12x(const Instruction& instruction, Primitive::Type type, uint32_t dex_pc);
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index ce92470868..0baa0e30dc 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -208,6 +208,7 @@ class DisassemblyScope {
 void CodeGenerator::GenerateSlowPaths() {
   size_t code_start = 0;
   for (SlowPathCode* slow_path : slow_paths_) {
+    current_slow_path_ = slow_path;
     if (disasm_info_ != nullptr) {
       code_start = GetAssembler()->CodeSize();
     }
@@ -216,6 +217,7 @@ void CodeGenerator::GenerateSlowPaths() {
       disasm_info_->AddSlowPathInterval(slow_path, code_start, GetAssembler()->CodeSize());
     }
   }
+  current_slow_path_ = nullptr;
 }
 
 void CodeGenerator::CompileInternal(CodeAllocator* allocator, bool is_baseline) {
@@ -308,7 +310,7 @@ size_t CodeGenerator::FindTwoFreeConsecutiveAlignedEntries(bool* array, size_t l
 
 void CodeGenerator::InitializeCodeGeneration(size_t number_of_spill_slots,
                                              size_t maximum_number_of_live_core_registers,
-                                             size_t maximum_number_of_live_fp_registers,
+                                             size_t maximum_number_of_live_fpu_registers,
                                              size_t number_of_out_slots,
                                              const ArenaVector<HBasicBlock*>& block_order) {
   block_order_ = &block_order;
@@ -322,14 +324,14 @@ void CodeGenerator::InitializeCodeGeneration(size_t number_of_spill_slots,
       && IsLeafMethod()
       && !RequiresCurrentMethod()) {
     DCHECK_EQ(maximum_number_of_live_core_registers, 0u);
-    DCHECK_EQ(maximum_number_of_live_fp_registers, 0u);
+    DCHECK_EQ(maximum_number_of_live_fpu_registers, 0u);
     SetFrameSize(CallPushesPC() ? GetWordSize() : 0);
   } else {
     SetFrameSize(RoundUp(
         number_of_spill_slots * kVRegSize
         + number_of_out_slots * kVRegSize
         + maximum_number_of_live_core_registers * GetWordSize()
-        + maximum_number_of_live_fp_registers * GetFloatingPointSpillSlotSize()
+        + maximum_number_of_live_fpu_registers * GetFloatingPointSpillSlotSize()
         + FrameEntrySpillSize(),
         kStackAlignment));
   }
@@ -381,11 +383,11 @@ void CodeGenerator::CreateCommonInvokeLocationSummary(
     HInvokeStaticOrDirect* call = invoke->AsInvokeStaticOrDirect();
     switch (call->GetMethodLoadKind()) {
       case HInvokeStaticOrDirect::MethodLoadKind::kRecursive:
-        locations->SetInAt(call->GetCurrentMethodInputIndex(), visitor->GetMethodLocation());
+        locations->SetInAt(call->GetSpecialInputIndex(), visitor->GetMethodLocation());
         break;
       case HInvokeStaticOrDirect::MethodLoadKind::kDexCacheViaMethod:
         locations->AddTemp(visitor->GetMethodLocation());
-        locations->SetInAt(call->GetCurrentMethodInputIndex(), Location::RequiresRegister());
+        locations->SetInAt(call->GetSpecialInputIndex(), Location::RequiresRegister());
         break;
       default:
         locations->AddTemp(visitor->GetMethodLocation());
@@ -545,15 +547,19 @@ void CodeGenerator::GenerateUnresolvedFieldAccess(
   }
 }
 
+// TODO: Remove argument `code_generator_supports_read_barrier` when
+// all code generators have read barrier support.
 void CodeGenerator::CreateLoadClassLocationSummary(HLoadClass* cls,
                                                    Location runtime_type_index_location,
-                                                   Location runtime_return_location) {
+                                                   Location runtime_return_location,
+                                                   bool code_generator_supports_read_barrier) {
   ArenaAllocator* allocator = cls->GetBlock()->GetGraph()->GetArena();
   LocationSummary::CallKind call_kind = cls->NeedsAccessCheck()
       ? LocationSummary::kCall
-      : (cls->CanCallRuntime()
-          ? LocationSummary::kCallOnSlowPath
-          : LocationSummary::kNoCall);
+      : (((code_generator_supports_read_barrier && kEmitCompilerReadBarrier) ||
+          cls->CanCallRuntime())
+            ? LocationSummary::kCallOnSlowPath
+            : LocationSummary::kNoCall);
   LocationSummary* locations = new (allocator) LocationSummary(cls, call_kind);
   if (cls->NeedsAccessCheck()) {
     locations->SetInAt(0, Location::NoLocation());
@@ -1318,21 +1324,38 @@ void CodeGenerator::ValidateInvokeRuntime(HInstruction* instruction, SlowPathCod
   // coherent with the runtime call generated, and that the GC side effect is
   // set when required.
   if (slow_path == nullptr) {
-    DCHECK(instruction->GetLocations()->WillCall()) << instruction->DebugName();
+    DCHECK(instruction->GetLocations()->WillCall())
+        << "instruction->DebugName()=" << instruction->DebugName();
     DCHECK(instruction->GetSideEffects().Includes(SideEffects::CanTriggerGC()))
-        << instruction->DebugName() << instruction->GetSideEffects().ToString();
+        << "instruction->DebugName()=" << instruction->DebugName()
+        << " instruction->GetSideEffects().ToString()=" << instruction->GetSideEffects().ToString();
   } else {
     DCHECK(instruction->GetLocations()->OnlyCallsOnSlowPath() || slow_path->IsFatal())
-        << instruction->DebugName() << slow_path->GetDescription();
+        << "instruction->DebugName()=" << instruction->DebugName()
+        << " slow_path->GetDescription()=" << slow_path->GetDescription();
     DCHECK(instruction->GetSideEffects().Includes(SideEffects::CanTriggerGC()) ||
            // Control flow would not come back into the code if a fatal slow
            // path is taken, so we do not care if it triggers GC.
            slow_path->IsFatal() ||
            // HDeoptimize is a special case: we know we are not coming back from
            // it into the code.
-           instruction->IsDeoptimize())
-        << instruction->DebugName() << instruction->GetSideEffects().ToString()
-        << slow_path->GetDescription();
+           instruction->IsDeoptimize() ||
+           // When read barriers are enabled, some instructions use a
+           // slow path to emit a read barrier, which does not trigger
+           // GC, is not fatal, nor is emitted by HDeoptimize
+           // instructions.
+           (kEmitCompilerReadBarrier &&
+            (instruction->IsInstanceFieldGet() ||
+             instruction->IsStaticFieldGet() ||
+             instruction->IsArraySet() ||
+             instruction->IsArrayGet() ||
+             instruction->IsLoadClass() ||
+             instruction->IsLoadString() ||
+             instruction->IsInstanceOf() ||
+             instruction->IsCheckCast())))
+        << "instruction->DebugName()=" << instruction->DebugName()
+        << " instruction->GetSideEffects().ToString()=" << instruction->GetSideEffects().ToString()
+        << " slow_path->GetDescription()=" << slow_path->GetDescription();
   }
 
   // Check the coherency of leaf information.
@@ -1344,11 +1367,12 @@ void CodeGenerator::ValidateInvokeRuntime(HInstruction* instruction, SlowPathCod
 }
 
 void SlowPathCode::SaveLiveRegisters(CodeGenerator* codegen, LocationSummary* locations) {
-  RegisterSet* register_set = locations->GetLiveRegisters();
+  RegisterSet* live_registers = locations->GetLiveRegisters();
   size_t stack_offset = codegen->GetFirstRegisterSlotInSlowPath();
+
   for (size_t i = 0, e = codegen->GetNumberOfCoreRegisters(); i < e; ++i) {
     if (!codegen->IsCoreCalleeSaveRegister(i)) {
-      if (register_set->ContainsCoreRegister(i)) {
+      if (live_registers->ContainsCoreRegister(i)) {
         // If the register holds an object, update the stack mask.
         if (locations->RegisterContainsObject(i)) {
           locations->SetStackBit(stack_offset / kVRegSize);
@@ -1363,7 +1387,7 @@ void SlowPathCode::SaveLiveRegisters(CodeGenerator* codegen, LocationSummary* lo
 
   for (size_t i = 0, e = codegen->GetNumberOfFloatingPointRegisters(); i < e; ++i) {
     if (!codegen->IsFloatingPointCalleeSaveRegister(i)) {
-      if (register_set->ContainsFloatingPointRegister(i)) {
+      if (live_registers->ContainsFloatingPointRegister(i)) {
         DCHECK_LT(stack_offset, codegen->GetFrameSize() - codegen->FrameEntrySpillSize());
         DCHECK_LT(i, kMaximumNumberOfExpectedRegisters);
         saved_fpu_stack_offsets_[i] = stack_offset;
@@ -1374,12 +1398,14 @@ void SlowPathCode::SaveLiveRegisters(CodeGenerator* codegen, LocationSummary* lo
 }
 
 void SlowPathCode::RestoreLiveRegisters(CodeGenerator* codegen, LocationSummary* locations) {
-  RegisterSet* register_set = locations->GetLiveRegisters();
+  RegisterSet* live_registers = locations->GetLiveRegisters();
   size_t stack_offset = codegen->GetFirstRegisterSlotInSlowPath();
+
   for (size_t i = 0, e = codegen->GetNumberOfCoreRegisters(); i < e; ++i) {
     if (!codegen->IsCoreCalleeSaveRegister(i)) {
-      if (register_set->ContainsCoreRegister(i)) {
+      if (live_registers->ContainsCoreRegister(i)) {
         DCHECK_LT(stack_offset, codegen->GetFrameSize() - codegen->FrameEntrySpillSize());
+        DCHECK_LT(i, kMaximumNumberOfExpectedRegisters);
         stack_offset += codegen->RestoreCoreRegister(stack_offset, i);
       }
     }
@@ -1387,8 +1413,9 @@ void SlowPathCode::RestoreLiveRegisters(CodeGenerator* codegen, LocationSummary*
 
   for (size_t i = 0, e = codegen->GetNumberOfFloatingPointRegisters(); i < e; ++i) {
     if (!codegen->IsFloatingPointCalleeSaveRegister(i)) {
-      if (register_set->ContainsFloatingPointRegister(i)) {
+      if (live_registers->ContainsFloatingPointRegister(i)) {
         DCHECK_LT(stack_offset, codegen->GetFrameSize() - codegen->FrameEntrySpillSize());
+        DCHECK_LT(i, kMaximumNumberOfExpectedRegisters);
         stack_offset += codegen->RestoreFloatingPointRegister(stack_offset, i);
       }
     }
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index a92014dc79..114d97be94 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -201,7 +201,7 @@ class CodeGenerator {
   virtual uintptr_t GetAddressOf(HBasicBlock* block) const = 0;
   void InitializeCodeGeneration(size_t number_of_spill_slots,
                                 size_t maximum_number_of_live_core_registers,
-                                size_t maximum_number_of_live_fp_registers,
+                                size_t maximum_number_of_live_fpu_registers,
                                 size_t number_of_out_slots,
                                 const ArenaVector<HBasicBlock*>& block_order);
   int32_t GetStackSlot(HLocal* local) const;
@@ -250,6 +250,15 @@ class CodeGenerator {
   // Returns whether we should split long moves in parallel moves.
   virtual bool ShouldSplitLongMoves() const { return false; }
 
+  size_t GetNumberOfCoreCalleeSaveRegisters() const {
+    return POPCOUNT(core_callee_save_mask_);
+  }
+
+  size_t GetNumberOfCoreCallerSaveRegisters() const {
+    DCHECK_GE(GetNumberOfCoreRegisters(), GetNumberOfCoreCalleeSaveRegisters());
+    return GetNumberOfCoreRegisters() - GetNumberOfCoreCalleeSaveRegisters();
+  }
+
   bool IsCoreCalleeSaveRegister(int reg) const {
     return (core_callee_save_mask_ & (1 << reg)) != 0;
   }
@@ -416,7 +425,8 @@ class CodeGenerator {
   // TODO: This overlaps a bit with MoveFromReturnRegister. Refactor for a better design.
   static void CreateLoadClassLocationSummary(HLoadClass* cls,
                                              Location runtime_type_index_location,
-                                             Location runtime_return_location);
+                                             Location runtime_return_location,
+                                             bool code_generator_supports_read_barrier = false);
 
   static void CreateSystemArrayCopyLocationSummary(HInvoke* invoke);
 
@@ -490,6 +500,7 @@ class CodeGenerator {
         compiler_options_(compiler_options),
         src_map_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
         slow_paths_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+        current_slow_path_(nullptr),
         current_block_index_(0),
         is_leaf_(true),
         requires_current_method_(false) {
@@ -557,6 +568,10 @@ class CodeGenerator {
     return raw_pointer_to_labels_array + block->GetBlockId();
   }
 
+  SlowPathCode* GetCurrentSlowPath() {
+    return current_slow_path_;
+  }
+
   // Frame size required for this method.
   uint32_t frame_size_;
   uint32_t core_spill_mask_;
@@ -605,6 +620,9 @@ class CodeGenerator {
   ArenaVector<SrcMapElem> src_map_;
   ArenaVector<SlowPathCode*> slow_paths_;
 
+  // The current slow path that we're generating code for.
+  SlowPathCode* current_slow_path_;
+
   // The current block index in `block_order_` of the block
   // we are generating code for.
   size_t current_block_index_;
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 6d05293277..cb6bed08ec 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -34,6 +34,9 @@
 
 namespace art {
 
+template<class MirrorType>
+class GcRoot;
+
 namespace arm {
 
 static bool ExpectedPairLayout(Location location) {
@@ -286,15 +289,6 @@ class TypeCheckSlowPathARM : public SlowPathCode {
     CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen);
     __ Bind(GetEntryLabel());
 
-    if (instruction_->IsCheckCast()) {
-      // The codegen for the instruction overwrites `temp`, so put it back in place.
-      Register obj = locations->InAt(0).AsRegister<Register>();
-      Register temp = locations->GetTemp(0).AsRegister<Register>();
-      uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
-      __ LoadFromOffset(kLoadWord, temp, obj, class_offset);
-      __ MaybeUnpoisonHeapReference(temp);
-    }
-
     if (!is_fatal_) {
       SaveLiveRegisters(codegen, locations);
     }
@@ -315,6 +309,8 @@ class TypeCheckSlowPathARM : public SlowPathCode {
                                  instruction_,
                                  instruction_->GetDexPc(),
                                  this);
+      CheckEntrypointTypes<
+          kQuickInstanceofNonTrivial, uint32_t, const mirror::Class*, const mirror::Class*>();
       arm_codegen->Move32(locations->Out(), Location::RegisterLocation(R0));
     } else {
       DCHECK(instruction_->IsCheckCast());
@@ -322,6 +318,7 @@ class TypeCheckSlowPathARM : public SlowPathCode {
                                  instruction_,
                                  instruction_->GetDexPc(),
                                  this);
+      CheckEntrypointTypes<kQuickCheckCast, void, const mirror::Class*, const mirror::Class*>();
     }
 
     if (!is_fatal_) {
@@ -408,6 +405,221 @@ class ArraySetSlowPathARM : public SlowPathCode {
   DISALLOW_COPY_AND_ASSIGN(ArraySetSlowPathARM);
 };
 
+// Slow path generating a read barrier for a heap reference.
+class ReadBarrierForHeapReferenceSlowPathARM : public SlowPathCode {
+ public:
+  ReadBarrierForHeapReferenceSlowPathARM(HInstruction* instruction,
+                                         Location out,
+                                         Location ref,
+                                         Location obj,
+                                         uint32_t offset,
+                                         Location index)
+      : instruction_(instruction),
+        out_(out),
+        ref_(ref),
+        obj_(obj),
+        offset_(offset),
+        index_(index) {
+    DCHECK(kEmitCompilerReadBarrier);
+    // If `obj` is equal to `out` or `ref`, it means the initial object
+    // has been overwritten by (or after) the heap object reference load
+    // to be instrumented, e.g.:
+    //
+    //   __ LoadFromOffset(kLoadWord, out, out, offset);
+    //   codegen_->GenerateReadBarrier(instruction, out_loc, out_loc, out_loc, offset);
+    //
+    // In that case, we have lost the information about the original
+    // object, and the emitted read barrier cannot work properly.
+    DCHECK(!obj.Equals(out)) << "obj=" << obj << " out=" << out;
+    DCHECK(!obj.Equals(ref)) << "obj=" << obj << " ref=" << ref;
+  }
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen);
+    LocationSummary* locations = instruction_->GetLocations();
+    Register reg_out = out_.AsRegister<Register>();
+    DCHECK(locations->CanCall());
+    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg_out));
+    DCHECK(!instruction_->IsInvoke() ||
+           (instruction_->IsInvokeStaticOrDirect() &&
+            instruction_->GetLocations()->Intrinsified()));
+
+    __ Bind(GetEntryLabel());
+    SaveLiveRegisters(codegen, locations);
+
+    // We may have to change the index's value, but as `index_` is a
+    // constant member (like other "inputs" of this slow path),
+    // introduce a copy of it, `index`.
+    Location index = index_;
+    if (index_.IsValid()) {
+      // Handle `index_` for HArrayGet and intrinsic UnsafeGetObject.
+      if (instruction_->IsArrayGet()) {
+        // Compute the actual memory offset and store it in `index`.
+        Register index_reg = index_.AsRegister<Register>();
+        DCHECK(locations->GetLiveRegisters()->ContainsCoreRegister(index_reg));
+        if (codegen->IsCoreCalleeSaveRegister(index_reg)) {
+          // We are about to change the value of `index_reg` (see the
+          // calls to art::arm::Thumb2Assembler::Lsl and
+          // art::arm::Thumb2Assembler::AddConstant below), but it has
+          // not been saved by the previous call to
+          // art::SlowPathCode::SaveLiveRegisters, as it is a
+          // callee-save register --
+          // art::SlowPathCode::SaveLiveRegisters does not consider
+          // callee-save registers, as it has been designed with the
+          // assumption that callee-save registers are supposed to be
+          // handled by the called function.  So, as a callee-save
+          // register, `index_reg` _would_ eventually be saved onto
+          // the stack, but it would be too late: we would have
+          // changed its value earlier.  Therefore, we manually save
+          // it here into another freely available register,
+          // `free_reg`, chosen of course among the caller-save
+          // registers (as a callee-save `free_reg` register would
+          // exhibit the same problem).
+          //
+          // Note we could have requested a temporary register from
+          // the register allocator instead; but we prefer not to, as
+          // this is a slow path, and we know we can find a
+          // caller-save register that is available.
+          Register free_reg = FindAvailableCallerSaveRegister(codegen);
+          __ Mov(free_reg, index_reg);
+          index_reg = free_reg;
+          index = Location::RegisterLocation(index_reg);
+        } else {
+          // The initial register stored in `index_` has already been
+          // saved in the call to art::SlowPathCode::SaveLiveRegisters
+          // (as it is not a callee-save register), so we can freely
+          // use it.
+        }
+        // Shifting the index value contained in `index_reg` by the scale
+        // factor (2) cannot overflow in practice, as the runtime is
+        // unable to allocate object arrays with a size larger than
+        // 2^26 - 1 (that is, 2^28 - 4 bytes).
+        __ Lsl(index_reg, index_reg, TIMES_4);
+        static_assert(
+            sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
+            "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
+        __ AddConstant(index_reg, index_reg, offset_);
+      } else {
+        DCHECK(instruction_->IsInvoke());
+        DCHECK(instruction_->GetLocations()->Intrinsified());
+        DCHECK((instruction_->AsInvoke()->GetIntrinsic() == Intrinsics::kUnsafeGetObject) ||
+               (instruction_->AsInvoke()->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile))
+            << instruction_->AsInvoke()->GetIntrinsic();
+        DCHECK_EQ(offset_, 0U);
+        DCHECK(index_.IsRegisterPair());
+        // UnsafeGet's offset location is a register pair, the low
+        // part contains the correct offset.
+        index = index_.ToLow();
+      }
+    }
+
+    // We're moving two or three locations to locations that could
+    // overlap, so we need a parallel move resolver.
+    InvokeRuntimeCallingConvention calling_convention;
+    HParallelMove parallel_move(codegen->GetGraph()->GetArena());
+    parallel_move.AddMove(ref_,
+                          Location::RegisterLocation(calling_convention.GetRegisterAt(0)),
+                          Primitive::kPrimNot,
+                          nullptr);
+    parallel_move.AddMove(obj_,
+                          Location::RegisterLocation(calling_convention.GetRegisterAt(1)),
+                          Primitive::kPrimNot,
+                          nullptr);
+    if (index.IsValid()) {
+      parallel_move.AddMove(index,
+                            Location::RegisterLocation(calling_convention.GetRegisterAt(2)),
+                            Primitive::kPrimInt,
+                            nullptr);
+      codegen->GetMoveResolver()->EmitNativeCode(&parallel_move);
+    } else {
+      codegen->GetMoveResolver()->EmitNativeCode(&parallel_move);
+      __ LoadImmediate(calling_convention.GetRegisterAt(2), offset_);
+    }
+    arm_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pReadBarrierSlow),
+                               instruction_,
+                               instruction_->GetDexPc(),
+                               this);
+    CheckEntrypointTypes<
+        kQuickReadBarrierSlow, mirror::Object*, mirror::Object*, mirror::Object*, uint32_t>();
+    arm_codegen->Move32(out_, Location::RegisterLocation(R0));
+
+    RestoreLiveRegisters(codegen, locations);
+    __ b(GetExitLabel());
+  }
+
+  const char* GetDescription() const OVERRIDE { return "ReadBarrierForHeapReferenceSlowPathARM"; }
+
+ private:
+  Register FindAvailableCallerSaveRegister(CodeGenerator* codegen) {
+    size_t ref = static_cast<int>(ref_.AsRegister<Register>());
+    size_t obj = static_cast<int>(obj_.AsRegister<Register>());
+    for (size_t i = 0, e = codegen->GetNumberOfCoreRegisters(); i < e; ++i) {
+      if (i != ref && i != obj && !codegen->IsCoreCalleeSaveRegister(i)) {
+        return static_cast<Register>(i);
+      }
+    }
+    // We shall never fail to find a free caller-save register, as
+    // there are more than two core caller-save registers on ARM
+    // (meaning it is possible to find one which is different from
+    // `ref` and `obj`).
+    DCHECK_GT(codegen->GetNumberOfCoreCallerSaveRegisters(), 2u);
+    LOG(FATAL) << "Could not find a free caller-save register";
+    UNREACHABLE();
+  }
+
+  HInstruction* const instruction_;
+  const Location out_;
+  const Location ref_;
+  const Location obj_;
+  const uint32_t offset_;
+  // An additional location containing an index to an array.
+  // Only used for HArrayGet and the UnsafeGetObject &
+  // UnsafeGetObjectVolatile intrinsics.
+  const Location index_;
+
+  DISALLOW_COPY_AND_ASSIGN(ReadBarrierForHeapReferenceSlowPathARM);
+};
+
+// Slow path generating a read barrier for a GC root.
+class ReadBarrierForRootSlowPathARM : public SlowPathCode {
+ public:
+  ReadBarrierForRootSlowPathARM(HInstruction* instruction, Location out, Location root)
+      : instruction_(instruction), out_(out), root_(root) {}
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    LocationSummary* locations = instruction_->GetLocations();
+    Register reg_out = out_.AsRegister<Register>();
+    DCHECK(locations->CanCall());
+    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg_out));
+    DCHECK(instruction_->IsLoadClass() || instruction_->IsLoadString());
+
+    __ Bind(GetEntryLabel());
+    SaveLiveRegisters(codegen, locations);
+
+    InvokeRuntimeCallingConvention calling_convention;
+    CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen);
+    arm_codegen->Move32(Location::RegisterLocation(calling_convention.GetRegisterAt(0)), root_);
+    arm_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pReadBarrierForRootSlow),
+                               instruction_,
+                               instruction_->GetDexPc(),
+                               this);
+    CheckEntrypointTypes<kQuickReadBarrierForRootSlow, mirror::Object*, GcRoot<mirror::Object>*>();
+    arm_codegen->Move32(out_, Location::RegisterLocation(R0));
+
+    RestoreLiveRegisters(codegen, locations);
+    __ b(GetExitLabel());
+  }
+
+  const char* GetDescription() const OVERRIDE { return "ReadBarrierForRootSlowPathARM"; }
+
+ private:
+  HInstruction* const instruction_;
+  const Location out_;
+  const Location root_;
+
+  DISALLOW_COPY_AND_ASSIGN(ReadBarrierForRootSlowPathARM);
+};
+
 #undef __
 #define __ down_cast<ArmAssembler*>(GetAssembler())->
 
@@ -581,7 +793,7 @@ Location CodeGeneratorARM::AllocateFreeRegister(Primitive::Type type) const {
       LOG(FATAL) << "Unreachable type " << type;
   }
 
-  return Location();
+  return Location::NoLocation();
 }
 
 void CodeGeneratorARM::SetupBlockedRegisters(bool is_baseline) const {
@@ -820,7 +1032,7 @@ Location InvokeDexCallingConventionVisitorARM::GetNextLocation(Primitive::Type t
       LOG(FATAL) << "Unexpected parameter type " << type;
       break;
   }
-  return Location();
+  return Location::NoLocation();
 }
 
 Location InvokeDexCallingConventionVisitorARM::GetReturnLocation(Primitive::Type type) const {
@@ -847,7 +1059,7 @@ Location InvokeDexCallingConventionVisitorARM::GetReturnLocation(Primitive::Type
     }
 
     case Primitive::kPrimVoid:
-      return Location();
+      return Location::NoLocation();
   }
 
   UNREACHABLE();
@@ -1240,26 +1452,19 @@ void InstructionCodeGeneratorARM::GenerateLongComparesAndJumps(HCondition* cond,
   __ b(true_label, final_condition);
 }
 
-void InstructionCodeGeneratorARM::GenerateCompareTestAndBranch(HIf* if_instr,
-                                                               HCondition* condition,
-                                                               Label* true_target,
-                                                               Label* false_target,
-                                                               Label* always_true_target) {
+void InstructionCodeGeneratorARM::GenerateCompareTestAndBranch(HCondition* condition,
+                                                               Label* true_target_in,
+                                                               Label* false_target_in) {
+  // Generated branching requires both targets to be explicit. If either of the
+  // targets is nullptr (fallthrough) use and bind `fallthrough_target` instead.
+  Label fallthrough_target;
+  Label* true_target = true_target_in == nullptr ? &fallthrough_target : true_target_in;
+  Label* false_target = false_target_in == nullptr ? &fallthrough_target : false_target_in;
+
   LocationSummary* locations = condition->GetLocations();
   Location left = locations->InAt(0);
   Location right = locations->InAt(1);
 
-  // We don't want true_target as a nullptr.
-  if (true_target == nullptr) {
-    true_target = always_true_target;
-  }
-  bool falls_through = (false_target == nullptr);
-
-  // FP compares don't like null false_targets.
-  if (false_target == nullptr) {
-    false_target = codegen_->GetLabelOf(if_instr->IfFalseSuccessor());
-  }
-
   Primitive::Type type = condition->InputAt(0)->GetType();
   switch (type) {
     case Primitive::kPrimLong:
@@ -1278,117 +1483,125 @@ void InstructionCodeGeneratorARM::GenerateCompareTestAndBranch(HIf* if_instr,
       LOG(FATAL) << "Unexpected compare type " << type;
   }
 
-  if (!falls_through) {
+  if (false_target != &fallthrough_target) {
     __ b(false_target);
   }
+
+  if (fallthrough_target.IsLinked()) {
+    __ Bind(&fallthrough_target);
+  }
 }
 
 void InstructionCodeGeneratorARM::GenerateTestAndBranch(HInstruction* instruction,
+                                                        size_t condition_input_index,
                                                         Label* true_target,
-                                                        Label* false_target,
-                                                        Label* always_true_target) {
-  HInstruction* cond = instruction->InputAt(0);
-  if (cond->IsIntConstant()) {
+                                                        Label* false_target) {
+  HInstruction* cond = instruction->InputAt(condition_input_index);
+
+  if (true_target == nullptr && false_target == nullptr) {
+    // Nothing to do. The code always falls through.
+    return;
+  } else if (cond->IsIntConstant()) {
     // Constant condition, statically compared against 1.
-    int32_t cond_value = cond->AsIntConstant()->GetValue();
-    if (cond_value == 1) {
-      if (always_true_target != nullptr) {
-        __ b(always_true_target);
+    if (cond->AsIntConstant()->IsOne()) {
+      if (true_target != nullptr) {
+        __ b(true_target);
       }
-      return;
     } else {
-      DCHECK_EQ(cond_value, 0);
+      DCHECK(cond->AsIntConstant()->IsZero());
+      if (false_target != nullptr) {
+        __ b(false_target);
+      }
+    }
+    return;
+  }
+
+  // The following code generates these patterns:
+  //  (1) true_target == nullptr && false_target != nullptr
+  //        - opposite condition true => branch to false_target
+  //  (2) true_target != nullptr && false_target == nullptr
+  //        - condition true => branch to true_target
+  //  (3) true_target != nullptr && false_target != nullptr
+  //        - condition true => branch to true_target
+  //        - branch to false_target
+  if (IsBooleanValueOrMaterializedCondition(cond)) {
+    // Condition has been materialized, compare the output to 0.
+    Location cond_val = instruction->GetLocations()->InAt(condition_input_index);
+    DCHECK(cond_val.IsRegister());
+    if (true_target == nullptr) {
+      __ CompareAndBranchIfZero(cond_val.AsRegister<Register>(), false_target);
+    } else {
+      __ CompareAndBranchIfNonZero(cond_val.AsRegister<Register>(), true_target);
     }
   } else {
-    // Can we optimize the jump if we know that the next block is the true case?
+    // Condition has not been materialized. Use its inputs as the comparison and
+    // its condition as the branch condition.
     HCondition* condition = cond->AsCondition();
-    bool can_jump_to_false = CanReverseCondition(always_true_target, false_target, condition);
-    if (condition == nullptr || condition->NeedsMaterialization()) {
-      // Condition has been materialized, compare the output to 0.
-      DCHECK(instruction->GetLocations()->InAt(0).IsRegister());
-      if (can_jump_to_false) {
-        __ CompareAndBranchIfZero(instruction->GetLocations()->InAt(0).AsRegister<Register>(),
-                                  false_target);
-        return;
-      }
-      __ CompareAndBranchIfNonZero(instruction->GetLocations()->InAt(0).AsRegister<Register>(),
-                                   true_target);
-    } else {
-      // Condition has not been materialized, use its inputs as the
-      // comparison and its condition as the branch condition.
-      Primitive::Type type = (condition != nullptr)
-          ? cond->InputAt(0)->GetType()
-          : Primitive::kPrimInt;
-      // Is this a long or FP comparison that has been folded into the HCondition?
-      if (type == Primitive::kPrimLong || Primitive::IsFloatingPointType(type)) {
-        // Generate the comparison directly.
-        GenerateCompareTestAndBranch(instruction->AsIf(), condition,
-                                     true_target, false_target, always_true_target);
-        return;
-      }
 
-      LocationSummary* locations = cond->GetLocations();
-      DCHECK(locations->InAt(0).IsRegister()) << locations->InAt(0);
-      Register left = locations->InAt(0).AsRegister<Register>();
-      Location right = locations->InAt(1);
-      if (right.IsRegister()) {
-        __ cmp(left, ShifterOperand(right.AsRegister<Register>()));
-      } else {
-        DCHECK(right.IsConstant());
-        GenerateCompareWithImmediate(left, CodeGenerator::GetInt32ValueOf(right.GetConstant()));
-      }
-      if (can_jump_to_false) {
-        __ b(false_target, ARMCondition(condition->GetOppositeCondition()));
-        return;
-      }
+    // If this is a long or FP comparison that has been folded into
+    // the HCondition, generate the comparison directly.
+    Primitive::Type type = condition->InputAt(0)->GetType();
+    if (type == Primitive::kPrimLong || Primitive::IsFloatingPointType(type)) {
+      GenerateCompareTestAndBranch(condition, true_target, false_target);
+      return;
+    }
 
+    LocationSummary* locations = cond->GetLocations();
+    DCHECK(locations->InAt(0).IsRegister());
+    Register left = locations->InAt(0).AsRegister<Register>();
+    Location right = locations->InAt(1);
+    if (right.IsRegister()) {
+      __ cmp(left, ShifterOperand(right.AsRegister<Register>()));
+    } else {
+      DCHECK(right.IsConstant());
+      GenerateCompareWithImmediate(left, CodeGenerator::GetInt32ValueOf(right.GetConstant()));
+    }
+    if (true_target == nullptr) {
+      __ b(false_target, ARMCondition(condition->GetOppositeCondition()));
+    } else {
       __ b(true_target, ARMCondition(condition->GetCondition()));
     }
   }
-  if (false_target != nullptr) {
+
+  // If neither branch falls through (case 3), the conditional branch to `true_target`
+  // was already emitted (case 2) and we need to emit a jump to `false_target`.
+  if (true_target != nullptr && false_target != nullptr) {
     __ b(false_target);
   }
 }
 
 void LocationsBuilderARM::VisitIf(HIf* if_instr) {
-  LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(if_instr, LocationSummary::kNoCall);
-  HInstruction* cond = if_instr->InputAt(0);
-  if (!cond->IsCondition() || cond->AsCondition()->NeedsMaterialization()) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(if_instr);
+  if (IsBooleanValueOrMaterializedCondition(if_instr->InputAt(0))) {
     locations->SetInAt(0, Location::RequiresRegister());
   }
 }
 
 void InstructionCodeGeneratorARM::VisitIf(HIf* if_instr) {
-  Label* true_target = codegen_->GetLabelOf(if_instr->IfTrueSuccessor());
-  Label* false_target = codegen_->GetLabelOf(if_instr->IfFalseSuccessor());
-  Label* always_true_target = true_target;
-  if (codegen_->GoesToNextBlock(if_instr->GetBlock(),
-                                if_instr->IfTrueSuccessor())) {
-    always_true_target = nullptr;
-  }
-  if (codegen_->GoesToNextBlock(if_instr->GetBlock(),
-                                if_instr->IfFalseSuccessor())) {
-    false_target = nullptr;
-  }
-  GenerateTestAndBranch(if_instr, true_target, false_target, always_true_target);
+  HBasicBlock* true_successor = if_instr->IfTrueSuccessor();
+  HBasicBlock* false_successor = if_instr->IfFalseSuccessor();
+  Label* true_target = codegen_->GoesToNextBlock(if_instr->GetBlock(), true_successor) ?
+      nullptr : codegen_->GetLabelOf(true_successor);
+  Label* false_target = codegen_->GoesToNextBlock(if_instr->GetBlock(), false_successor) ?
+      nullptr : codegen_->GetLabelOf(false_successor);
+  GenerateTestAndBranch(if_instr, /* condition_input_index */ 0, true_target, false_target);
 }
 
 void LocationsBuilderARM::VisitDeoptimize(HDeoptimize* deoptimize) {
   LocationSummary* locations = new (GetGraph()->GetArena())
       LocationSummary(deoptimize, LocationSummary::kCallOnSlowPath);
-  HInstruction* cond = deoptimize->InputAt(0);
-  if (!cond->IsCondition() || cond->AsCondition()->NeedsMaterialization()) {
+  if (IsBooleanValueOrMaterializedCondition(deoptimize->InputAt(0))) {
     locations->SetInAt(0, Location::RequiresRegister());
   }
 }
 
 void InstructionCodeGeneratorARM::VisitDeoptimize(HDeoptimize* deoptimize) {
-  SlowPathCode* slow_path = new (GetGraph()->GetArena())
-      DeoptimizationSlowPathARM(deoptimize);
+  SlowPathCode* slow_path = new (GetGraph()->GetArena()) DeoptimizationSlowPathARM(deoptimize);
   codegen_->AddSlowPath(slow_path);
-  Label* slow_path_entry = slow_path->GetEntryLabel();
-  GenerateTestAndBranch(deoptimize, slow_path_entry, nullptr, slow_path_entry);
+  GenerateTestAndBranch(deoptimize,
+                        /* condition_input_index */ 0,
+                        slow_path->GetEntryLabel(),
+                        /* false_target */ nullptr);
 }
 
 void LocationsBuilderARM::VisitCondition(HCondition* cond) {
@@ -1761,29 +1974,39 @@ void LocationsBuilderARM::VisitInvokeInterface(HInvokeInterface* invoke) {
 
 void InstructionCodeGeneratorARM::VisitInvokeInterface(HInvokeInterface* invoke) {
   // TODO: b/18116999, our IMTs can miss an IncompatibleClassChangeError.
-  Register temp = invoke->GetLocations()->GetTemp(0).AsRegister<Register>();
+  LocationSummary* locations = invoke->GetLocations();
+  Register temp = locations->GetTemp(0).AsRegister<Register>();
+  Register hidden_reg = locations->GetTemp(1).AsRegister<Register>();
   uint32_t method_offset = mirror::Class::EmbeddedImTableEntryOffset(
       invoke->GetImtIndex() % mirror::Class::kImtSize, kArmPointerSize).Uint32Value();
-  LocationSummary* locations = invoke->GetLocations();
   Location receiver = locations->InAt(0);
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
 
-  // Set the hidden argument.
-  __ LoadImmediate(invoke->GetLocations()->GetTemp(1).AsRegister<Register>(),
-                   invoke->GetDexMethodIndex());
+  // Set the hidden argument. This is safe to do this here, as R12
+  // won't be modified thereafter, before the `blx` (call) instruction.
+  DCHECK_EQ(R12, hidden_reg);
+  __ LoadImmediate(hidden_reg, invoke->GetDexMethodIndex());
 
-  // temp = object->GetClass();
   if (receiver.IsStackSlot()) {
     __ LoadFromOffset(kLoadWord, temp, SP, receiver.GetStackIndex());
+    // /* HeapReference<Class> */ temp = temp->klass_
     __ LoadFromOffset(kLoadWord, temp, temp, class_offset);
   } else {
+    // /* HeapReference<Class> */ temp = receiver->klass_
     __ LoadFromOffset(kLoadWord, temp, receiver.AsRegister<Register>(), class_offset);
   }
   codegen_->MaybeRecordImplicitNullCheck(invoke);
+  // Instead of simply (possibly) unpoisoning `temp` here, we should
+  // emit a read barrier for the previous class reference load.
+  // However this is not required in practice, as this is an
+  // intermediate/temporary reference and because the current
+  // concurrent copying collector keeps the from-space memory
+  // intact/accessible until the end of the marking phase (the
+  // concurrent copying collector may not in the future).
   __ MaybeUnpoisonHeapReference(temp);
   // temp = temp->GetImtEntryAt(method_offset);
-  uint32_t entry_point = ArtMethod::EntryPointFromQuickCompiledCodeOffset(
-      kArmWordSize).Int32Value();
+  uint32_t entry_point =
+      ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArmWordSize).Int32Value();
   __ LoadFromOffset(kLoadWord, temp, temp, method_offset);
   // LR = temp->GetEntryPoint();
   __ LoadFromOffset(kLoadWord, LR, temp, entry_point);
@@ -2694,7 +2917,7 @@ void LocationsBuilderARM::VisitDiv(HDiv* div) {
     case Primitive::kPrimInt: {
       if (div->InputAt(1)->IsConstant()) {
         locations->SetInAt(0, Location::RequiresRegister());
-        locations->SetInAt(1, Location::RegisterOrConstant(div->InputAt(1)));
+        locations->SetInAt(1, Location::ConstantLocation(div->InputAt(1)->AsConstant()));
         locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
         int32_t abs_imm = std::abs(div->InputAt(1)->AsIntConstant()->GetValue());
         if (abs_imm <= 1) {
@@ -2818,7 +3041,7 @@ void LocationsBuilderARM::VisitRem(HRem* rem) {
     case Primitive::kPrimInt: {
       if (rem->InputAt(1)->IsConstant()) {
         locations->SetInAt(0, Location::RequiresRegister());
-        locations->SetInAt(1, Location::RegisterOrConstant(rem->InputAt(1)));
+        locations->SetInAt(1, Location::ConstantLocation(rem->InputAt(1)->AsConstant()));
         locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
         int32_t abs_imm = std::abs(rem->InputAt(1)->AsIntConstant()->GetValue());
         if (abs_imm <= 1) {
@@ -2989,17 +3212,29 @@ void LocationsBuilderARM::HandleShift(HBinaryOperation* op) {
   switch (op->GetResultType()) {
     case Primitive::kPrimInt: {
       locations->SetInAt(0, Location::RequiresRegister());
-      locations->SetInAt(1, Location::RegisterOrConstant(op->InputAt(1)));
-      // Make the output overlap, as it will be used to hold the masked
-      // second input.
-      locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
+      if (op->InputAt(1)->IsConstant()) {
+        locations->SetInAt(1, Location::ConstantLocation(op->InputAt(1)->AsConstant()));
+        locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+      } else {
+        locations->SetInAt(1, Location::RequiresRegister());
+        // Make the output overlap, as it will be used to hold the masked
+        // second input.
+        locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
+      }
       break;
     }
     case Primitive::kPrimLong: {
       locations->SetInAt(0, Location::RequiresRegister());
-      locations->SetInAt(1, Location::RequiresRegister());
-      locations->AddTemp(Location::RequiresRegister());
-      locations->SetOut(Location::RequiresRegister());
+      if (op->InputAt(1)->IsConstant()) {
+        locations->SetInAt(1, Location::ConstantLocation(op->InputAt(1)->AsConstant()));
+        // For simplicity, use kOutputOverlap even though we only require that low registers
+        // don't clash with high registers which the register allocator currently guarantees.
+        locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
+      } else {
+        locations->SetInAt(1, Location::RequiresRegister());
+        locations->AddTemp(Location::RequiresRegister());
+        locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
+      }
       break;
     }
     default:
@@ -3020,9 +3255,9 @@ void InstructionCodeGeneratorARM::HandleShift(HBinaryOperation* op) {
     case Primitive::kPrimInt: {
       Register out_reg = out.AsRegister<Register>();
       Register first_reg = first.AsRegister<Register>();
-      // Arm doesn't mask the shift count so we need to do it ourselves.
       if (second.IsRegister()) {
         Register second_reg = second.AsRegister<Register>();
+        // Arm doesn't mask the shift count so we need to do it ourselves.
         __ and_(out_reg, second_reg, ShifterOperand(kMaxIntShiftValue));
         if (op->IsShl()) {
           __ Lsl(out_reg, first_reg, out_reg);
@@ -3050,57 +3285,103 @@ void InstructionCodeGeneratorARM::HandleShift(HBinaryOperation* op) {
       Register o_h = out.AsRegisterPairHigh<Register>();
       Register o_l = out.AsRegisterPairLow<Register>();
 
-      Register temp = locations->GetTemp(0).AsRegister<Register>();
-
       Register high = first.AsRegisterPairHigh<Register>();
       Register low = first.AsRegisterPairLow<Register>();
 
-      Register second_reg = second.AsRegister<Register>();
-
-      if (op->IsShl()) {
-        __ and_(o_l, second_reg, ShifterOperand(kMaxLongShiftValue));
-        // Shift the high part
-        __ Lsl(o_h, high, o_l);
-        // Shift the low part and `or` what overflew on the high part
-        __ rsb(temp, o_l, ShifterOperand(kArmBitsPerWord));
-        __ Lsr(temp, low, temp);
-        __ orr(o_h, o_h, ShifterOperand(temp));
-        // If the shift is > 32 bits, override the high part
-        __ subs(temp, o_l, ShifterOperand(kArmBitsPerWord));
-        __ it(PL);
-        __ Lsl(o_h, low, temp, PL);
-        // Shift the low part
-        __ Lsl(o_l, low, o_l);
-      } else if (op->IsShr()) {
-        __ and_(o_h, second_reg, ShifterOperand(kMaxLongShiftValue));
-        // Shift the low part
-        __ Lsr(o_l, low, o_h);
-        // Shift the high part and `or` what underflew on the low part
-        __ rsb(temp, o_h, ShifterOperand(kArmBitsPerWord));
-        __ Lsl(temp, high, temp);
-        __ orr(o_l, o_l, ShifterOperand(temp));
-        // If the shift is > 32 bits, override the low part
-        __ subs(temp, o_h, ShifterOperand(kArmBitsPerWord));
-        __ it(PL);
-        __ Asr(o_l, high, temp, PL);
-        // Shift the high part
-        __ Asr(o_h, high, o_h);
+      if (second.IsRegister()) {
+        Register temp = locations->GetTemp(0).AsRegister<Register>();
+
+        Register second_reg = second.AsRegister<Register>();
+
+        if (op->IsShl()) {
+          __ and_(o_l, second_reg, ShifterOperand(kMaxLongShiftValue));
+          // Shift the high part
+          __ Lsl(o_h, high, o_l);
+          // Shift the low part and `or` what overflew on the high part
+          __ rsb(temp, o_l, ShifterOperand(kArmBitsPerWord));
+          __ Lsr(temp, low, temp);
+          __ orr(o_h, o_h, ShifterOperand(temp));
+          // If the shift is > 32 bits, override the high part
+          __ subs(temp, o_l, ShifterOperand(kArmBitsPerWord));
+          __ it(PL);
+          __ Lsl(o_h, low, temp, PL);
+          // Shift the low part
+          __ Lsl(o_l, low, o_l);
+        } else if (op->IsShr()) {
+          __ and_(o_h, second_reg, ShifterOperand(kMaxLongShiftValue));
+          // Shift the low part
+          __ Lsr(o_l, low, o_h);
+          // Shift the high part and `or` what underflew on the low part
+          __ rsb(temp, o_h, ShifterOperand(kArmBitsPerWord));
+          __ Lsl(temp, high, temp);
+          __ orr(o_l, o_l, ShifterOperand(temp));
+          // If the shift is > 32 bits, override the low part
+          __ subs(temp, o_h, ShifterOperand(kArmBitsPerWord));
+          __ it(PL);
+          __ Asr(o_l, high, temp, PL);
+          // Shift the high part
+          __ Asr(o_h, high, o_h);
+        } else {
+          __ and_(o_h, second_reg, ShifterOperand(kMaxLongShiftValue));
+          // same as Shr except we use `Lsr`s and not `Asr`s
+          __ Lsr(o_l, low, o_h);
+          __ rsb(temp, o_h, ShifterOperand(kArmBitsPerWord));
+          __ Lsl(temp, high, temp);
+          __ orr(o_l, o_l, ShifterOperand(temp));
+          __ subs(temp, o_h, ShifterOperand(kArmBitsPerWord));
+          __ it(PL);
+          __ Lsr(o_l, high, temp, PL);
+          __ Lsr(o_h, high, o_h);
+        }
       } else {
-        __ and_(o_h, second_reg, ShifterOperand(kMaxLongShiftValue));
-        // same as Shr except we use `Lsr`s and not `Asr`s
-        __ Lsr(o_l, low, o_h);
-        __ rsb(temp, o_h, ShifterOperand(kArmBitsPerWord));
-        __ Lsl(temp, high, temp);
-        __ orr(o_l, o_l, ShifterOperand(temp));
-        __ subs(temp, o_h, ShifterOperand(kArmBitsPerWord));
-        __ it(PL);
-        __ Lsr(o_l, high, temp, PL);
-        __ Lsr(o_h, high, o_h);
+        // Register allocator doesn't create partial overlap.
+        DCHECK_NE(o_l, high);
+        DCHECK_NE(o_h, low);
+        int32_t cst = second.GetConstant()->AsIntConstant()->GetValue();
+        uint32_t shift_value = static_cast<uint32_t>(cst & kMaxLongShiftValue);
+        if (shift_value > 32) {
+          if (op->IsShl()) {
+            __ Lsl(o_h, low, shift_value - 32);
+            __ LoadImmediate(o_l, 0);
+          } else if (op->IsShr()) {
+            __ Asr(o_l, high, shift_value - 32);
+            __ Asr(o_h, high, 31);
+          } else {
+            __ Lsr(o_l, high, shift_value - 32);
+            __ LoadImmediate(o_h, 0);
+          }
+        } else if (shift_value == 32) {
+          if (op->IsShl()) {
+            __ mov(o_h, ShifterOperand(low));
+            __ LoadImmediate(o_l, 0);
+          } else if (op->IsShr()) {
+            __ mov(o_l, ShifterOperand(high));
+            __ Asr(o_h, high, 31);
+          } else {
+            __ mov(o_l, ShifterOperand(high));
+            __ LoadImmediate(o_h, 0);
+          }
+        } else {  // shift_value < 32
+          if (op->IsShl()) {
+            __ Lsl(o_h, high, shift_value);
+            __ orr(o_h, o_h, ShifterOperand(low, LSR, 32 - shift_value));
+            __ Lsl(o_l, low, shift_value);
+          } else if (op->IsShr()) {
+            __ Lsr(o_l, low, shift_value);
+            __ orr(o_l, o_l, ShifterOperand(high, LSL, 32 - shift_value));
+            __ Asr(o_h, high, shift_value);
+          } else {
+            __ Lsr(o_l, low, shift_value);
+            __ orr(o_l, o_l, ShifterOperand(high, LSL, 32 - shift_value));
+            __ Lsr(o_h, high, shift_value);
+          }
+        }
       }
       break;
     }
     default:
       LOG(FATAL) << "Unexpected operation type " << type;
+      UNREACHABLE();
   }
 }
 
@@ -3348,6 +3629,9 @@ void InstructionCodeGeneratorARM::GenerateWideAtomicLoad(Register addr,
                                                          Register out_lo,
                                                          Register out_hi) {
   if (offset != 0) {
+    // Ensure `out_lo` is different from `addr`, so that loading
+    // `offset` into `out_lo` does not clutter `addr`.
+    DCHECK_NE(out_lo, addr);
     __ LoadImmediate(out_lo, offset);
     __ add(IP, addr, ShifterOperand(out_lo));
     addr = IP;
@@ -3535,14 +3819,26 @@ void InstructionCodeGeneratorARM::HandleFieldSet(HInstruction* instruction,
 
 void LocationsBuilderARM::HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info) {
   DCHECK(instruction->IsInstanceFieldGet() || instruction->IsStaticFieldGet());
+
+  bool object_field_get_with_read_barrier =
+      kEmitCompilerReadBarrier && (field_info.GetFieldType() == Primitive::kPrimNot);
   LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
+      new (GetGraph()->GetArena()) LocationSummary(instruction,
+                                                   object_field_get_with_read_barrier ?
+                                                       LocationSummary::kCallOnSlowPath :
+                                                       LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RequiresRegister());
 
   bool volatile_for_double = field_info.IsVolatile()
       && (field_info.GetFieldType() == Primitive::kPrimDouble)
       && !codegen_->GetInstructionSetFeatures().HasAtomicLdrdAndStrd();
-  bool overlap = field_info.IsVolatile() && (field_info.GetFieldType() == Primitive::kPrimLong);
+  // The output overlaps in case of volatile long: we don't want the
+  // code generated by GenerateWideAtomicLoad to overwrite the
+  // object's location.  Likewise, in the case of an object field get
+  // with read barriers enabled, we do not want the load to overwrite
+  // the object's location, as we need it to emit the read barrier.
+  bool overlap = (field_info.IsVolatile() && (field_info.GetFieldType() == Primitive::kPrimLong)) ||
+      object_field_get_with_read_barrier;
 
   if (Primitive::IsFloatingPointType(instruction->GetType())) {
     locations->SetOut(Location::RequiresFpuRegister());
@@ -3608,7 +3904,8 @@ void InstructionCodeGeneratorARM::HandleFieldGet(HInstruction* instruction,
   DCHECK(instruction->IsInstanceFieldGet() || instruction->IsStaticFieldGet());
 
   LocationSummary* locations = instruction->GetLocations();
-  Register base = locations->InAt(0).AsRegister<Register>();
+  Location base_loc = locations->InAt(0);
+  Register base = base_loc.AsRegister<Register>();
   Location out = locations->Out();
   bool is_volatile = field_info.IsVolatile();
   bool atomic_ldrd_strd = codegen_->GetInstructionSetFeatures().HasAtomicLdrdAndStrd();
@@ -3688,7 +3985,7 @@ void InstructionCodeGeneratorARM::HandleFieldGet(HInstruction* instruction,
   }
 
   if (field_type == Primitive::kPrimNot) {
-    __ MaybeUnpoisonHeapReference(out.AsRegister<Register>());
+    codegen_->MaybeGenerateReadBarrier(instruction, out, out, base_loc, offset);
   }
 }
 
@@ -3832,20 +4129,31 @@ void InstructionCodeGeneratorARM::VisitNullCheck(HNullCheck* instruction) {
 }
 
 void LocationsBuilderARM::VisitArrayGet(HArrayGet* instruction) {
+  bool object_array_get_with_read_barrier =
+      kEmitCompilerReadBarrier && (instruction->GetType() == Primitive::kPrimNot);
   LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
+      new (GetGraph()->GetArena()) LocationSummary(instruction,
+                                                   object_array_get_with_read_barrier ?
+                                                       LocationSummary::kCallOnSlowPath :
+                                                       LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
   if (Primitive::IsFloatingPointType(instruction->GetType())) {
     locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
   } else {
-    locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+    // The output overlaps in the case of an object array get with
+    // read barriers enabled: we do not want the move to overwrite the
+    // array's location, as we need it to emit the read barrier.
+    locations->SetOut(
+        Location::RequiresRegister(),
+        object_array_get_with_read_barrier ? Location::kOutputOverlap : Location::kNoOutputOverlap);
   }
 }
 
 void InstructionCodeGeneratorARM::VisitArrayGet(HArrayGet* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  Register obj = locations->InAt(0).AsRegister<Register>();
+  Location obj_loc = locations->InAt(0);
+  Register obj = obj_loc.AsRegister<Register>();
   Location index = locations->InAt(1);
   Primitive::Type type = instruction->GetType();
 
@@ -3908,8 +4216,9 @@ void InstructionCodeGeneratorARM::VisitArrayGet(HArrayGet* instruction) {
 
     case Primitive::kPrimInt:
     case Primitive::kPrimNot: {
-      static_assert(sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
-                    "art::mirror::HeapReference<mirror::Object> and int32_t have different sizes.");
+      static_assert(
+          sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
+          "art::mirror::HeapReference<mirror::Object> and int32_t have different sizes.");
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value();
       Register out = locations->Out().AsRegister<Register>();
       if (index.IsConstant()) {
@@ -3972,8 +4281,17 @@ void InstructionCodeGeneratorARM::VisitArrayGet(HArrayGet* instruction) {
   codegen_->MaybeRecordImplicitNullCheck(instruction);
 
   if (type == Primitive::kPrimNot) {
-    Register out = locations->Out().AsRegister<Register>();
-    __ MaybeUnpoisonHeapReference(out);
+    static_assert(
+        sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
+        "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
+    uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value();
+    Location out = locations->Out();
+    if (index.IsConstant()) {
+      uint32_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
+      codegen_->MaybeGenerateReadBarrier(instruction, out, out, obj_loc, offset);
+    } else {
+      codegen_->MaybeGenerateReadBarrier(instruction, out, out, obj_loc, data_offset, index);
+    }
   }
 }
 
@@ -3982,11 +4300,16 @@ void LocationsBuilderARM::VisitArraySet(HArraySet* instruction) {
 
   bool needs_write_barrier =
       CodeGenerator::StoreNeedsWriteBarrier(value_type, instruction->GetValue());
-  bool may_need_runtime_call = instruction->NeedsTypeCheck();
+  bool may_need_runtime_call_for_type_check = instruction->NeedsTypeCheck();
+  bool object_array_set_with_read_barrier =
+      kEmitCompilerReadBarrier && (value_type == Primitive::kPrimNot);
 
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(
       instruction,
-      may_need_runtime_call ? LocationSummary::kCallOnSlowPath : LocationSummary::kNoCall);
+      (may_need_runtime_call_for_type_check || object_array_set_with_read_barrier) ?
+          LocationSummary::kCallOnSlowPath :
+          LocationSummary::kNoCall);
+
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
   if (Primitive::IsFloatingPointType(value_type)) {
@@ -3994,20 +4317,20 @@ void LocationsBuilderARM::VisitArraySet(HArraySet* instruction) {
   } else {
     locations->SetInAt(2, Location::RequiresRegister());
   }
-
   if (needs_write_barrier) {
     // Temporary registers for the write barrier.
     locations->AddTemp(Location::RequiresRegister());  // Possibly used for ref. poisoning too.
-    locations->AddTemp(Location::RequiresRegister());
+    locations->AddTemp(Location::RequiresRegister());  // Possibly used for read barrier too.
   }
 }
 
 void InstructionCodeGeneratorARM::VisitArraySet(HArraySet* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  Register array = locations->InAt(0).AsRegister<Register>();
+  Location array_loc = locations->InAt(0);
+  Register array = array_loc.AsRegister<Register>();
   Location index = locations->InAt(1);
   Primitive::Type value_type = instruction->GetComponentType();
-  bool may_need_runtime_call = locations->CanCall();
+  bool may_need_runtime_call_for_type_check = instruction->NeedsTypeCheck();
   bool needs_write_barrier =
       CodeGenerator::StoreNeedsWriteBarrier(value_type, instruction->GetValue());
 
@@ -4044,7 +4367,8 @@ void InstructionCodeGeneratorARM::VisitArraySet(HArraySet* instruction) {
 
     case Primitive::kPrimNot: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value();
-      Register value = locations->InAt(2).AsRegister<Register>();
+      Location value_loc = locations->InAt(2);
+      Register value = value_loc.AsRegister<Register>();
       Register source = value;
 
       if (instruction->InputAt(2)->IsNullConstant()) {
@@ -4058,6 +4382,8 @@ void InstructionCodeGeneratorARM::VisitArraySet(HArraySet* instruction) {
           __ add(IP, array, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_4));
           __ StoreToOffset(kStoreWord, source, IP, data_offset);
         }
+        DCHECK(!needs_write_barrier);
+        DCHECK(!may_need_runtime_call_for_type_check);
         break;
       }
 
@@ -4070,7 +4396,7 @@ void InstructionCodeGeneratorARM::VisitArraySet(HArraySet* instruction) {
       Label done;
       SlowPathCode* slow_path = nullptr;
 
-      if (may_need_runtime_call) {
+      if (may_need_runtime_call_for_type_check) {
         slow_path = new (GetGraph()->GetArena()) ArraySetSlowPathARM(instruction);
         codegen_->AddSlowPath(slow_path);
         if (instruction->GetValueCanBeNull()) {
@@ -4090,23 +4416,63 @@ void InstructionCodeGeneratorARM::VisitArraySet(HArraySet* instruction) {
           __ Bind(&non_zero);
         }
 
-        __ LoadFromOffset(kLoadWord, temp1, array, class_offset);
-        codegen_->MaybeRecordImplicitNullCheck(instruction);
-        __ MaybeUnpoisonHeapReference(temp1);
-        __ LoadFromOffset(kLoadWord, temp1, temp1, component_offset);
-        __ LoadFromOffset(kLoadWord, temp2, value, class_offset);
-        // No need to poison/unpoison, we're comparing two poisoined references.
-        __ cmp(temp1, ShifterOperand(temp2));
-        if (instruction->StaticTypeOfArrayIsObjectArray()) {
-          Label do_put;
-          __ b(&do_put, EQ);
-          __ MaybeUnpoisonHeapReference(temp1);
-          __ LoadFromOffset(kLoadWord, temp1, temp1, super_offset);
-          // No need to poison/unpoison, we're comparing against null.
-          __ CompareAndBranchIfNonZero(temp1, slow_path->GetEntryLabel());
-          __ Bind(&do_put);
+        if (kEmitCompilerReadBarrier) {
+          // When read barriers are enabled, the type checking
+          // instrumentation requires two read barriers:
+          //
+          //   __ Mov(temp2, temp1);
+          //   // /* HeapReference<Class> */ temp1 = temp1->component_type_
+          //   __ LoadFromOffset(kLoadWord, temp1, temp1, component_offset);
+          //   codegen_->GenerateReadBarrier(
+          //       instruction, temp1_loc, temp1_loc, temp2_loc, component_offset);
+          //
+          //   // /* HeapReference<Class> */ temp2 = value->klass_
+          //   __ LoadFromOffset(kLoadWord, temp2, value, class_offset);
+          //   codegen_->GenerateReadBarrier(
+          //       instruction, temp2_loc, temp2_loc, value_loc, class_offset, temp1_loc);
+          //
+          //   __ cmp(temp1, ShifterOperand(temp2));
+          //
+          // However, the second read barrier may trash `temp`, as it
+          // is a temporary register, and as such would not be saved
+          // along with live registers before calling the runtime (nor
+          // restored afterwards).  So in this case, we bail out and
+          // delegate the work to the array set slow path.
+          //
+          // TODO: Extend the register allocator to support a new
+          // "(locally) live temp" location so as to avoid always
+          // going into the slow path when read barriers are enabled.
+          __ b(slow_path->GetEntryLabel());
         } else {
-          __ b(slow_path->GetEntryLabel(), NE);
+          // /* HeapReference<Class> */ temp1 = array->klass_
+          __ LoadFromOffset(kLoadWord, temp1, array, class_offset);
+          codegen_->MaybeRecordImplicitNullCheck(instruction);
+          __ MaybeUnpoisonHeapReference(temp1);
+
+          // /* HeapReference<Class> */ temp1 = temp1->component_type_
+          __ LoadFromOffset(kLoadWord, temp1, temp1, component_offset);
+          // /* HeapReference<Class> */ temp2 = value->klass_
+          __ LoadFromOffset(kLoadWord, temp2, value, class_offset);
+          // If heap poisoning is enabled, no need to unpoison `temp1`
+          // nor `temp2`, as we are comparing two poisoned references.
+          __ cmp(temp1, ShifterOperand(temp2));
+
+          if (instruction->StaticTypeOfArrayIsObjectArray()) {
+            Label do_put;
+            __ b(&do_put, EQ);
+            // If heap poisoning is enabled, the `temp1` reference has
+            // not been unpoisoned yet; unpoison it now.
+            __ MaybeUnpoisonHeapReference(temp1);
+
+            // /* HeapReference<Class> */ temp1 = temp1->super_class_
+            __ LoadFromOffset(kLoadWord, temp1, temp1, super_offset);
+            // If heap poisoning is enabled, no need to unpoison
+            // `temp1`, as we are comparing against null below.
+            __ CompareAndBranchIfNonZero(temp1, slow_path->GetEntryLabel());
+            __ Bind(&do_put);
+          } else {
+            __ b(slow_path->GetEntryLabel(), NE);
+          }
         }
       }
 
@@ -4130,7 +4496,7 @@ void InstructionCodeGeneratorARM::VisitArraySet(HArraySet* instruction) {
         __ StoreToOffset(kStoreWord, source, IP, data_offset);
       }
 
-      if (!may_need_runtime_call) {
+      if (!may_need_runtime_call_for_type_check) {
         codegen_->MaybeRecordImplicitNullCheck(instruction);
       }
 
@@ -4559,7 +4925,8 @@ void LocationsBuilderARM::VisitLoadClass(HLoadClass* cls) {
   CodeGenerator::CreateLoadClassLocationSummary(
       cls,
       Location::RegisterLocation(calling_convention.GetRegisterAt(0)),
-      Location::RegisterLocation(R0));
+      Location::RegisterLocation(R0),
+      /* code_generator_supports_read_barrier */ true);
 }
 
 void InstructionCodeGeneratorARM::VisitLoadClass(HLoadClass* cls) {
@@ -4573,21 +4940,42 @@ void InstructionCodeGeneratorARM::VisitLoadClass(HLoadClass* cls) {
     return;
   }
 
-  Register out = locations->Out().AsRegister<Register>();
+  Location out_loc = locations->Out();
+  Register out = out_loc.AsRegister<Register>();
   Register current_method = locations->InAt(0).AsRegister<Register>();
+
   if (cls->IsReferrersClass()) {
     DCHECK(!cls->CanCallRuntime());
     DCHECK(!cls->MustGenerateClinitCheck());
-    __ LoadFromOffset(
-        kLoadWord, out, current_method, ArtMethod::DeclaringClassOffset().Int32Value());
+    uint32_t declaring_class_offset = ArtMethod::DeclaringClassOffset().Int32Value();
+    if (kEmitCompilerReadBarrier) {
+      // /* GcRoot<mirror::Class>* */ out = &(current_method->declaring_class_)
+      __ AddConstant(out, current_method, declaring_class_offset);
+      // /* mirror::Class* */ out = out->Read()
+      codegen_->GenerateReadBarrierForRoot(cls, out_loc, out_loc);
+    } else {
+      // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
+      __ LoadFromOffset(kLoadWord, out, current_method, declaring_class_offset);
+    }
   } else {
     DCHECK(cls->CanCallRuntime());
+    // /* GcRoot<mirror::Class>[] */ out =
+    //        current_method.ptr_sized_fields_->dex_cache_resolved_types_
     __ LoadFromOffset(kLoadWord,
                       out,
                       current_method,
                       ArtMethod::DexCacheResolvedTypesOffset(kArmPointerSize).Int32Value());
-    __ LoadFromOffset(kLoadWord, out, out, CodeGenerator::GetCacheOffset(cls->GetTypeIndex()));
-    // TODO: We will need a read barrier here.
+
+    size_t cache_offset = CodeGenerator::GetCacheOffset(cls->GetTypeIndex());
+    if (kEmitCompilerReadBarrier) {
+      // /* GcRoot<mirror::Class>* */ out = &out[type_index]
+      __ AddConstant(out, out, cache_offset);
+      // /* mirror::Class* */ out = out->Read()
+      codegen_->GenerateReadBarrierForRoot(cls, out_loc, out_loc);
+    } else {
+      // /* GcRoot<mirror::Class> */ out = out[type_index]
+      __ LoadFromOffset(kLoadWord, out, out, cache_offset);
+    }
 
     SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadClassSlowPathARM(
         cls, cls, cls->GetDexPc(), cls->MustGenerateClinitCheck());
@@ -4642,13 +5030,35 @@ void InstructionCodeGeneratorARM::VisitLoadString(HLoadString* load) {
   codegen_->AddSlowPath(slow_path);
 
   LocationSummary* locations = load->GetLocations();
-  Register out = locations->Out().AsRegister<Register>();
+  Location out_loc = locations->Out();
+  Register out = out_loc.AsRegister<Register>();
   Register current_method = locations->InAt(0).AsRegister<Register>();
-  __ LoadFromOffset(
-      kLoadWord, out, current_method, ArtMethod::DeclaringClassOffset().Int32Value());
+
+  uint32_t declaring_class_offset = ArtMethod::DeclaringClassOffset().Int32Value();
+  if (kEmitCompilerReadBarrier) {
+    // /* GcRoot<mirror::Class>* */ out = &(current_method->declaring_class_)
+    __ AddConstant(out, current_method, declaring_class_offset);
+    // /* mirror::Class* */ out = out->Read()
+    codegen_->GenerateReadBarrierForRoot(load, out_loc, out_loc);
+  } else {
+    // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
+    __ LoadFromOffset(kLoadWord, out, current_method, declaring_class_offset);
+  }
+
+  // /* GcRoot<mirror::String>[] */ out = out->dex_cache_strings_
   __ LoadFromOffset(kLoadWord, out, out, mirror::Class::DexCacheStringsOffset().Int32Value());
-  __ LoadFromOffset(kLoadWord, out, out, CodeGenerator::GetCacheOffset(load->GetStringIndex()));
-  // TODO: We will need a read barrier here.
+
+  size_t cache_offset = CodeGenerator::GetCacheOffset(load->GetStringIndex());
+  if (kEmitCompilerReadBarrier) {
+    // /* GcRoot<mirror::String>* */ out = &out[string_index]
+    __ AddConstant(out, out, cache_offset);
+    // /* mirror::String* */ out = out->Read()
+    codegen_->GenerateReadBarrierForRoot(load, out_loc, out_loc);
+  } else {
+    // /* GcRoot<mirror::String> */ out = out[string_index]
+    __ LoadFromOffset(kLoadWord, out, out, cache_offset);
+  }
+
   __ CompareAndBranchIfZero(out, slow_path->GetEntryLabel());
   __ Bind(slow_path->GetExitLabel());
 }
@@ -4691,41 +5101,45 @@ void InstructionCodeGeneratorARM::VisitThrow(HThrow* instruction) {
 
 void LocationsBuilderARM::VisitInstanceOf(HInstanceOf* instruction) {
   LocationSummary::CallKind call_kind = LocationSummary::kNoCall;
-  switch (instruction->GetTypeCheckKind()) {
+  TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
+  switch (type_check_kind) {
     case TypeCheckKind::kExactCheck:
     case TypeCheckKind::kAbstractClassCheck:
     case TypeCheckKind::kClassHierarchyCheck:
     case TypeCheckKind::kArrayObjectCheck:
-      call_kind = LocationSummary::kNoCall;
+      call_kind =
+          kEmitCompilerReadBarrier ? LocationSummary::kCallOnSlowPath : LocationSummary::kNoCall;
       break;
+    case TypeCheckKind::kArrayCheck:
     case TypeCheckKind::kUnresolvedCheck:
     case TypeCheckKind::kInterfaceCheck:
-      call_kind = LocationSummary::kCall;
-      break;
-    case TypeCheckKind::kArrayCheck:
       call_kind = LocationSummary::kCallOnSlowPath;
       break;
   }
+
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, call_kind);
-  if (call_kind != LocationSummary::kCall) {
-    locations->SetInAt(0, Location::RequiresRegister());
-    locations->SetInAt(1, Location::RequiresRegister());
-    // The out register is used as a temporary, so it overlaps with the inputs.
-    // Note that TypeCheckSlowPathARM uses this register too.
-    locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
-  } else {
-    InvokeRuntimeCallingConvention calling_convention;
-    locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
-    locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
-    locations->SetOut(Location::RegisterLocation(R0));
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(1, Location::RequiresRegister());
+  // The "out" register is used as a temporary, so it overlaps with the inputs.
+  // Note that TypeCheckSlowPathARM uses this register too.
+  locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
+  // When read barriers are enabled, we need a temporary register for
+  // some cases.
+  if (kEmitCompilerReadBarrier &&
+      (type_check_kind == TypeCheckKind::kAbstractClassCheck ||
+       type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
+       type_check_kind == TypeCheckKind::kArrayObjectCheck)) {
+    locations->AddTemp(Location::RequiresRegister());
   }
 }
 
 void InstructionCodeGeneratorARM::VisitInstanceOf(HInstanceOf* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  Register obj = locations->InAt(0).AsRegister<Register>();
+  Location obj_loc = locations->InAt(0);
+  Register obj = obj_loc.AsRegister<Register>();
   Register cls = locations->InAt(1).AsRegister<Register>();
-  Register out = locations->Out().AsRegister<Register>();
+  Location out_loc = locations->Out();
+  Register out = out_loc.AsRegister<Register>();
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
   uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
   uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
@@ -4739,15 +5153,9 @@ void InstructionCodeGeneratorARM::VisitInstanceOf(HInstanceOf* instruction) {
     __ CompareAndBranchIfZero(obj, &zero);
   }
 
-  // In case of an interface/unresolved check, we put the object class into the object register.
-  // This is safe, as the register is caller-save, and the object must be in another
-  // register if it survives the runtime call.
-  Register target = (instruction->GetTypeCheckKind() == TypeCheckKind::kInterfaceCheck) ||
-      (instruction->GetTypeCheckKind() == TypeCheckKind::kUnresolvedCheck)
-      ? obj
-      : out;
-  __ LoadFromOffset(kLoadWord, target, obj, class_offset);
-  __ MaybeUnpoisonHeapReference(target);
+  // /* HeapReference<Class> */ out = obj->klass_
+  __ LoadFromOffset(kLoadWord, out, obj, class_offset);
+  codegen_->MaybeGenerateReadBarrier(instruction, out_loc, out_loc, obj_loc, class_offset);
 
   switch (instruction->GetTypeCheckKind()) {
     case TypeCheckKind::kExactCheck: {
@@ -4758,13 +5166,23 @@ void InstructionCodeGeneratorARM::VisitInstanceOf(HInstanceOf* instruction) {
       __ b(&done);
       break;
     }
+
     case TypeCheckKind::kAbstractClassCheck: {
       // If the class is abstract, we eagerly fetch the super class of the
       // object to avoid doing a comparison we know will fail.
       Label loop;
       __ Bind(&loop);
+      Location temp_loc = kEmitCompilerReadBarrier ? locations->GetTemp(0) : Location::NoLocation();
+      if (kEmitCompilerReadBarrier) {
+        // Save the value of `out` into `temp` before overwriting it
+        // in the following move operation, as we will need it for the
+        // read barrier below.
+        Register temp = temp_loc.AsRegister<Register>();
+        __ Mov(temp, out);
+      }
+      // /* HeapReference<Class> */ out = out->super_class_
       __ LoadFromOffset(kLoadWord, out, out, super_offset);
-      __ MaybeUnpoisonHeapReference(out);
+      codegen_->MaybeGenerateReadBarrier(instruction, out_loc, out_loc, temp_loc, super_offset);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ CompareAndBranchIfZero(out, &done);
       __ cmp(out, ShifterOperand(cls));
@@ -4775,14 +5193,24 @@ void InstructionCodeGeneratorARM::VisitInstanceOf(HInstanceOf* instruction) {
       }
       break;
     }
+
     case TypeCheckKind::kClassHierarchyCheck: {
       // Walk over the class hierarchy to find a match.
       Label loop, success;
       __ Bind(&loop);
       __ cmp(out, ShifterOperand(cls));
       __ b(&success, EQ);
+      Location temp_loc = kEmitCompilerReadBarrier ? locations->GetTemp(0) : Location::NoLocation();
+      if (kEmitCompilerReadBarrier) {
+        // Save the value of `out` into `temp` before overwriting it
+        // in the following move operation, as we will need it for the
+        // read barrier below.
+        Register temp = temp_loc.AsRegister<Register>();
+        __ Mov(temp, out);
+      }
+      // /* HeapReference<Class> */ out = out->super_class_
       __ LoadFromOffset(kLoadWord, out, out, super_offset);
-      __ MaybeUnpoisonHeapReference(out);
+      codegen_->MaybeGenerateReadBarrier(instruction, out_loc, out_loc, temp_loc, super_offset);
       __ CompareAndBranchIfNonZero(out, &loop);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ b(&done);
@@ -4793,14 +5221,24 @@ void InstructionCodeGeneratorARM::VisitInstanceOf(HInstanceOf* instruction) {
       }
       break;
     }
+
     case TypeCheckKind::kArrayObjectCheck: {
       // Do an exact check.
       Label exact_check;
       __ cmp(out, ShifterOperand(cls));
       __ b(&exact_check, EQ);
-      // Otherwise, we need to check that the object's class is a non primitive array.
+      // Otherwise, we need to check that the object's class is a non-primitive array.
+      Location temp_loc = kEmitCompilerReadBarrier ? locations->GetTemp(0) : Location::NoLocation();
+      if (kEmitCompilerReadBarrier) {
+        // Save the value of `out` into `temp` before overwriting it
+        // in the following move operation, as we will need it for the
+        // read barrier below.
+        Register temp = temp_loc.AsRegister<Register>();
+        __ Mov(temp, out);
+      }
+      // /* HeapReference<Class> */ out = out->component_type_
       __ LoadFromOffset(kLoadWord, out, out, component_offset);
-      __ MaybeUnpoisonHeapReference(out);
+      codegen_->MaybeGenerateReadBarrier(instruction, out_loc, out_loc, temp_loc, component_offset);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ CompareAndBranchIfZero(out, &done);
       __ LoadFromOffset(kLoadUnsignedHalfword, out, out, primitive_offset);
@@ -4811,11 +5249,12 @@ void InstructionCodeGeneratorARM::VisitInstanceOf(HInstanceOf* instruction) {
       __ b(&done);
       break;
     }
+
     case TypeCheckKind::kArrayCheck: {
       __ cmp(out, ShifterOperand(cls));
       DCHECK(locations->OnlyCallsOnSlowPath());
-      slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathARM(
-          instruction, /* is_fatal */ false);
+      slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathARM(instruction,
+                                                                    /* is_fatal */ false);
       codegen_->AddSlowPath(slow_path);
       __ b(slow_path->GetEntryLabel(), NE);
       __ LoadImmediate(out, 1);
@@ -4824,13 +5263,25 @@ void InstructionCodeGeneratorARM::VisitInstanceOf(HInstanceOf* instruction) {
       }
       break;
     }
+
     case TypeCheckKind::kUnresolvedCheck:
-    case TypeCheckKind::kInterfaceCheck:
-    default: {
-      codegen_->InvokeRuntime(QUICK_ENTRY_POINT(pInstanceofNonTrivial),
-                              instruction,
-                              instruction->GetDexPc(),
-                              nullptr);
+    case TypeCheckKind::kInterfaceCheck: {
+      // Note that we indeed only call on slow path, but we always go
+      // into the slow path for the unresolved & interface check
+      // cases.
+      //
+      // We cannot directly call the InstanceofNonTrivial runtime
+      // entry point without resorting to a type checking slow path
+      // here (i.e. by calling InvokeRuntime directly), as it would
+      // require to assign fixed registers for the inputs of this
+      // HInstanceOf instruction (following the runtime calling
+      // convention), which might be cluttered by the potential first
+      // read barrier emission at the beginning of this method.
+      DCHECK(locations->OnlyCallsOnSlowPath());
+      slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathARM(instruction,
+                                                                    /* is_fatal */ false);
+      codegen_->AddSlowPath(slow_path);
+      __ b(slow_path->GetEntryLabel());
       if (zero.IsLinked()) {
         __ b(&done);
       }
@@ -4856,57 +5307,61 @@ void LocationsBuilderARM::VisitCheckCast(HCheckCast* instruction) {
   LocationSummary::CallKind call_kind = LocationSummary::kNoCall;
   bool throws_into_catch = instruction->CanThrowIntoCatchBlock();
 
-  switch (instruction->GetTypeCheckKind()) {
+  TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
+  switch (type_check_kind) {
     case TypeCheckKind::kExactCheck:
     case TypeCheckKind::kAbstractClassCheck:
     case TypeCheckKind::kClassHierarchyCheck:
     case TypeCheckKind::kArrayObjectCheck:
-      call_kind = throws_into_catch
-          ? LocationSummary::kCallOnSlowPath
-          : LocationSummary::kNoCall;
+      call_kind = (throws_into_catch || kEmitCompilerReadBarrier) ?
+          LocationSummary::kCallOnSlowPath :
+          LocationSummary::kNoCall;  // In fact, call on a fatal (non-returning) slow path.
       break;
+    case TypeCheckKind::kArrayCheck:
     case TypeCheckKind::kUnresolvedCheck:
     case TypeCheckKind::kInterfaceCheck:
-      call_kind = LocationSummary::kCall;
-      break;
-    case TypeCheckKind::kArrayCheck:
       call_kind = LocationSummary::kCallOnSlowPath;
       break;
   }
 
-  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(
-      instruction, call_kind);
-  if (call_kind != LocationSummary::kCall) {
-    locations->SetInAt(0, Location::RequiresRegister());
-    locations->SetInAt(1, Location::RequiresRegister());
-    // Note that TypeCheckSlowPathARM uses this register too.
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, call_kind);
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(1, Location::RequiresRegister());
+  // Note that TypeCheckSlowPathARM uses this "temp" register too.
+  locations->AddTemp(Location::RequiresRegister());
+  // When read barriers are enabled, we need an additional temporary
+  // register for some cases.
+  if (kEmitCompilerReadBarrier &&
+      (type_check_kind == TypeCheckKind::kAbstractClassCheck ||
+       type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
+       type_check_kind == TypeCheckKind::kArrayObjectCheck)) {
     locations->AddTemp(Location::RequiresRegister());
-  } else {
-    InvokeRuntimeCallingConvention calling_convention;
-    locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
-    locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
   }
 }
 
 void InstructionCodeGeneratorARM::VisitCheckCast(HCheckCast* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  Register obj = locations->InAt(0).AsRegister<Register>();
+  Location obj_loc = locations->InAt(0);
+  Register obj = obj_loc.AsRegister<Register>();
   Register cls = locations->InAt(1).AsRegister<Register>();
-  Register temp = locations->WillCall()
-      ? Register(kNoRegister)
-      : locations->GetTemp(0).AsRegister<Register>();
-
+  Location temp_loc = locations->GetTemp(0);
+  Register temp = temp_loc.AsRegister<Register>();
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
   uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
   uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
   uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
-  SlowPathCode* slow_path = nullptr;
 
-  if (!locations->WillCall()) {
-    slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathARM(
-        instruction, !locations->CanCall());
-    codegen_->AddSlowPath(slow_path);
-  }
+  TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
+  bool is_type_check_slow_path_fatal =
+      (type_check_kind == TypeCheckKind::kExactCheck ||
+       type_check_kind == TypeCheckKind::kAbstractClassCheck ||
+       type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
+       type_check_kind == TypeCheckKind::kArrayObjectCheck) &&
+      !instruction->CanThrowIntoCatchBlock();
+  SlowPathCode* type_check_slow_path =
+      new (GetGraph()->GetArena()) TypeCheckSlowPathARM(instruction,
+                                                        is_type_check_slow_path_fatal);
+  codegen_->AddSlowPath(type_check_slow_path);
 
   Label done;
   // Avoid null check if we know obj is not null.
@@ -4914,76 +5369,159 @@ void InstructionCodeGeneratorARM::VisitCheckCast(HCheckCast* instruction) {
     __ CompareAndBranchIfZero(obj, &done);
   }
 
-  if (locations->WillCall()) {
-    __ LoadFromOffset(kLoadWord, obj, obj, class_offset);
-    __ MaybeUnpoisonHeapReference(obj);
-  } else {
-    __ LoadFromOffset(kLoadWord, temp, obj, class_offset);
-    __ MaybeUnpoisonHeapReference(temp);
-  }
+  // /* HeapReference<Class> */ temp = obj->klass_
+  __ LoadFromOffset(kLoadWord, temp, obj, class_offset);
+  codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, obj_loc, class_offset);
 
-  switch (instruction->GetTypeCheckKind()) {
+  switch (type_check_kind) {
     case TypeCheckKind::kExactCheck:
     case TypeCheckKind::kArrayCheck: {
       __ cmp(temp, ShifterOperand(cls));
       // Jump to slow path for throwing the exception or doing a
       // more involved array check.
-      __ b(slow_path->GetEntryLabel(), NE);
+      __ b(type_check_slow_path->GetEntryLabel(), NE);
       break;
     }
+
     case TypeCheckKind::kAbstractClassCheck: {
       // If the class is abstract, we eagerly fetch the super class of the
       // object to avoid doing a comparison we know will fail.
-      Label loop;
+      Label loop, compare_classes;
       __ Bind(&loop);
+      Location temp2_loc =
+          kEmitCompilerReadBarrier ? locations->GetTemp(1) : Location::NoLocation();
+      if (kEmitCompilerReadBarrier) {
+        // Save the value of `temp` into `temp2` before overwriting it
+        // in the following move operation, as we will need it for the
+        // read barrier below.
+        Register temp2 = temp2_loc.AsRegister<Register>();
+        __ Mov(temp2, temp);
+      }
+      // /* HeapReference<Class> */ temp = temp->super_class_
       __ LoadFromOffset(kLoadWord, temp, temp, super_offset);
-      __ MaybeUnpoisonHeapReference(temp);
-      // Jump to the slow path to throw the exception.
-      __ CompareAndBranchIfZero(temp, slow_path->GetEntryLabel());
+      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, temp2_loc, super_offset);
+
+      // If the class reference currently in `temp` is not null, jump
+      // to the `compare_classes` label to compare it with the checked
+      // class.
+      __ CompareAndBranchIfNonZero(temp, &compare_classes);
+      // Otherwise, jump to the slow path to throw the exception.
+      //
+      // But before, move back the object's class into `temp` before
+      // going into the slow path, as it has been overwritten in the
+      // meantime.
+      // /* HeapReference<Class> */ temp = obj->klass_
+      __ LoadFromOffset(kLoadWord, temp, obj, class_offset);
+      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, obj_loc, class_offset);
+      __ b(type_check_slow_path->GetEntryLabel());
+
+      __ Bind(&compare_classes);
       __ cmp(temp, ShifterOperand(cls));
       __ b(&loop, NE);
       break;
     }
+
     case TypeCheckKind::kClassHierarchyCheck: {
       // Walk over the class hierarchy to find a match.
       Label loop;
       __ Bind(&loop);
       __ cmp(temp, ShifterOperand(cls));
       __ b(&done, EQ);
+
+      Location temp2_loc =
+          kEmitCompilerReadBarrier ? locations->GetTemp(1) : Location::NoLocation();
+      if (kEmitCompilerReadBarrier) {
+        // Save the value of `temp` into `temp2` before overwriting it
+        // in the following move operation, as we will need it for the
+        // read barrier below.
+        Register temp2 = temp2_loc.AsRegister<Register>();
+        __ Mov(temp2, temp);
+      }
+      // /* HeapReference<Class> */ temp = temp->super_class_
       __ LoadFromOffset(kLoadWord, temp, temp, super_offset);
-      __ MaybeUnpoisonHeapReference(temp);
+      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, temp2_loc, super_offset);
+
+      // If the class reference currently in `temp` is not null, jump
+      // back at the beginning of the loop.
       __ CompareAndBranchIfNonZero(temp, &loop);
-      // Jump to the slow path to throw the exception.
-      __ b(slow_path->GetEntryLabel());
+      // Otherwise, jump to the slow path to throw the exception.
+      //
+      // But before, move back the object's class into `temp` before
+      // going into the slow path, as it has been overwritten in the
+      // meantime.
+      // /* HeapReference<Class> */ temp = obj->klass_
+      __ LoadFromOffset(kLoadWord, temp, obj, class_offset);
+      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, obj_loc, class_offset);
+      __ b(type_check_slow_path->GetEntryLabel());
       break;
     }
+
     case TypeCheckKind::kArrayObjectCheck: {
       // Do an exact check.
+      Label check_non_primitive_component_type;
       __ cmp(temp, ShifterOperand(cls));
       __ b(&done, EQ);
-      // Otherwise, we need to check that the object's class is a non primitive array.
+
+      // Otherwise, we need to check that the object's class is a non-primitive array.
+      Location temp2_loc =
+          kEmitCompilerReadBarrier ? locations->GetTemp(1) : Location::NoLocation();
+      if (kEmitCompilerReadBarrier) {
+        // Save the value of `temp` into `temp2` before overwriting it
+        // in the following move operation, as we will need it for the
+        // read barrier below.
+        Register temp2 = temp2_loc.AsRegister<Register>();
+        __ Mov(temp2, temp);
+      }
+      // /* HeapReference<Class> */ temp = temp->component_type_
       __ LoadFromOffset(kLoadWord, temp, temp, component_offset);
-      __ MaybeUnpoisonHeapReference(temp);
-      __ CompareAndBranchIfZero(temp, slow_path->GetEntryLabel());
+      codegen_->MaybeGenerateReadBarrier(
+          instruction, temp_loc, temp_loc, temp2_loc, component_offset);
+
+      // If the component type is not null (i.e. the object is indeed
+      // an array), jump to label `check_non_primitive_component_type`
+      // to further check that this component type is not a primitive
+      // type.
+      __ CompareAndBranchIfNonZero(temp, &check_non_primitive_component_type);
+      // Otherwise, jump to the slow path to throw the exception.
+      //
+      // But before, move back the object's class into `temp` before
+      // going into the slow path, as it has been overwritten in the
+      // meantime.
+      // /* HeapReference<Class> */ temp = obj->klass_
+      __ LoadFromOffset(kLoadWord, temp, obj, class_offset);
+      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, obj_loc, class_offset);
+      __ b(type_check_slow_path->GetEntryLabel());
+
+      __ Bind(&check_non_primitive_component_type);
       __ LoadFromOffset(kLoadUnsignedHalfword, temp, temp, primitive_offset);
-      static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
-      __ CompareAndBranchIfNonZero(temp, slow_path->GetEntryLabel());
+      static_assert(Primitive::kPrimNot == 0, "Expected 0 for art::Primitive::kPrimNot");
+      __ CompareAndBranchIfZero(temp, &done);
+      // Same comment as above regarding `temp` and the slow path.
+      // /* HeapReference<Class> */ temp = obj->klass_
+      __ LoadFromOffset(kLoadWord, temp, obj, class_offset);
+      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, obj_loc, class_offset);
+      __ b(type_check_slow_path->GetEntryLabel());
       break;
     }
+
     case TypeCheckKind::kUnresolvedCheck:
     case TypeCheckKind::kInterfaceCheck:
-    default:
-      codegen_->InvokeRuntime(QUICK_ENTRY_POINT(pCheckCast),
-                              instruction,
-                              instruction->GetDexPc(),
-                              nullptr);
+      // We always go into the type check slow path for the unresolved &
+      // interface check cases.
+      //
+      // We cannot directly call the CheckCast runtime entry point
+      // without resorting to a type checking slow path here (i.e. by
+      // calling InvokeRuntime directly), as it would require to
+      // assign fixed registers for the inputs of this HInstanceOf
+      // instruction (following the runtime calling convention), which
+      // might be cluttered by the potential first read barrier
+      // emission at the beginning of this method.
+      __ b(type_check_slow_path->GetEntryLabel());
       break;
   }
   __ Bind(&done);
 
-  if (slow_path != nullptr) {
-    __ Bind(slow_path->GetExitLabel());
-  }
+  __ Bind(type_check_slow_path->GetExitLabel());
 }
 
 void LocationsBuilderARM::VisitMonitorOperation(HMonitorOperation* instruction) {
@@ -5157,6 +5695,82 @@ void InstructionCodeGeneratorARM::HandleBitwiseOperation(HBinaryOperation* instr
   }
 }
 
+void CodeGeneratorARM::GenerateReadBarrier(HInstruction* instruction,
+                                           Location out,
+                                           Location ref,
+                                           Location obj,
+                                           uint32_t offset,
+                                           Location index) {
+  DCHECK(kEmitCompilerReadBarrier);
+
+  // If heap poisoning is enabled, the unpoisoning of the loaded
+  // reference will be carried out by the runtime within the slow
+  // path.
+  //
+  // Note that `ref` currently does not get unpoisoned (when heap
+  // poisoning is enabled), which is alright as the `ref` argument is
+  // not used by the artReadBarrierSlow entry point.
+  //
+  // TODO: Unpoison `ref` when it is used by artReadBarrierSlow.
+  SlowPathCode* slow_path = new (GetGraph()->GetArena())
+      ReadBarrierForHeapReferenceSlowPathARM(instruction, out, ref, obj, offset, index);
+  AddSlowPath(slow_path);
+
+  // TODO: When read barrier has a fast path, add it here.
+  /* Currently the read barrier call is inserted after the original load.
+   * However, if we have a fast path, we need to perform the load of obj.LockWord *before* the
+   * original load. This load-load ordering is required by the read barrier.
+   * The fast path/slow path (for Baker's algorithm) should look like:
+   *
+   * bool isGray = obj.LockWord & kReadBarrierMask;
+   * lfence;  // load fence or artificial data dependence to prevent load-load reordering
+   * ref = obj.field;    // this is the original load
+   * if (isGray) {
+   *   ref = Mark(ref);  // ideally the slow path just does Mark(ref)
+   * }
+   */
+
+  __ b(slow_path->GetEntryLabel());
+  __ Bind(slow_path->GetExitLabel());
+}
+
+void CodeGeneratorARM::MaybeGenerateReadBarrier(HInstruction* instruction,
+                                                Location out,
+                                                Location ref,
+                                                Location obj,
+                                                uint32_t offset,
+                                                Location index) {
+  if (kEmitCompilerReadBarrier) {
+    // If heap poisoning is enabled, unpoisoning will be taken care of
+    // by the runtime within the slow path.
+    GenerateReadBarrier(instruction, out, ref, obj, offset, index);
+  } else if (kPoisonHeapReferences) {
+    __ UnpoisonHeapReference(out.AsRegister<Register>());
+  }
+}
+
+void CodeGeneratorARM::GenerateReadBarrierForRoot(HInstruction* instruction,
+                                                  Location out,
+                                                  Location root) {
+  DCHECK(kEmitCompilerReadBarrier);
+
+  // Note that GC roots are not affected by heap poisoning, so we do
+  // not need to do anything special for this here.
+  SlowPathCode* slow_path =
+      new (GetGraph()->GetArena()) ReadBarrierForRootSlowPathARM(instruction, out, root);
+  AddSlowPath(slow_path);
+
+  // TODO: Implement a fast path for ReadBarrierForRoot, performing
+  // the following operation (for Baker's algorithm):
+  //
+  //   if (thread.tls32_.is_gc_marking) {
+  //     root = Mark(root);
+  //   }
+
+  __ b(slow_path->GetEntryLabel());
+  __ Bind(slow_path->GetExitLabel());
+}
+
 HInvokeStaticOrDirect::DispatchInfo CodeGeneratorARM::GetSupportedInvokeStaticOrDirectDispatch(
       const HInvokeStaticOrDirect::DispatchInfo& desired_dispatch_info,
       MethodReference target_method) {
@@ -5214,7 +5828,7 @@ void CodeGeneratorARM::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke,
       __ LoadFromOffset(kLoadWord, temp.AsRegister<Register>(), TR, invoke->GetStringInitOffset());
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kRecursive:
-      callee_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodInputIndex());
+      callee_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress:
       __ LoadImmediate(temp.AsRegister<Register>(), invoke->GetMethodAddress());
@@ -5229,7 +5843,7 @@ void CodeGeneratorARM::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke,
       LOG(FATAL) << "Unsupported";
       UNREACHABLE();
     case HInvokeStaticOrDirect::MethodLoadKind::kDexCacheViaMethod: {
-      Location current_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodInputIndex());
+      Location current_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
       Register method_reg;
       Register reg = temp.AsRegister<Register>();
       if (current_method.IsRegister()) {
@@ -5240,10 +5854,11 @@ void CodeGeneratorARM::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke,
         method_reg = reg;
         __ LoadFromOffset(kLoadWord, reg, SP, kCurrentMethodStackOffset);
       }
-      // temp = current_method->dex_cache_resolved_methods_;
-      __ LoadFromOffset(
-          kLoadWord, reg, method_reg, ArtMethod::DexCacheResolvedMethodsOffset(
-              kArmPointerSize).Int32Value());
+      // /* ArtMethod*[] */ temp = temp.ptr_sized_fields_->dex_cache_resolved_methods_;
+      __ LoadFromOffset(kLoadWord,
+                        reg,
+                        method_reg,
+                        ArtMethod::DexCacheResolvedMethodsOffset(kArmPointerSize).Int32Value());
       // temp = temp[index_in_cache]
       uint32_t index_in_cache = invoke->GetTargetMethod().dex_method_index;
       __ LoadFromOffset(kLoadWord, reg, reg, CodeGenerator::GetCachePointerOffset(index_in_cache));
@@ -5287,10 +5902,17 @@ void CodeGeneratorARM::GenerateVirtualCall(HInvokeVirtual* invoke, Location temp
   LocationSummary* locations = invoke->GetLocations();
   Location receiver = locations->InAt(0);
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
-  // temp = object->GetClass();
   DCHECK(receiver.IsRegister());
+  // /* HeapReference<Class> */ temp = receiver->klass_
   __ LoadFromOffset(kLoadWord, temp, receiver.AsRegister<Register>(), class_offset);
   MaybeRecordImplicitNullCheck(invoke);
+  // Instead of simply (possibly) unpoisoning `temp` here, we should
+  // emit a read barrier for the previous class reference load.
+  // However this is not required in practice, as this is an
+  // intermediate/temporary reference and because the current
+  // concurrent copying collector keeps the from-space memory
+  // intact/accessible until the end of the marking phase (the
+  // concurrent copying collector may not in the future).
   __ MaybeUnpoisonHeapReference(temp);
   // temp = temp->GetMethodAt(method_offset);
   uint32_t entry_point = ArtMethod::EntryPointFromQuickCompiledCodeOffset(
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index cef1095c5d..89de4f801d 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -228,15 +228,13 @@ class InstructionCodeGeneratorARM : public HGraphVisitor {
   void GenerateImplicitNullCheck(HNullCheck* instruction);
   void GenerateExplicitNullCheck(HNullCheck* instruction);
   void GenerateTestAndBranch(HInstruction* instruction,
+                             size_t condition_input_index,
                              Label* true_target,
-                             Label* false_target,
-                             Label* always_true_target);
+                             Label* false_target);
   void GenerateCompareWithImmediate(Register left, int32_t right);
-  void GenerateCompareTestAndBranch(HIf* if_instr,
-                                    HCondition* condition,
+  void GenerateCompareTestAndBranch(HCondition* condition,
                                     Label* true_target,
-                                    Label* false_target,
-                                    Label* always_true_target);
+                                    Label* false_target);
   void GenerateFPJumps(HCondition* cond, Label* true_label, Label* false_label);
   void GenerateLongComparesAndJumps(HCondition* cond, Label* true_label, Label* false_label);
   void DivRemOneOrMinusOne(HBinaryOperation* instruction);
@@ -375,6 +373,51 @@ class CodeGeneratorARM : public CodeGenerator {
 
   void EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches) OVERRIDE;
 
+  // Generate a read barrier for a heap reference within `instruction`.
+  //
+  // A read barrier for an object reference read from the heap is
+  // implemented as a call to the artReadBarrierSlow runtime entry
+  // point, which is passed the values in locations `ref`, `obj`, and
+  // `offset`:
+  //
+  //   mirror::Object* artReadBarrierSlow(mirror::Object* ref,
+  //                                      mirror::Object* obj,
+  //                                      uint32_t offset);
+  //
+  // The `out` location contains the value returned by
+  // artReadBarrierSlow.
+  //
+  // When `index` is provided (i.e. for array accesses), the offset
+  // value passed to artReadBarrierSlow is adjusted to take `index`
+  // into account.
+  void GenerateReadBarrier(HInstruction* instruction,
+                           Location out,
+                           Location ref,
+                           Location obj,
+                           uint32_t offset,
+                           Location index = Location::NoLocation());
+
+  // If read barriers are enabled, generate a read barrier for a heap reference.
+  // If heap poisoning is enabled, also unpoison the reference in `out`.
+  void MaybeGenerateReadBarrier(HInstruction* instruction,
+                                Location out,
+                                Location ref,
+                                Location obj,
+                                uint32_t offset,
+                                Location index = Location::NoLocation());
+
+  // Generate a read barrier for a GC root within `instruction`.
+  //
+  // A read barrier for an object reference GC root is implemented as
+  // a call to the artReadBarrierForRootSlow runtime entry point,
+  // which is passed the value in location `root`:
+  //
+  //   mirror::Object* artReadBarrierForRootSlow(GcRoot<mirror::Object>* root);
+  //
+  // The `out` location contains the value returned by
+  // artReadBarrierForRootSlow.
+  void GenerateReadBarrierForRoot(HInstruction* instruction, Location out, Location root);
+
  private:
   using MethodToLiteralMap = ArenaSafeMap<MethodReference, Literal*, MethodReferenceComparator>;
 
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index b0be446174..2776b7d6c9 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -68,6 +68,10 @@ using helpers::ARM64EncodableConstantOrRegister;
 using helpers::ArtVixlRegCodeCoherentForRegSet;
 
 static constexpr int kCurrentMethodStackOffset = 0;
+// The compare/jump sequence will generate about (2 * num_entries + 1) instructions. While jump
+// table version generates 7 instructions and num_entries literals. Compare/jump sequence will
+// generates less code/data with a small num_entries.
+static constexpr uint32_t kPackedSwitchJumpTableThreshold = 6;
 
 inline Condition ARM64Condition(IfCondition cond) {
   switch (cond) {
@@ -545,6 +549,28 @@ class ArraySetSlowPathARM64 : public SlowPathCodeARM64 {
   DISALLOW_COPY_AND_ASSIGN(ArraySetSlowPathARM64);
 };
 
+void JumpTableARM64::EmitTable(CodeGeneratorARM64* codegen) {
+  uint32_t num_entries = switch_instr_->GetNumEntries();
+  DCHECK_GE(num_entries, kPackedSwitchJumpTableThreshold);
+
+  // We are about to use the assembler to place literals directly. Make sure we have enough
+  // underlying code buffer and we have generated the jump table with right size.
+  CodeBufferCheckScope scope(codegen->GetVIXLAssembler(), num_entries * sizeof(int32_t),
+                             CodeBufferCheckScope::kCheck, CodeBufferCheckScope::kExactSize);
+
+  __ Bind(&table_start_);
+  const ArenaVector<HBasicBlock*>& successors = switch_instr_->GetBlock()->GetSuccessors();
+  for (uint32_t i = 0; i < num_entries; i++) {
+    vixl::Label* target_label = codegen->GetLabelOf(successors[i]);
+    DCHECK(target_label->IsBound());
+    ptrdiff_t jump_offset = target_label->location() - table_start_.location();
+    DCHECK_GT(jump_offset, std::numeric_limits<int32_t>::min());
+    DCHECK_LE(jump_offset, std::numeric_limits<int32_t>::max());
+    Literal<int32_t> literal(jump_offset);
+    __ place(&literal);
+  }
+}
+
 #undef __
 
 Location InvokeDexCallingConventionVisitorARM64::GetNextLocation(Primitive::Type type) {
@@ -587,6 +613,7 @@ CodeGeneratorARM64::CodeGeneratorARM64(HGraph* graph,
                     compiler_options,
                     stats),
       block_labels_(nullptr),
+      jump_tables_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       location_builder_(graph, this),
       instruction_visitor_(graph, this),
       move_resolver_(graph->GetArena(), this),
@@ -598,15 +625,21 @@ CodeGeneratorARM64::CodeGeneratorARM64(HGraph* graph,
       call_patches_(MethodReferenceComparator(),
                     graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       relative_call_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
-      pc_rel_dex_cache_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)) {
+      pc_relative_dex_cache_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)) {
   // Save the link register (containing the return address) to mimic Quick.
   AddAllocatedRegister(LocationFrom(lr));
 }
 
-#undef __
 #define __ GetVIXLAssembler()->
 
+void CodeGeneratorARM64::EmitJumpTables() {
+  for (auto jump_table : jump_tables_) {
+    jump_table->EmitTable(this);
+  }
+}
+
 void CodeGeneratorARM64::Finalize(CodeAllocator* allocator) {
+  EmitJumpTables();
   // Ensure we emit the literal pool.
   __ FinalizeCode();
 
@@ -2283,38 +2316,56 @@ void InstructionCodeGeneratorARM64::VisitTryBoundary(HTryBoundary* try_boundary)
 }
 
 void InstructionCodeGeneratorARM64::GenerateTestAndBranch(HInstruction* instruction,
+                                                          size_t condition_input_index,
                                                           vixl::Label* true_target,
-                                                          vixl::Label* false_target,
-                                                          vixl::Label* always_true_target) {
-  HInstruction* cond = instruction->InputAt(0);
-  HCondition* condition = cond->AsCondition();
-
-  if (cond->IsIntConstant()) {
-    int32_t cond_value = cond->AsIntConstant()->GetValue();
-    if (cond_value == 1) {
-      if (always_true_target != nullptr) {
-        __ B(always_true_target);
+                                                          vixl::Label* false_target) {
+  // FP branching requires both targets to be explicit. If either of the targets
+  // is nullptr (fallthrough) use and bind `fallthrough_target` instead.
+  vixl::Label fallthrough_target;
+  HInstruction* cond = instruction->InputAt(condition_input_index);
+
+  if (true_target == nullptr && false_target == nullptr) {
+    // Nothing to do. The code always falls through.
+    return;
+  } else if (cond->IsIntConstant()) {
+    // Constant condition, statically compared against 1.
+    if (cond->AsIntConstant()->IsOne()) {
+      if (true_target != nullptr) {
+        __ B(true_target);
       }
-      return;
     } else {
-      DCHECK_EQ(cond_value, 0);
+      DCHECK(cond->AsIntConstant()->IsZero());
+      if (false_target != nullptr) {
+        __ B(false_target);
+      }
     }
-  } else if (!cond->IsCondition() || condition->NeedsMaterialization()) {
+    return;
+  }
+
+  // The following code generates these patterns:
+  //  (1) true_target == nullptr && false_target != nullptr
+  //        - opposite condition true => branch to false_target
+  //  (2) true_target != nullptr && false_target == nullptr
+  //        - condition true => branch to true_target
+  //  (3) true_target != nullptr && false_target != nullptr
+  //        - condition true => branch to true_target
+  //        - branch to false_target
+  if (IsBooleanValueOrMaterializedCondition(cond)) {
     // The condition instruction has been materialized, compare the output to 0.
-    Location cond_val = instruction->GetLocations()->InAt(0);
+    Location cond_val = instruction->GetLocations()->InAt(condition_input_index);
     DCHECK(cond_val.IsRegister());
-    __ Cbnz(InputRegisterAt(instruction, 0), true_target);
+      if (true_target == nullptr) {
+      __ Cbz(InputRegisterAt(instruction, condition_input_index), false_target);
+    } else {
+      __ Cbnz(InputRegisterAt(instruction, condition_input_index), true_target);
+    }
   } else {
     // The condition instruction has not been materialized, use its inputs as
     // the comparison and its condition as the branch condition.
-    Primitive::Type type =
-        cond->IsCondition() ? cond->InputAt(0)->GetType() : Primitive::kPrimInt;
+    HCondition* condition = cond->AsCondition();
 
+    Primitive::Type type = condition->InputAt(0)->GetType();
     if (Primitive::IsFloatingPointType(type)) {
-      // FP compares don't like null false_targets.
-      if (false_target == nullptr) {
-        false_target = codegen_->GetLabelOf(instruction->AsIf()->IfFalseSuccessor());
-      }
       FPRegister lhs = InputFPRegisterAt(condition, 0);
       if (condition->GetLocations()->InAt(1).IsConstant()) {
         DCHECK(IsFloatingPointZeroConstant(condition->GetLocations()->InAt(1).GetConstant()));
@@ -2324,31 +2375,45 @@ void InstructionCodeGeneratorARM64::GenerateTestAndBranch(HInstruction* instruct
         __ Fcmp(lhs, InputFPRegisterAt(condition, 1));
       }
       if (condition->IsFPConditionTrueIfNaN()) {
-        __ B(vs, true_target);  // VS for unordered.
+        __ B(vs, true_target == nullptr ? &fallthrough_target : true_target);
       } else if (condition->IsFPConditionFalseIfNaN()) {
-        __ B(vs, false_target);  // VS for unordered.
+        __ B(vs, false_target == nullptr ? &fallthrough_target : false_target);
+      }
+      if (true_target == nullptr) {
+        __ B(ARM64Condition(condition->GetOppositeCondition()), false_target);
+      } else {
+        __ B(ARM64Condition(condition->GetCondition()), true_target);
       }
-      __ B(ARM64Condition(condition->GetCondition()), true_target);
     } else {
       // Integer cases.
       Register lhs = InputRegisterAt(condition, 0);
       Operand rhs = InputOperandAt(condition, 1);
-      Condition arm64_cond = ARM64Condition(condition->GetCondition());
+
+      Condition arm64_cond;
+      vixl::Label* non_fallthrough_target;
+      if (true_target == nullptr) {
+        arm64_cond = ARM64Condition(condition->GetOppositeCondition());
+        non_fallthrough_target = false_target;
+      } else {
+        arm64_cond = ARM64Condition(condition->GetCondition());
+        non_fallthrough_target = true_target;
+      }
+
       if ((arm64_cond != gt && arm64_cond != le) && rhs.IsImmediate() && (rhs.immediate() == 0)) {
         switch (arm64_cond) {
           case eq:
-            __ Cbz(lhs, true_target);
+            __ Cbz(lhs, non_fallthrough_target);
             break;
           case ne:
-            __ Cbnz(lhs, true_target);
+            __ Cbnz(lhs, non_fallthrough_target);
             break;
           case lt:
             // Test the sign bit and branch accordingly.
-            __ Tbnz(lhs, (lhs.IsX() ? kXRegSize : kWRegSize) - 1, true_target);
+            __ Tbnz(lhs, (lhs.IsX() ? kXRegSize : kWRegSize) - 1, non_fallthrough_target);
             break;
           case ge:
             // Test the sign bit and branch accordingly.
-            __ Tbz(lhs, (lhs.IsX() ? kXRegSize : kWRegSize) - 1, true_target);
+            __ Tbz(lhs, (lhs.IsX() ? kXRegSize : kWRegSize) - 1, non_fallthrough_target);
             break;
           default:
             // Without the `static_cast` the compiler throws an error for
@@ -2357,43 +2422,43 @@ void InstructionCodeGeneratorARM64::GenerateTestAndBranch(HInstruction* instruct
         }
       } else {
         __ Cmp(lhs, rhs);
-        __ B(arm64_cond, true_target);
+        __ B(arm64_cond, non_fallthrough_target);
       }
     }
   }
-  if (false_target != nullptr) {
+
+  // If neither branch falls through (case 3), the conditional branch to `true_target`
+  // was already emitted (case 2) and we need to emit a jump to `false_target`.
+  if (true_target != nullptr && false_target != nullptr) {
     __ B(false_target);
   }
+
+  if (fallthrough_target.IsLinked()) {
+    __ Bind(&fallthrough_target);
+  }
 }
 
 void LocationsBuilderARM64::VisitIf(HIf* if_instr) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(if_instr);
-  HInstruction* cond = if_instr->InputAt(0);
-  if (!cond->IsCondition() || cond->AsCondition()->NeedsMaterialization()) {
+  if (IsBooleanValueOrMaterializedCondition(if_instr->InputAt(0))) {
     locations->SetInAt(0, Location::RequiresRegister());
   }
 }
 
 void InstructionCodeGeneratorARM64::VisitIf(HIf* if_instr) {
-  vixl::Label* true_target = codegen_->GetLabelOf(if_instr->IfTrueSuccessor());
-  vixl::Label* false_target = codegen_->GetLabelOf(if_instr->IfFalseSuccessor());
-  vixl::Label* always_true_target = true_target;
-  if (codegen_->GoesToNextBlock(if_instr->GetBlock(),
-                                if_instr->IfTrueSuccessor())) {
-    always_true_target = nullptr;
-  }
-  if (codegen_->GoesToNextBlock(if_instr->GetBlock(),
-                                if_instr->IfFalseSuccessor())) {
-    false_target = nullptr;
-  }
-  GenerateTestAndBranch(if_instr, true_target, false_target, always_true_target);
+  HBasicBlock* true_successor = if_instr->IfTrueSuccessor();
+  HBasicBlock* false_successor = if_instr->IfFalseSuccessor();
+  vixl::Label* true_target = codegen_->GoesToNextBlock(if_instr->GetBlock(), true_successor) ?
+      nullptr : codegen_->GetLabelOf(true_successor);
+  vixl::Label* false_target = codegen_->GoesToNextBlock(if_instr->GetBlock(), false_successor) ?
+      nullptr : codegen_->GetLabelOf(false_successor);
+  GenerateTestAndBranch(if_instr, /* condition_input_index */ 0, true_target, false_target);
 }
 
 void LocationsBuilderARM64::VisitDeoptimize(HDeoptimize* deoptimize) {
   LocationSummary* locations = new (GetGraph()->GetArena())
       LocationSummary(deoptimize, LocationSummary::kCallOnSlowPath);
-  HInstruction* cond = deoptimize->InputAt(0);
-  if (!cond->IsCondition() || cond->AsCondition()->NeedsMaterialization()) {
+  if (IsBooleanValueOrMaterializedCondition(deoptimize->InputAt(0))) {
     locations->SetInAt(0, Location::RequiresRegister());
   }
 }
@@ -2402,8 +2467,10 @@ void InstructionCodeGeneratorARM64::VisitDeoptimize(HDeoptimize* deoptimize) {
   SlowPathCodeARM64* slow_path = new (GetGraph()->GetArena())
       DeoptimizationSlowPathARM64(deoptimize);
   codegen_->AddSlowPath(slow_path);
-  vixl::Label* slow_path_entry = slow_path->GetEntryLabel();
-  GenerateTestAndBranch(deoptimize, slow_path_entry, nullptr, slow_path_entry);
+  GenerateTestAndBranch(deoptimize,
+                        /* condition_input_index */ 0,
+                        slow_path->GetEntryLabel(),
+                        /* false_target */ nullptr);
 }
 
 void LocationsBuilderARM64::VisitInstanceFieldGet(HInstanceFieldGet* instruction) {
@@ -2856,41 +2923,44 @@ void CodeGeneratorARM64::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invok
   switch (invoke->GetMethodLoadKind()) {
     case HInvokeStaticOrDirect::MethodLoadKind::kStringInit:
       // temp = thread->string_init_entrypoint
-      __ Ldr(XRegisterFrom(temp).X(), MemOperand(tr, invoke->GetStringInitOffset()));
+      __ Ldr(XRegisterFrom(temp), MemOperand(tr, invoke->GetStringInitOffset()));
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kRecursive:
-      callee_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodInputIndex());
+      callee_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress:
       // Load method address from literal pool.
-      __ Ldr(XRegisterFrom(temp).X(), DeduplicateUint64Literal(invoke->GetMethodAddress()));
+      __ Ldr(XRegisterFrom(temp), DeduplicateUint64Literal(invoke->GetMethodAddress()));
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddressWithFixup:
       // Load method address from literal pool with a link-time patch.
-      __ Ldr(XRegisterFrom(temp).X(),
+      __ Ldr(XRegisterFrom(temp),
              DeduplicateMethodAddressLiteral(invoke->GetTargetMethod()));
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kDexCachePcRelative: {
       // Add ADRP with its PC-relative DexCache access patch.
-      pc_rel_dex_cache_patches_.emplace_back(*invoke->GetTargetMethod().dex_file,
-                                             invoke->GetDexCacheArrayOffset());
-      vixl::Label* pc_insn_label = &pc_rel_dex_cache_patches_.back().label;
+      pc_relative_dex_cache_patches_.emplace_back(*invoke->GetTargetMethod().dex_file,
+                                                  invoke->GetDexCacheArrayOffset());
+      vixl::Label* pc_insn_label = &pc_relative_dex_cache_patches_.back().label;
       {
         vixl::SingleEmissionCheckScope guard(GetVIXLAssembler());
-        __ adrp(XRegisterFrom(temp).X(), 0);
+        __ Bind(pc_insn_label);
+        __ adrp(XRegisterFrom(temp), 0);
       }
-      __ Bind(pc_insn_label);  // Bind after ADRP.
-      pc_rel_dex_cache_patches_.back().pc_insn_label = pc_insn_label;
+      pc_relative_dex_cache_patches_.back().pc_insn_label = pc_insn_label;
       // Add LDR with its PC-relative DexCache access patch.
-      pc_rel_dex_cache_patches_.emplace_back(*invoke->GetTargetMethod().dex_file,
-                                             invoke->GetDexCacheArrayOffset());
-      __ Ldr(XRegisterFrom(temp).X(), MemOperand(XRegisterFrom(temp).X(), 0));
-      __ Bind(&pc_rel_dex_cache_patches_.back().label);  // Bind after LDR.
-      pc_rel_dex_cache_patches_.back().pc_insn_label = pc_insn_label;
+      pc_relative_dex_cache_patches_.emplace_back(*invoke->GetTargetMethod().dex_file,
+                                                  invoke->GetDexCacheArrayOffset());
+      {
+        vixl::SingleEmissionCheckScope guard(GetVIXLAssembler());
+        __ Bind(&pc_relative_dex_cache_patches_.back().label);
+        __ ldr(XRegisterFrom(temp), MemOperand(XRegisterFrom(temp), 0));
+        pc_relative_dex_cache_patches_.back().pc_insn_label = pc_insn_label;
+      }
       break;
     }
     case HInvokeStaticOrDirect::MethodLoadKind::kDexCacheViaMethod: {
-      Location current_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodInputIndex());
+      Location current_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
       Register reg = XRegisterFrom(temp);
       Register method_reg;
       if (current_method.IsRegister()) {
@@ -2920,8 +2990,9 @@ void CodeGeneratorARM64::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invok
     case HInvokeStaticOrDirect::CodePtrLocation::kCallPCRelative: {
       relative_call_patches_.emplace_back(invoke->GetTargetMethod());
       vixl::Label* label = &relative_call_patches_.back().label;
-      __ Bl(label);  // Arbitrarily branch to the instruction after BL, override at link time.
-      __ Bind(label);  // Bind after BL.
+      vixl::SingleEmissionCheckScope guard(GetVIXLAssembler());
+      __ Bind(label);
+      __ bl(0);  // Branch and link to itself. This will be overriden at link time.
       break;
     }
     case HInvokeStaticOrDirect::CodePtrLocation::kCallDirectWithFixup:
@@ -2934,7 +3005,7 @@ void CodeGeneratorARM64::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invok
     case HInvokeStaticOrDirect::CodePtrLocation::kCallArtMethod:
       // LR = callee_method->entry_point_from_quick_compiled_code_;
       __ Ldr(lr, MemOperand(
-          XRegisterFrom(callee_method).X(),
+          XRegisterFrom(callee_method),
           ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArm64WordSize).Int32Value()));
       // lr()
       __ Blr(lr);
@@ -2973,7 +3044,7 @@ void CodeGeneratorARM64::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patc
       method_patches_.size() +
       call_patches_.size() +
       relative_call_patches_.size() +
-      pc_rel_dex_cache_patches_.size();
+      pc_relative_dex_cache_patches_.size();
   linker_patches->reserve(size);
   for (const auto& entry : method_patches_) {
     const MethodReference& target_method = entry.first;
@@ -2990,14 +3061,14 @@ void CodeGeneratorARM64::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patc
                                                      target_method.dex_method_index));
   }
   for (const MethodPatchInfo<vixl::Label>& info : relative_call_patches_) {
-    linker_patches->push_back(LinkerPatch::RelativeCodePatch(info.label.location() - 4u,
+    linker_patches->push_back(LinkerPatch::RelativeCodePatch(info.label.location(),
                                                              info.target_method.dex_file,
                                                              info.target_method.dex_method_index));
   }
-  for (const PcRelativeDexCacheAccessInfo& info : pc_rel_dex_cache_patches_) {
-    linker_patches->push_back(LinkerPatch::DexCacheArrayPatch(info.label.location() - 4u,
+  for (const PcRelativeDexCacheAccessInfo& info : pc_relative_dex_cache_patches_) {
+    linker_patches->push_back(LinkerPatch::DexCacheArrayPatch(info.label.location(),
                                                               &info.target_dex_file,
-                                                              info.pc_insn_label->location() - 4u,
+                                                              info.pc_insn_label->location(),
                                                               info.element_offset));
   }
 }
@@ -3810,26 +3881,73 @@ void LocationsBuilderARM64::VisitPackedSwitch(HPackedSwitch* switch_instr) {
 
 void InstructionCodeGeneratorARM64::VisitPackedSwitch(HPackedSwitch* switch_instr) {
   int32_t lower_bound = switch_instr->GetStartValue();
-  int32_t num_entries = switch_instr->GetNumEntries();
+  uint32_t num_entries = switch_instr->GetNumEntries();
   Register value_reg = InputRegisterAt(switch_instr, 0);
   HBasicBlock* default_block = switch_instr->GetDefaultBlock();
 
-  // Create a series of compare/jumps.
-  const ArenaVector<HBasicBlock*>& successors = switch_instr->GetBlock()->GetSuccessors();
-  for (int32_t i = 0; i < num_entries; i++) {
-    int32_t case_value = lower_bound + i;
-    vixl::Label* succ = codegen_->GetLabelOf(successors[i]);
-    if (case_value == 0) {
-      __ Cbz(value_reg, succ);
-    } else {
-      __ Cmp(value_reg, vixl::Operand(case_value));
-      __ B(eq, succ);
+  // Roughly set 16 as max average assemblies generated per HIR in a graph.
+  static constexpr int32_t kMaxExpectedSizePerHInstruction = 16 * vixl::kInstructionSize;
+  // ADR has a limited range(+/-1MB), so we set a threshold for the number of HIRs in the graph to
+  // make sure we don't emit it if the target may run out of range.
+  // TODO: Instead of emitting all jump tables at the end of the code, we could keep track of ADR
+  // ranges and emit the tables only as required.
+  static constexpr int32_t kJumpTableInstructionThreshold = 1* MB / kMaxExpectedSizePerHInstruction;
+
+  if (num_entries < kPackedSwitchJumpTableThreshold ||
+      // Current instruction id is an upper bound of the number of HIRs in the graph.
+      GetGraph()->GetCurrentInstructionId() > kJumpTableInstructionThreshold) {
+    // Create a series of compare/jumps.
+    const ArenaVector<HBasicBlock*>& successors = switch_instr->GetBlock()->GetSuccessors();
+    for (uint32_t i = 0; i < num_entries; i++) {
+      int32_t case_value = lower_bound + i;
+      vixl::Label* succ = codegen_->GetLabelOf(successors[i]);
+      if (case_value == 0) {
+        __ Cbz(value_reg, succ);
+      } else {
+        __ Cmp(value_reg, Operand(case_value));
+        __ B(eq, succ);
+      }
     }
-  }
 
-  // And the default for any other value.
-  if (!codegen_->GoesToNextBlock(switch_instr->GetBlock(), default_block)) {
-    __ B(codegen_->GetLabelOf(default_block));
+    // And the default for any other value.
+    if (!codegen_->GoesToNextBlock(switch_instr->GetBlock(), default_block)) {
+      __ B(codegen_->GetLabelOf(default_block));
+    }
+  } else {
+    JumpTableARM64* jump_table = new (GetGraph()->GetArena()) JumpTableARM64(switch_instr);
+    codegen_->AddJumpTable(jump_table);
+
+    UseScratchRegisterScope temps(codegen_->GetVIXLAssembler());
+
+    // Below instructions should use at most one blocked register. Since there are two blocked
+    // registers, we are free to block one.
+    Register temp_w = temps.AcquireW();
+    Register index;
+    // Remove the bias.
+    if (lower_bound != 0) {
+      index = temp_w;
+      __ Sub(index, value_reg, Operand(lower_bound));
+    } else {
+      index = value_reg;
+    }
+
+    // Jump to default block if index is out of the range.
+    __ Cmp(index, Operand(num_entries));
+    __ B(hs, codegen_->GetLabelOf(default_block));
+
+    // In current VIXL implementation, it won't require any blocked registers to encode the
+    // immediate value for Adr. So we are free to use both VIXL blocked registers to reduce the
+    // register pressure.
+    Register table_base = temps.AcquireX();
+    // Load jump offset from the table.
+    __ Adr(table_base, jump_table->GetTableStartLabel());
+    Register jump_offset = temp_w;
+    __ Ldr(jump_offset, MemOperand(table_base, index, UXTW, 2));
+
+    // Jump to target block by branching to table_base(pc related) + offset.
+    Register target_address = table_base;
+    __ Add(target_address, table_base, Operand(jump_offset, SXTW));
+    __ Br(target_address);
   }
 }
 
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index ab684ea538..881afcc123 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -81,6 +81,22 @@ class SlowPathCodeARM64 : public SlowPathCode {
   DISALLOW_COPY_AND_ASSIGN(SlowPathCodeARM64);
 };
 
+class JumpTableARM64 : public ArenaObject<kArenaAllocSwitchTable> {
+ public:
+  explicit JumpTableARM64(HPackedSwitch* switch_instr)
+    : switch_instr_(switch_instr), table_start_() {}
+
+  vixl::Label* GetTableStartLabel() { return &table_start_; }
+
+  void EmitTable(CodeGeneratorARM64* codegen);
+
+ private:
+  HPackedSwitch* const switch_instr_;
+  vixl::Label table_start_;
+
+  DISALLOW_COPY_AND_ASSIGN(JumpTableARM64);
+};
+
 static const vixl::Register kRuntimeParameterCoreRegisters[] =
     { vixl::x0, vixl::x1, vixl::x2, vixl::x3, vixl::x4, vixl::x5, vixl::x6, vixl::x7 };
 static constexpr size_t kRuntimeParameterCoreRegistersLength =
@@ -203,9 +219,9 @@ class InstructionCodeGeneratorARM64 : public HGraphVisitor {
   void GenerateImplicitNullCheck(HNullCheck* instruction);
   void GenerateExplicitNullCheck(HNullCheck* instruction);
   void GenerateTestAndBranch(HInstruction* instruction,
+                             size_t condition_input_index,
                              vixl::Label* true_target,
-                             vixl::Label* false_target,
-                             vixl::Label* always_true_target);
+                             vixl::Label* false_target);
   void DivRemOneOrMinusOne(HBinaryOperation* instruction);
   void DivRemByPowerOfTwo(HBinaryOperation* instruction);
   void GenerateDivRemWithAnyConstant(HBinaryOperation* instruction);
@@ -358,6 +374,10 @@ class CodeGeneratorARM64 : public CodeGenerator {
     block_labels_ = CommonInitializeLabels<vixl::Label>();
   }
 
+  void AddJumpTable(JumpTableARM64* jump_table) {
+    jump_tables_.push_back(jump_table);
+  }
+
   void Finalize(CodeAllocator* allocator) OVERRIDE;
 
   // Code generation helpers.
@@ -422,15 +442,16 @@ class CodeGeneratorARM64 : public CodeGenerator {
 
     const DexFile& target_dex_file;
     uint32_t element_offset;
-    // NOTE: Labels are bound to the end of the patched instruction because
-    // we don't know if there will be a veneer or how big it will be.
     vixl::Label label;
     vixl::Label* pc_insn_label;
   };
 
+  void EmitJumpTables();
+
   // Labels for each block that will be compiled.
   vixl::Label* block_labels_;  // Indexed by block id.
   vixl::Label frame_entry_label_;
+  ArenaVector<JumpTableARM64*> jump_tables_;
 
   LocationsBuilderARM64 location_builder_;
   InstructionCodeGeneratorARM64 instruction_visitor_;
@@ -447,7 +468,7 @@ class CodeGeneratorARM64 : public CodeGenerator {
   // Using ArenaDeque<> which retains element addresses on push/emplace_back().
   ArenaDeque<MethodPatchInfo<vixl::Label>> relative_call_patches_;
   // PC-relative DexCache access info.
-  ArenaDeque<PcRelativeDexCacheAccessInfo> pc_rel_dex_cache_patches_;
+  ArenaDeque<PcRelativeDexCacheAccessInfo> pc_relative_dex_cache_patches_;
 
   DISALLOW_COPY_AND_ASSIGN(CodeGeneratorARM64);
 };
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index 959adb4238..801e203de5 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -2420,30 +2420,51 @@ void InstructionCodeGeneratorMIPS::VisitTryBoundary(HTryBoundary* try_boundary)
 }
 
 void InstructionCodeGeneratorMIPS::GenerateTestAndBranch(HInstruction* instruction,
+                                                         size_t condition_input_index,
                                                          MipsLabel* true_target,
-                                                         MipsLabel* false_target,
-                                                         MipsLabel* always_true_target) {
-  HInstruction* cond = instruction->InputAt(0);
-  HCondition* condition = cond->AsCondition();
-
-  if (cond->IsIntConstant()) {
-    int32_t cond_value = cond->AsIntConstant()->GetValue();
-    if (cond_value == 1) {
-      if (always_true_target != nullptr) {
-        __ B(always_true_target);
+                                                         MipsLabel* false_target) {
+  HInstruction* cond = instruction->InputAt(condition_input_index);
+
+  if (true_target == nullptr && false_target == nullptr) {
+    // Nothing to do. The code always falls through.
+    return;
+  } else if (cond->IsIntConstant()) {
+    // Constant condition, statically compared against 1.
+    if (cond->AsIntConstant()->IsOne()) {
+      if (true_target != nullptr) {
+        __ B(true_target);
       }
-      return;
     } else {
-      DCHECK_EQ(cond_value, 0);
+      DCHECK(cond->AsIntConstant()->IsZero());
+      if (false_target != nullptr) {
+        __ B(false_target);
+      }
     }
-  } else if (!cond->IsCondition() || condition->NeedsMaterialization()) {
+    return;
+  }
+
+  // The following code generates these patterns:
+  //  (1) true_target == nullptr && false_target != nullptr
+  //        - opposite condition true => branch to false_target
+  //  (2) true_target != nullptr && false_target == nullptr
+  //        - condition true => branch to true_target
+  //  (3) true_target != nullptr && false_target != nullptr
+  //        - condition true => branch to true_target
+  //        - branch to false_target
+  if (IsBooleanValueOrMaterializedCondition(cond)) {
     // The condition instruction has been materialized, compare the output to 0.
-    Location cond_val = instruction->GetLocations()->InAt(0);
+    Location cond_val = instruction->GetLocations()->InAt(condition_input_index);
     DCHECK(cond_val.IsRegister());
-    __ Bnez(cond_val.AsRegister<Register>(), true_target);
+      if (true_target == nullptr) {
+      __ Beqz(cond_val.AsRegister<Register>(), false_target);
+    } else {
+      __ Bnez(cond_val.AsRegister<Register>(), true_target);
+    }
   } else {
     // The condition instruction has not been materialized, use its inputs as
     // the comparison and its condition as the branch condition.
+    HCondition* condition = cond->AsCondition();
+
     Register lhs = condition->GetLocations()->InAt(0).AsRegister<Register>();
     Location rhs_location = condition->GetLocations()->InAt(1);
     Register rhs_reg = ZERO;
@@ -2455,37 +2476,46 @@ void InstructionCodeGeneratorMIPS::GenerateTestAndBranch(HInstruction* instructi
       rhs_reg = rhs_location.AsRegister<Register>();
     }
 
-    IfCondition if_cond = condition->GetCondition();
+    IfCondition if_cond;
+    MipsLabel* non_fallthrough_target;
+    if (true_target == nullptr) {
+      if_cond = condition->GetOppositeCondition();
+      non_fallthrough_target = false_target;
+    } else {
+      if_cond = condition->GetCondition();
+      non_fallthrough_target = true_target;
+    }
+
     if (use_imm && rhs_imm == 0) {
       switch (if_cond) {
         case kCondEQ:
-          __ Beqz(lhs, true_target);
+          __ Beqz(lhs, non_fallthrough_target);
           break;
         case kCondNE:
-          __ Bnez(lhs, true_target);
+          __ Bnez(lhs, non_fallthrough_target);
           break;
         case kCondLT:
-          __ Bltz(lhs, true_target);
+          __ Bltz(lhs, non_fallthrough_target);
           break;
         case kCondGE:
-          __ Bgez(lhs, true_target);
+          __ Bgez(lhs, non_fallthrough_target);
           break;
         case kCondLE:
-          __ Blez(lhs, true_target);
+          __ Blez(lhs, non_fallthrough_target);
           break;
         case kCondGT:
-          __ Bgtz(lhs, true_target);
+          __ Bgtz(lhs, non_fallthrough_target);
           break;
         case kCondB:
           break;  // always false
         case kCondBE:
-          __ Beqz(lhs, true_target);  // <= 0 if zero
+          __ Beqz(lhs, non_fallthrough_target);  // <= 0 if zero
           break;
         case kCondA:
-          __ Bnez(lhs, true_target);  // > 0 if non-zero
+          __ Bnez(lhs, non_fallthrough_target);  // > 0 if non-zero
           break;
         case kCondAE:
-          __ B(true_target);  // always true
+          __ B(non_fallthrough_target);  // always true
           break;
       }
     } else {
@@ -2496,81 +2526,78 @@ void InstructionCodeGeneratorMIPS::GenerateTestAndBranch(HInstruction* instructi
       }
       switch (if_cond) {
         case kCondEQ:
-          __ Beq(lhs, rhs_reg, true_target);
+          __ Beq(lhs, rhs_reg, non_fallthrough_target);
           break;
         case kCondNE:
-          __ Bne(lhs, rhs_reg, true_target);
+          __ Bne(lhs, rhs_reg, non_fallthrough_target);
           break;
         case kCondLT:
-          __ Blt(lhs, rhs_reg, true_target);
+          __ Blt(lhs, rhs_reg, non_fallthrough_target);
           break;
         case kCondGE:
-          __ Bge(lhs, rhs_reg, true_target);
+          __ Bge(lhs, rhs_reg, non_fallthrough_target);
           break;
         case kCondLE:
-          __ Bge(rhs_reg, lhs, true_target);
+          __ Bge(rhs_reg, lhs, non_fallthrough_target);
           break;
         case kCondGT:
-          __ Blt(rhs_reg, lhs, true_target);
+          __ Blt(rhs_reg, lhs, non_fallthrough_target);
           break;
         case kCondB:
-          __ Bltu(lhs, rhs_reg, true_target);
+          __ Bltu(lhs, rhs_reg, non_fallthrough_target);
           break;
         case kCondAE:
-          __ Bgeu(lhs, rhs_reg, true_target);
+          __ Bgeu(lhs, rhs_reg, non_fallthrough_target);
           break;
         case kCondBE:
-          __ Bgeu(rhs_reg, lhs, true_target);
+          __ Bgeu(rhs_reg, lhs, non_fallthrough_target);
           break;
         case kCondA:
-          __ Bltu(rhs_reg, lhs, true_target);
+          __ Bltu(rhs_reg, lhs, non_fallthrough_target);
           break;
       }
     }
   }
-  if (false_target != nullptr) {
+
+  // If neither branch falls through (case 3), the conditional branch to `true_target`
+  // was already emitted (case 2) and we need to emit a jump to `false_target`.
+  if (true_target != nullptr && false_target != nullptr) {
     __ B(false_target);
   }
 }
 
 void LocationsBuilderMIPS::VisitIf(HIf* if_instr) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(if_instr);
-  HInstruction* cond = if_instr->InputAt(0);
-  if (!cond->IsCondition() || cond->AsCondition()->NeedsMaterialization()) {
+  if (IsBooleanValueOrMaterializedCondition(if_instr->InputAt(0))) {
     locations->SetInAt(0, Location::RequiresRegister());
   }
 }
 
 void InstructionCodeGeneratorMIPS::VisitIf(HIf* if_instr) {
-  MipsLabel* true_target = codegen_->GetLabelOf(if_instr->IfTrueSuccessor());
-  MipsLabel* false_target = codegen_->GetLabelOf(if_instr->IfFalseSuccessor());
-  MipsLabel* always_true_target = true_target;
-  if (codegen_->GoesToNextBlock(if_instr->GetBlock(),
-                                if_instr->IfTrueSuccessor())) {
-    always_true_target = nullptr;
-  }
-  if (codegen_->GoesToNextBlock(if_instr->GetBlock(),
-                                if_instr->IfFalseSuccessor())) {
-    false_target = nullptr;
-  }
-  GenerateTestAndBranch(if_instr, true_target, false_target, always_true_target);
+  HBasicBlock* true_successor = if_instr->IfTrueSuccessor();
+  HBasicBlock* false_successor = if_instr->IfFalseSuccessor();
+  MipsLabel* true_target = codegen_->GoesToNextBlock(if_instr->GetBlock(), true_successor) ?
+      nullptr : codegen_->GetLabelOf(true_successor);
+  MipsLabel* false_target = codegen_->GoesToNextBlock(if_instr->GetBlock(), false_successor) ?
+      nullptr : codegen_->GetLabelOf(false_successor);
+  GenerateTestAndBranch(if_instr, /* condition_input_index */ 0, true_target, false_target);
 }
 
 void LocationsBuilderMIPS::VisitDeoptimize(HDeoptimize* deoptimize) {
   LocationSummary* locations = new (GetGraph()->GetArena())
       LocationSummary(deoptimize, LocationSummary::kCallOnSlowPath);
-  HInstruction* cond = deoptimize->InputAt(0);
-  if (!cond->IsCondition() || cond->AsCondition()->NeedsMaterialization()) {
+  if (IsBooleanValueOrMaterializedCondition(deoptimize->InputAt(0))) {
     locations->SetInAt(0, Location::RequiresRegister());
   }
 }
 
 void InstructionCodeGeneratorMIPS::VisitDeoptimize(HDeoptimize* deoptimize) {
-  SlowPathCodeMIPS* slow_path = new (GetGraph()->GetArena())
-      DeoptimizationSlowPathMIPS(deoptimize);
+  SlowPathCodeMIPS* slow_path = new (GetGraph()->GetArena()) DeoptimizationSlowPathMIPS(deoptimize);
   codegen_->AddSlowPath(slow_path);
-  MipsLabel* slow_path_entry = slow_path->GetEntryLabel();
-  GenerateTestAndBranch(deoptimize, slow_path_entry, nullptr, slow_path_entry);
+  GenerateTestAndBranch(deoptimize,
+                        /* condition_input_index */ 0,
+                        slow_path->GetEntryLabel(),
+                        /* false_target */ nullptr);
 }
 
 void LocationsBuilderMIPS::HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info) {
@@ -3004,7 +3031,7 @@ void CodeGeneratorMIPS::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke
                         invoke->GetStringInitOffset());
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kRecursive:
-      callee_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodInputIndex());
+      callee_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress:
       __ LoadConst32(temp.AsRegister<Register>(), invoke->GetMethodAddress());
@@ -3016,7 +3043,7 @@ void CodeGeneratorMIPS::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke
       LOG(FATAL) << "Unsupported";
       UNREACHABLE();
     case HInvokeStaticOrDirect::MethodLoadKind::kDexCacheViaMethod: {
-      Location current_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodInputIndex());
+      Location current_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
       Register reg = temp.AsRegister<Register>();
       Register method_reg;
       if (current_method.IsRegister()) {
diff --git a/compiler/optimizing/code_generator_mips.h b/compiler/optimizing/code_generator_mips.h
index 059131dcfc..e3a2cb40ef 100644
--- a/compiler/optimizing/code_generator_mips.h
+++ b/compiler/optimizing/code_generator_mips.h
@@ -226,9 +226,9 @@ class InstructionCodeGeneratorMIPS : public HGraphVisitor {
   void GenerateImplicitNullCheck(HNullCheck* instruction);
   void GenerateExplicitNullCheck(HNullCheck* instruction);
   void GenerateTestAndBranch(HInstruction* instruction,
+                             size_t condition_input_index,
                              MipsLabel* true_target,
-                             MipsLabel* false_target,
-                             MipsLabel* always_true_target);
+                             MipsLabel* false_target);
   void HandleGoto(HInstruction* got, HBasicBlock* successor);
 
   MipsAssembler* const assembler_;
diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc
index d4fcaf9321..7b33075358 100644
--- a/compiler/optimizing/code_generator_mips64.cc
+++ b/compiler/optimizing/code_generator_mips64.cc
@@ -16,13 +16,13 @@
 
 #include "code_generator_mips64.h"
 
+#include "art_method.h"
+#include "code_generator_utils.h"
 #include "entrypoints/quick/quick_entrypoints.h"
 #include "entrypoints/quick/quick_entrypoints_enum.h"
 #include "gc/accounting/card_table.h"
 #include "intrinsics.h"
 #include "intrinsics_mips64.h"
-#include "art_method.h"
-#include "code_generator_utils.h"
 #include "mirror/array-inl.h"
 #include "mirror/class-inl.h"
 #include "offsets.h"
@@ -420,7 +420,7 @@ CodeGeneratorMIPS64::CodeGeneratorMIPS64(HGraph* graph,
     : CodeGenerator(graph,
                     kNumberOfGpuRegisters,
                     kNumberOfFpuRegisters,
-                    0,  // kNumberOfRegisterPairs
+                    /* number_of_register_pairs */ 0,
                     ComputeRegisterMask(reinterpret_cast<const int*>(kCoreCalleeSaves),
                                         arraysize(kCoreCalleeSaves)),
                     ComputeRegisterMask(reinterpret_cast<const int*>(kFpuCalleeSaves),
@@ -666,9 +666,19 @@ void CodeGeneratorMIPS64::MoveLocation(Location destination,
         gpr = destination.AsRegister<GpuRegister>();
       }
       if (dst_type == Primitive::kPrimInt || dst_type == Primitive::kPrimFloat) {
-        __ LoadConst32(gpr, GetInt32ValueOf(source.GetConstant()->AsConstant()));
+        int32_t value = GetInt32ValueOf(source.GetConstant()->AsConstant());
+        if (Primitive::IsFloatingPointType(dst_type) && value == 0) {
+          gpr = ZERO;
+        } else {
+          __ LoadConst32(gpr, value);
+        }
       } else {
-        __ LoadConst64(gpr, GetInt64ValueOf(source.GetConstant()->AsConstant()));
+        int64_t value = GetInt64ValueOf(source.GetConstant()->AsConstant());
+        if (Primitive::IsFloatingPointType(dst_type) && value == 0) {
+          gpr = ZERO;
+        } else {
+          __ LoadConst64(gpr, value);
+        }
       }
       if (dst_type == Primitive::kPrimFloat) {
         __ Mtc1(gpr, destination.AsFpuRegister<FpuRegister>());
@@ -734,12 +744,22 @@ void CodeGeneratorMIPS64::MoveLocation(Location destination,
       // Move to stack from constant
       HConstant* src_cst = source.GetConstant();
       StoreOperandType store_type = destination.IsStackSlot() ? kStoreWord : kStoreDoubleword;
+      GpuRegister gpr = ZERO;
       if (destination.IsStackSlot()) {
-        __ LoadConst32(TMP, GetInt32ValueOf(src_cst->AsConstant()));
+        int32_t value = GetInt32ValueOf(src_cst->AsConstant());
+        if (value != 0) {
+          gpr = TMP;
+          __ LoadConst32(gpr, value);
+        }
       } else {
-        __ LoadConst64(TMP, GetInt64ValueOf(src_cst->AsConstant()));
+        DCHECK(destination.IsDoubleStackSlot());
+        int64_t value = GetInt64ValueOf(src_cst->AsConstant());
+        if (value != 0) {
+          gpr = TMP;
+          __ LoadConst64(gpr, value);
+        }
       }
-      __ StoreToOffset(store_type, TMP, SP, destination.GetStackIndex());
+      __ StoreToOffset(store_type, gpr, SP, destination.GetStackIndex());
     } else {
       DCHECK(source.IsStackSlot() || source.IsDoubleStackSlot());
       DCHECK_EQ(source.IsDoubleStackSlot(), destination.IsDoubleStackSlot());
@@ -755,9 +775,7 @@ void CodeGeneratorMIPS64::MoveLocation(Location destination,
   }
 }
 
-void CodeGeneratorMIPS64::SwapLocations(Location loc1,
-                                        Location loc2,
-                                        Primitive::Type type ATTRIBUTE_UNUSED) {
+void CodeGeneratorMIPS64::SwapLocations(Location loc1, Location loc2, Primitive::Type type) {
   DCHECK(!loc1.IsConstant());
   DCHECK(!loc2.IsConstant());
 
@@ -781,12 +799,16 @@ void CodeGeneratorMIPS64::SwapLocations(Location loc1,
     // Swap 2 FPRs
     FpuRegister r1 = loc1.AsFpuRegister<FpuRegister>();
     FpuRegister r2 = loc2.AsFpuRegister<FpuRegister>();
-    // TODO: Can MOV.S/MOV.D be used here to save one instruction?
-    // Need to distinguish float from double, right?
-    __ Dmfc1(TMP, r2);
-    __ Dmfc1(AT, r1);
-    __ Dmtc1(TMP, r1);
-    __ Dmtc1(AT, r2);
+    if (type == Primitive::kPrimFloat) {
+      __ MovS(FTMP, r1);
+      __ MovS(r1, r2);
+      __ MovS(r2, FTMP);
+    } else {
+      DCHECK_EQ(type, Primitive::kPrimDouble);
+      __ MovD(FTMP, r1);
+      __ MovD(r1, r2);
+      __ MovD(r2, FTMP);
+    }
   } else if (is_slot1 != is_slot2) {
     // Swap GPR/FPR and stack slot
     Location reg_loc = is_slot1 ? loc2 : loc1;
@@ -800,7 +822,6 @@ void CodeGeneratorMIPS64::SwapLocations(Location loc1,
                           reg_loc.AsFpuRegister<FpuRegister>(),
                           SP,
                           mem_loc.GetStackIndex());
-      // TODO: review this MTC1/DMTC1 move
       if (mem_loc.IsStackSlot()) {
         __ Mtc1(TMP, reg_loc.AsFpuRegister<FpuRegister>());
       } else {
@@ -845,12 +866,22 @@ void CodeGeneratorMIPS64::Move(HInstruction* instruction,
     } else {
       DCHECK(location.IsStackSlot() || location.IsDoubleStackSlot());
       // Move to stack from constant
+      GpuRegister gpr = ZERO;
       if (location.IsStackSlot()) {
-        __ LoadConst32(TMP, GetInt32ValueOf(instruction->AsConstant()));
-        __ StoreToOffset(kStoreWord, TMP, SP, location.GetStackIndex());
+        int32_t value = GetInt32ValueOf(instruction->AsConstant());
+        if (value != 0) {
+          gpr = TMP;
+          __ LoadConst32(gpr, value);
+        }
+        __ StoreToOffset(kStoreWord, gpr, SP, location.GetStackIndex());
       } else {
-        __ LoadConst64(TMP, instruction->AsLongConstant()->GetValue());
-        __ StoreToOffset(kStoreDoubleword, TMP, SP, location.GetStackIndex());
+        DCHECK(location.IsDoubleStackSlot());
+        int64_t value = instruction->AsLongConstant()->GetValue();
+        if (value != 0) {
+          gpr = TMP;
+          __ LoadConst64(gpr, value);
+        }
+        __ StoreToOffset(kStoreDoubleword, gpr, SP, location.GetStackIndex());
       }
     }
   } else if (instruction->IsTemporary()) {
@@ -1198,7 +1229,7 @@ void LocationsBuilderMIPS64::HandleShift(HBinaryOperation* instr) {
     case Primitive::kPrimLong: {
       locations->SetInAt(0, Location::RequiresRegister());
       locations->SetInAt(1, Location::RegisterOrConstant(instr->InputAt(1)));
-      locations->SetOut(Location::RequiresRegister());
+      locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
       break;
     }
     default:
@@ -1707,7 +1738,7 @@ void LocationsBuilderMIPS64::VisitCompare(HCompare* compare) {
   switch (in_type) {
     case Primitive::kPrimLong:
       locations->SetInAt(0, Location::RequiresRegister());
-      locations->SetInAt(1, Location::RequiresRegister());
+      locations->SetInAt(1, Location::RegisterOrConstant(compare->InputAt(1)));
       locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
       break;
 
@@ -1736,8 +1767,18 @@ void InstructionCodeGeneratorMIPS64::VisitCompare(HCompare* instruction) {
     case Primitive::kPrimLong: {
       GpuRegister dst = locations->Out().AsRegister<GpuRegister>();
       GpuRegister lhs = locations->InAt(0).AsRegister<GpuRegister>();
-      GpuRegister rhs = locations->InAt(1).AsRegister<GpuRegister>();
-      // TODO: more efficient (direct) comparison with a constant
+      Location rhs_location = locations->InAt(1);
+      bool use_imm = rhs_location.IsConstant();
+      GpuRegister rhs = ZERO;
+      if (use_imm) {
+        int64_t value = CodeGenerator::GetInt64ValueOf(rhs_location.GetConstant()->AsConstant());
+        if (value != 0) {
+          rhs = AT;
+          __ LoadConst64(rhs, value);
+        }
+      } else {
+        rhs = rhs_location.AsRegister<GpuRegister>();
+      }
       __ Slt(TMP, lhs, rhs);
       __ Slt(dst, rhs, lhs);
       __ Subu(dst, dst, TMP);
@@ -1902,6 +1943,252 @@ void InstructionCodeGeneratorMIPS64::VisitCondition(HCondition* instruction) {
   }
 }
 
+void InstructionCodeGeneratorMIPS64::DivRemOneOrMinusOne(HBinaryOperation* instruction) {
+  DCHECK(instruction->IsDiv() || instruction->IsRem());
+  Primitive::Type type = instruction->GetResultType();
+
+  LocationSummary* locations = instruction->GetLocations();
+  Location second = locations->InAt(1);
+  DCHECK(second.IsConstant());
+
+  GpuRegister out = locations->Out().AsRegister<GpuRegister>();
+  GpuRegister dividend = locations->InAt(0).AsRegister<GpuRegister>();
+  int64_t imm = Int64FromConstant(second.GetConstant());
+  DCHECK(imm == 1 || imm == -1);
+
+  if (instruction->IsRem()) {
+    __ Move(out, ZERO);
+  } else {
+    if (imm == -1) {
+      if (type == Primitive::kPrimInt) {
+        __ Subu(out, ZERO, dividend);
+      } else {
+        DCHECK_EQ(type, Primitive::kPrimLong);
+        __ Dsubu(out, ZERO, dividend);
+      }
+    } else if (out != dividend) {
+      __ Move(out, dividend);
+    }
+  }
+}
+
+void InstructionCodeGeneratorMIPS64::DivRemByPowerOfTwo(HBinaryOperation* instruction) {
+  DCHECK(instruction->IsDiv() || instruction->IsRem());
+  Primitive::Type type = instruction->GetResultType();
+
+  LocationSummary* locations = instruction->GetLocations();
+  Location second = locations->InAt(1);
+  DCHECK(second.IsConstant());
+
+  GpuRegister out = locations->Out().AsRegister<GpuRegister>();
+  GpuRegister dividend = locations->InAt(0).AsRegister<GpuRegister>();
+  int64_t imm = Int64FromConstant(second.GetConstant());
+  uint64_t abs_imm = static_cast<uint64_t>(std::abs(imm));
+  DCHECK(IsPowerOfTwo(abs_imm));
+  int ctz_imm = CTZ(abs_imm);
+
+  if (instruction->IsDiv()) {
+    if (type == Primitive::kPrimInt) {
+      if (ctz_imm == 1) {
+        // Fast path for division by +/-2, which is very common.
+        __ Srl(TMP, dividend, 31);
+      } else {
+        __ Sra(TMP, dividend, 31);
+        __ Srl(TMP, TMP, 32 - ctz_imm);
+      }
+      __ Addu(out, dividend, TMP);
+      __ Sra(out, out, ctz_imm);
+      if (imm < 0) {
+        __ Subu(out, ZERO, out);
+      }
+    } else {
+      DCHECK_EQ(type, Primitive::kPrimLong);
+      if (ctz_imm == 1) {
+        // Fast path for division by +/-2, which is very common.
+        __ Dsrl32(TMP, dividend, 31);
+      } else {
+        __ Dsra32(TMP, dividend, 31);
+        if (ctz_imm > 32) {
+          __ Dsrl(TMP, TMP, 64 - ctz_imm);
+        } else {
+          __ Dsrl32(TMP, TMP, 32 - ctz_imm);
+        }
+      }
+      __ Daddu(out, dividend, TMP);
+      if (ctz_imm < 32) {
+        __ Dsra(out, out, ctz_imm);
+      } else {
+        __ Dsra32(out, out, ctz_imm - 32);
+      }
+      if (imm < 0) {
+        __ Dsubu(out, ZERO, out);
+      }
+    }
+  } else {
+    if (type == Primitive::kPrimInt) {
+      if (ctz_imm == 1) {
+        // Fast path for modulo +/-2, which is very common.
+        __ Sra(TMP, dividend, 31);
+        __ Subu(out, dividend, TMP);
+        __ Andi(out, out, 1);
+        __ Addu(out, out, TMP);
+      } else {
+        __ Sra(TMP, dividend, 31);
+        __ Srl(TMP, TMP, 32 - ctz_imm);
+        __ Addu(out, dividend, TMP);
+        if (IsUint<16>(abs_imm - 1)) {
+          __ Andi(out, out, abs_imm - 1);
+        } else {
+          __ Sll(out, out, 32 - ctz_imm);
+          __ Srl(out, out, 32 - ctz_imm);
+        }
+        __ Subu(out, out, TMP);
+      }
+    } else {
+      DCHECK_EQ(type, Primitive::kPrimLong);
+      if (ctz_imm == 1) {
+        // Fast path for modulo +/-2, which is very common.
+        __ Dsra32(TMP, dividend, 31);
+        __ Dsubu(out, dividend, TMP);
+        __ Andi(out, out, 1);
+        __ Daddu(out, out, TMP);
+      } else {
+        __ Dsra32(TMP, dividend, 31);
+        if (ctz_imm > 32) {
+          __ Dsrl(TMP, TMP, 64 - ctz_imm);
+        } else {
+          __ Dsrl32(TMP, TMP, 32 - ctz_imm);
+        }
+        __ Daddu(out, dividend, TMP);
+        if (IsUint<16>(abs_imm - 1)) {
+          __ Andi(out, out, abs_imm - 1);
+        } else {
+          if (ctz_imm > 32) {
+            __ Dsll(out, out, 64 - ctz_imm);
+            __ Dsrl(out, out, 64 - ctz_imm);
+          } else {
+            __ Dsll32(out, out, 32 - ctz_imm);
+            __ Dsrl32(out, out, 32 - ctz_imm);
+          }
+        }
+        __ Dsubu(out, out, TMP);
+      }
+    }
+  }
+}
+
+void InstructionCodeGeneratorMIPS64::GenerateDivRemWithAnyConstant(HBinaryOperation* instruction) {
+  DCHECK(instruction->IsDiv() || instruction->IsRem());
+
+  LocationSummary* locations = instruction->GetLocations();
+  Location second = locations->InAt(1);
+  DCHECK(second.IsConstant());
+
+  GpuRegister out = locations->Out().AsRegister<GpuRegister>();
+  GpuRegister dividend = locations->InAt(0).AsRegister<GpuRegister>();
+  int64_t imm = Int64FromConstant(second.GetConstant());
+
+  Primitive::Type type = instruction->GetResultType();
+  DCHECK(type == Primitive::kPrimInt || type == Primitive::kPrimLong) << type;
+
+  int64_t magic;
+  int shift;
+  CalculateMagicAndShiftForDivRem(imm,
+                                  (type == Primitive::kPrimLong),
+                                  &magic,
+                                  &shift);
+
+  if (type == Primitive::kPrimInt) {
+    __ LoadConst32(TMP, magic);
+    __ MuhR6(TMP, dividend, TMP);
+
+    if (imm > 0 && magic < 0) {
+      __ Addu(TMP, TMP, dividend);
+    } else if (imm < 0 && magic > 0) {
+      __ Subu(TMP, TMP, dividend);
+    }
+
+    if (shift != 0) {
+      __ Sra(TMP, TMP, shift);
+    }
+
+    if (instruction->IsDiv()) {
+      __ Sra(out, TMP, 31);
+      __ Subu(out, TMP, out);
+    } else {
+      __ Sra(AT, TMP, 31);
+      __ Subu(AT, TMP, AT);
+      __ LoadConst32(TMP, imm);
+      __ MulR6(TMP, AT, TMP);
+      __ Subu(out, dividend, TMP);
+    }
+  } else {
+    __ LoadConst64(TMP, magic);
+    __ Dmuh(TMP, dividend, TMP);
+
+    if (imm > 0 && magic < 0) {
+      __ Daddu(TMP, TMP, dividend);
+    } else if (imm < 0 && magic > 0) {
+      __ Dsubu(TMP, TMP, dividend);
+    }
+
+    if (shift >= 32) {
+      __ Dsra32(TMP, TMP, shift - 32);
+    } else if (shift > 0) {
+      __ Dsra(TMP, TMP, shift);
+    }
+
+    if (instruction->IsDiv()) {
+      __ Dsra32(out, TMP, 31);
+      __ Dsubu(out, TMP, out);
+    } else {
+      __ Dsra32(AT, TMP, 31);
+      __ Dsubu(AT, TMP, AT);
+      __ LoadConst64(TMP, imm);
+      __ Dmul(TMP, AT, TMP);
+      __ Dsubu(out, dividend, TMP);
+    }
+  }
+}
+
+void InstructionCodeGeneratorMIPS64::GenerateDivRemIntegral(HBinaryOperation* instruction) {
+  DCHECK(instruction->IsDiv() || instruction->IsRem());
+  Primitive::Type type = instruction->GetResultType();
+  DCHECK(type == Primitive::kPrimInt || type == Primitive::kPrimLong) << type;
+
+  LocationSummary* locations = instruction->GetLocations();
+  GpuRegister out = locations->Out().AsRegister<GpuRegister>();
+  Location second = locations->InAt(1);
+
+  if (second.IsConstant()) {
+    int64_t imm = Int64FromConstant(second.GetConstant());
+    if (imm == 0) {
+      // Do not generate anything. DivZeroCheck would prevent any code to be executed.
+    } else if (imm == 1 || imm == -1) {
+      DivRemOneOrMinusOne(instruction);
+    } else if (IsPowerOfTwo(std::abs(imm))) {
+      DivRemByPowerOfTwo(instruction);
+    } else {
+      DCHECK(imm <= -2 || imm >= 2);
+      GenerateDivRemWithAnyConstant(instruction);
+    }
+  } else {
+    GpuRegister dividend = locations->InAt(0).AsRegister<GpuRegister>();
+    GpuRegister divisor = second.AsRegister<GpuRegister>();
+    if (instruction->IsDiv()) {
+      if (type == Primitive::kPrimInt)
+        __ DivR6(out, dividend, divisor);
+      else
+        __ Ddiv(out, dividend, divisor);
+    } else {
+      if (type == Primitive::kPrimInt)
+        __ ModR6(out, dividend, divisor);
+      else
+        __ Dmod(out, dividend, divisor);
+    }
+  }
+}
+
 void LocationsBuilderMIPS64::VisitDiv(HDiv* div) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(div, LocationSummary::kNoCall);
@@ -1909,7 +2196,7 @@ void LocationsBuilderMIPS64::VisitDiv(HDiv* div) {
     case Primitive::kPrimInt:
     case Primitive::kPrimLong:
       locations->SetInAt(0, Location::RequiresRegister());
-      locations->SetInAt(1, Location::RequiresRegister());
+      locations->SetInAt(1, Location::RegisterOrConstant(div->InputAt(1)));
       locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
       break;
 
@@ -1931,16 +2218,9 @@ void InstructionCodeGeneratorMIPS64::VisitDiv(HDiv* instruction) {
 
   switch (type) {
     case Primitive::kPrimInt:
-    case Primitive::kPrimLong: {
-      GpuRegister dst = locations->Out().AsRegister<GpuRegister>();
-      GpuRegister lhs = locations->InAt(0).AsRegister<GpuRegister>();
-      GpuRegister rhs = locations->InAt(1).AsRegister<GpuRegister>();
-      if (type == Primitive::kPrimInt)
-        __ DivR6(dst, lhs, rhs);
-      else
-        __ Ddiv(dst, lhs, rhs);
+    case Primitive::kPrimLong:
+      GenerateDivRemIntegral(instruction);
       break;
-    }
     case Primitive::kPrimFloat:
     case Primitive::kPrimDouble: {
       FpuRegister dst = locations->Out().AsFpuRegister<FpuRegister>();
@@ -2060,30 +2340,51 @@ void InstructionCodeGeneratorMIPS64::VisitTryBoundary(HTryBoundary* try_boundary
 }
 
 void InstructionCodeGeneratorMIPS64::GenerateTestAndBranch(HInstruction* instruction,
+                                                           size_t condition_input_index,
                                                            Label* true_target,
-                                                           Label* false_target,
-                                                           Label* always_true_target) {
-  HInstruction* cond = instruction->InputAt(0);
-  HCondition* condition = cond->AsCondition();
-
-  if (cond->IsIntConstant()) {
-    int32_t cond_value = cond->AsIntConstant()->GetValue();
-    if (cond_value == 1) {
-      if (always_true_target != nullptr) {
-        __ B(always_true_target);
+                                                           Label* false_target) {
+  HInstruction* cond = instruction->InputAt(condition_input_index);
+
+  if (true_target == nullptr && false_target == nullptr) {
+    // Nothing to do. The code always falls through.
+    return;
+  } else if (cond->IsIntConstant()) {
+    // Constant condition, statically compared against 1.
+    if (cond->AsIntConstant()->IsOne()) {
+      if (true_target != nullptr) {
+        __ B(true_target);
       }
-      return;
     } else {
-      DCHECK_EQ(cond_value, 0);
+      DCHECK(cond->AsIntConstant()->IsZero());
+      if (false_target != nullptr) {
+        __ B(false_target);
+      }
     }
-  } else if (!cond->IsCondition() || condition->NeedsMaterialization()) {
+    return;
+  }
+
+  // The following code generates these patterns:
+  //  (1) true_target == nullptr && false_target != nullptr
+  //        - opposite condition true => branch to false_target
+  //  (2) true_target != nullptr && false_target == nullptr
+  //        - condition true => branch to true_target
+  //  (3) true_target != nullptr && false_target != nullptr
+  //        - condition true => branch to true_target
+  //        - branch to false_target
+  if (IsBooleanValueOrMaterializedCondition(cond)) {
     // The condition instruction has been materialized, compare the output to 0.
-    Location cond_val = instruction->GetLocations()->InAt(0);
+    Location cond_val = instruction->GetLocations()->InAt(condition_input_index);
     DCHECK(cond_val.IsRegister());
-    __ Bnezc(cond_val.AsRegister<GpuRegister>(), true_target);
+    if (true_target == nullptr) {
+      __ Beqzc(cond_val.AsRegister<GpuRegister>(), false_target);
+    } else {
+      __ Bnezc(cond_val.AsRegister<GpuRegister>(), true_target);
+    }
   } else {
     // The condition instruction has not been materialized, use its inputs as
     // the comparison and its condition as the branch condition.
+    HCondition* condition = cond->AsCondition();
+
     GpuRegister lhs = condition->GetLocations()->InAt(0).AsRegister<GpuRegister>();
     Location rhs_location = condition->GetLocations()->InAt(1);
     GpuRegister rhs_reg = ZERO;
@@ -2095,37 +2396,46 @@ void InstructionCodeGeneratorMIPS64::GenerateTestAndBranch(HInstruction* instruc
       rhs_reg = rhs_location.AsRegister<GpuRegister>();
     }
 
-    IfCondition if_cond = condition->GetCondition();
+    IfCondition if_cond;
+    Label* non_fallthrough_target;
+    if (true_target == nullptr) {
+      if_cond = condition->GetOppositeCondition();
+      non_fallthrough_target = false_target;
+    } else {
+      if_cond = condition->GetCondition();
+      non_fallthrough_target = true_target;
+    }
+
     if (use_imm && rhs_imm == 0) {
       switch (if_cond) {
         case kCondEQ:
-          __ Beqzc(lhs, true_target);
+          __ Beqzc(lhs, non_fallthrough_target);
           break;
         case kCondNE:
-          __ Bnezc(lhs, true_target);
+          __ Bnezc(lhs, non_fallthrough_target);
           break;
         case kCondLT:
-          __ Bltzc(lhs, true_target);
+          __ Bltzc(lhs, non_fallthrough_target);
           break;
         case kCondGE:
-          __ Bgezc(lhs, true_target);
+          __ Bgezc(lhs, non_fallthrough_target);
           break;
         case kCondLE:
-          __ Blezc(lhs, true_target);
+          __ Blezc(lhs, non_fallthrough_target);
           break;
         case kCondGT:
-          __ Bgtzc(lhs, true_target);
+          __ Bgtzc(lhs, non_fallthrough_target);
           break;
         case kCondB:
           break;  // always false
         case kCondBE:
-          __ Beqzc(lhs, true_target);  // <= 0 if zero
+          __ Beqzc(lhs, non_fallthrough_target);  // <= 0 if zero
           break;
         case kCondA:
-          __ Bnezc(lhs, true_target);  // > 0 if non-zero
+          __ Bnezc(lhs, non_fallthrough_target);  // > 0 if non-zero
           break;
         case kCondAE:
-          __ B(true_target);  // always true
+          __ B(non_fallthrough_target);  // always true
           break;
       }
     } else {
@@ -2144,7 +2454,7 @@ void InstructionCodeGeneratorMIPS64::GenerateTestAndBranch(HInstruction* instruc
           case kCondBE:
           case kCondAE:
             // if lhs == rhs for a positive condition, then it is a branch
-            __ B(true_target);
+            __ B(non_fallthrough_target);
             break;
           case kCondNE:
           case kCondLT:
@@ -2157,72 +2467,68 @@ void InstructionCodeGeneratorMIPS64::GenerateTestAndBranch(HInstruction* instruc
       } else {
         switch (if_cond) {
           case kCondEQ:
-            __ Beqc(lhs, rhs_reg, true_target);
+            __ Beqc(lhs, rhs_reg, non_fallthrough_target);
             break;
           case kCondNE:
-            __ Bnec(lhs, rhs_reg, true_target);
+            __ Bnec(lhs, rhs_reg, non_fallthrough_target);
             break;
           case kCondLT:
-            __ Bltc(lhs, rhs_reg, true_target);
+            __ Bltc(lhs, rhs_reg, non_fallthrough_target);
             break;
           case kCondGE:
-            __ Bgec(lhs, rhs_reg, true_target);
+            __ Bgec(lhs, rhs_reg, non_fallthrough_target);
             break;
           case kCondLE:
-            __ Bgec(rhs_reg, lhs, true_target);
+            __ Bgec(rhs_reg, lhs, non_fallthrough_target);
             break;
           case kCondGT:
-            __ Bltc(rhs_reg, lhs, true_target);
+            __ Bltc(rhs_reg, lhs, non_fallthrough_target);
             break;
           case kCondB:
-            __ Bltuc(lhs, rhs_reg, true_target);
+            __ Bltuc(lhs, rhs_reg, non_fallthrough_target);
             break;
           case kCondAE:
-            __ Bgeuc(lhs, rhs_reg, true_target);
+            __ Bgeuc(lhs, rhs_reg, non_fallthrough_target);
             break;
           case kCondBE:
-            __ Bgeuc(rhs_reg, lhs, true_target);
+            __ Bgeuc(rhs_reg, lhs, non_fallthrough_target);
             break;
           case kCondA:
-            __ Bltuc(rhs_reg, lhs, true_target);
+            __ Bltuc(rhs_reg, lhs, non_fallthrough_target);
             break;
         }
       }
     }
   }
-  if (false_target != nullptr) {
+
+  // If neither branch falls through (case 3), the conditional branch to `true_target`
+  // was already emitted (case 2) and we need to emit a jump to `false_target`.
+  if (true_target != nullptr && false_target != nullptr) {
     __ B(false_target);
   }
 }
 
 void LocationsBuilderMIPS64::VisitIf(HIf* if_instr) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(if_instr);
-  HInstruction* cond = if_instr->InputAt(0);
-  if (!cond->IsCondition() || cond->AsCondition()->NeedsMaterialization()) {
+  if (IsBooleanValueOrMaterializedCondition(if_instr->InputAt(0))) {
     locations->SetInAt(0, Location::RequiresRegister());
   }
 }
 
 void InstructionCodeGeneratorMIPS64::VisitIf(HIf* if_instr) {
-  Label* true_target = codegen_->GetLabelOf(if_instr->IfTrueSuccessor());
-  Label* false_target = codegen_->GetLabelOf(if_instr->IfFalseSuccessor());
-  Label* always_true_target = true_target;
-  if (codegen_->GoesToNextBlock(if_instr->GetBlock(),
-                                if_instr->IfTrueSuccessor())) {
-    always_true_target = nullptr;
-  }
-  if (codegen_->GoesToNextBlock(if_instr->GetBlock(),
-                                if_instr->IfFalseSuccessor())) {
-    false_target = nullptr;
-  }
-  GenerateTestAndBranch(if_instr, true_target, false_target, always_true_target);
+  HBasicBlock* true_successor = if_instr->IfTrueSuccessor();
+  HBasicBlock* false_successor = if_instr->IfFalseSuccessor();
+  Label* true_target = codegen_->GoesToNextBlock(if_instr->GetBlock(), true_successor) ?
+      nullptr : codegen_->GetLabelOf(true_successor);
+  Label* false_target = codegen_->GoesToNextBlock(if_instr->GetBlock(), false_successor) ?
+      nullptr : codegen_->GetLabelOf(false_successor);
+  GenerateTestAndBranch(if_instr, /* condition_input_index */ 0, true_target, false_target);
 }
 
 void LocationsBuilderMIPS64::VisitDeoptimize(HDeoptimize* deoptimize) {
   LocationSummary* locations = new (GetGraph()->GetArena())
       LocationSummary(deoptimize, LocationSummary::kCallOnSlowPath);
-  HInstruction* cond = deoptimize->InputAt(0);
-  if (!cond->IsCondition() || cond->AsCondition()->NeedsMaterialization()) {
+  if (IsBooleanValueOrMaterializedCondition(deoptimize->InputAt(0))) {
     locations->SetInAt(0, Location::RequiresRegister());
   }
 }
@@ -2231,8 +2537,10 @@ void InstructionCodeGeneratorMIPS64::VisitDeoptimize(HDeoptimize* deoptimize) {
   SlowPathCodeMIPS64* slow_path = new (GetGraph()->GetArena())
       DeoptimizationSlowPathMIPS64(deoptimize);
   codegen_->AddSlowPath(slow_path);
-  Label* slow_path_entry = slow_path->GetEntryLabel();
-  GenerateTestAndBranch(deoptimize, slow_path_entry, nullptr, slow_path_entry);
+  GenerateTestAndBranch(deoptimize,
+                        /* condition_input_index */ 0,
+                        slow_path->GetEntryLabel(),
+                        /* false_target */ nullptr);
 }
 
 void LocationsBuilderMIPS64::HandleFieldGet(HInstruction* instruction,
@@ -2512,10 +2820,12 @@ void LocationsBuilderMIPS64::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* in
   // allocation of a register for the current method pointer like on x86 baseline.
   // TODO: remove this once all the issues with register saving/restoring are
   // sorted out.
-  LocationSummary* locations = invoke->GetLocations();
-  Location location = locations->InAt(invoke->GetCurrentMethodInputIndex());
-  if (location.IsUnallocated() && location.GetPolicy() == Location::kRequiresRegister) {
-    locations->SetInAt(invoke->GetCurrentMethodInputIndex(), Location::NoLocation());
+  if (invoke->HasCurrentMethodInput()) {
+    LocationSummary* locations = invoke->GetLocations();
+    Location location = locations->InAt(invoke->GetSpecialInputIndex());
+    if (location.IsUnallocated() && location.GetPolicy() == Location::kRequiresRegister) {
+      locations->SetInAt(invoke->GetSpecialInputIndex(), Location::NoLocation());
+    }
   }
 }
 
@@ -2572,7 +2882,7 @@ void CodeGeneratorMIPS64::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invo
                         invoke->GetStringInitOffset());
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kRecursive:
-      callee_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodInputIndex());
+      callee_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress:
       __ LoadConst64(temp.AsRegister<GpuRegister>(), invoke->GetMethodAddress());
@@ -2584,7 +2894,7 @@ void CodeGeneratorMIPS64::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invo
       LOG(FATAL) << "Unsupported";
       UNREACHABLE();
     case HInvokeStaticOrDirect::MethodLoadKind::kDexCacheViaMethod: {
-      Location current_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodInputIndex());
+      Location current_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
       GpuRegister reg = temp.AsRegister<GpuRegister>();
       GpuRegister method_reg;
       if (current_method.IsRegister()) {
@@ -2695,7 +3005,7 @@ void LocationsBuilderMIPS64::VisitLoadClass(HLoadClass* cls) {
   CodeGenerator::CreateLoadClassLocationSummary(
       cls,
       Location::RegisterLocation(calling_convention.GetRegisterAt(0)),
-      Location::RegisterLocation(A0));
+      calling_convention.GetReturnLocation(cls->GetType()));
 }
 
 void InstructionCodeGeneratorMIPS64::VisitLoadClass(HLoadClass* cls) {
@@ -3112,7 +3422,7 @@ void LocationsBuilderMIPS64::VisitRem(HRem* rem) {
     case Primitive::kPrimInt:
     case Primitive::kPrimLong:
       locations->SetInAt(0, Location::RequiresRegister());
-      locations->SetInAt(1, Location::RequiresRegister());
+      locations->SetInAt(1, Location::RegisterOrConstant(rem->InputAt(1)));
       locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
       break;
 
@@ -3132,20 +3442,12 @@ void LocationsBuilderMIPS64::VisitRem(HRem* rem) {
 
 void InstructionCodeGeneratorMIPS64::VisitRem(HRem* instruction) {
   Primitive::Type type = instruction->GetType();
-  LocationSummary* locations = instruction->GetLocations();
 
   switch (type) {
     case Primitive::kPrimInt:
-    case Primitive::kPrimLong: {
-      GpuRegister dst = locations->Out().AsRegister<GpuRegister>();
-      GpuRegister lhs = locations->InAt(0).AsRegister<GpuRegister>();
-      GpuRegister rhs = locations->InAt(1).AsRegister<GpuRegister>();
-      if (type == Primitive::kPrimInt)
-        __ ModR6(dst, lhs, rhs);
-      else
-        __ Dmod(dst, lhs, rhs);
+    case Primitive::kPrimLong:
+      GenerateDivRemIntegral(instruction);
       break;
-    }
 
     case Primitive::kPrimFloat:
     case Primitive::kPrimDouble: {
diff --git a/compiler/optimizing/code_generator_mips64.h b/compiler/optimizing/code_generator_mips64.h
index 4f91c7179f..a078dd1819 100644
--- a/compiler/optimizing/code_generator_mips64.h
+++ b/compiler/optimizing/code_generator_mips64.h
@@ -119,9 +119,12 @@ class FieldAccessCallingConventionMIPS64 : public FieldAccessCallingConvention {
   Location GetReturnLocation(Primitive::Type type ATTRIBUTE_UNUSED) const OVERRIDE {
     return Location::RegisterLocation(V0);
   }
-  Location GetSetValueLocation(
-      Primitive::Type type ATTRIBUTE_UNUSED, bool is_instance) const OVERRIDE {
-    return is_instance ? Location::RegisterLocation(A2) : Location::RegisterLocation(A1);
+  Location GetSetValueLocation(Primitive::Type type, bool is_instance) const OVERRIDE {
+    return Primitive::Is64BitType(type)
+        ? Location::RegisterLocation(A2)
+        : (is_instance
+            ? Location::RegisterLocation(A2)
+            : Location::RegisterLocation(A1));
   }
   Location GetFpuLocation(Primitive::Type type ATTRIBUTE_UNUSED) const OVERRIDE {
     return Location::FpuRegisterLocation(F0);
@@ -227,9 +230,13 @@ class InstructionCodeGeneratorMIPS64 : public HGraphVisitor {
   void GenerateImplicitNullCheck(HNullCheck* instruction);
   void GenerateExplicitNullCheck(HNullCheck* instruction);
   void GenerateTestAndBranch(HInstruction* instruction,
+                             size_t condition_input_index,
                              Label* true_target,
-                             Label* false_target,
-                             Label* always_true_target);
+                             Label* false_target);
+  void DivRemOneOrMinusOne(HBinaryOperation* instruction);
+  void DivRemByPowerOfTwo(HBinaryOperation* instruction);
+  void GenerateDivRemWithAnyConstant(HBinaryOperation* instruction);
+  void GenerateDivRemIntegral(HBinaryOperation* instruction);
   void HandleGoto(HInstruction* got, HBasicBlock* successor);
 
   Mips64Assembler* const assembler_;
diff --git a/compiler/optimizing/code_generator_utils.cc b/compiler/optimizing/code_generator_utils.cc
index bf354e7ee2..644a3fb75e 100644
--- a/compiler/optimizing/code_generator_utils.cc
+++ b/compiler/optimizing/code_generator_utils.cc
@@ -95,19 +95,8 @@ void CalculateMagicAndShiftForDivRem(int64_t divisor, bool is_long,
   *shift = is_long ? p - 64 : p - 32;
 }
 
-// Is it valid to reverse the condition? Uses the values supplied to
-// GenerateTestAndBranch() in instruction generators.
-bool CanReverseCondition(Label* always_true_target,
-                         Label* false_target,
-                         HCondition* condition) {
-  // 'always_true_target' is null when the 'true' path is to the next
-  // block to be generated.  Check the type of the condition to ensure that
-  // FP conditions are not swapped.  This is for future fusing of HCompare and
-  // HCondition.
-  // Note:  If the condition is nullptr, then it is always okay to reverse.
-  return always_true_target == nullptr && false_target != nullptr &&
-         (condition == nullptr ||
-          !Primitive::IsFloatingPointType(condition->InputAt(0)->GetType()));
+bool IsBooleanValueOrMaterializedCondition(HInstruction* cond_input) {
+  return !cond_input->IsCondition() || cond_input->AsCondition()->NeedsMaterialization();
 }
 
 }  // namespace art
diff --git a/compiler/optimizing/code_generator_utils.h b/compiler/optimizing/code_generator_utils.h
index 628eee8885..7efed8c9ec 100644
--- a/compiler/optimizing/code_generator_utils.h
+++ b/compiler/optimizing/code_generator_utils.h
@@ -21,18 +21,16 @@
 
 namespace art {
 
-class Label;
-class HCondition;
+class HInstruction;
 
 // Computes the magic number and the shift needed in the div/rem by constant algorithm, as out
 // arguments `magic` and `shift`
 void CalculateMagicAndShiftForDivRem(int64_t divisor, bool is_long, int64_t* magic, int* shift);
 
-// Is it valid to reverse the condition? Uses the values supplied to
-// GenerateTestAndBranch() in instruction generators.
-bool CanReverseCondition(Label* always_true_target,
-                         Label* false_target,
-                         HCondition* condition);
+// Returns true if `cond_input` is expected to have a location. Assumes that
+// `cond_input` is a conditional input of the currently emitted instruction and
+// that it has been previously visited by the InstructionCodeGenerator.
+bool IsBooleanValueOrMaterializedCondition(HInstruction* cond_input);
 
 }  // namespace art
 
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 8308d9ee20..a87e8ede04 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -19,7 +19,6 @@
 #include "art_method.h"
 #include "code_generator_utils.h"
 #include "compiled_method.h"
-#include "constant_area_fixups_x86.h"
 #include "entrypoints/quick/quick_entrypoints.h"
 #include "entrypoints/quick/quick_entrypoints_enum.h"
 #include "gc/accounting/card_table.h"
@@ -27,6 +26,7 @@
 #include "intrinsics_x86.h"
 #include "mirror/array-inl.h"
 #include "mirror/class-inl.h"
+#include "pc_relative_fixups_x86.h"
 #include "thread.h"
 #include "utils/assembler.h"
 #include "utils/stack_checks.h"
@@ -35,6 +35,9 @@
 
 namespace art {
 
+template<class MirrorType>
+class GcRoot;
+
 namespace x86 {
 
 static constexpr int kCurrentMethodStackOffset = 0;
@@ -300,15 +303,6 @@ class TypeCheckSlowPathX86 : public SlowPathCode {
     CodeGeneratorX86* x86_codegen = down_cast<CodeGeneratorX86*>(codegen);
     __ Bind(GetEntryLabel());
 
-    if (instruction_->IsCheckCast()) {
-      // The codegen for the instruction overwrites `temp`, so put it back in place.
-      Register obj = locations->InAt(0).AsRegister<Register>();
-      Register temp = locations->GetTemp(0).AsRegister<Register>();
-      uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
-      __ movl(temp, Address(obj, class_offset));
-      __ MaybeUnpoisonHeapReference(temp);
-    }
-
     if (!is_fatal_) {
       SaveLiveRegisters(codegen, locations);
     }
@@ -329,12 +323,15 @@ class TypeCheckSlowPathX86 : public SlowPathCode {
                                  instruction_,
                                  instruction_->GetDexPc(),
                                  this);
+      CheckEntrypointTypes<
+          kQuickInstanceofNonTrivial, uint32_t, const mirror::Class*, const mirror::Class*>();
     } else {
       DCHECK(instruction_->IsCheckCast());
       x86_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pCheckCast),
                                  instruction_,
                                  instruction_->GetDexPc(),
                                  this);
+      CheckEntrypointTypes<kQuickCheckCast, void, const mirror::Class*, const mirror::Class*>();
     }
 
     if (!is_fatal_) {
@@ -425,6 +422,221 @@ class ArraySetSlowPathX86 : public SlowPathCode {
   DISALLOW_COPY_AND_ASSIGN(ArraySetSlowPathX86);
 };
 
+// Slow path generating a read barrier for a heap reference.
+class ReadBarrierForHeapReferenceSlowPathX86 : public SlowPathCode {
+ public:
+  ReadBarrierForHeapReferenceSlowPathX86(HInstruction* instruction,
+                                         Location out,
+                                         Location ref,
+                                         Location obj,
+                                         uint32_t offset,
+                                         Location index)
+      : instruction_(instruction),
+        out_(out),
+        ref_(ref),
+        obj_(obj),
+        offset_(offset),
+        index_(index) {
+    DCHECK(kEmitCompilerReadBarrier);
+    // If `obj` is equal to `out` or `ref`, it means the initial object
+    // has been overwritten by (or after) the heap object reference load
+    // to be instrumented, e.g.:
+    //
+    //   __ movl(out, Address(out, offset));
+    //   codegen_->GenerateReadBarrier(instruction, out_loc, out_loc, out_loc, offset);
+    //
+    // In that case, we have lost the information about the original
+    // object, and the emitted read barrier cannot work properly.
+    DCHECK(!obj.Equals(out)) << "obj=" << obj << " out=" << out;
+    DCHECK(!obj.Equals(ref)) << "obj=" << obj << " ref=" << ref;
+  }
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    CodeGeneratorX86* x86_codegen = down_cast<CodeGeneratorX86*>(codegen);
+    LocationSummary* locations = instruction_->GetLocations();
+    Register reg_out = out_.AsRegister<Register>();
+    DCHECK(locations->CanCall());
+    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg_out));
+    DCHECK(!instruction_->IsInvoke() ||
+           (instruction_->IsInvokeStaticOrDirect() &&
+            instruction_->GetLocations()->Intrinsified()));
+
+    __ Bind(GetEntryLabel());
+    SaveLiveRegisters(codegen, locations);
+
+    // We may have to change the index's value, but as `index_` is a
+    // constant member (like other "inputs" of this slow path),
+    // introduce a copy of it, `index`.
+    Location index = index_;
+    if (index_.IsValid()) {
+      // Handle `index_` for HArrayGet and intrinsic UnsafeGetObject.
+      if (instruction_->IsArrayGet()) {
+        // Compute the actual memory offset and store it in `index`.
+        Register index_reg = index_.AsRegister<Register>();
+        DCHECK(locations->GetLiveRegisters()->ContainsCoreRegister(index_reg));
+        if (codegen->IsCoreCalleeSaveRegister(index_reg)) {
+          // We are about to change the value of `index_reg` (see the
+          // calls to art::x86::X86Assembler::shll and
+          // art::x86::X86Assembler::AddImmediate below), but it has
+          // not been saved by the previous call to
+          // art::SlowPathCode::SaveLiveRegisters, as it is a
+          // callee-save register --
+          // art::SlowPathCode::SaveLiveRegisters does not consider
+          // callee-save registers, as it has been designed with the
+          // assumption that callee-save registers are supposed to be
+          // handled by the called function.  So, as a callee-save
+          // register, `index_reg` _would_ eventually be saved onto
+          // the stack, but it would be too late: we would have
+          // changed its value earlier.  Therefore, we manually save
+          // it here into another freely available register,
+          // `free_reg`, chosen of course among the caller-save
+          // registers (as a callee-save `free_reg` register would
+          // exhibit the same problem).
+          //
+          // Note we could have requested a temporary register from
+          // the register allocator instead; but we prefer not to, as
+          // this is a slow path, and we know we can find a
+          // caller-save register that is available.
+          Register free_reg = FindAvailableCallerSaveRegister(codegen);
+          __ movl(free_reg, index_reg);
+          index_reg = free_reg;
+          index = Location::RegisterLocation(index_reg);
+        } else {
+          // The initial register stored in `index_` has already been
+          // saved in the call to art::SlowPathCode::SaveLiveRegisters
+          // (as it is not a callee-save register), so we can freely
+          // use it.
+        }
+        // Shifting the index value contained in `index_reg` by the scale
+        // factor (2) cannot overflow in practice, as the runtime is
+        // unable to allocate object arrays with a size larger than
+        // 2^26 - 1 (that is, 2^28 - 4 bytes).
+        __ shll(index_reg, Immediate(TIMES_4));
+        static_assert(
+            sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
+            "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
+        __ AddImmediate(index_reg, Immediate(offset_));
+      } else {
+        DCHECK(instruction_->IsInvoke());
+        DCHECK(instruction_->GetLocations()->Intrinsified());
+        DCHECK((instruction_->AsInvoke()->GetIntrinsic() == Intrinsics::kUnsafeGetObject) ||
+               (instruction_->AsInvoke()->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile))
+            << instruction_->AsInvoke()->GetIntrinsic();
+        DCHECK_EQ(offset_, 0U);
+        DCHECK(index_.IsRegisterPair());
+        // UnsafeGet's offset location is a register pair, the low
+        // part contains the correct offset.
+        index = index_.ToLow();
+      }
+    }
+
+    // We're moving two or three locations to locations that could
+    // overlap, so we need a parallel move resolver.
+    InvokeRuntimeCallingConvention calling_convention;
+    HParallelMove parallel_move(codegen->GetGraph()->GetArena());
+    parallel_move.AddMove(ref_,
+                          Location::RegisterLocation(calling_convention.GetRegisterAt(0)),
+                          Primitive::kPrimNot,
+                          nullptr);
+    parallel_move.AddMove(obj_,
+                          Location::RegisterLocation(calling_convention.GetRegisterAt(1)),
+                          Primitive::kPrimNot,
+                          nullptr);
+    if (index.IsValid()) {
+      parallel_move.AddMove(index,
+                            Location::RegisterLocation(calling_convention.GetRegisterAt(2)),
+                            Primitive::kPrimInt,
+                            nullptr);
+      codegen->GetMoveResolver()->EmitNativeCode(&parallel_move);
+    } else {
+      codegen->GetMoveResolver()->EmitNativeCode(&parallel_move);
+      __ movl(calling_convention.GetRegisterAt(2), Immediate(offset_));
+    }
+    x86_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pReadBarrierSlow),
+                               instruction_,
+                               instruction_->GetDexPc(),
+                               this);
+    CheckEntrypointTypes<
+        kQuickReadBarrierSlow, mirror::Object*, mirror::Object*, mirror::Object*, uint32_t>();
+    x86_codegen->Move32(out_, Location::RegisterLocation(EAX));
+
+    RestoreLiveRegisters(codegen, locations);
+    __ jmp(GetExitLabel());
+  }
+
+  const char* GetDescription() const OVERRIDE { return "ReadBarrierForHeapReferenceSlowPathX86"; }
+
+ private:
+  Register FindAvailableCallerSaveRegister(CodeGenerator* codegen) {
+    size_t ref = static_cast<int>(ref_.AsRegister<Register>());
+    size_t obj = static_cast<int>(obj_.AsRegister<Register>());
+    for (size_t i = 0, e = codegen->GetNumberOfCoreRegisters(); i < e; ++i) {
+      if (i != ref && i != obj && !codegen->IsCoreCalleeSaveRegister(i)) {
+        return static_cast<Register>(i);
+      }
+    }
+    // We shall never fail to find a free caller-save register, as
+    // there are more than two core caller-save registers on x86
+    // (meaning it is possible to find one which is different from
+    // `ref` and `obj`).
+    DCHECK_GT(codegen->GetNumberOfCoreCallerSaveRegisters(), 2u);
+    LOG(FATAL) << "Could not find a free caller-save register";
+    UNREACHABLE();
+  }
+
+  HInstruction* const instruction_;
+  const Location out_;
+  const Location ref_;
+  const Location obj_;
+  const uint32_t offset_;
+  // An additional location containing an index to an array.
+  // Only used for HArrayGet and the UnsafeGetObject &
+  // UnsafeGetObjectVolatile intrinsics.
+  const Location index_;
+
+  DISALLOW_COPY_AND_ASSIGN(ReadBarrierForHeapReferenceSlowPathX86);
+};
+
+// Slow path generating a read barrier for a GC root.
+class ReadBarrierForRootSlowPathX86 : public SlowPathCode {
+ public:
+  ReadBarrierForRootSlowPathX86(HInstruction* instruction, Location out, Location root)
+      : instruction_(instruction), out_(out), root_(root) {}
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    LocationSummary* locations = instruction_->GetLocations();
+    Register reg_out = out_.AsRegister<Register>();
+    DCHECK(locations->CanCall());
+    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg_out));
+    DCHECK(instruction_->IsLoadClass() || instruction_->IsLoadString());
+
+    __ Bind(GetEntryLabel());
+    SaveLiveRegisters(codegen, locations);
+
+    InvokeRuntimeCallingConvention calling_convention;
+    CodeGeneratorX86* x86_codegen = down_cast<CodeGeneratorX86*>(codegen);
+    x86_codegen->Move32(Location::RegisterLocation(calling_convention.GetRegisterAt(0)), root_);
+    x86_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pReadBarrierForRootSlow),
+                               instruction_,
+                               instruction_->GetDexPc(),
+                               this);
+    CheckEntrypointTypes<kQuickReadBarrierForRootSlow, mirror::Object*, GcRoot<mirror::Object>*>();
+    x86_codegen->Move32(out_, Location::RegisterLocation(EAX));
+
+    RestoreLiveRegisters(codegen, locations);
+    __ jmp(GetExitLabel());
+  }
+
+  const char* GetDescription() const OVERRIDE { return "ReadBarrierForRootSlowPathX86"; }
+
+ private:
+  HInstruction* const instruction_;
+  const Location out_;
+  const Location root_;
+
+  DISALLOW_COPY_AND_ASSIGN(ReadBarrierForRootSlowPathX86);
+};
+
 #undef __
 #define __ down_cast<X86Assembler*>(GetAssembler())->
 
@@ -513,9 +725,9 @@ void CodeGeneratorX86::InvokeRuntime(int32_t entry_point_offset,
 }
 
 CodeGeneratorX86::CodeGeneratorX86(HGraph* graph,
-                   const X86InstructionSetFeatures& isa_features,
-                   const CompilerOptions& compiler_options,
-                   OptimizingCompilerStats* stats)
+                                   const X86InstructionSetFeatures& isa_features,
+                                   const CompilerOptions& compiler_options,
+                                   OptimizingCompilerStats* stats)
     : CodeGenerator(graph,
                     kNumberOfCpuRegisters,
                     kNumberOfXmmRegisters,
@@ -533,6 +745,7 @@ CodeGeneratorX86::CodeGeneratorX86(HGraph* graph,
       isa_features_(isa_features),
       method_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       relative_call_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      pc_relative_dex_cache_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       fixups_to_jump_tables_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)) {
   // Use a fake return address register to mimic Quick.
   AddAllocatedRegister(Location::RegisterLocation(kFakeReturnRegister));
@@ -581,7 +794,7 @@ Location CodeGeneratorX86::AllocateFreeRegister(Primitive::Type type) const {
       LOG(FATAL) << "Unreachable type " << type;
   }
 
-  return Location();
+  return Location::NoLocation();
 }
 
 void CodeGeneratorX86::SetupBlockedRegisters(bool is_baseline) const {
@@ -782,7 +995,7 @@ Location InvokeDexCallingConventionVisitorX86::GetNextLocation(Primitive::Type t
       LOG(FATAL) << "Unexpected parameter type " << type;
       break;
   }
-  return Location();
+  return Location::NoLocation();
 }
 
 void CodeGeneratorX86::Move32(Location destination, Location source) {
@@ -1157,26 +1370,19 @@ void InstructionCodeGeneratorX86::GenerateLongComparesAndJumps(HCondition* cond,
   __ j(final_condition, true_label);
 }
 
-void InstructionCodeGeneratorX86::GenerateCompareTestAndBranch(HIf* if_instr,
-                                                               HCondition* condition,
-                                                               Label* true_target,
-                                                               Label* false_target,
-                                                               Label* always_true_target) {
+void InstructionCodeGeneratorX86::GenerateCompareTestAndBranch(HCondition* condition,
+                                                               Label* true_target_in,
+                                                               Label* false_target_in) {
+  // Generated branching requires both targets to be explicit. If either of the
+  // targets is nullptr (fallthrough) use and bind `fallthrough_target` instead.
+  Label fallthrough_target;
+  Label* true_target = true_target_in == nullptr ? &fallthrough_target : true_target_in;
+  Label* false_target = false_target_in == nullptr ? &fallthrough_target : false_target_in;
+
   LocationSummary* locations = condition->GetLocations();
   Location left = locations->InAt(0);
   Location right = locations->InAt(1);
 
-  // We don't want true_target as a nullptr.
-  if (true_target == nullptr) {
-    true_target = always_true_target;
-  }
-  bool falls_through = (false_target == nullptr);
-
-  // FP compares don't like null false_targets.
-  if (false_target == nullptr) {
-    false_target = codegen_->GetLabelOf(if_instr->IfFalseSuccessor());
-  }
-
   Primitive::Type type = condition->InputAt(0)->GetType();
   switch (type) {
     case Primitive::kPrimLong:
@@ -1194,138 +1400,141 @@ void InstructionCodeGeneratorX86::GenerateCompareTestAndBranch(HIf* if_instr,
       LOG(FATAL) << "Unexpected compare type " << type;
   }
 
-  if (!falls_through) {
+  if (false_target != &fallthrough_target) {
     __ jmp(false_target);
   }
+
+  if (fallthrough_target.IsLinked()) {
+    __ Bind(&fallthrough_target);
+  }
+}
+
+static bool AreEflagsSetFrom(HInstruction* cond, HInstruction* branch) {
+  // Moves may affect the eflags register (move zero uses xorl), so the EFLAGS
+  // are set only strictly before `branch`. We can't use the eflags on long/FP
+  // conditions if they are materialized due to the complex branching.
+  return cond->IsCondition() &&
+         cond->GetNext() == branch &&
+         cond->InputAt(0)->GetType() != Primitive::kPrimLong &&
+         !Primitive::IsFloatingPointType(cond->InputAt(0)->GetType());
 }
 
 void InstructionCodeGeneratorX86::GenerateTestAndBranch(HInstruction* instruction,
+                                                        size_t condition_input_index,
                                                         Label* true_target,
-                                                        Label* false_target,
-                                                        Label* always_true_target) {
-  HInstruction* cond = instruction->InputAt(0);
-  if (cond->IsIntConstant()) {
+                                                        Label* false_target) {
+  HInstruction* cond = instruction->InputAt(condition_input_index);
+
+  if (true_target == nullptr && false_target == nullptr) {
+    // Nothing to do. The code always falls through.
+    return;
+  } else if (cond->IsIntConstant()) {
     // Constant condition, statically compared against 1.
-    int32_t cond_value = cond->AsIntConstant()->GetValue();
-    if (cond_value == 1) {
-      if (always_true_target != nullptr) {
-        __ jmp(always_true_target);
+    if (cond->AsIntConstant()->IsOne()) {
+      if (true_target != nullptr) {
+        __ jmp(true_target);
       }
-      return;
     } else {
-      DCHECK_EQ(cond_value, 0);
+      DCHECK(cond->AsIntConstant()->IsZero());
+      if (false_target != nullptr) {
+        __ jmp(false_target);
+      }
     }
-  } else {
-    HCondition* condition = cond->AsCondition();
-    bool is_materialized =
-        condition == nullptr || condition->NeedsMaterialization();
-    // Moves do not affect the eflags register, so if the condition is
-    // evaluated just before the if, we don't need to evaluate it
-    // again.  We can't use the eflags on long/FP conditions if they are
-    // materialized due to the complex branching.
-    Primitive::Type type = (condition != nullptr)
-        ? cond->InputAt(0)->GetType()
-        : Primitive::kPrimInt;
-    bool eflags_set = condition != nullptr
-        && condition->IsBeforeWhenDisregardMoves(instruction)
-        && (type != Primitive::kPrimLong && !Primitive::IsFloatingPointType(type));
-    // Can we optimize the jump if we know that the next block is the true case?
-    bool can_jump_to_false = CanReverseCondition(always_true_target, false_target, condition);
-    if (is_materialized) {
-      if (!eflags_set) {
-        // Materialized condition, compare against 0.
-        Location lhs = instruction->GetLocations()->InAt(0);
-        if (lhs.IsRegister()) {
-          __ testl(lhs.AsRegister<Register>(), lhs.AsRegister<Register>());
-        } else {
-          __ cmpl(Address(ESP, lhs.GetStackIndex()), Immediate(0));
-        }
-        if (can_jump_to_false) {
-          __ j(kEqual, false_target);
-          return;
-        }
-        __ j(kNotEqual, true_target);
+    return;
+  }
+
+  // The following code generates these patterns:
+  //  (1) true_target == nullptr && false_target != nullptr
+  //        - opposite condition true => branch to false_target
+  //  (2) true_target != nullptr && false_target == nullptr
+  //        - condition true => branch to true_target
+  //  (3) true_target != nullptr && false_target != nullptr
+  //        - condition true => branch to true_target
+  //        - branch to false_target
+  if (IsBooleanValueOrMaterializedCondition(cond)) {
+    if (AreEflagsSetFrom(cond, instruction)) {
+      if (true_target == nullptr) {
+        __ j(X86Condition(cond->AsCondition()->GetOppositeCondition()), false_target);
       } else {
-        if (can_jump_to_false) {
-          __ j(X86Condition(condition->GetOppositeCondition()), false_target);
-          return;
-        }
-        __ j(X86Condition(condition->GetCondition()), true_target);
+        __ j(X86Condition(cond->AsCondition()->GetCondition()), true_target);
       }
     } else {
-      // Condition has not been materialized, use its inputs as the
-      // comparison and its condition as the branch condition.
-
-      // Is this a long or FP comparison that has been folded into the HCondition?
-      if (type == Primitive::kPrimLong || Primitive::IsFloatingPointType(type)) {
-        // Generate the comparison directly.
-        GenerateCompareTestAndBranch(instruction->AsIf(),
-                                     condition,
-                                     true_target,
-                                     false_target,
-                                     always_true_target);
-        return;
+      // Materialized condition, compare against 0.
+      Location lhs = instruction->GetLocations()->InAt(condition_input_index);
+      if (lhs.IsRegister()) {
+        __ testl(lhs.AsRegister<Register>(), lhs.AsRegister<Register>());
+      } else {
+        __ cmpl(Address(ESP, lhs.GetStackIndex()), Immediate(0));
       }
-
-      Location lhs = cond->GetLocations()->InAt(0);
-      Location rhs = cond->GetLocations()->InAt(1);
-      // LHS is guaranteed to be in a register (see
-      // LocationsBuilderX86::VisitCondition).
-      if (rhs.IsRegister()) {
-        __ cmpl(lhs.AsRegister<Register>(), rhs.AsRegister<Register>());
-      } else if (rhs.IsConstant()) {
-        int32_t constant = CodeGenerator::GetInt32ValueOf(rhs.GetConstant());
-        if (constant == 0) {
-          __ testl(lhs.AsRegister<Register>(), lhs.AsRegister<Register>());
-        } else {
-          __ cmpl(lhs.AsRegister<Register>(), Immediate(constant));
-        }
+      if (true_target == nullptr) {
+        __ j(kEqual, false_target);
       } else {
-        __ cmpl(lhs.AsRegister<Register>(), Address(ESP, rhs.GetStackIndex()));
+        __ j(kNotEqual, true_target);
       }
+    }
+  } else {
+    // Condition has not been materialized, use its inputs as the comparison and
+    // its condition as the branch condition.
+    HCondition* condition = cond->AsCondition();
 
-      if (can_jump_to_false) {
-        __ j(X86Condition(condition->GetOppositeCondition()), false_target);
-        return;
-      }
+    // If this is a long or FP comparison that has been folded into
+    // the HCondition, generate the comparison directly.
+    Primitive::Type type = condition->InputAt(0)->GetType();
+    if (type == Primitive::kPrimLong || Primitive::IsFloatingPointType(type)) {
+      GenerateCompareTestAndBranch(condition, true_target, false_target);
+      return;
+    }
 
+    Location lhs = condition->GetLocations()->InAt(0);
+    Location rhs = condition->GetLocations()->InAt(1);
+    // LHS is guaranteed to be in a register (see LocationsBuilderX86::VisitCondition).
+    if (rhs.IsRegister()) {
+      __ cmpl(lhs.AsRegister<Register>(), rhs.AsRegister<Register>());
+    } else if (rhs.IsConstant()) {
+      int32_t constant = CodeGenerator::GetInt32ValueOf(rhs.GetConstant());
+      if (constant == 0) {
+        __ testl(lhs.AsRegister<Register>(), lhs.AsRegister<Register>());
+      } else {
+        __ cmpl(lhs.AsRegister<Register>(), Immediate(constant));
+      }
+    } else {
+      __ cmpl(lhs.AsRegister<Register>(), Address(ESP, rhs.GetStackIndex()));
+    }
+    if (true_target == nullptr) {
+      __ j(X86Condition(condition->GetOppositeCondition()), false_target);
+    } else {
       __ j(X86Condition(condition->GetCondition()), true_target);
     }
   }
-  if (false_target != nullptr) {
+
+  // If neither branch falls through (case 3), the conditional branch to `true_target`
+  // was already emitted (case 2) and we need to emit a jump to `false_target`.
+  if (true_target != nullptr && false_target != nullptr) {
     __ jmp(false_target);
   }
 }
 
 void LocationsBuilderX86::VisitIf(HIf* if_instr) {
-  LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(if_instr, LocationSummary::kNoCall);
-  HInstruction* cond = if_instr->InputAt(0);
-  if (!cond->IsCondition() || cond->AsCondition()->NeedsMaterialization()) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(if_instr);
+  if (IsBooleanValueOrMaterializedCondition(if_instr->InputAt(0))) {
     locations->SetInAt(0, Location::Any());
   }
 }
 
 void InstructionCodeGeneratorX86::VisitIf(HIf* if_instr) {
-  Label* true_target = codegen_->GetLabelOf(if_instr->IfTrueSuccessor());
-  Label* false_target = codegen_->GetLabelOf(if_instr->IfFalseSuccessor());
-  Label* always_true_target = true_target;
-  if (codegen_->GoesToNextBlock(if_instr->GetBlock(),
-                                if_instr->IfTrueSuccessor())) {
-    always_true_target = nullptr;
-  }
-  if (codegen_->GoesToNextBlock(if_instr->GetBlock(),
-                                if_instr->IfFalseSuccessor())) {
-    false_target = nullptr;
-  }
-  GenerateTestAndBranch(if_instr, true_target, false_target, always_true_target);
+  HBasicBlock* true_successor = if_instr->IfTrueSuccessor();
+  HBasicBlock* false_successor = if_instr->IfFalseSuccessor();
+  Label* true_target = codegen_->GoesToNextBlock(if_instr->GetBlock(), true_successor) ?
+      nullptr : codegen_->GetLabelOf(true_successor);
+  Label* false_target = codegen_->GoesToNextBlock(if_instr->GetBlock(), false_successor) ?
+      nullptr : codegen_->GetLabelOf(false_successor);
+  GenerateTestAndBranch(if_instr, /* condition_input_index */ 0, true_target, false_target);
 }
 
 void LocationsBuilderX86::VisitDeoptimize(HDeoptimize* deoptimize) {
   LocationSummary* locations = new (GetGraph()->GetArena())
       LocationSummary(deoptimize, LocationSummary::kCallOnSlowPath);
-  HInstruction* cond = deoptimize->InputAt(0);
-  if (!cond->IsCondition() || cond->AsCondition()->NeedsMaterialization()) {
+  if (IsBooleanValueOrMaterializedCondition(deoptimize->InputAt(0))) {
     locations->SetInAt(0, Location::Any());
   }
 }
@@ -1334,8 +1543,10 @@ void InstructionCodeGeneratorX86::VisitDeoptimize(HDeoptimize* deoptimize) {
   SlowPathCode* slow_path = new (GetGraph()->GetArena())
       DeoptimizationSlowPathX86(deoptimize);
   codegen_->AddSlowPath(slow_path);
-  Label* slow_path_entry = slow_path->GetEntryLabel();
-  GenerateTestAndBranch(deoptimize, slow_path_entry, nullptr, slow_path_entry);
+  GenerateTestAndBranch(deoptimize,
+                        /* condition_input_index */ 0,
+                        slow_path->GetEntryLabel(),
+                        /* false_target */ nullptr);
 }
 
 void LocationsBuilderX86::VisitLocal(HLocal* local) {
@@ -1696,19 +1907,28 @@ void LocationsBuilderX86::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invok
 
   IntrinsicLocationsBuilderX86 intrinsic(codegen_);
   if (intrinsic.TryDispatch(invoke)) {
+    if (invoke->GetLocations()->CanCall() && invoke->HasPcRelativeDexCache()) {
+      invoke->GetLocations()->SetInAt(invoke->GetSpecialInputIndex(), Location::Any());
+    }
     return;
   }
 
   HandleInvoke(invoke);
 
+  // For PC-relative dex cache the invoke has an extra input, the PC-relative address base.
+  if (invoke->HasPcRelativeDexCache()) {
+    invoke->GetLocations()->SetInAt(invoke->GetSpecialInputIndex(),
+                                    Location::RequiresRegister());
+  }
+
   if (codegen_->IsBaseline()) {
     // Baseline does not have enough registers if the current method also
     // needs a register. We therefore do not require a register for it, and let
     // the code generation of the invoke handle it.
     LocationSummary* locations = invoke->GetLocations();
-    Location location = locations->InAt(invoke->GetCurrentMethodInputIndex());
+    Location location = locations->InAt(invoke->GetSpecialInputIndex());
     if (location.IsUnallocated() && location.GetPolicy() == Location::kRequiresRegister) {
-      locations->SetInAt(invoke->GetCurrentMethodInputIndex(), Location::NoLocation());
+      locations->SetInAt(invoke->GetSpecialInputIndex(), Location::NoLocation());
     }
   }
 }
@@ -1757,6 +1977,9 @@ void InstructionCodeGeneratorX86::VisitInvokeVirtual(HInvokeVirtual* invoke) {
 }
 
 void LocationsBuilderX86::VisitInvokeInterface(HInvokeInterface* invoke) {
+  // This call to HandleInvoke allocates a temporary (core) register
+  // which is also used to transfer the hidden argument from FP to
+  // core register.
   HandleInvoke(invoke);
   // Add the hidden argument.
   invoke->GetLocations()->AddTemp(Location::FpuRegisterLocation(XMM7));
@@ -1764,31 +1987,42 @@ void LocationsBuilderX86::VisitInvokeInterface(HInvokeInterface* invoke) {
 
 void InstructionCodeGeneratorX86::VisitInvokeInterface(HInvokeInterface* invoke) {
   // TODO: b/18116999, our IMTs can miss an IncompatibleClassChangeError.
-  Register temp = invoke->GetLocations()->GetTemp(0).AsRegister<Register>();
+  LocationSummary* locations = invoke->GetLocations();
+  Register temp = locations->GetTemp(0).AsRegister<Register>();
+  XmmRegister hidden_reg = locations->GetTemp(1).AsFpuRegister<XmmRegister>();
   uint32_t method_offset = mirror::Class::EmbeddedImTableEntryOffset(
       invoke->GetImtIndex() % mirror::Class::kImtSize, kX86PointerSize).Uint32Value();
-  LocationSummary* locations = invoke->GetLocations();
   Location receiver = locations->InAt(0);
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
 
-  // Set the hidden argument.
+  // Set the hidden argument. This is safe to do this here, as XMM7
+  // won't be modified thereafter, before the `call` instruction.
+  DCHECK_EQ(XMM7, hidden_reg);
   __ movl(temp, Immediate(invoke->GetDexMethodIndex()));
-  __ movd(invoke->GetLocations()->GetTemp(1).AsFpuRegister<XmmRegister>(), temp);
+  __ movd(hidden_reg, temp);
 
-  // temp = object->GetClass();
   if (receiver.IsStackSlot()) {
     __ movl(temp, Address(ESP, receiver.GetStackIndex()));
+    // /* HeapReference<Class> */ temp = temp->klass_
     __ movl(temp, Address(temp, class_offset));
   } else {
+    // /* HeapReference<Class> */ temp = receiver->klass_
     __ movl(temp, Address(receiver.AsRegister<Register>(), class_offset));
   }
   codegen_->MaybeRecordImplicitNullCheck(invoke);
+  // Instead of simply (possibly) unpoisoning `temp` here, we should
+  // emit a read barrier for the previous class reference load.
+  // However this is not required in practice, as this is an
+  // intermediate/temporary reference and because the current
+  // concurrent copying collector keeps the from-space memory
+  // intact/accessible until the end of the marking phase (the
+  // concurrent copying collector may not in the future).
   __ MaybeUnpoisonHeapReference(temp);
   // temp = temp->GetImtEntryAt(method_offset);
   __ movl(temp, Address(temp, method_offset));
   // call temp->GetEntryPoint();
-  __ call(Address(temp, ArtMethod::EntryPointFromQuickCompiledCodeOffset(
-      kX86WordSize).Int32Value()));
+  __ call(Address(temp,
+                  ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86WordSize).Int32Value()));
 
   DCHECK(!codegen_->IsLeafMethod());
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
@@ -3014,7 +3248,7 @@ void InstructionCodeGeneratorX86::GenerateDivRemIntegral(HBinaryOperation* instr
       DCHECK_EQ(EAX, first.AsRegister<Register>());
       DCHECK_EQ(is_div ? EAX : EDX, out.AsRegister<Register>());
 
-      if (instruction->InputAt(1)->IsIntConstant()) {
+      if (second.IsConstant()) {
         int32_t imm = second.GetConstant()->AsIntConstant()->GetValue();
 
         if (imm == 0) {
@@ -3779,16 +4013,6 @@ void InstructionCodeGeneratorX86::GenerateMemoryBarrier(MemBarrierKind kind) {
 HInvokeStaticOrDirect::DispatchInfo CodeGeneratorX86::GetSupportedInvokeStaticOrDirectDispatch(
       const HInvokeStaticOrDirect::DispatchInfo& desired_dispatch_info,
       MethodReference target_method ATTRIBUTE_UNUSED) {
-  if (desired_dispatch_info.method_load_kind ==
-      HInvokeStaticOrDirect::MethodLoadKind::kDexCachePcRelative) {
-    // TODO: Implement this type. For the moment, we fall back to kDexCacheViaMethod.
-    return HInvokeStaticOrDirect::DispatchInfo {
-      HInvokeStaticOrDirect::MethodLoadKind::kDexCacheViaMethod,
-      HInvokeStaticOrDirect::CodePtrLocation::kCallArtMethod,
-      0u,
-      0u
-    };
-  }
   switch (desired_dispatch_info.code_ptr_location) {
     case HInvokeStaticOrDirect::CodePtrLocation::kCallDirectWithFixup:
     case HInvokeStaticOrDirect::CodePtrLocation::kCallDirect:
@@ -3805,6 +4029,32 @@ HInvokeStaticOrDirect::DispatchInfo CodeGeneratorX86::GetSupportedInvokeStaticOr
   }
 }
 
+Register CodeGeneratorX86::GetInvokeStaticOrDirectExtraParameter(HInvokeStaticOrDirect* invoke,
+                                                                 Register temp) {
+  DCHECK_EQ(invoke->InputCount(), invoke->GetNumberOfArguments() + 1u);
+  Location location = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
+  if (!invoke->GetLocations()->Intrinsified()) {
+    return location.AsRegister<Register>();
+  }
+  // For intrinsics we allow any location, so it may be on the stack.
+  if (!location.IsRegister()) {
+    __ movl(temp, Address(ESP, location.GetStackIndex()));
+    return temp;
+  }
+  // For register locations, check if the register was saved. If so, get it from the stack.
+  // Note: There is a chance that the register was saved but not overwritten, so we could
+  // save one load. However, since this is just an intrinsic slow path we prefer this
+  // simple and more robust approach rather that trying to determine if that's the case.
+  SlowPathCode* slow_path = GetCurrentSlowPath();
+  DCHECK(slow_path != nullptr);  // For intrinsified invokes the call is emitted on the slow path.
+  if (slow_path->IsCoreRegisterSaved(location.AsRegister<Register>())) {
+    int stack_offset = slow_path->GetStackOffsetOfCoreRegister(location.AsRegister<Register>());
+    __ movl(temp, Address(ESP, stack_offset));
+    return temp;
+  }
+  return location.AsRegister<Register>();
+}
+
 void CodeGeneratorX86::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp) {
   Location callee_method = temp;  // For all kinds except kRecursive, callee will be in temp.
   switch (invoke->GetMethodLoadKind()) {
@@ -3813,7 +4063,7 @@ void CodeGeneratorX86::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke,
       __ fs()->movl(temp.AsRegister<Register>(), Address::Absolute(invoke->GetStringInitOffset()));
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kRecursive:
-      callee_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodInputIndex());
+      callee_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress:
       __ movl(temp.AsRegister<Register>(), Immediate(invoke->GetMethodAddress()));
@@ -3823,13 +4073,18 @@ void CodeGeneratorX86::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke,
       method_patches_.emplace_back(invoke->GetTargetMethod());
       __ Bind(&method_patches_.back().label);  // Bind the label at the end of the "movl" insn.
       break;
-    case HInvokeStaticOrDirect::MethodLoadKind::kDexCachePcRelative:
-      // TODO: Implement this type.
-      // Currently filtered out by GetSupportedInvokeStaticOrDirectDispatch().
-      LOG(FATAL) << "Unsupported";
-      UNREACHABLE();
+    case HInvokeStaticOrDirect::MethodLoadKind::kDexCachePcRelative: {
+      Register base_reg = GetInvokeStaticOrDirectExtraParameter(invoke,
+                                                                temp.AsRegister<Register>());
+      uint32_t offset = invoke->GetDexCacheArrayOffset();
+      __ movl(temp.AsRegister<Register>(), Address(base_reg, kDummy32BitOffset));
+      // Add the patch entry and bind its label at the end of the instruction.
+      pc_relative_dex_cache_patches_.emplace_back(*invoke->GetTargetMethod().dex_file, offset);
+      __ Bind(&pc_relative_dex_cache_patches_.back().label);
+      break;
+    }
     case HInvokeStaticOrDirect::MethodLoadKind::kDexCacheViaMethod: {
-      Location current_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodInputIndex());
+      Location current_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
       Register method_reg;
       Register reg = temp.AsRegister<Register>();
       if (current_method.IsRegister()) {
@@ -3840,7 +4095,7 @@ void CodeGeneratorX86::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke,
         method_reg = reg;
         __ movl(reg, Address(ESP, kCurrentMethodStackOffset));
       }
-      // temp = temp->dex_cache_resolved_methods_;
+      // /* ArtMethod*[] */ temp = temp.ptr_sized_fields_->dex_cache_resolved_methods_;
       __ movl(reg, Address(method_reg,
                            ArtMethod::DexCacheResolvedMethodsOffset(kX86PointerSize).Int32Value()));
       // temp = temp[index_in_cache]
@@ -3884,10 +4139,17 @@ void CodeGeneratorX86::GenerateVirtualCall(HInvokeVirtual* invoke, Location temp
   LocationSummary* locations = invoke->GetLocations();
   Location receiver = locations->InAt(0);
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
-  // temp = object->GetClass();
   DCHECK(receiver.IsRegister());
+  // /* HeapReference<Class> */ temp = receiver->klass_
   __ movl(temp, Address(receiver.AsRegister<Register>(), class_offset));
   MaybeRecordImplicitNullCheck(invoke);
+  // Instead of simply (possibly) unpoisoning `temp` here, we should
+  // emit a read barrier for the previous class reference load.
+  // However this is not required in practice, as this is an
+  // intermediate/temporary reference and because the current
+  // concurrent copying collector keeps the from-space memory
+  // intact/accessible until the end of the marking phase (the
+  // concurrent copying collector may not in the future).
   __ MaybeUnpoisonHeapReference(temp);
   // temp = temp->GetMethodAt(method_offset);
   __ movl(temp, Address(temp, method_offset));
@@ -3898,23 +4160,33 @@ void CodeGeneratorX86::GenerateVirtualCall(HInvokeVirtual* invoke, Location temp
 
 void CodeGeneratorX86::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches) {
   DCHECK(linker_patches->empty());
-  linker_patches->reserve(method_patches_.size() + relative_call_patches_.size());
+  size_t size =
+      method_patches_.size() +
+      relative_call_patches_.size() +
+      pc_relative_dex_cache_patches_.size();
+  linker_patches->reserve(size);
+  // The label points to the end of the "movl" insn but the literal offset for method
+  // patch needs to point to the embedded constant which occupies the last 4 bytes.
+  constexpr uint32_t kLabelPositionToLiteralOffsetAdjustment = 4u;
   for (const MethodPatchInfo<Label>& info : method_patches_) {
-    // The label points to the end of the "movl" insn but the literal offset for method
-    // patch x86 needs to point to the embedded constant which occupies the last 4 bytes.
-    uint32_t literal_offset = info.label.Position() - 4;
+    uint32_t literal_offset = info.label.Position() - kLabelPositionToLiteralOffsetAdjustment;
     linker_patches->push_back(LinkerPatch::MethodPatch(literal_offset,
                                                        info.target_method.dex_file,
                                                        info.target_method.dex_method_index));
   }
   for (const MethodPatchInfo<Label>& info : relative_call_patches_) {
-    // The label points to the end of the "call" insn but the literal offset for method
-    // patch x86 needs to point to the embedded constant which occupies the last 4 bytes.
-    uint32_t literal_offset = info.label.Position() - 4;
+    uint32_t literal_offset = info.label.Position() - kLabelPositionToLiteralOffsetAdjustment;
     linker_patches->push_back(LinkerPatch::RelativeCodePatch(literal_offset,
                                                              info.target_method.dex_file,
                                                              info.target_method.dex_method_index));
   }
+  for (const PcRelativeDexCacheAccessInfo& info : pc_relative_dex_cache_patches_) {
+    uint32_t literal_offset = info.label.Position() - kLabelPositionToLiteralOffsetAdjustment;
+    linker_patches->push_back(LinkerPatch::DexCacheArrayPatch(literal_offset,
+                                                              &info.target_dex_file,
+                                                              GetMethodAddressOffset(),
+                                                              info.element_offset));
+  }
 }
 
 void CodeGeneratorX86::MarkGCCard(Register temp,
@@ -3939,18 +4211,29 @@ void CodeGeneratorX86::MarkGCCard(Register temp,
 
 void LocationsBuilderX86::HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info) {
   DCHECK(instruction->IsInstanceFieldGet() || instruction->IsStaticFieldGet());
+
+  bool object_field_get_with_read_barrier =
+      kEmitCompilerReadBarrier && (instruction->GetType() == Primitive::kPrimNot);
   LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
+      new (GetGraph()->GetArena()) LocationSummary(instruction,
+                                                   kEmitCompilerReadBarrier ?
+                                                       LocationSummary::kCallOnSlowPath :
+                                                       LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RequiresRegister());
 
   if (Primitive::IsFloatingPointType(instruction->GetType())) {
     locations->SetOut(Location::RequiresFpuRegister());
   } else {
-    // The output overlaps in case of long: we don't want the low move to overwrite
-    // the object's location.
-    locations->SetOut(Location::RequiresRegister(),
-        (instruction->GetType() == Primitive::kPrimLong) ? Location::kOutputOverlap
-                                                         : Location::kNoOutputOverlap);
+    // The output overlaps in case of long: we don't want the low move
+    // to overwrite the object's location.  Likewise, in the case of
+    // an object field get with read barriers enabled, we do not want
+    // the move to overwrite the object's location, as we need it to emit
+    // the read barrier.
+    locations->SetOut(
+        Location::RequiresRegister(),
+        (object_field_get_with_read_barrier || instruction->GetType() == Primitive::kPrimLong) ?
+            Location::kOutputOverlap :
+            Location::kNoOutputOverlap);
   }
 
   if (field_info.IsVolatile() && (field_info.GetFieldType() == Primitive::kPrimLong)) {
@@ -3966,7 +4249,8 @@ void InstructionCodeGeneratorX86::HandleFieldGet(HInstruction* instruction,
   DCHECK(instruction->IsInstanceFieldGet() || instruction->IsStaticFieldGet());
 
   LocationSummary* locations = instruction->GetLocations();
-  Register base = locations->InAt(0).AsRegister<Register>();
+  Location base_loc = locations->InAt(0);
+  Register base = base_loc.AsRegister<Register>();
   Location out = locations->Out();
   bool is_volatile = field_info.IsVolatile();
   Primitive::Type field_type = field_info.GetFieldType();
@@ -4041,7 +4325,7 @@ void InstructionCodeGeneratorX86::HandleFieldGet(HInstruction* instruction,
   }
 
   if (field_type == Primitive::kPrimNot) {
-    __ MaybeUnpoisonHeapReference(out.AsRegister<Register>());
+    codegen_->MaybeGenerateReadBarrier(instruction, out, out, base_loc, offset);
   }
 }
 
@@ -4369,24 +4653,35 @@ void InstructionCodeGeneratorX86::VisitNullCheck(HNullCheck* instruction) {
 }
 
 void LocationsBuilderX86::VisitArrayGet(HArrayGet* instruction) {
+  bool object_array_get_with_read_barrier =
+      kEmitCompilerReadBarrier && (instruction->GetType() == Primitive::kPrimNot);
   LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
+      new (GetGraph()->GetArena()) LocationSummary(instruction,
+                                                   object_array_get_with_read_barrier ?
+                                                       LocationSummary::kCallOnSlowPath :
+                                                       LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
   if (Primitive::IsFloatingPointType(instruction->GetType())) {
     locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
   } else {
-    // The output overlaps in case of long: we don't want the low move to overwrite
-    // the array's location.
-    locations->SetOut(Location::RequiresRegister(),
-        (instruction->GetType() == Primitive::kPrimLong) ? Location::kOutputOverlap
-                                                         : Location::kNoOutputOverlap);
+    // The output overlaps in case of long: we don't want the low move
+    // to overwrite the array's location.  Likewise, in the case of an
+    // object array get with read barriers enabled, we do not want the
+    // move to overwrite the array's location, as we need it to emit
+    // the read barrier.
+    locations->SetOut(
+        Location::RequiresRegister(),
+        (instruction->GetType() == Primitive::kPrimLong || object_array_get_with_read_barrier) ?
+            Location::kOutputOverlap :
+            Location::kNoOutputOverlap);
   }
 }
 
 void InstructionCodeGeneratorX86::VisitArrayGet(HArrayGet* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  Register obj = locations->InAt(0).AsRegister<Register>();
+  Location obj_loc = locations->InAt(0);
+  Register obj = obj_loc.AsRegister<Register>();
   Location index = locations->InAt(1);
 
   Primitive::Type type = instruction->GetType();
@@ -4441,6 +4736,9 @@ void InstructionCodeGeneratorX86::VisitArrayGet(HArrayGet* instruction) {
 
     case Primitive::kPrimInt:
     case Primitive::kPrimNot: {
+      static_assert(
+          sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
+          "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value();
       Register out = locations->Out().AsRegister<Register>();
       if (index.IsConstant()) {
@@ -4505,8 +4803,17 @@ void InstructionCodeGeneratorX86::VisitArrayGet(HArrayGet* instruction) {
   }
 
   if (type == Primitive::kPrimNot) {
-    Register out = locations->Out().AsRegister<Register>();
-    __ MaybeUnpoisonHeapReference(out);
+    static_assert(
+        sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
+        "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
+    uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value();
+    Location out = locations->Out();
+    if (index.IsConstant()) {
+      uint32_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
+      codegen_->MaybeGenerateReadBarrier(instruction, out, out, obj_loc, offset);
+    } else {
+      codegen_->MaybeGenerateReadBarrier(instruction, out, out, obj_loc, data_offset, index);
+    }
   }
 }
 
@@ -4517,14 +4824,18 @@ void LocationsBuilderX86::VisitArraySet(HArraySet* instruction) {
   // optimization.
 
   Primitive::Type value_type = instruction->GetComponentType();
+
   bool needs_write_barrier =
       CodeGenerator::StoreNeedsWriteBarrier(value_type, instruction->GetValue());
-
-  bool may_need_runtime_call = instruction->NeedsTypeCheck();
+  bool may_need_runtime_call_for_type_check = instruction->NeedsTypeCheck();
+  bool object_array_set_with_read_barrier =
+      kEmitCompilerReadBarrier && (value_type == Primitive::kPrimNot);
 
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(
       instruction,
-      may_need_runtime_call ? LocationSummary::kCallOnSlowPath : LocationSummary::kNoCall);
+      (may_need_runtime_call_for_type_check || object_array_set_with_read_barrier) ?
+          LocationSummary::kCallOnSlowPath :
+          LocationSummary::kNoCall);
 
   bool is_byte_type = (value_type == Primitive::kPrimBoolean)
       || (value_type == Primitive::kPrimByte);
@@ -4545,20 +4856,21 @@ void LocationsBuilderX86::VisitArraySet(HArraySet* instruction) {
     // Temporary registers for the write barrier.
     locations->AddTemp(Location::RequiresRegister());  // Possibly used for ref. poisoning too.
     // Ensure the card is in a byte register.
-    locations->AddTemp(Location::RegisterLocation(ECX));
+    locations->AddTemp(Location::RegisterLocation(ECX));  // Possibly used for read barrier too.
   }
 }
 
 void InstructionCodeGeneratorX86::VisitArraySet(HArraySet* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  Register array = locations->InAt(0).AsRegister<Register>();
+  Location array_loc = locations->InAt(0);
+  Register array = array_loc.AsRegister<Register>();
   Location index = locations->InAt(1);
   Location value = locations->InAt(2);
   Primitive::Type value_type = instruction->GetComponentType();
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
   uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
   uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
-  bool may_need_runtime_call = locations->CanCall();
+  bool may_need_runtime_call_for_type_check = instruction->NeedsTypeCheck();
   bool needs_write_barrier =
       CodeGenerator::StoreNeedsWriteBarrier(value_type, instruction->GetValue());
 
@@ -4598,6 +4910,7 @@ void InstructionCodeGeneratorX86::VisitArraySet(HArraySet* instruction) {
       Address address = index.IsConstant()
           ? Address(array, (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + offset)
           : Address(array, index.AsRegister<Register>(), TIMES_4, offset);
+
       if (!value.IsRegister()) {
         // Just setting null.
         DCHECK(instruction->InputAt(2)->IsNullConstant());
@@ -4605,7 +4918,7 @@ void InstructionCodeGeneratorX86::VisitArraySet(HArraySet* instruction) {
         __ movl(address, Immediate(0));
         codegen_->MaybeRecordImplicitNullCheck(instruction);
         DCHECK(!needs_write_barrier);
-        DCHECK(!may_need_runtime_call);
+        DCHECK(!may_need_runtime_call_for_type_check);
         break;
       }
 
@@ -4614,7 +4927,7 @@ void InstructionCodeGeneratorX86::VisitArraySet(HArraySet* instruction) {
       NearLabel done, not_null, do_put;
       SlowPathCode* slow_path = nullptr;
       Register temp = locations->GetTemp(0).AsRegister<Register>();
-      if (may_need_runtime_call) {
+      if (may_need_runtime_call_for_type_check) {
         slow_path = new (GetGraph()->GetArena()) ArraySetSlowPathX86(instruction);
         codegen_->AddSlowPath(slow_path);
         if (instruction->GetValueCanBeNull()) {
@@ -4626,22 +4939,62 @@ void InstructionCodeGeneratorX86::VisitArraySet(HArraySet* instruction) {
           __ Bind(&not_null);
         }
 
-        __ movl(temp, Address(array, class_offset));
-        codegen_->MaybeRecordImplicitNullCheck(instruction);
-        __ MaybeUnpoisonHeapReference(temp);
-        __ movl(temp, Address(temp, component_offset));
-        // No need to poison/unpoison, we're comparing two poisoned references.
-        __ cmpl(temp, Address(register_value, class_offset));
-        if (instruction->StaticTypeOfArrayIsObjectArray()) {
-          __ j(kEqual, &do_put);
-          __ MaybeUnpoisonHeapReference(temp);
-          __ movl(temp, Address(temp, super_offset));
-          // No need to unpoison, we're comparing against null..
-          __ testl(temp, temp);
-          __ j(kNotEqual, slow_path->GetEntryLabel());
-          __ Bind(&do_put);
+        if (kEmitCompilerReadBarrier) {
+          // When read barriers are enabled, the type checking
+          // instrumentation requires two read barriers:
+          //
+          //   __ movl(temp2, temp);
+          //   // /* HeapReference<Class> */ temp = temp->component_type_
+          //   __ movl(temp, Address(temp, component_offset));
+          //   codegen_->GenerateReadBarrier(
+          //       instruction, temp_loc, temp_loc, temp2_loc, component_offset);
+          //
+          //   // /* HeapReference<Class> */ temp2 = register_value->klass_
+          //   __ movl(temp2, Address(register_value, class_offset));
+          //   codegen_->GenerateReadBarrier(
+          //       instruction, temp2_loc, temp2_loc, value, class_offset, temp_loc);
+          //
+          //   __ cmpl(temp, temp2);
+          //
+          // However, the second read barrier may trash `temp`, as it
+          // is a temporary register, and as such would not be saved
+          // along with live registers before calling the runtime (nor
+          // restored afterwards).  So in this case, we bail out and
+          // delegate the work to the array set slow path.
+          //
+          // TODO: Extend the register allocator to support a new
+          // "(locally) live temp" location so as to avoid always
+          // going into the slow path when read barriers are enabled.
+          __ jmp(slow_path->GetEntryLabel());
         } else {
-          __ j(kNotEqual, slow_path->GetEntryLabel());
+          // /* HeapReference<Class> */ temp = array->klass_
+          __ movl(temp, Address(array, class_offset));
+          codegen_->MaybeRecordImplicitNullCheck(instruction);
+          __ MaybeUnpoisonHeapReference(temp);
+
+          // /* HeapReference<Class> */ temp = temp->component_type_
+          __ movl(temp, Address(temp, component_offset));
+          // If heap poisoning is enabled, no need to unpoison `temp`
+          // nor the object reference in `register_value->klass`, as
+          // we are comparing two poisoned references.
+          __ cmpl(temp, Address(register_value, class_offset));
+
+          if (instruction->StaticTypeOfArrayIsObjectArray()) {
+            __ j(kEqual, &do_put);
+            // If heap poisoning is enabled, the `temp` reference has
+            // not been unpoisoned yet; unpoison it now.
+            __ MaybeUnpoisonHeapReference(temp);
+
+            // /* HeapReference<Class> */ temp = temp->super_class_
+            __ movl(temp, Address(temp, super_offset));
+            // If heap poisoning is enabled, no need to unpoison
+            // `temp`, as we are comparing against null below.
+            __ testl(temp, temp);
+            __ j(kNotEqual, slow_path->GetEntryLabel());
+            __ Bind(&do_put);
+          } else {
+            __ j(kNotEqual, slow_path->GetEntryLabel());
+          }
         }
       }
 
@@ -4652,7 +5005,7 @@ void InstructionCodeGeneratorX86::VisitArraySet(HArraySet* instruction) {
       } else {
         __ movl(address, register_value);
       }
-      if (!may_need_runtime_call) {
+      if (!may_need_runtime_call_for_type_check) {
         codegen_->MaybeRecordImplicitNullCheck(instruction);
       }
 
@@ -4667,6 +5020,7 @@ void InstructionCodeGeneratorX86::VisitArraySet(HArraySet* instruction) {
 
       break;
     }
+
     case Primitive::kPrimInt: {
       uint32_t offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value();
       Address address = index.IsConstant()
@@ -5137,7 +5491,8 @@ void LocationsBuilderX86::VisitLoadClass(HLoadClass* cls) {
   CodeGenerator::CreateLoadClassLocationSummary(
       cls,
       Location::RegisterLocation(calling_convention.GetRegisterAt(0)),
-      Location::RegisterLocation(EAX));
+      Location::RegisterLocation(EAX),
+      /* code_generator_supports_read_barrier */ true);
 }
 
 void InstructionCodeGeneratorX86::VisitLoadClass(HLoadClass* cls) {
@@ -5151,18 +5506,40 @@ void InstructionCodeGeneratorX86::VisitLoadClass(HLoadClass* cls) {
     return;
   }
 
-  Register out = locations->Out().AsRegister<Register>();
+  Location out_loc = locations->Out();
+  Register out = out_loc.AsRegister<Register>();
   Register current_method = locations->InAt(0).AsRegister<Register>();
+
   if (cls->IsReferrersClass()) {
     DCHECK(!cls->CanCallRuntime());
     DCHECK(!cls->MustGenerateClinitCheck());
-    __ movl(out, Address(current_method, ArtMethod::DeclaringClassOffset().Int32Value()));
+    uint32_t declaring_class_offset = ArtMethod::DeclaringClassOffset().Int32Value();
+    if (kEmitCompilerReadBarrier) {
+      // /* GcRoot<mirror::Class>* */ out = &(current_method->declaring_class_)
+      __ leal(out, Address(current_method, declaring_class_offset));
+      // /* mirror::Class* */ out = out->Read()
+      codegen_->GenerateReadBarrierForRoot(cls, out_loc, out_loc);
+    } else {
+      // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
+      __ movl(out, Address(current_method, declaring_class_offset));
+    }
   } else {
     DCHECK(cls->CanCallRuntime());
-    __ movl(out, Address(
-        current_method, ArtMethod::DexCacheResolvedTypesOffset(kX86PointerSize).Int32Value()));
-    __ movl(out, Address(out, CodeGenerator::GetCacheOffset(cls->GetTypeIndex())));
-    // TODO: We will need a read barrier here.
+    // /* GcRoot<mirror::Class>[] */ out =
+    //        current_method.ptr_sized_fields_->dex_cache_resolved_types_
+    __ movl(out, Address(current_method,
+                         ArtMethod::DexCacheResolvedTypesOffset(kX86PointerSize).Int32Value()));
+
+    size_t cache_offset = CodeGenerator::GetCacheOffset(cls->GetTypeIndex());
+    if (kEmitCompilerReadBarrier) {
+      // /* GcRoot<mirror::Class>* */ out = &out[type_index]
+      __ leal(out, Address(out, cache_offset));
+      // /* mirror::Class* */ out = out->Read()
+      codegen_->GenerateReadBarrierForRoot(cls, out_loc, out_loc);
+    } else {
+      // /* GcRoot<mirror::Class> */ out = out[type_index]
+      __ movl(out, Address(out, cache_offset));
+    }
 
     SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadClassSlowPathX86(
         cls, cls, cls->GetDexPc(), cls->MustGenerateClinitCheck());
@@ -5216,12 +5593,35 @@ void InstructionCodeGeneratorX86::VisitLoadString(HLoadString* load) {
   codegen_->AddSlowPath(slow_path);
 
   LocationSummary* locations = load->GetLocations();
-  Register out = locations->Out().AsRegister<Register>();
+  Location out_loc = locations->Out();
+  Register out = out_loc.AsRegister<Register>();
   Register current_method = locations->InAt(0).AsRegister<Register>();
-  __ movl(out, Address(current_method, ArtMethod::DeclaringClassOffset().Int32Value()));
+
+  uint32_t declaring_class_offset = ArtMethod::DeclaringClassOffset().Int32Value();
+  if (kEmitCompilerReadBarrier) {
+    // /* GcRoot<mirror::Class>* */ out = &(current_method->declaring_class_)
+    __ leal(out, Address(current_method, declaring_class_offset));
+    // /* mirror::Class* */ out = out->Read()
+    codegen_->GenerateReadBarrierForRoot(load, out_loc, out_loc);
+  } else {
+    // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
+    __ movl(out, Address(current_method, declaring_class_offset));
+  }
+
+  // /* GcRoot<mirror::String>[] */ out = out->dex_cache_strings_
   __ movl(out, Address(out, mirror::Class::DexCacheStringsOffset().Int32Value()));
-  __ movl(out, Address(out, CodeGenerator::GetCacheOffset(load->GetStringIndex())));
-  // TODO: We will need a read barrier here.
+
+  size_t cache_offset = CodeGenerator::GetCacheOffset(load->GetStringIndex());
+  if (kEmitCompilerReadBarrier) {
+    // /* GcRoot<mirror::String>* */ out = &out[string_index]
+    __ leal(out, Address(out, cache_offset));
+    // /* mirror::String* */ out = out->Read()
+    codegen_->GenerateReadBarrierForRoot(load, out_loc, out_loc);
+  } else {
+    // /* GcRoot<mirror::String> */ out = out[string_index]
+    __ movl(out, Address(out, cache_offset));
+  }
+
   __ testl(out, out);
   __ j(kEqual, slow_path->GetEntryLabel());
   __ Bind(slow_path->GetExitLabel());
@@ -5265,40 +5665,44 @@ void InstructionCodeGeneratorX86::VisitThrow(HThrow* instruction) {
 
 void LocationsBuilderX86::VisitInstanceOf(HInstanceOf* instruction) {
   LocationSummary::CallKind call_kind = LocationSummary::kNoCall;
-  switch (instruction->GetTypeCheckKind()) {
+  TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
+  switch (type_check_kind) {
     case TypeCheckKind::kExactCheck:
     case TypeCheckKind::kAbstractClassCheck:
     case TypeCheckKind::kClassHierarchyCheck:
     case TypeCheckKind::kArrayObjectCheck:
-      call_kind = LocationSummary::kNoCall;
+      call_kind =
+          kEmitCompilerReadBarrier ? LocationSummary::kCallOnSlowPath : LocationSummary::kNoCall;
       break;
+    case TypeCheckKind::kArrayCheck:
     case TypeCheckKind::kUnresolvedCheck:
     case TypeCheckKind::kInterfaceCheck:
-      call_kind = LocationSummary::kCall;
-      break;
-    case TypeCheckKind::kArrayCheck:
       call_kind = LocationSummary::kCallOnSlowPath;
       break;
   }
+
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, call_kind);
-  if (call_kind != LocationSummary::kCall) {
-    locations->SetInAt(0, Location::RequiresRegister());
-    locations->SetInAt(1, Location::Any());
-    // Note that TypeCheckSlowPathX86 uses this register too.
-    locations->SetOut(Location::RequiresRegister());
-  } else {
-    InvokeRuntimeCallingConvention calling_convention;
-    locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
-    locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
-    locations->SetOut(Location::RegisterLocation(EAX));
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(1, Location::Any());
+  // Note that TypeCheckSlowPathX86 uses this "out" register too.
+  locations->SetOut(Location::RequiresRegister());
+  // When read barriers are enabled, we need a temporary register for
+  // some cases.
+  if (kEmitCompilerReadBarrier &&
+      (type_check_kind == TypeCheckKind::kAbstractClassCheck ||
+       type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
+       type_check_kind == TypeCheckKind::kArrayObjectCheck)) {
+    locations->AddTemp(Location::RequiresRegister());
   }
 }
 
 void InstructionCodeGeneratorX86::VisitInstanceOf(HInstanceOf* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  Register obj = locations->InAt(0).AsRegister<Register>();
+  Location obj_loc = locations->InAt(0);
+  Register obj = obj_loc.AsRegister<Register>();
   Location cls = locations->InAt(1);
-  Register out = locations->Out().AsRegister<Register>();
+  Location out_loc = locations->Out();
+  Register out = out_loc.AsRegister<Register>();
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
   uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
   uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
@@ -5313,15 +5717,9 @@ void InstructionCodeGeneratorX86::VisitInstanceOf(HInstanceOf* instruction) {
     __ j(kEqual, &zero);
   }
 
-  // In case of an interface/unresolved check, we put the object class into the object register.
-  // This is safe, as the register is caller-save, and the object must be in another
-  // register if it survives the runtime call.
-  Register target = (instruction->GetTypeCheckKind() == TypeCheckKind::kInterfaceCheck) ||
-      (instruction->GetTypeCheckKind() == TypeCheckKind::kUnresolvedCheck)
-      ? obj
-      : out;
-  __ movl(target, Address(obj, class_offset));
-  __ MaybeUnpoisonHeapReference(target);
+  // /* HeapReference<Class> */ out = obj->klass_
+  __ movl(out, Address(obj, class_offset));
+  codegen_->MaybeGenerateReadBarrier(instruction, out_loc, out_loc, obj_loc, class_offset);
 
   switch (instruction->GetTypeCheckKind()) {
     case TypeCheckKind::kExactCheck: {
@@ -5338,13 +5736,23 @@ void InstructionCodeGeneratorX86::VisitInstanceOf(HInstanceOf* instruction) {
       __ jmp(&done);
       break;
     }
+
     case TypeCheckKind::kAbstractClassCheck: {
       // If the class is abstract, we eagerly fetch the super class of the
       // object to avoid doing a comparison we know will fail.
       NearLabel loop;
       __ Bind(&loop);
+      Location temp_loc = kEmitCompilerReadBarrier ? locations->GetTemp(0) : Location::NoLocation();
+      if (kEmitCompilerReadBarrier) {
+        // Save the value of `out` into `temp` before overwriting it
+        // in the following move operation, as we will need it for the
+        // read barrier below.
+        Register temp = temp_loc.AsRegister<Register>();
+        __ movl(temp, out);
+      }
+      // /* HeapReference<Class> */ out = out->super_class_
       __ movl(out, Address(out, super_offset));
-      __ MaybeUnpoisonHeapReference(out);
+      codegen_->MaybeGenerateReadBarrier(instruction, out_loc, out_loc, temp_loc, super_offset);
       __ testl(out, out);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ j(kEqual, &done);
@@ -5361,6 +5769,7 @@ void InstructionCodeGeneratorX86::VisitInstanceOf(HInstanceOf* instruction) {
       }
       break;
     }
+
     case TypeCheckKind::kClassHierarchyCheck: {
       // Walk over the class hierarchy to find a match.
       NearLabel loop, success;
@@ -5372,8 +5781,17 @@ void InstructionCodeGeneratorX86::VisitInstanceOf(HInstanceOf* instruction) {
         __ cmpl(out, Address(ESP, cls.GetStackIndex()));
       }
       __ j(kEqual, &success);
+      Location temp_loc = kEmitCompilerReadBarrier ? locations->GetTemp(0) : Location::NoLocation();
+      if (kEmitCompilerReadBarrier) {
+        // Save the value of `out` into `temp` before overwriting it
+        // in the following move operation, as we will need it for the
+        // read barrier below.
+        Register temp = temp_loc.AsRegister<Register>();
+        __ movl(temp, out);
+      }
+      // /* HeapReference<Class> */ out = out->super_class_
       __ movl(out, Address(out, super_offset));
-      __ MaybeUnpoisonHeapReference(out);
+      codegen_->MaybeGenerateReadBarrier(instruction, out_loc, out_loc, temp_loc, super_offset);
       __ testl(out, out);
       __ j(kNotEqual, &loop);
       // If `out` is null, we use it for the result, and jump to `done`.
@@ -5385,6 +5803,7 @@ void InstructionCodeGeneratorX86::VisitInstanceOf(HInstanceOf* instruction) {
       }
       break;
     }
+
     case TypeCheckKind::kArrayObjectCheck: {
       // Do an exact check.
       NearLabel exact_check;
@@ -5395,9 +5814,18 @@ void InstructionCodeGeneratorX86::VisitInstanceOf(HInstanceOf* instruction) {
         __ cmpl(out, Address(ESP, cls.GetStackIndex()));
       }
       __ j(kEqual, &exact_check);
-      // Otherwise, we need to check that the object's class is a non primitive array.
+      // Otherwise, we need to check that the object's class is a non-primitive array.
+      Location temp_loc = kEmitCompilerReadBarrier ? locations->GetTemp(0) : Location::NoLocation();
+      if (kEmitCompilerReadBarrier) {
+        // Save the value of `out` into `temp` before overwriting it
+        // in the following move operation, as we will need it for the
+        // read barrier below.
+        Register temp = temp_loc.AsRegister<Register>();
+        __ movl(temp, out);
+      }
+      // /* HeapReference<Class> */ out = out->component_type_
       __ movl(out, Address(out, component_offset));
-      __ MaybeUnpoisonHeapReference(out);
+      codegen_->MaybeGenerateReadBarrier(instruction, out_loc, out_loc, temp_loc, component_offset);
       __ testl(out, out);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ j(kEqual, &done);
@@ -5408,6 +5836,7 @@ void InstructionCodeGeneratorX86::VisitInstanceOf(HInstanceOf* instruction) {
       __ jmp(&done);
       break;
     }
+
     case TypeCheckKind::kArrayCheck: {
       if (cls.IsRegister()) {
         __ cmpl(out, cls.AsRegister<Register>());
@@ -5416,8 +5845,8 @@ void InstructionCodeGeneratorX86::VisitInstanceOf(HInstanceOf* instruction) {
         __ cmpl(out, Address(ESP, cls.GetStackIndex()));
       }
       DCHECK(locations->OnlyCallsOnSlowPath());
-      slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathX86(
-          instruction, /* is_fatal */ false);
+      slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathX86(instruction,
+                                                                    /* is_fatal */ false);
       codegen_->AddSlowPath(slow_path);
       __ j(kNotEqual, slow_path->GetEntryLabel());
       __ movl(out, Immediate(1));
@@ -5426,13 +5855,25 @@ void InstructionCodeGeneratorX86::VisitInstanceOf(HInstanceOf* instruction) {
       }
       break;
     }
+
     case TypeCheckKind::kUnresolvedCheck:
-    case TypeCheckKind::kInterfaceCheck:
-    default: {
-      codegen_->InvokeRuntime(QUICK_ENTRY_POINT(pInstanceofNonTrivial),
-                              instruction,
-                              instruction->GetDexPc(),
-                              nullptr);
+    case TypeCheckKind::kInterfaceCheck: {
+      // Note that we indeed only call on slow path, but we always go
+      // into the slow path for the unresolved & interface check
+      // cases.
+      //
+      // We cannot directly call the InstanceofNonTrivial runtime
+      // entry point without resorting to a type checking slow path
+      // here (i.e. by calling InvokeRuntime directly), as it would
+      // require to assign fixed registers for the inputs of this
+      // HInstanceOf instruction (following the runtime calling
+      // convention), which might be cluttered by the potential first
+      // read barrier emission at the beginning of this method.
+      DCHECK(locations->OnlyCallsOnSlowPath());
+      slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathX86(instruction,
+                                                                    /* is_fatal */ false);
+      codegen_->AddSlowPath(slow_path);
+      __ jmp(slow_path->GetEntryLabel());
       if (zero.IsLinked()) {
         __ jmp(&done);
       }
@@ -5457,75 +5898,73 @@ void InstructionCodeGeneratorX86::VisitInstanceOf(HInstanceOf* instruction) {
 void LocationsBuilderX86::VisitCheckCast(HCheckCast* instruction) {
   LocationSummary::CallKind call_kind = LocationSummary::kNoCall;
   bool throws_into_catch = instruction->CanThrowIntoCatchBlock();
-
-  switch (instruction->GetTypeCheckKind()) {
+  TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
+  switch (type_check_kind) {
     case TypeCheckKind::kExactCheck:
     case TypeCheckKind::kAbstractClassCheck:
     case TypeCheckKind::kClassHierarchyCheck:
     case TypeCheckKind::kArrayObjectCheck:
-      call_kind = throws_into_catch
-          ? LocationSummary::kCallOnSlowPath
-          : LocationSummary::kNoCall;
-      break;
-    case TypeCheckKind::kInterfaceCheck:
-    case TypeCheckKind::kUnresolvedCheck:
-      call_kind = LocationSummary::kCall;
+      call_kind = (throws_into_catch || kEmitCompilerReadBarrier) ?
+          LocationSummary::kCallOnSlowPath :
+          LocationSummary::kNoCall;  // In fact, call on a fatal (non-returning) slow path.
       break;
     case TypeCheckKind::kArrayCheck:
+    case TypeCheckKind::kUnresolvedCheck:
+    case TypeCheckKind::kInterfaceCheck:
       call_kind = LocationSummary::kCallOnSlowPath;
       break;
   }
-
-  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(
-      instruction, call_kind);
-  if (call_kind != LocationSummary::kCall) {
-    locations->SetInAt(0, Location::RequiresRegister());
-    locations->SetInAt(1, Location::Any());
-    // Note that TypeCheckSlowPathX86 uses this register too.
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, call_kind);
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(1, Location::Any());
+  // Note that TypeCheckSlowPathX86 uses this "temp" register too.
+  locations->AddTemp(Location::RequiresRegister());
+  // When read barriers are enabled, we need an additional temporary
+  // register for some cases.
+  if (kEmitCompilerReadBarrier &&
+      (type_check_kind == TypeCheckKind::kAbstractClassCheck ||
+       type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
+       type_check_kind == TypeCheckKind::kArrayObjectCheck)) {
     locations->AddTemp(Location::RequiresRegister());
-  } else {
-    InvokeRuntimeCallingConvention calling_convention;
-    locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
-    locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
   }
 }
 
 void InstructionCodeGeneratorX86::VisitCheckCast(HCheckCast* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  Register obj = locations->InAt(0).AsRegister<Register>();
+  Location obj_loc = locations->InAt(0);
+  Register obj = obj_loc.AsRegister<Register>();
   Location cls = locations->InAt(1);
-  Register temp = locations->WillCall()
-      ? kNoRegister
-      : locations->GetTemp(0).AsRegister<Register>();
-
+  Location temp_loc = locations->GetTemp(0);
+  Register temp = temp_loc.AsRegister<Register>();
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
   uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
   uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
   uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
-  SlowPathCode* slow_path = nullptr;
 
-  if (!locations->WillCall()) {
-    slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathX86(
-        instruction, !locations->CanCall());
-    codegen_->AddSlowPath(slow_path);
-  }
+  TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
+  bool is_type_check_slow_path_fatal =
+      (type_check_kind == TypeCheckKind::kExactCheck ||
+       type_check_kind == TypeCheckKind::kAbstractClassCheck ||
+       type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
+       type_check_kind == TypeCheckKind::kArrayObjectCheck) &&
+      !instruction->CanThrowIntoCatchBlock();
+  SlowPathCode* type_check_slow_path =
+      new (GetGraph()->GetArena()) TypeCheckSlowPathX86(instruction,
+                                                        is_type_check_slow_path_fatal);
+  codegen_->AddSlowPath(type_check_slow_path);
 
-  NearLabel done, abstract_entry;
+  NearLabel done;
   // Avoid null check if we know obj is not null.
   if (instruction->MustDoNullCheck()) {
     __ testl(obj, obj);
     __ j(kEqual, &done);
   }
 
-  if (locations->WillCall()) {
-    __ movl(obj, Address(obj, class_offset));
-    __ MaybeUnpoisonHeapReference(obj);
-  } else {
-    __ movl(temp, Address(obj, class_offset));
-    __ MaybeUnpoisonHeapReference(temp);
-  }
+  // /* HeapReference<Class> */ temp = obj->klass_
+  __ movl(temp, Address(obj, class_offset));
+  codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, obj_loc, class_offset);
 
-  switch (instruction->GetTypeCheckKind()) {
+  switch (type_check_kind) {
     case TypeCheckKind::kExactCheck:
     case TypeCheckKind::kArrayCheck: {
       if (cls.IsRegister()) {
@@ -5536,19 +5975,44 @@ void InstructionCodeGeneratorX86::VisitCheckCast(HCheckCast* instruction) {
       }
       // Jump to slow path for throwing the exception or doing a
       // more involved array check.
-      __ j(kNotEqual, slow_path->GetEntryLabel());
+      __ j(kNotEqual, type_check_slow_path->GetEntryLabel());
       break;
     }
+
     case TypeCheckKind::kAbstractClassCheck: {
       // If the class is abstract, we eagerly fetch the super class of the
       // object to avoid doing a comparison we know will fail.
-      NearLabel loop, success;
+      NearLabel loop, compare_classes;
       __ Bind(&loop);
+      Location temp2_loc =
+          kEmitCompilerReadBarrier ? locations->GetTemp(1) : Location::NoLocation();
+      if (kEmitCompilerReadBarrier) {
+        // Save the value of `temp` into `temp2` before overwriting it
+        // in the following move operation, as we will need it for the
+        // read barrier below.
+        Register temp2 = temp2_loc.AsRegister<Register>();
+        __ movl(temp2, temp);
+      }
+      // /* HeapReference<Class> */ temp = temp->super_class_
       __ movl(temp, Address(temp, super_offset));
-      __ MaybeUnpoisonHeapReference(temp);
+      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, temp2_loc, super_offset);
+
+      // If the class reference currently in `temp` is not null, jump
+      // to the `compare_classes` label to compare it with the checked
+      // class.
       __ testl(temp, temp);
-      // Jump to the slow path to throw the exception.
-      __ j(kEqual, slow_path->GetEntryLabel());
+      __ j(kNotEqual, &compare_classes);
+      // Otherwise, jump to the slow path to throw the exception.
+      //
+      // But before, move back the object's class into `temp` before
+      // going into the slow path, as it has been overwritten in the
+      // meantime.
+      // /* HeapReference<Class> */ temp = obj->klass_
+      __ movl(temp, Address(obj, class_offset));
+      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, obj_loc, class_offset);
+      __ jmp(type_check_slow_path->GetEntryLabel());
+
+      __ Bind(&compare_classes);
       if (cls.IsRegister()) {
         __ cmpl(temp, cls.AsRegister<Register>());
       } else {
@@ -5558,6 +6022,7 @@ void InstructionCodeGeneratorX86::VisitCheckCast(HCheckCast* instruction) {
       __ j(kNotEqual, &loop);
       break;
     }
+
     case TypeCheckKind::kClassHierarchyCheck: {
       // Walk over the class hierarchy to find a match.
       NearLabel loop;
@@ -5569,16 +6034,39 @@ void InstructionCodeGeneratorX86::VisitCheckCast(HCheckCast* instruction) {
         __ cmpl(temp, Address(ESP, cls.GetStackIndex()));
       }
       __ j(kEqual, &done);
+
+      Location temp2_loc =
+          kEmitCompilerReadBarrier ? locations->GetTemp(1) : Location::NoLocation();
+      if (kEmitCompilerReadBarrier) {
+        // Save the value of `temp` into `temp2` before overwriting it
+        // in the following move operation, as we will need it for the
+        // read barrier below.
+        Register temp2 = temp2_loc.AsRegister<Register>();
+        __ movl(temp2, temp);
+      }
+      // /* HeapReference<Class> */ temp = temp->super_class_
       __ movl(temp, Address(temp, super_offset));
-      __ MaybeUnpoisonHeapReference(temp);
+      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, temp2_loc, super_offset);
+
+      // If the class reference currently in `temp` is not null, jump
+      // back at the beginning of the loop.
       __ testl(temp, temp);
       __ j(kNotEqual, &loop);
-      // Jump to the slow path to throw the exception.
-      __ jmp(slow_path->GetEntryLabel());
+      // Otherwise, jump to the slow path to throw the exception.
+      //
+      // But before, move back the object's class into `temp` before
+      // going into the slow path, as it has been overwritten in the
+      // meantime.
+      // /* HeapReference<Class> */ temp = obj->klass_
+      __ movl(temp, Address(obj, class_offset));
+      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, obj_loc, class_offset);
+      __ jmp(type_check_slow_path->GetEntryLabel());
       break;
     }
+
     case TypeCheckKind::kArrayObjectCheck: {
       // Do an exact check.
+      NearLabel check_non_primitive_component_type;
       if (cls.IsRegister()) {
         __ cmpl(temp, cls.AsRegister<Register>());
       } else {
@@ -5586,29 +6074,67 @@ void InstructionCodeGeneratorX86::VisitCheckCast(HCheckCast* instruction) {
         __ cmpl(temp, Address(ESP, cls.GetStackIndex()));
       }
       __ j(kEqual, &done);
-      // Otherwise, we need to check that the object's class is a non primitive array.
+
+      // Otherwise, we need to check that the object's class is a non-primitive array.
+      Location temp2_loc =
+          kEmitCompilerReadBarrier ? locations->GetTemp(1) : Location::NoLocation();
+      if (kEmitCompilerReadBarrier) {
+        // Save the value of `temp` into `temp2` before overwriting it
+        // in the following move operation, as we will need it for the
+        // read barrier below.
+        Register temp2 = temp2_loc.AsRegister<Register>();
+        __ movl(temp2, temp);
+      }
+      // /* HeapReference<Class> */ temp = temp->component_type_
       __ movl(temp, Address(temp, component_offset));
-      __ MaybeUnpoisonHeapReference(temp);
+      codegen_->MaybeGenerateReadBarrier(
+          instruction, temp_loc, temp_loc, temp2_loc, component_offset);
+
+      // If the component type is not null (i.e. the object is indeed
+      // an array), jump to label `check_non_primitive_component_type`
+      // to further check that this component type is not a primitive
+      // type.
       __ testl(temp, temp);
-      __ j(kEqual, slow_path->GetEntryLabel());
+      __ j(kNotEqual, &check_non_primitive_component_type);
+      // Otherwise, jump to the slow path to throw the exception.
+      //
+      // But before, move back the object's class into `temp` before
+      // going into the slow path, as it has been overwritten in the
+      // meantime.
+      // /* HeapReference<Class> */ temp = obj->klass_
+      __ movl(temp, Address(obj, class_offset));
+      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, obj_loc, class_offset);
+      __ jmp(type_check_slow_path->GetEntryLabel());
+
+      __ Bind(&check_non_primitive_component_type);
       __ cmpw(Address(temp, primitive_offset), Immediate(Primitive::kPrimNot));
-      __ j(kNotEqual, slow_path->GetEntryLabel());
+      __ j(kEqual, &done);
+      // Same comment as above regarding `temp` and the slow path.
+      // /* HeapReference<Class> */ temp = obj->klass_
+      __ movl(temp, Address(obj, class_offset));
+      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, obj_loc, class_offset);
+      __ jmp(type_check_slow_path->GetEntryLabel());
       break;
     }
+
     case TypeCheckKind::kUnresolvedCheck:
     case TypeCheckKind::kInterfaceCheck:
-    default:
-      codegen_->InvokeRuntime(QUICK_ENTRY_POINT(pCheckCast),
-                              instruction,
-                              instruction->GetDexPc(),
-                              nullptr);
+      // We always go into the type check slow path for the unresolved &
+      // interface check cases.
+      //
+      // We cannot directly call the CheckCast runtime entry point
+      // without resorting to a type checking slow path here (i.e. by
+      // calling InvokeRuntime directly), as it would require to
+      // assign fixed registers for the inputs of this HInstanceOf
+      // instruction (following the runtime calling convention), which
+      // might be cluttered by the potential first read barrier
+      // emission at the beginning of this method.
+      __ jmp(type_check_slow_path->GetEntryLabel());
       break;
   }
   __ Bind(&done);
 
-  if (slow_path != nullptr) {
-    __ Bind(slow_path->GetExitLabel());
-  }
+  __ Bind(type_check_slow_path->GetExitLabel());
 }
 
 void LocationsBuilderX86::VisitMonitorOperation(HMonitorOperation* instruction) {
@@ -5759,6 +6285,82 @@ void InstructionCodeGeneratorX86::HandleBitwiseOperation(HBinaryOperation* instr
   }
 }
 
+void CodeGeneratorX86::GenerateReadBarrier(HInstruction* instruction,
+                                           Location out,
+                                           Location ref,
+                                           Location obj,
+                                           uint32_t offset,
+                                           Location index) {
+  DCHECK(kEmitCompilerReadBarrier);
+
+  // If heap poisoning is enabled, the unpoisoning of the loaded
+  // reference will be carried out by the runtime within the slow
+  // path.
+  //
+  // Note that `ref` currently does not get unpoisoned (when heap
+  // poisoning is enabled), which is alright as the `ref` argument is
+  // not used by the artReadBarrierSlow entry point.
+  //
+  // TODO: Unpoison `ref` when it is used by artReadBarrierSlow.
+  SlowPathCode* slow_path = new (GetGraph()->GetArena())
+      ReadBarrierForHeapReferenceSlowPathX86(instruction, out, ref, obj, offset, index);
+  AddSlowPath(slow_path);
+
+  // TODO: When read barrier has a fast path, add it here.
+  /* Currently the read barrier call is inserted after the original load.
+   * However, if we have a fast path, we need to perform the load of obj.LockWord *before* the
+   * original load. This load-load ordering is required by the read barrier.
+   * The fast path/slow path (for Baker's algorithm) should look like:
+   *
+   * bool isGray = obj.LockWord & kReadBarrierMask;
+   * lfence;  // load fence or artificial data dependence to prevent load-load reordering
+   * ref = obj.field;    // this is the original load
+   * if (isGray) {
+   *   ref = Mark(ref);  // ideally the slow path just does Mark(ref)
+   * }
+   */
+
+  __ jmp(slow_path->GetEntryLabel());
+  __ Bind(slow_path->GetExitLabel());
+}
+
+void CodeGeneratorX86::MaybeGenerateReadBarrier(HInstruction* instruction,
+                                                Location out,
+                                                Location ref,
+                                                Location obj,
+                                                uint32_t offset,
+                                                Location index) {
+  if (kEmitCompilerReadBarrier) {
+    // If heap poisoning is enabled, unpoisoning will be taken care of
+    // by the runtime within the slow path.
+    GenerateReadBarrier(instruction, out, ref, obj, offset, index);
+  } else if (kPoisonHeapReferences) {
+    __ UnpoisonHeapReference(out.AsRegister<Register>());
+  }
+}
+
+void CodeGeneratorX86::GenerateReadBarrierForRoot(HInstruction* instruction,
+                                                  Location out,
+                                                  Location root) {
+  DCHECK(kEmitCompilerReadBarrier);
+
+  // Note that GC roots are not affected by heap poisoning, so we do
+  // not need to do anything special for this here.
+  SlowPathCode* slow_path =
+      new (GetGraph()->GetArena()) ReadBarrierForRootSlowPathX86(instruction, out, root);
+  AddSlowPath(slow_path);
+
+  // TODO: Implement a fast path for ReadBarrierForRoot, performing
+  // the following operation (for Baker's algorithm):
+  //
+  //   if (thread.tls32_.is_gc_marking) {
+  //     root = Mark(root);
+  //   }
+
+  __ jmp(slow_path->GetEntryLabel());
+  __ Bind(slow_path->GetExitLabel());
+}
+
 void LocationsBuilderX86::VisitBoundType(HBoundType* instruction ATTRIBUTE_UNUSED) {
   // Nothing to do, this should be removed during prepare for register allocator.
   LOG(FATAL) << "Unreachable";
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index ac3d06c23d..064051c7f4 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -227,14 +227,12 @@ class InstructionCodeGeneratorX86 : public HGraphVisitor {
   void GenerateImplicitNullCheck(HNullCheck* instruction);
   void GenerateExplicitNullCheck(HNullCheck* instruction);
   void GenerateTestAndBranch(HInstruction* instruction,
+                             size_t condition_input_index,
                              Label* true_target,
-                             Label* false_target,
-                             Label* always_true_target);
-  void GenerateCompareTestAndBranch(HIf* if_inst,
-                                    HCondition* condition,
+                             Label* false_target);
+  void GenerateCompareTestAndBranch(HCondition* condition,
                                     Label* true_target,
-                                    Label* false_target,
-                                    Label* always_true_target);
+                                    Label* false_target);
   void GenerateFPJumps(HCondition* cond, Label* true_label, Label* false_label);
   void GenerateLongComparesAndJumps(HCondition* cond, Label* true_label, Label* false_label);
   void HandleGoto(HInstruction* got, HBasicBlock* successor);
@@ -397,7 +395,64 @@ class CodeGeneratorX86 : public CodeGenerator {
 
   void Finalize(CodeAllocator* allocator) OVERRIDE;
 
+  // Generate a read barrier for a heap reference within `instruction`.
+  //
+  // A read barrier for an object reference read from the heap is
+  // implemented as a call to the artReadBarrierSlow runtime entry
+  // point, which is passed the values in locations `ref`, `obj`, and
+  // `offset`:
+  //
+  //   mirror::Object* artReadBarrierSlow(mirror::Object* ref,
+  //                                      mirror::Object* obj,
+  //                                      uint32_t offset);
+  //
+  // The `out` location contains the value returned by
+  // artReadBarrierSlow.
+  //
+  // When `index` is provided (i.e. for array accesses), the offset
+  // value passed to artReadBarrierSlow is adjusted to take `index`
+  // into account.
+  void GenerateReadBarrier(HInstruction* instruction,
+                           Location out,
+                           Location ref,
+                           Location obj,
+                           uint32_t offset,
+                           Location index = Location::NoLocation());
+
+  // If read barriers are enabled, generate a read barrier for a heap reference.
+  // If heap poisoning is enabled, also unpoison the reference in `out`.
+  void MaybeGenerateReadBarrier(HInstruction* instruction,
+                                Location out,
+                                Location ref,
+                                Location obj,
+                                uint32_t offset,
+                                Location index = Location::NoLocation());
+
+  // Generate a read barrier for a GC root within `instruction`.
+  //
+  // A read barrier for an object reference GC root is implemented as
+  // a call to the artReadBarrierForRootSlow runtime entry point,
+  // which is passed the value in location `root`:
+  //
+  //   mirror::Object* artReadBarrierForRootSlow(GcRoot<mirror::Object>* root);
+  //
+  // The `out` location contains the value returned by
+  // artReadBarrierForRootSlow.
+  void GenerateReadBarrierForRoot(HInstruction* instruction, Location out, Location root);
+
  private:
+  Register GetInvokeStaticOrDirectExtraParameter(HInvokeStaticOrDirect* invoke, Register temp);
+
+  struct PcRelativeDexCacheAccessInfo {
+    PcRelativeDexCacheAccessInfo(const DexFile& dex_file, uint32_t element_off)
+        : target_dex_file(dex_file), element_offset(element_off), label() { }
+
+    const DexFile& target_dex_file;
+    uint32_t element_offset;
+    // NOTE: Label is bound to the end of the instruction that has an embedded 32-bit offset.
+    Label label;
+  };
+
   // Labels for each block that will be compiled.
   Label* block_labels_;  // Indexed by block id.
   Label frame_entry_label_;
@@ -410,6 +465,8 @@ class CodeGeneratorX86 : public CodeGenerator {
   // Method patch info. Using ArenaDeque<> which retains element addresses on push/emplace_back().
   ArenaDeque<MethodPatchInfo<Label>> method_patches_;
   ArenaDeque<MethodPatchInfo<Label>> relative_call_patches_;
+  // PC-relative DexCache access info.
+  ArenaDeque<PcRelativeDexCacheAccessInfo> pc_relative_dex_cache_patches_;
 
   // Offset to the start of the constant area in the assembled code.
   // Used for fixups to the constant area.
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index ee8a299c5e..dcc180804d 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -34,6 +34,9 @@
 
 namespace art {
 
+template<class MirrorType>
+class GcRoot;
+
 namespace x86_64 {
 
 static constexpr int kCurrentMethodStackOffset = 0;
@@ -52,16 +55,16 @@ class NullCheckSlowPathX86_64 : public SlowPathCode {
   explicit NullCheckSlowPathX86_64(HNullCheck* instruction) : instruction_(instruction) {}
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
-    CodeGeneratorX86_64* x64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
+    CodeGeneratorX86_64* x86_64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
     __ Bind(GetEntryLabel());
     if (instruction_->CanThrowIntoCatchBlock()) {
       // Live registers will be restored in the catch block if caught.
       SaveLiveRegisters(codegen, instruction_->GetLocations());
     }
-    x64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pThrowNullPointer),
-                               instruction_,
-                               instruction_->GetDexPc(),
-                               this);
+    x86_64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pThrowNullPointer),
+                                  instruction_,
+                                  instruction_->GetDexPc(),
+                                  this);
   }
 
   bool IsFatal() const OVERRIDE { return true; }
@@ -78,16 +81,16 @@ class DivZeroCheckSlowPathX86_64 : public SlowPathCode {
   explicit DivZeroCheckSlowPathX86_64(HDivZeroCheck* instruction) : instruction_(instruction) {}
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
-    CodeGeneratorX86_64* x64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
+    CodeGeneratorX86_64* x86_64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
     __ Bind(GetEntryLabel());
     if (instruction_->CanThrowIntoCatchBlock()) {
       // Live registers will be restored in the catch block if caught.
       SaveLiveRegisters(codegen, instruction_->GetLocations());
     }
-    x64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pThrowDivZero),
-                               instruction_,
-                               instruction_->GetDexPc(),
-                               this);
+    x86_64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pThrowDivZero),
+                                  instruction_,
+                                  instruction_->GetDexPc(),
+                                  this);
   }
 
   bool IsFatal() const OVERRIDE { return true; }
@@ -139,18 +142,18 @@ class SuspendCheckSlowPathX86_64 : public SlowPathCode {
       : instruction_(instruction), successor_(successor) {}
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
-    CodeGeneratorX86_64* x64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
+    CodeGeneratorX86_64* x86_64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
     __ Bind(GetEntryLabel());
     SaveLiveRegisters(codegen, instruction_->GetLocations());
-    x64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pTestSuspend),
-                               instruction_,
-                               instruction_->GetDexPc(),
-                               this);
+    x86_64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pTestSuspend),
+                                  instruction_,
+                                  instruction_->GetDexPc(),
+                                  this);
     RestoreLiveRegisters(codegen, instruction_->GetLocations());
     if (successor_ == nullptr) {
       __ jmp(GetReturnLabel());
     } else {
-      __ jmp(x64_codegen->GetLabelOf(successor_));
+      __ jmp(x86_64_codegen->GetLabelOf(successor_));
     }
   }
 
@@ -180,7 +183,7 @@ class BoundsCheckSlowPathX86_64 : public SlowPathCode {
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     LocationSummary* locations = instruction_->GetLocations();
-    CodeGeneratorX86_64* x64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
+    CodeGeneratorX86_64* x86_64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
     __ Bind(GetEntryLabel());
     if (instruction_->CanThrowIntoCatchBlock()) {
       // Live registers will be restored in the catch block if caught.
@@ -196,8 +199,10 @@ class BoundsCheckSlowPathX86_64 : public SlowPathCode {
         locations->InAt(1),
         Location::RegisterLocation(calling_convention.GetRegisterAt(1)),
         Primitive::kPrimInt);
-    x64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pThrowArrayBounds),
-                               instruction_, instruction_->GetDexPc(), this);
+    x86_64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pThrowArrayBounds),
+                                  instruction_,
+                                  instruction_->GetDexPc(),
+                                  this);
   }
 
   bool IsFatal() const OVERRIDE { return true; }
@@ -222,22 +227,25 @@ class LoadClassSlowPathX86_64 : public SlowPathCode {
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     LocationSummary* locations = at_->GetLocations();
-    CodeGeneratorX86_64* x64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
+    CodeGeneratorX86_64* x86_64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
     __ Bind(GetEntryLabel());
 
     SaveLiveRegisters(codegen, locations);
 
     InvokeRuntimeCallingConvention calling_convention;
     __ movl(CpuRegister(calling_convention.GetRegisterAt(0)), Immediate(cls_->GetTypeIndex()));
-    x64_codegen->InvokeRuntime(do_clinit_ ? QUICK_ENTRY_POINT(pInitializeStaticStorage)
-                                          : QUICK_ENTRY_POINT(pInitializeType),
-                                          at_, dex_pc_, this);
+    x86_64_codegen->InvokeRuntime(do_clinit_ ?
+                                      QUICK_ENTRY_POINT(pInitializeStaticStorage) :
+                                      QUICK_ENTRY_POINT(pInitializeType),
+                                  at_,
+                                  dex_pc_,
+                                  this);
 
     Location out = locations->Out();
     // Move the class to the desired location.
     if (out.IsValid()) {
       DCHECK(out.IsRegister() && !locations->GetLiveRegisters()->ContainsCoreRegister(out.reg()));
-      x64_codegen->Move(out, Location::RegisterLocation(RAX));
+      x86_64_codegen->Move(out, Location::RegisterLocation(RAX));
     }
 
     RestoreLiveRegisters(codegen, locations);
@@ -271,18 +279,18 @@ class LoadStringSlowPathX86_64 : public SlowPathCode {
     LocationSummary* locations = instruction_->GetLocations();
     DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(locations->Out().reg()));
 
-    CodeGeneratorX86_64* x64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
+    CodeGeneratorX86_64* x86_64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
     __ Bind(GetEntryLabel());
     SaveLiveRegisters(codegen, locations);
 
     InvokeRuntimeCallingConvention calling_convention;
     __ movl(CpuRegister(calling_convention.GetRegisterAt(0)),
             Immediate(instruction_->GetStringIndex()));
-    x64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pResolveString),
-                               instruction_,
-                               instruction_->GetDexPc(),
-                               this);
-    x64_codegen->Move(locations->Out(), Location::RegisterLocation(RAX));
+    x86_64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pResolveString),
+                                  instruction_,
+                                  instruction_->GetDexPc(),
+                                  this);
+    x86_64_codegen->Move(locations->Out(), Location::RegisterLocation(RAX));
     RestoreLiveRegisters(codegen, locations);
     __ jmp(GetExitLabel());
   }
@@ -308,18 +316,9 @@ class TypeCheckSlowPathX86_64 : public SlowPathCode {
     DCHECK(instruction_->IsCheckCast()
            || !locations->GetLiveRegisters()->ContainsCoreRegister(locations->Out().reg()));
 
-    CodeGeneratorX86_64* x64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
+    CodeGeneratorX86_64* x86_64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
     __ Bind(GetEntryLabel());
 
-    if (instruction_->IsCheckCast()) {
-      // The codegen for the instruction overwrites `temp`, so put it back in place.
-      CpuRegister obj = locations->InAt(0).AsRegister<CpuRegister>();
-      CpuRegister temp = locations->GetTemp(0).AsRegister<CpuRegister>();
-      uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
-      __ movl(temp, Address(obj, class_offset));
-      __ MaybeUnpoisonHeapReference(temp);
-    }
-
     if (!is_fatal_) {
       SaveLiveRegisters(codegen, locations);
     }
@@ -336,21 +335,24 @@ class TypeCheckSlowPathX86_64 : public SlowPathCode {
         Primitive::kPrimNot);
 
     if (instruction_->IsInstanceOf()) {
-      x64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pInstanceofNonTrivial),
-                                 instruction_,
-                                 dex_pc,
-                                 this);
+      x86_64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pInstanceofNonTrivial),
+                                    instruction_,
+                                    dex_pc,
+                                    this);
+      CheckEntrypointTypes<
+          kQuickInstanceofNonTrivial, uint32_t, const mirror::Class*, const mirror::Class*>();
     } else {
       DCHECK(instruction_->IsCheckCast());
-      x64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pCheckCast),
-                                 instruction_,
-                                 dex_pc,
-                                 this);
+      x86_64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pCheckCast),
+                                    instruction_,
+                                    dex_pc,
+                                    this);
+      CheckEntrypointTypes<kQuickCheckCast, void, const mirror::Class*, const mirror::Class*>();
     }
 
     if (!is_fatal_) {
       if (instruction_->IsInstanceOf()) {
-        x64_codegen->Move(locations->Out(), Location::RegisterLocation(RAX));
+        x86_64_codegen->Move(locations->Out(), Location::RegisterLocation(RAX));
       }
 
       RestoreLiveRegisters(codegen, locations);
@@ -375,15 +377,15 @@ class DeoptimizationSlowPathX86_64 : public SlowPathCode {
       : instruction_(instruction) {}
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
-    CodeGeneratorX86_64* x64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
+    CodeGeneratorX86_64* x86_64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
     __ Bind(GetEntryLabel());
     SaveLiveRegisters(codegen, instruction_->GetLocations());
     DCHECK(instruction_->IsDeoptimize());
     HDeoptimize* deoptimize = instruction_->AsDeoptimize();
-    x64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pDeoptimize),
-                               deoptimize,
-                               deoptimize->GetDexPc(),
-                               this);
+    x86_64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pDeoptimize),
+                                  deoptimize,
+                                  deoptimize->GetDexPc(),
+                                  this);
   }
 
   const char* GetDescription() const OVERRIDE { return "DeoptimizationSlowPathX86_64"; }
@@ -421,11 +423,11 @@ class ArraySetSlowPathX86_64 : public SlowPathCode {
         nullptr);
     codegen->GetMoveResolver()->EmitNativeCode(&parallel_move);
 
-    CodeGeneratorX86_64* x64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
-    x64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pAputObject),
-                               instruction_,
-                               instruction_->GetDexPc(),
-                               this);
+    CodeGeneratorX86_64* x86_64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
+    x86_64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pAputObject),
+                                  instruction_,
+                                  instruction_->GetDexPc(),
+                                  this);
     RestoreLiveRegisters(codegen, locations);
     __ jmp(GetExitLabel());
   }
@@ -438,6 +440,219 @@ class ArraySetSlowPathX86_64 : public SlowPathCode {
   DISALLOW_COPY_AND_ASSIGN(ArraySetSlowPathX86_64);
 };
 
+// Slow path generating a read barrier for a heap reference.
+class ReadBarrierForHeapReferenceSlowPathX86_64 : public SlowPathCode {
+ public:
+  ReadBarrierForHeapReferenceSlowPathX86_64(HInstruction* instruction,
+                                            Location out,
+                                            Location ref,
+                                            Location obj,
+                                            uint32_t offset,
+                                            Location index)
+      : instruction_(instruction),
+        out_(out),
+        ref_(ref),
+        obj_(obj),
+        offset_(offset),
+        index_(index) {
+    DCHECK(kEmitCompilerReadBarrier);
+    // If `obj` is equal to `out` or `ref`, it means the initial
+    // object has been overwritten by (or after) the heap object
+    // reference load to be instrumented, e.g.:
+    //
+    //   __ movl(out, Address(out, offset));
+    //   codegen_->GenerateReadBarrier(instruction, out_loc, out_loc, out_loc, offset);
+    //
+    // In that case, we have lost the information about the original
+    // object, and the emitted read barrier cannot work properly.
+    DCHECK(!obj.Equals(out)) << "obj=" << obj << " out=" << out;
+    DCHECK(!obj.Equals(ref)) << "obj=" << obj << " ref=" << ref;
+}
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    CodeGeneratorX86_64* x86_64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
+    LocationSummary* locations = instruction_->GetLocations();
+    CpuRegister reg_out = out_.AsRegister<CpuRegister>();
+    DCHECK(locations->CanCall());
+    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg_out.AsRegister())) << out_;
+    DCHECK(!instruction_->IsInvoke() ||
+           (instruction_->IsInvokeStaticOrDirect() &&
+            instruction_->GetLocations()->Intrinsified()));
+
+    __ Bind(GetEntryLabel());
+    SaveLiveRegisters(codegen, locations);
+
+    // We may have to change the index's value, but as `index_` is a
+    // constant member (like other "inputs" of this slow path),
+    // introduce a copy of it, `index`.
+    Location index = index_;
+    if (index_.IsValid()) {
+      // Handle `index_` for HArrayGet and intrinsic UnsafeGetObject.
+      if (instruction_->IsArrayGet()) {
+        // Compute real offset and store it in index_.
+        Register index_reg = index_.AsRegister<CpuRegister>().AsRegister();
+        DCHECK(locations->GetLiveRegisters()->ContainsCoreRegister(index_reg));
+        if (codegen->IsCoreCalleeSaveRegister(index_reg)) {
+          // We are about to change the value of `index_reg` (see the
+          // calls to art::x86_64::X86_64Assembler::shll and
+          // art::x86_64::X86_64Assembler::AddImmediate below), but it
+          // has not been saved by the previous call to
+          // art::SlowPathCode::SaveLiveRegisters, as it is a
+          // callee-save register --
+          // art::SlowPathCode::SaveLiveRegisters does not consider
+          // callee-save registers, as it has been designed with the
+          // assumption that callee-save registers are supposed to be
+          // handled by the called function.  So, as a callee-save
+          // register, `index_reg` _would_ eventually be saved onto
+          // the stack, but it would be too late: we would have
+          // changed its value earlier.  Therefore, we manually save
+          // it here into another freely available register,
+          // `free_reg`, chosen of course among the caller-save
+          // registers (as a callee-save `free_reg` register would
+          // exhibit the same problem).
+          //
+          // Note we could have requested a temporary register from
+          // the register allocator instead; but we prefer not to, as
+          // this is a slow path, and we know we can find a
+          // caller-save register that is available.
+          Register free_reg = FindAvailableCallerSaveRegister(codegen).AsRegister();
+          __ movl(CpuRegister(free_reg), CpuRegister(index_reg));
+          index_reg = free_reg;
+          index = Location::RegisterLocation(index_reg);
+        } else {
+          // The initial register stored in `index_` has already been
+          // saved in the call to art::SlowPathCode::SaveLiveRegisters
+          // (as it is not a callee-save register), so we can freely
+          // use it.
+        }
+        // Shifting the index value contained in `index_reg` by the
+        // scale factor (2) cannot overflow in practice, as the
+        // runtime is unable to allocate object arrays with a size
+        // larger than 2^26 - 1 (that is, 2^28 - 4 bytes).
+        __ shll(CpuRegister(index_reg), Immediate(TIMES_4));
+        static_assert(
+            sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
+            "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
+        __ AddImmediate(CpuRegister(index_reg), Immediate(offset_));
+      } else {
+        DCHECK(instruction_->IsInvoke());
+        DCHECK(instruction_->GetLocations()->Intrinsified());
+        DCHECK((instruction_->AsInvoke()->GetIntrinsic() == Intrinsics::kUnsafeGetObject) ||
+               (instruction_->AsInvoke()->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile))
+            << instruction_->AsInvoke()->GetIntrinsic();
+        DCHECK_EQ(offset_, 0U);
+        DCHECK(index_.IsRegister());
+      }
+    }
+
+    // We're moving two or three locations to locations that could
+    // overlap, so we need a parallel move resolver.
+    InvokeRuntimeCallingConvention calling_convention;
+    HParallelMove parallel_move(codegen->GetGraph()->GetArena());
+    parallel_move.AddMove(ref_,
+                          Location::RegisterLocation(calling_convention.GetRegisterAt(0)),
+                          Primitive::kPrimNot,
+                          nullptr);
+    parallel_move.AddMove(obj_,
+                          Location::RegisterLocation(calling_convention.GetRegisterAt(1)),
+                          Primitive::kPrimNot,
+                          nullptr);
+    if (index.IsValid()) {
+      parallel_move.AddMove(index,
+                            Location::RegisterLocation(calling_convention.GetRegisterAt(2)),
+                            Primitive::kPrimInt,
+                            nullptr);
+      codegen->GetMoveResolver()->EmitNativeCode(&parallel_move);
+    } else {
+      codegen->GetMoveResolver()->EmitNativeCode(&parallel_move);
+      __ movl(CpuRegister(calling_convention.GetRegisterAt(2)), Immediate(offset_));
+    }
+    x86_64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pReadBarrierSlow),
+                                  instruction_,
+                                  instruction_->GetDexPc(),
+                                  this);
+    CheckEntrypointTypes<
+        kQuickReadBarrierSlow, mirror::Object*, mirror::Object*, mirror::Object*, uint32_t>();
+    x86_64_codegen->Move(out_, Location::RegisterLocation(RAX));
+
+    RestoreLiveRegisters(codegen, locations);
+    __ jmp(GetExitLabel());
+  }
+
+  const char* GetDescription() const OVERRIDE {
+    return "ReadBarrierForHeapReferenceSlowPathX86_64";
+  }
+
+ private:
+  CpuRegister FindAvailableCallerSaveRegister(CodeGenerator* codegen) {
+    size_t ref = static_cast<int>(ref_.AsRegister<CpuRegister>().AsRegister());
+    size_t obj = static_cast<int>(obj_.AsRegister<CpuRegister>().AsRegister());
+    for (size_t i = 0, e = codegen->GetNumberOfCoreRegisters(); i < e; ++i) {
+      if (i != ref && i != obj && !codegen->IsCoreCalleeSaveRegister(i)) {
+        return static_cast<CpuRegister>(i);
+      }
+    }
+    // We shall never fail to find a free caller-save register, as
+    // there are more than two core caller-save registers on x86-64
+    // (meaning it is possible to find one which is different from
+    // `ref` and `obj`).
+    DCHECK_GT(codegen->GetNumberOfCoreCallerSaveRegisters(), 2u);
+    LOG(FATAL) << "Could not find a free caller-save register";
+    UNREACHABLE();
+  }
+
+  HInstruction* const instruction_;
+  const Location out_;
+  const Location ref_;
+  const Location obj_;
+  const uint32_t offset_;
+  // An additional location containing an index to an array.
+  // Only used for HArrayGet and the UnsafeGetObject &
+  // UnsafeGetObjectVolatile intrinsics.
+  const Location index_;
+
+  DISALLOW_COPY_AND_ASSIGN(ReadBarrierForHeapReferenceSlowPathX86_64);
+};
+
+// Slow path generating a read barrier for a GC root.
+class ReadBarrierForRootSlowPathX86_64 : public SlowPathCode {
+ public:
+  ReadBarrierForRootSlowPathX86_64(HInstruction* instruction, Location out, Location root)
+      : instruction_(instruction), out_(out), root_(root) {}
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    LocationSummary* locations = instruction_->GetLocations();
+    DCHECK(locations->CanCall());
+    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(out_.reg()));
+    DCHECK(instruction_->IsLoadClass() || instruction_->IsLoadString());
+
+    __ Bind(GetEntryLabel());
+    SaveLiveRegisters(codegen, locations);
+
+    InvokeRuntimeCallingConvention calling_convention;
+    CodeGeneratorX86_64* x86_64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
+    x86_64_codegen->Move(Location::RegisterLocation(calling_convention.GetRegisterAt(0)), root_);
+    x86_64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pReadBarrierForRootSlow),
+                                  instruction_,
+                                  instruction_->GetDexPc(),
+                                  this);
+    CheckEntrypointTypes<kQuickReadBarrierForRootSlow, mirror::Object*, GcRoot<mirror::Object>*>();
+    x86_64_codegen->Move(out_, Location::RegisterLocation(RAX));
+
+    RestoreLiveRegisters(codegen, locations);
+    __ jmp(GetExitLabel());
+  }
+
+  const char* GetDescription() const OVERRIDE { return "ReadBarrierForRootSlowPathX86_64"; }
+
+ private:
+  HInstruction* const instruction_;
+  const Location out_;
+  const Location root_;
+
+  DISALLOW_COPY_AND_ASSIGN(ReadBarrierForRootSlowPathX86_64);
+};
+
 #undef __
 #define __ down_cast<X86_64Assembler*>(GetAssembler())->
 
@@ -503,7 +718,7 @@ void CodeGeneratorX86_64::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invo
                     Address::Absolute(invoke->GetStringInitOffset(), true));
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kRecursive:
-      callee_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodInputIndex());
+      callee_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress:
       __ movq(temp.AsRegister<CpuRegister>(), Immediate(invoke->GetMethodAddress()));
@@ -514,15 +729,15 @@ void CodeGeneratorX86_64::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invo
       __ Bind(&method_patches_.back().label);  // Bind the label at the end of the "movl" insn.
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kDexCachePcRelative:
-      pc_rel_dex_cache_patches_.emplace_back(*invoke->GetTargetMethod().dex_file,
-                                             invoke->GetDexCacheArrayOffset());
+      pc_relative_dex_cache_patches_.emplace_back(*invoke->GetTargetMethod().dex_file,
+                                                  invoke->GetDexCacheArrayOffset());
       __ movq(temp.AsRegister<CpuRegister>(),
               Address::Absolute(kDummy32BitOffset, false /* no_rip */));
       // Bind the label at the end of the "movl" insn.
-      __ Bind(&pc_rel_dex_cache_patches_.back().label);
+      __ Bind(&pc_relative_dex_cache_patches_.back().label);
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kDexCacheViaMethod: {
-      Location current_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodInputIndex());
+      Location current_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
       Register method_reg;
       CpuRegister reg = temp.AsRegister<CpuRegister>();
       if (current_method.IsRegister()) {
@@ -533,7 +748,7 @@ void CodeGeneratorX86_64::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invo
         method_reg = reg.AsRegister();
         __ movq(reg, Address(CpuRegister(RSP), kCurrentMethodStackOffset));
       }
-      // temp = temp->dex_cache_resolved_methods_;
+      // /* ArtMethod*[] */ temp = temp.ptr_sized_fields_->dex_cache_resolved_methods_;
       __ movq(reg,
               Address(CpuRegister(method_reg),
                       ArtMethod::DexCacheResolvedMethodsOffset(kX86_64PointerSize).SizeValue()));
@@ -578,10 +793,17 @@ void CodeGeneratorX86_64::GenerateVirtualCall(HInvokeVirtual* invoke, Location t
   LocationSummary* locations = invoke->GetLocations();
   Location receiver = locations->InAt(0);
   size_t class_offset = mirror::Object::ClassOffset().SizeValue();
-  // temp = object->GetClass();
   DCHECK(receiver.IsRegister());
+  // /* HeapReference<Class> */ temp = receiver->klass_
   __ movl(temp, Address(receiver.AsRegister<CpuRegister>(), class_offset));
   MaybeRecordImplicitNullCheck(invoke);
+  // Instead of simply (possibly) unpoisoning `temp` here, we should
+  // emit a read barrier for the previous class reference load.
+  // However this is not required in practice, as this is an
+  // intermediate/temporary reference and because the current
+  // concurrent copying collector keeps the from-space memory
+  // intact/accessible until the end of the marking phase (the
+  // concurrent copying collector may not in the future).
   __ MaybeUnpoisonHeapReference(temp);
   // temp = temp->GetMethodAt(method_offset);
   __ movq(temp, Address(temp, method_offset));
@@ -593,28 +815,27 @@ void CodeGeneratorX86_64::GenerateVirtualCall(HInvokeVirtual* invoke, Location t
 void CodeGeneratorX86_64::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches) {
   DCHECK(linker_patches->empty());
   size_t size =
-      method_patches_.size() + relative_call_patches_.size() + pc_rel_dex_cache_patches_.size();
+      method_patches_.size() +
+      relative_call_patches_.size() +
+      pc_relative_dex_cache_patches_.size();
   linker_patches->reserve(size);
+  // The label points to the end of the "movl" insn but the literal offset for method
+  // patch needs to point to the embedded constant which occupies the last 4 bytes.
+  constexpr uint32_t kLabelPositionToLiteralOffsetAdjustment = 4u;
   for (const MethodPatchInfo<Label>& info : method_patches_) {
-    // The label points to the end of the "movl" instruction but the literal offset for method
-    // patch x86 needs to point to the embedded constant which occupies the last 4 bytes.
-    uint32_t literal_offset = info.label.Position() - 4;
+    uint32_t literal_offset = info.label.Position() - kLabelPositionToLiteralOffsetAdjustment;
     linker_patches->push_back(LinkerPatch::MethodPatch(literal_offset,
                                                        info.target_method.dex_file,
                                                        info.target_method.dex_method_index));
   }
   for (const MethodPatchInfo<Label>& info : relative_call_patches_) {
-    // The label points to the end of the "call" instruction but the literal offset for method
-    // patch x86 needs to point to the embedded constant which occupies the last 4 bytes.
-    uint32_t literal_offset = info.label.Position() - 4;
+    uint32_t literal_offset = info.label.Position() - kLabelPositionToLiteralOffsetAdjustment;
     linker_patches->push_back(LinkerPatch::RelativeCodePatch(literal_offset,
                                                              info.target_method.dex_file,
                                                              info.target_method.dex_method_index));
   }
-  for (const PcRelativeDexCacheAccessInfo& info : pc_rel_dex_cache_patches_) {
-    // The label points to the end of the "mov" instruction but the literal offset for method
-    // patch x86 needs to point to the embedded constant which occupies the last 4 bytes.
-    uint32_t literal_offset = info.label.Position() - 4;
+  for (const PcRelativeDexCacheAccessInfo& info : pc_relative_dex_cache_patches_) {
+    uint32_t literal_offset = info.label.Position() - kLabelPositionToLiteralOffsetAdjustment;
     linker_patches->push_back(LinkerPatch::DexCacheArrayPatch(literal_offset,
                                                               &info.target_dex_file,
                                                               info.label.Position(),
@@ -673,9 +894,9 @@ static constexpr int kNumberOfCpuRegisterPairs = 0;
 // Use a fake return address register to mimic Quick.
 static constexpr Register kFakeReturnRegister = Register(kLastCpuRegister + 1);
 CodeGeneratorX86_64::CodeGeneratorX86_64(HGraph* graph,
-                const X86_64InstructionSetFeatures& isa_features,
-                const CompilerOptions& compiler_options,
-                OptimizingCompilerStats* stats)
+                                         const X86_64InstructionSetFeatures& isa_features,
+                                         const CompilerOptions& compiler_options,
+                                         OptimizingCompilerStats* stats)
       : CodeGenerator(graph,
                       kNumberOfCpuRegisters,
                       kNumberOfFloatRegisters,
@@ -695,7 +916,7 @@ CodeGeneratorX86_64::CodeGeneratorX86_64(HGraph* graph,
         constant_area_start_(0),
         method_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
         relative_call_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
-        pc_rel_dex_cache_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+        pc_relative_dex_cache_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
         fixups_to_jump_tables_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)) {
   AddAllocatedRegister(Location::RegisterLocation(kFakeReturnRegister));
 }
@@ -729,7 +950,7 @@ Location CodeGeneratorX86_64::AllocateFreeRegister(Primitive::Type type) const {
       LOG(FATAL) << "Unreachable type " << type;
   }
 
-  return Location();
+  return Location::NoLocation();
 }
 
 void CodeGeneratorX86_64::SetupBlockedRegisters(bool is_baseline) const {
@@ -1083,26 +1304,19 @@ void InstructionCodeGeneratorX86_64::GenerateFPJumps(HCondition* cond,
   __ j(X86_64FPCondition(cond->GetCondition()), true_label);
 }
 
-void InstructionCodeGeneratorX86_64::GenerateCompareTestAndBranch(HIf* if_instr,
-                                                                  HCondition* condition,
-                                                                  Label* true_target,
-                                                                  Label* false_target,
-                                                                  Label* always_true_target) {
+void InstructionCodeGeneratorX86_64::GenerateCompareTestAndBranch(HCondition* condition,
+                                                                  Label* true_target_in,
+                                                                  Label* false_target_in) {
+  // Generated branching requires both targets to be explicit. If either of the
+  // targets is nullptr (fallthrough) use and bind `fallthrough_target` instead.
+  Label fallthrough_target;
+  Label* true_target = true_target_in == nullptr ? &fallthrough_target : true_target_in;
+  Label* false_target = false_target_in == nullptr ? &fallthrough_target : false_target_in;
+
   LocationSummary* locations = condition->GetLocations();
   Location left = locations->InAt(0);
   Location right = locations->InAt(1);
 
-  // We don't want true_target as a nullptr.
-  if (true_target == nullptr) {
-    true_target = always_true_target;
-  }
-  bool falls_through = (false_target == nullptr);
-
-  // FP compares don't like null false_targets.
-  if (false_target == nullptr) {
-    false_target = codegen_->GetLabelOf(if_instr->IfFalseSuccessor());
-  }
-
   Primitive::Type type = condition->InputAt(0)->GetType();
   switch (type) {
     case Primitive::kPrimLong: {
@@ -1161,135 +1375,140 @@ void InstructionCodeGeneratorX86_64::GenerateCompareTestAndBranch(HIf* if_instr,
       LOG(FATAL) << "Unexpected condition type " << type;
   }
 
-  if (!falls_through) {
+  if (false_target != &fallthrough_target) {
     __ jmp(false_target);
   }
+
+  if (fallthrough_target.IsLinked()) {
+    __ Bind(&fallthrough_target);
+  }
+}
+
+static bool AreEflagsSetFrom(HInstruction* cond, HInstruction* branch) {
+  // Moves may affect the eflags register (move zero uses xorl), so the EFLAGS
+  // are set only strictly before `branch`. We can't use the eflags on long
+  // conditions if they are materialized due to the complex branching.
+  return cond->IsCondition() &&
+         cond->GetNext() == branch &&
+         !Primitive::IsFloatingPointType(cond->InputAt(0)->GetType());
 }
 
 void InstructionCodeGeneratorX86_64::GenerateTestAndBranch(HInstruction* instruction,
+                                                           size_t condition_input_index,
                                                            Label* true_target,
-                                                           Label* false_target,
-                                                           Label* always_true_target) {
-  HInstruction* cond = instruction->InputAt(0);
-  if (cond->IsIntConstant()) {
+                                                           Label* false_target) {
+  HInstruction* cond = instruction->InputAt(condition_input_index);
+
+  if (true_target == nullptr && false_target == nullptr) {
+    // Nothing to do. The code always falls through.
+    return;
+  } else if (cond->IsIntConstant()) {
     // Constant condition, statically compared against 1.
-    int32_t cond_value = cond->AsIntConstant()->GetValue();
-    if (cond_value == 1) {
-      if (always_true_target != nullptr) {
-        __ jmp(always_true_target);
+    if (cond->AsIntConstant()->IsOne()) {
+      if (true_target != nullptr) {
+        __ jmp(true_target);
       }
-      return;
     } else {
-      DCHECK_EQ(cond_value, 0);
+      DCHECK(cond->AsIntConstant()->IsZero());
+      if (false_target != nullptr) {
+        __ jmp(false_target);
+      }
     }
-  } else {
-    HCondition* condition = cond->AsCondition();
-    bool is_materialized = condition == nullptr || condition->NeedsMaterialization();
-    // Moves do not affect the eflags register, so if the condition is
-    // evaluated just before the if, we don't need to evaluate it
-    // again.  We can't use the eflags on FP conditions if they are
-    // materialized due to the complex branching.
-    Primitive::Type type = (condition != nullptr)
-        ? cond->InputAt(0)->GetType()
-        : Primitive::kPrimInt;
-    bool eflags_set = condition != nullptr
-        && condition->IsBeforeWhenDisregardMoves(instruction)
-        && !Primitive::IsFloatingPointType(type);
-    // Can we optimize the jump if we know that the next block is the true case?
-    bool can_jump_to_false = CanReverseCondition(always_true_target, false_target, condition);
-
-    if (is_materialized) {
-      if (!eflags_set) {
-        // Materialized condition, compare against 0.
-        Location lhs = instruction->GetLocations()->InAt(0);
-        if (lhs.IsRegister()) {
-          __ testl(lhs.AsRegister<CpuRegister>(), lhs.AsRegister<CpuRegister>());
-        } else {
-          __ cmpl(Address(CpuRegister(RSP), lhs.GetStackIndex()),
-                  Immediate(0));
-        }
-        if (can_jump_to_false) {
-          __ j(kEqual, false_target);
-          return;
-        }
-        __ j(kNotEqual, true_target);
+    return;
+  }
+
+  // The following code generates these patterns:
+  //  (1) true_target == nullptr && false_target != nullptr
+  //        - opposite condition true => branch to false_target
+  //  (2) true_target != nullptr && false_target == nullptr
+  //        - condition true => branch to true_target
+  //  (3) true_target != nullptr && false_target != nullptr
+  //        - condition true => branch to true_target
+  //        - branch to false_target
+  if (IsBooleanValueOrMaterializedCondition(cond)) {
+    if (AreEflagsSetFrom(cond, instruction)) {
+      if (true_target == nullptr) {
+        __ j(X86_64IntegerCondition(cond->AsCondition()->GetOppositeCondition()), false_target);
       } else {
-        if (can_jump_to_false) {
-          __ j(X86_64IntegerCondition(condition->GetOppositeCondition()), false_target);
-          return;
-        }
-        __ j(X86_64IntegerCondition(condition->GetCondition()), true_target);
+        __ j(X86_64IntegerCondition(cond->AsCondition()->GetCondition()), true_target);
       }
     } else {
-      // Condition has not been materialized, use its inputs as the
-      // comparison and its condition as the branch condition.
-
-      // Is this a long or FP comparison that has been folded into the HCondition?
-      if (type == Primitive::kPrimLong || Primitive::IsFloatingPointType(type)) {
-        // Generate the comparison directly.
-        GenerateCompareTestAndBranch(instruction->AsIf(), condition,
-                                     true_target, false_target, always_true_target);
-        return;
+      // Materialized condition, compare against 0.
+      Location lhs = instruction->GetLocations()->InAt(condition_input_index);
+      if (lhs.IsRegister()) {
+        __ testl(lhs.AsRegister<CpuRegister>(), lhs.AsRegister<CpuRegister>());
+      } else {
+        __ cmpl(Address(CpuRegister(RSP), lhs.GetStackIndex()), Immediate(0));
       }
-
-      Location lhs = cond->GetLocations()->InAt(0);
-      Location rhs = cond->GetLocations()->InAt(1);
-      if (rhs.IsRegister()) {
-        __ cmpl(lhs.AsRegister<CpuRegister>(), rhs.AsRegister<CpuRegister>());
-      } else if (rhs.IsConstant()) {
-        int32_t constant = CodeGenerator::GetInt32ValueOf(rhs.GetConstant());
-        if (constant == 0) {
-          __ testl(lhs.AsRegister<CpuRegister>(), lhs.AsRegister<CpuRegister>());
-        } else {
-          __ cmpl(lhs.AsRegister<CpuRegister>(), Immediate(constant));
-        }
+      if (true_target == nullptr) {
+        __ j(kEqual, false_target);
       } else {
-        __ cmpl(lhs.AsRegister<CpuRegister>(),
-                Address(CpuRegister(RSP), rhs.GetStackIndex()));
+        __ j(kNotEqual, true_target);
       }
+    }
+  } else {
+    // Condition has not been materialized, use its inputs as the
+    // comparison and its condition as the branch condition.
+    HCondition* condition = cond->AsCondition();
 
-      if (can_jump_to_false) {
-        __ j(X86_64IntegerCondition(condition->GetOppositeCondition()), false_target);
-        return;
-      }
+    // If this is a long or FP comparison that has been folded into
+    // the HCondition, generate the comparison directly.
+    Primitive::Type type = condition->InputAt(0)->GetType();
+    if (type == Primitive::kPrimLong || Primitive::IsFloatingPointType(type)) {
+      GenerateCompareTestAndBranch(condition, true_target, false_target);
+      return;
+    }
 
+    Location lhs = condition->GetLocations()->InAt(0);
+    Location rhs = condition->GetLocations()->InAt(1);
+    if (rhs.IsRegister()) {
+      __ cmpl(lhs.AsRegister<CpuRegister>(), rhs.AsRegister<CpuRegister>());
+    } else if (rhs.IsConstant()) {
+      int32_t constant = CodeGenerator::GetInt32ValueOf(rhs.GetConstant());
+      if (constant == 0) {
+        __ testl(lhs.AsRegister<CpuRegister>(), lhs.AsRegister<CpuRegister>());
+      } else {
+        __ cmpl(lhs.AsRegister<CpuRegister>(), Immediate(constant));
+      }
+    } else {
+      __ cmpl(lhs.AsRegister<CpuRegister>(),
+              Address(CpuRegister(RSP), rhs.GetStackIndex()));
+    }
+      if (true_target == nullptr) {
+      __ j(X86_64IntegerCondition(condition->GetOppositeCondition()), false_target);
+    } else {
       __ j(X86_64IntegerCondition(condition->GetCondition()), true_target);
     }
   }
-  if (false_target != nullptr) {
+
+  // If neither branch falls through (case 3), the conditional branch to `true_target`
+  // was already emitted (case 2) and we need to emit a jump to `false_target`.
+  if (true_target != nullptr && false_target != nullptr) {
     __ jmp(false_target);
   }
 }
 
 void LocationsBuilderX86_64::VisitIf(HIf* if_instr) {
-  LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(if_instr, LocationSummary::kNoCall);
-  HInstruction* cond = if_instr->InputAt(0);
-  if (!cond->IsCondition() || cond->AsCondition()->NeedsMaterialization()) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(if_instr);
+  if (IsBooleanValueOrMaterializedCondition(if_instr->InputAt(0))) {
     locations->SetInAt(0, Location::Any());
   }
 }
 
 void InstructionCodeGeneratorX86_64::VisitIf(HIf* if_instr) {
-  Label* true_target = codegen_->GetLabelOf(if_instr->IfTrueSuccessor());
-  Label* false_target = codegen_->GetLabelOf(if_instr->IfFalseSuccessor());
-  Label* always_true_target = true_target;
-  if (codegen_->GoesToNextBlock(if_instr->GetBlock(),
-                                if_instr->IfTrueSuccessor())) {
-    always_true_target = nullptr;
-  }
-  if (codegen_->GoesToNextBlock(if_instr->GetBlock(),
-                                if_instr->IfFalseSuccessor())) {
-    false_target = nullptr;
-  }
-  GenerateTestAndBranch(if_instr, true_target, false_target, always_true_target);
+  HBasicBlock* true_successor = if_instr->IfTrueSuccessor();
+  HBasicBlock* false_successor = if_instr->IfFalseSuccessor();
+  Label* true_target = codegen_->GoesToNextBlock(if_instr->GetBlock(), true_successor) ?
+      nullptr : codegen_->GetLabelOf(true_successor);
+  Label* false_target = codegen_->GoesToNextBlock(if_instr->GetBlock(), false_successor) ?
+      nullptr : codegen_->GetLabelOf(false_successor);
+  GenerateTestAndBranch(if_instr, /* condition_input_index */ 0, true_target, false_target);
 }
 
 void LocationsBuilderX86_64::VisitDeoptimize(HDeoptimize* deoptimize) {
   LocationSummary* locations = new (GetGraph()->GetArena())
       LocationSummary(deoptimize, LocationSummary::kCallOnSlowPath);
-  HInstruction* cond = deoptimize->InputAt(0);
-  if (!cond->IsCondition() || cond->AsCondition()->NeedsMaterialization()) {
+  if (IsBooleanValueOrMaterializedCondition(deoptimize->InputAt(0))) {
     locations->SetInAt(0, Location::Any());
   }
 }
@@ -1298,8 +1517,10 @@ void InstructionCodeGeneratorX86_64::VisitDeoptimize(HDeoptimize* deoptimize) {
   SlowPathCode* slow_path = new (GetGraph()->GetArena())
       DeoptimizationSlowPathX86_64(deoptimize);
   codegen_->AddSlowPath(slow_path);
-  Label* slow_path_entry = slow_path->GetEntryLabel();
-  GenerateTestAndBranch(deoptimize, slow_path_entry, nullptr, slow_path_entry);
+  GenerateTestAndBranch(deoptimize,
+                        /* condition_input_index */ 0,
+                        slow_path->GetEntryLabel(),
+                        /* false_target */ nullptr);
 }
 
 void LocationsBuilderX86_64::VisitLocal(HLocal* local) {
@@ -1837,7 +2058,7 @@ Location InvokeDexCallingConventionVisitorX86_64::GetNextLocation(Primitive::Typ
       LOG(FATAL) << "Unexpected parameter type " << type;
       break;
   }
-  return Location();
+  return Location::NoLocation();
 }
 
 void LocationsBuilderX86_64::VisitInvokeUnresolved(HInvokeUnresolved* invoke) {
@@ -1908,7 +2129,6 @@ void InstructionCodeGeneratorX86_64::VisitInvokeVirtual(HInvokeVirtual* invoke)
   }
 
   codegen_->GenerateVirtualCall(invoke, invoke->GetLocations()->GetTemp(0));
-
   DCHECK(!codegen_->IsLeafMethod());
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
 }
@@ -1921,31 +2141,41 @@ void LocationsBuilderX86_64::VisitInvokeInterface(HInvokeInterface* invoke) {
 
 void InstructionCodeGeneratorX86_64::VisitInvokeInterface(HInvokeInterface* invoke) {
   // TODO: b/18116999, our IMTs can miss an IncompatibleClassChangeError.
-  CpuRegister temp = invoke->GetLocations()->GetTemp(0).AsRegister<CpuRegister>();
+  LocationSummary* locations = invoke->GetLocations();
+  CpuRegister temp = locations->GetTemp(0).AsRegister<CpuRegister>();
+  CpuRegister hidden_reg = locations->GetTemp(1).AsRegister<CpuRegister>();
   uint32_t method_offset = mirror::Class::EmbeddedImTableEntryOffset(
       invoke->GetImtIndex() % mirror::Class::kImtSize, kX86_64PointerSize).Uint32Value();
-  LocationSummary* locations = invoke->GetLocations();
   Location receiver = locations->InAt(0);
   size_t class_offset = mirror::Object::ClassOffset().SizeValue();
 
-  // Set the hidden argument.
-  CpuRegister hidden_reg = invoke->GetLocations()->GetTemp(1).AsRegister<CpuRegister>();
+  // Set the hidden argument. This is safe to do this here, as RAX
+  // won't be modified thereafter, before the `call` instruction.
+  DCHECK_EQ(RAX, hidden_reg.AsRegister());
   codegen_->Load64BitValue(hidden_reg, invoke->GetDexMethodIndex());
 
-  // temp = object->GetClass();
   if (receiver.IsStackSlot()) {
     __ movl(temp, Address(CpuRegister(RSP), receiver.GetStackIndex()));
+    // /* HeapReference<Class> */ temp = temp->klass_
     __ movl(temp, Address(temp, class_offset));
   } else {
+    // /* HeapReference<Class> */ temp = receiver->klass_
     __ movl(temp, Address(receiver.AsRegister<CpuRegister>(), class_offset));
   }
   codegen_->MaybeRecordImplicitNullCheck(invoke);
+  // Instead of simply (possibly) unpoisoning `temp` here, we should
+  // emit a read barrier for the previous class reference load.
+  // However this is not required in practice, as this is an
+  // intermediate/temporary reference and because the current
+  // concurrent copying collector keeps the from-space memory
+  // intact/accessible until the end of the marking phase (the
+  // concurrent copying collector may not in the future).
   __ MaybeUnpoisonHeapReference(temp);
   // temp = temp->GetImtEntryAt(method_offset);
   __ movq(temp, Address(temp, method_offset));
   // call temp->GetEntryPoint();
-  __ call(Address(temp, ArtMethod::EntryPointFromQuickCompiledCodeOffset(
-      kX86_64WordSize).SizeValue()));
+  __ call(Address(temp,
+                  ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86_64WordSize).SizeValue()));
 
   DCHECK(!codegen_->IsLeafMethod());
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
@@ -3687,13 +3917,23 @@ void InstructionCodeGeneratorX86_64::GenerateMemoryBarrier(MemBarrierKind kind)
 void LocationsBuilderX86_64::HandleFieldGet(HInstruction* instruction) {
   DCHECK(instruction->IsInstanceFieldGet() || instruction->IsStaticFieldGet());
 
+  bool object_field_get_with_read_barrier =
+      kEmitCompilerReadBarrier && (instruction->GetType() == Primitive::kPrimNot);
   LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
+      new (GetGraph()->GetArena()) LocationSummary(instruction,
+                                                   object_field_get_with_read_barrier ?
+                                                       LocationSummary::kCallOnSlowPath :
+                                                       LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RequiresRegister());
   if (Primitive::IsFloatingPointType(instruction->GetType())) {
     locations->SetOut(Location::RequiresFpuRegister());
   } else {
-    locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+    // The output overlaps for an object field get when read barriers
+    // are enabled: we do not want the move to overwrite the object's
+    // location, as we need it to emit the read barrier.
+    locations->SetOut(
+        Location::RequiresRegister(),
+        object_field_get_with_read_barrier ? Location::kOutputOverlap : Location::kNoOutputOverlap);
   }
 }
 
@@ -3702,7 +3942,8 @@ void InstructionCodeGeneratorX86_64::HandleFieldGet(HInstruction* instruction,
   DCHECK(instruction->IsInstanceFieldGet() || instruction->IsStaticFieldGet());
 
   LocationSummary* locations = instruction->GetLocations();
-  CpuRegister base = locations->InAt(0).AsRegister<CpuRegister>();
+  Location base_loc = locations->InAt(0);
+  CpuRegister base = base_loc.AsRegister<CpuRegister>();
   Location out = locations->Out();
   bool is_volatile = field_info.IsVolatile();
   Primitive::Type field_type = field_info.GetFieldType();
@@ -3762,7 +4003,7 @@ void InstructionCodeGeneratorX86_64::HandleFieldGet(HInstruction* instruction,
   }
 
   if (field_type == Primitive::kPrimNot) {
-    __ MaybeUnpoisonHeapReference(out.AsRegister<CpuRegister>());
+    codegen_->MaybeGenerateReadBarrier(instruction, out, out, base_loc, offset);
   }
 }
 
@@ -4080,20 +4321,31 @@ void InstructionCodeGeneratorX86_64::VisitNullCheck(HNullCheck* instruction) {
 }
 
 void LocationsBuilderX86_64::VisitArrayGet(HArrayGet* instruction) {
+  bool object_array_get_with_read_barrier =
+      kEmitCompilerReadBarrier && (instruction->GetType() == Primitive::kPrimNot);
   LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
+      new (GetGraph()->GetArena()) LocationSummary(instruction,
+                                                   object_array_get_with_read_barrier ?
+                                                       LocationSummary::kCallOnSlowPath :
+                                                       LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
   if (Primitive::IsFloatingPointType(instruction->GetType())) {
     locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
   } else {
-    locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+    // The output overlaps for an object array get when read barriers
+    // are enabled: we do not want the move to overwrite the array's
+    // location, as we need it to emit the read barrier.
+    locations->SetOut(
+        Location::RequiresRegister(),
+        object_array_get_with_read_barrier ? Location::kOutputOverlap : Location::kNoOutputOverlap);
   }
 }
 
 void InstructionCodeGeneratorX86_64::VisitArrayGet(HArrayGet* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  CpuRegister obj = locations->InAt(0).AsRegister<CpuRegister>();
+  Location obj_loc = locations->InAt(0);
+  CpuRegister obj = obj_loc.AsRegister<CpuRegister>();
   Location index = locations->InAt(1);
   Primitive::Type type = instruction->GetType();
 
@@ -4148,8 +4400,9 @@ void InstructionCodeGeneratorX86_64::VisitArrayGet(HArrayGet* instruction) {
 
     case Primitive::kPrimInt:
     case Primitive::kPrimNot: {
-      static_assert(sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
-                    "art::mirror::HeapReference<mirror::Object> and int32_t have different sizes.");
+      static_assert(
+          sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
+          "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value();
       CpuRegister out = locations->Out().AsRegister<CpuRegister>();
       if (index.IsConstant()) {
@@ -4204,8 +4457,17 @@ void InstructionCodeGeneratorX86_64::VisitArrayGet(HArrayGet* instruction) {
   codegen_->MaybeRecordImplicitNullCheck(instruction);
 
   if (type == Primitive::kPrimNot) {
-    CpuRegister out = locations->Out().AsRegister<CpuRegister>();
-    __ MaybeUnpoisonHeapReference(out);
+    static_assert(
+        sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
+        "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
+    uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value();
+    Location out = locations->Out();
+    if (index.IsConstant()) {
+      uint32_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
+      codegen_->MaybeGenerateReadBarrier(instruction, out, out, obj_loc, offset);
+    } else {
+      codegen_->MaybeGenerateReadBarrier(instruction, out, out, obj_loc, data_offset, index);
+    }
   }
 }
 
@@ -4215,10 +4477,14 @@ void LocationsBuilderX86_64::VisitArraySet(HArraySet* instruction) {
   bool needs_write_barrier =
       CodeGenerator::StoreNeedsWriteBarrier(value_type, instruction->GetValue());
   bool may_need_runtime_call = instruction->NeedsTypeCheck();
+  bool object_array_set_with_read_barrier =
+      kEmitCompilerReadBarrier && (value_type == Primitive::kPrimNot);
 
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(
       instruction,
-      may_need_runtime_call ? LocationSummary::kCallOnSlowPath : LocationSummary::kNoCall);
+      (may_need_runtime_call || object_array_set_with_read_barrier) ?
+          LocationSummary::kCallOnSlowPath :
+          LocationSummary::kNoCall);
 
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
@@ -4230,18 +4496,24 @@ void LocationsBuilderX86_64::VisitArraySet(HArraySet* instruction) {
 
   if (needs_write_barrier) {
     // Temporary registers for the write barrier.
-    locations->AddTemp(Location::RequiresRegister());  // Possibly used for ref. poisoning too.
+
+    // This first temporary register is possibly used for heap
+    // reference poisoning and/or read barrier emission too.
+    locations->AddTemp(Location::RequiresRegister());
+    // This second temporary register is possibly used for read
+    // barrier emission too.
     locations->AddTemp(Location::RequiresRegister());
   }
 }
 
 void InstructionCodeGeneratorX86_64::VisitArraySet(HArraySet* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  CpuRegister array = locations->InAt(0).AsRegister<CpuRegister>();
+  Location array_loc = locations->InAt(0);
+  CpuRegister array = array_loc.AsRegister<CpuRegister>();
   Location index = locations->InAt(1);
   Location value = locations->InAt(2);
   Primitive::Type value_type = instruction->GetComponentType();
-  bool may_need_runtime_call = locations->CanCall();
+  bool may_need_runtime_call = instruction->NeedsTypeCheck();
   bool needs_write_barrier =
       CodeGenerator::StoreNeedsWriteBarrier(value_type, instruction->GetValue());
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
@@ -4285,6 +4557,7 @@ void InstructionCodeGeneratorX86_64::VisitArraySet(HArraySet* instruction) {
       Address address = index.IsConstant()
           ? Address(array, (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + offset)
           : Address(array, index.AsRegister<CpuRegister>(), TIMES_4, offset);
+
       if (!value.IsRegister()) {
         // Just setting null.
         DCHECK(instruction->InputAt(2)->IsNullConstant());
@@ -4313,22 +4586,62 @@ void InstructionCodeGeneratorX86_64::VisitArraySet(HArraySet* instruction) {
           __ Bind(&not_null);
         }
 
-        __ movl(temp, Address(array, class_offset));
-        codegen_->MaybeRecordImplicitNullCheck(instruction);
-        __ MaybeUnpoisonHeapReference(temp);
-        __ movl(temp, Address(temp, component_offset));
-        // No need to poison/unpoison, we're comparing two poisoned references.
-        __ cmpl(temp, Address(register_value, class_offset));
-        if (instruction->StaticTypeOfArrayIsObjectArray()) {
-          __ j(kEqual, &do_put);
-          __ MaybeUnpoisonHeapReference(temp);
-          __ movl(temp, Address(temp, super_offset));
-          // No need to unpoison the result, we're comparing against null.
-          __ testl(temp, temp);
-          __ j(kNotEqual, slow_path->GetEntryLabel());
-          __ Bind(&do_put);
+        if (kEmitCompilerReadBarrier) {
+          // When read barriers are enabled, the type checking
+          // instrumentation requires two read barriers:
+          //
+          //   __ movl(temp2, temp);
+          //   // /* HeapReference<Class> */ temp = temp->component_type_
+          //   __ movl(temp, Address(temp, component_offset));
+          //   codegen_->GenerateReadBarrier(
+          //       instruction, temp_loc, temp_loc, temp2_loc, component_offset);
+          //
+          //   // /* HeapReference<Class> */ temp2 = register_value->klass_
+          //   __ movl(temp2, Address(register_value, class_offset));
+          //   codegen_->GenerateReadBarrier(
+          //       instruction, temp2_loc, temp2_loc, value, class_offset, temp_loc);
+          //
+          //   __ cmpl(temp, temp2);
+          //
+          // However, the second read barrier may trash `temp`, as it
+          // is a temporary register, and as such would not be saved
+          // along with live registers before calling the runtime (nor
+          // restored afterwards).  So in this case, we bail out and
+          // delegate the work to the array set slow path.
+          //
+          // TODO: Extend the register allocator to support a new
+          // "(locally) live temp" location so as to avoid always
+          // going into the slow path when read barriers are enabled.
+          __ jmp(slow_path->GetEntryLabel());
         } else {
-          __ j(kNotEqual, slow_path->GetEntryLabel());
+          // /* HeapReference<Class> */ temp = array->klass_
+          __ movl(temp, Address(array, class_offset));
+          codegen_->MaybeRecordImplicitNullCheck(instruction);
+          __ MaybeUnpoisonHeapReference(temp);
+
+          // /* HeapReference<Class> */ temp = temp->component_type_
+          __ movl(temp, Address(temp, component_offset));
+          // If heap poisoning is enabled, no need to unpoison `temp`
+          // nor the object reference in `register_value->klass`, as
+          // we are comparing two poisoned references.
+          __ cmpl(temp, Address(register_value, class_offset));
+
+          if (instruction->StaticTypeOfArrayIsObjectArray()) {
+            __ j(kEqual, &do_put);
+            // If heap poisoning is enabled, the `temp` reference has
+            // not been unpoisoned yet; unpoison it now.
+            __ MaybeUnpoisonHeapReference(temp);
+
+            // /* HeapReference<Class> */ temp = temp->super_class_
+            __ movl(temp, Address(temp, super_offset));
+            // If heap poisoning is enabled, no need to unpoison
+            // `temp`, as we are comparing against null below.
+            __ testl(temp, temp);
+            __ j(kNotEqual, slow_path->GetEntryLabel());
+            __ Bind(&do_put);
+          } else {
+            __ j(kNotEqual, slow_path->GetEntryLabel());
+          }
         }
       }
 
@@ -4354,6 +4667,7 @@ void InstructionCodeGeneratorX86_64::VisitArraySet(HArraySet* instruction) {
 
       break;
     }
+
     case Primitive::kPrimInt: {
       uint32_t offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value();
       Address address = index.IsConstant()
@@ -4803,7 +5117,8 @@ void LocationsBuilderX86_64::VisitLoadClass(HLoadClass* cls) {
   CodeGenerator::CreateLoadClassLocationSummary(
       cls,
       Location::RegisterLocation(calling_convention.GetRegisterAt(0)),
-      Location::RegisterLocation(RAX));
+      Location::RegisterLocation(RAX),
+      /* code_generator_supports_read_barrier */ true);
 }
 
 void InstructionCodeGeneratorX86_64::VisitLoadClass(HLoadClass* cls) {
@@ -4817,18 +5132,40 @@ void InstructionCodeGeneratorX86_64::VisitLoadClass(HLoadClass* cls) {
     return;
   }
 
-  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
+  Location out_loc = locations->Out();
+  CpuRegister out = out_loc.AsRegister<CpuRegister>();
   CpuRegister current_method = locations->InAt(0).AsRegister<CpuRegister>();
+
   if (cls->IsReferrersClass()) {
     DCHECK(!cls->CanCallRuntime());
     DCHECK(!cls->MustGenerateClinitCheck());
-    __ movl(out, Address(current_method, ArtMethod::DeclaringClassOffset().Int32Value()));
+    uint32_t declaring_class_offset = ArtMethod::DeclaringClassOffset().Int32Value();
+    if (kEmitCompilerReadBarrier) {
+      // /* GcRoot<mirror::Class>* */ out = &(current_method->declaring_class_)
+      __ leaq(out, Address(current_method, declaring_class_offset));
+      // /* mirror::Class* */ out = out->Read()
+      codegen_->GenerateReadBarrierForRoot(cls, out_loc, out_loc);
+    } else {
+      // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
+      __ movl(out, Address(current_method, declaring_class_offset));
+    }
   } else {
     DCHECK(cls->CanCallRuntime());
-    __ movq(out, Address(
-        current_method, ArtMethod::DexCacheResolvedTypesOffset(kX86_64PointerSize).Int32Value()));
-    __ movl(out, Address(out, CodeGenerator::GetCacheOffset(cls->GetTypeIndex())));
-    // TODO: We will need a read barrier here.
+    // /* GcRoot<mirror::Class>[] */ out =
+    //        current_method.ptr_sized_fields_->dex_cache_resolved_types_
+    __ movq(out, Address(current_method,
+                         ArtMethod::DexCacheResolvedTypesOffset(kX86_64PointerSize).Int32Value()));
+
+    size_t cache_offset = CodeGenerator::GetCacheOffset(cls->GetTypeIndex());
+    if (kEmitCompilerReadBarrier) {
+      // /* GcRoot<mirror::Class>* */ out = &out[type_index]
+      __ leaq(out, Address(out, cache_offset));
+      // /* mirror::Class* */ out = out->Read()
+      codegen_->GenerateReadBarrierForRoot(cls, out_loc, out_loc);
+    } else {
+      // /* GcRoot<mirror::Class> */ out = out[type_index]
+      __ movl(out, Address(out, cache_offset));
+    }
 
     SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadClassSlowPathX86_64(
         cls, cls, cls->GetDexPc(), cls->MustGenerateClinitCheck());
@@ -4873,12 +5210,35 @@ void InstructionCodeGeneratorX86_64::VisitLoadString(HLoadString* load) {
   codegen_->AddSlowPath(slow_path);
 
   LocationSummary* locations = load->GetLocations();
-  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
+  Location out_loc = locations->Out();
+  CpuRegister out = out_loc.AsRegister<CpuRegister>();
   CpuRegister current_method = locations->InAt(0).AsRegister<CpuRegister>();
-  __ movl(out, Address(current_method, ArtMethod::DeclaringClassOffset().Int32Value()));
-  __ movq(out, Address(out, mirror::Class::DexCacheStringsOffset().Int32Value()));
-  __ movl(out, Address(out, CodeGenerator::GetCacheOffset(load->GetStringIndex())));
-  // TODO: We will need a read barrier here.
+
+  uint32_t declaring_class_offset = ArtMethod::DeclaringClassOffset().Int32Value();
+  if (kEmitCompilerReadBarrier) {
+    // /* GcRoot<mirror::Class>* */ out = &(current_method->declaring_class_)
+    __ leaq(out, Address(current_method, declaring_class_offset));
+    // /* mirror::Class* */ out = out->Read()
+    codegen_->GenerateReadBarrierForRoot(load, out_loc, out_loc);
+  } else {
+    // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
+    __ movl(out, Address(current_method, declaring_class_offset));
+  }
+
+  // /* GcRoot<mirror::String>[] */ out = out->dex_cache_strings_
+  __ movq(out, Address(out, mirror::Class::DexCacheStringsOffset().Uint32Value()));
+
+  size_t cache_offset = CodeGenerator::GetCacheOffset(load->GetStringIndex());
+  if (kEmitCompilerReadBarrier) {
+    // /* GcRoot<mirror::String>* */ out = &out[string_index]
+    __ leaq(out, Address(out, cache_offset));
+    // /* mirror::String* */ out = out->Read()
+    codegen_->GenerateReadBarrierForRoot(load, out_loc, out_loc);
+  } else {
+    // /* GcRoot<mirror::String> */ out = out[string_index]
+    __ movl(out, Address(out, cache_offset));
+  }
+
   __ testl(out, out);
   __ j(kEqual, slow_path->GetEntryLabel());
   __ Bind(slow_path->GetExitLabel());
@@ -4922,40 +5282,44 @@ void InstructionCodeGeneratorX86_64::VisitThrow(HThrow* instruction) {
 
 void LocationsBuilderX86_64::VisitInstanceOf(HInstanceOf* instruction) {
   LocationSummary::CallKind call_kind = LocationSummary::kNoCall;
-  switch (instruction->GetTypeCheckKind()) {
+  TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
+  switch (type_check_kind) {
     case TypeCheckKind::kExactCheck:
     case TypeCheckKind::kAbstractClassCheck:
     case TypeCheckKind::kClassHierarchyCheck:
     case TypeCheckKind::kArrayObjectCheck:
-      call_kind = LocationSummary::kNoCall;
+      call_kind =
+          kEmitCompilerReadBarrier ? LocationSummary::kCallOnSlowPath : LocationSummary::kNoCall;
       break;
+    case TypeCheckKind::kArrayCheck:
     case TypeCheckKind::kUnresolvedCheck:
     case TypeCheckKind::kInterfaceCheck:
-      call_kind = LocationSummary::kCall;
-      break;
-    case TypeCheckKind::kArrayCheck:
       call_kind = LocationSummary::kCallOnSlowPath;
       break;
   }
+
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, call_kind);
-  if (call_kind != LocationSummary::kCall) {
-    locations->SetInAt(0, Location::RequiresRegister());
-    locations->SetInAt(1, Location::Any());
-    // Note that TypeCheckSlowPathX86_64 uses this register too.
-    locations->SetOut(Location::RequiresRegister());
-  } else {
-    InvokeRuntimeCallingConvention calling_convention;
-    locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
-    locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
-    locations->SetOut(Location::RegisterLocation(RAX));
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(1, Location::Any());
+  // Note that TypeCheckSlowPathX86_64 uses this "out" register too.
+  locations->SetOut(Location::RequiresRegister());
+  // When read barriers are enabled, we need a temporary register for
+  // some cases.
+  if (kEmitCompilerReadBarrier &&
+      (type_check_kind == TypeCheckKind::kAbstractClassCheck ||
+       type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
+       type_check_kind == TypeCheckKind::kArrayObjectCheck)) {
+    locations->AddTemp(Location::RequiresRegister());
   }
 }
 
 void InstructionCodeGeneratorX86_64::VisitInstanceOf(HInstanceOf* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  CpuRegister obj = locations->InAt(0).AsRegister<CpuRegister>();
+  Location obj_loc = locations->InAt(0);
+  CpuRegister obj = obj_loc.AsRegister<CpuRegister>();
   Location cls = locations->InAt(1);
-  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
+  Location out_loc =  locations->Out();
+  CpuRegister out = out_loc.AsRegister<CpuRegister>();
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
   uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
   uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
@@ -4970,15 +5334,9 @@ void InstructionCodeGeneratorX86_64::VisitInstanceOf(HInstanceOf* instruction) {
     __ j(kEqual, &zero);
   }
 
-  // In case of an interface/unresolved check, we put the object class into the object register.
-  // This is safe, as the register is caller-save, and the object must be in another
-  // register if it survives the runtime call.
-  CpuRegister target = (instruction->GetTypeCheckKind() == TypeCheckKind::kInterfaceCheck) ||
-      (instruction->GetTypeCheckKind() == TypeCheckKind::kUnresolvedCheck)
-      ? obj
-      : out;
-  __ movl(target, Address(obj, class_offset));
-  __ MaybeUnpoisonHeapReference(target);
+  // /* HeapReference<Class> */ out = obj->klass_
+  __ movl(out, Address(obj, class_offset));
+  codegen_->MaybeGenerateReadBarrier(instruction, out_loc, out_loc, obj_loc, class_offset);
 
   switch (instruction->GetTypeCheckKind()) {
     case TypeCheckKind::kExactCheck: {
@@ -5000,13 +5358,23 @@ void InstructionCodeGeneratorX86_64::VisitInstanceOf(HInstanceOf* instruction) {
       }
       break;
     }
+
     case TypeCheckKind::kAbstractClassCheck: {
       // If the class is abstract, we eagerly fetch the super class of the
       // object to avoid doing a comparison we know will fail.
       NearLabel loop, success;
       __ Bind(&loop);
+      Location temp_loc = kEmitCompilerReadBarrier ? locations->GetTemp(0) : Location::NoLocation();
+      if (kEmitCompilerReadBarrier) {
+        // Save the value of `out` into `temp` before overwriting it
+        // in the following move operation, as we will need it for the
+        // read barrier below.
+        CpuRegister temp = temp_loc.AsRegister<CpuRegister>();
+        __ movl(temp, out);
+      }
+      // /* HeapReference<Class> */ out = out->super_class_
       __ movl(out, Address(out, super_offset));
-      __ MaybeUnpoisonHeapReference(out);
+      codegen_->MaybeGenerateReadBarrier(instruction, out_loc, out_loc, temp_loc, super_offset);
       __ testl(out, out);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ j(kEqual, &done);
@@ -5023,6 +5391,7 @@ void InstructionCodeGeneratorX86_64::VisitInstanceOf(HInstanceOf* instruction) {
       }
       break;
     }
+
     case TypeCheckKind::kClassHierarchyCheck: {
       // Walk over the class hierarchy to find a match.
       NearLabel loop, success;
@@ -5034,8 +5403,17 @@ void InstructionCodeGeneratorX86_64::VisitInstanceOf(HInstanceOf* instruction) {
         __ cmpl(out, Address(CpuRegister(RSP), cls.GetStackIndex()));
       }
       __ j(kEqual, &success);
+      Location temp_loc = kEmitCompilerReadBarrier ? locations->GetTemp(0) : Location::NoLocation();
+      if (kEmitCompilerReadBarrier) {
+        // Save the value of `out` into `temp` before overwriting it
+        // in the following move operation, as we will need it for the
+        // read barrier below.
+        CpuRegister temp = temp_loc.AsRegister<CpuRegister>();
+        __ movl(temp, out);
+      }
+      // /* HeapReference<Class> */ out = out->super_class_
       __ movl(out, Address(out, super_offset));
-      __ MaybeUnpoisonHeapReference(out);
+      codegen_->MaybeGenerateReadBarrier(instruction, out_loc, out_loc, temp_loc, super_offset);
       __ testl(out, out);
       __ j(kNotEqual, &loop);
       // If `out` is null, we use it for the result, and jump to `done`.
@@ -5047,6 +5425,7 @@ void InstructionCodeGeneratorX86_64::VisitInstanceOf(HInstanceOf* instruction) {
       }
       break;
     }
+
     case TypeCheckKind::kArrayObjectCheck: {
       // Do an exact check.
       NearLabel exact_check;
@@ -5057,9 +5436,18 @@ void InstructionCodeGeneratorX86_64::VisitInstanceOf(HInstanceOf* instruction) {
         __ cmpl(out, Address(CpuRegister(RSP), cls.GetStackIndex()));
       }
       __ j(kEqual, &exact_check);
-      // Otherwise, we need to check that the object's class is a non primitive array.
+      // Otherwise, we need to check that the object's class is a non-primitive array.
+      Location temp_loc = kEmitCompilerReadBarrier ? locations->GetTemp(0) : Location::NoLocation();
+      if (kEmitCompilerReadBarrier) {
+        // Save the value of `out` into `temp` before overwriting it
+        // in the following move operation, as we will need it for the
+        // read barrier below.
+        CpuRegister temp = temp_loc.AsRegister<CpuRegister>();
+        __ movl(temp, out);
+      }
+      // /* HeapReference<Class> */ out = out->component_type_
       __ movl(out, Address(out, component_offset));
-      __ MaybeUnpoisonHeapReference(out);
+      codegen_->MaybeGenerateReadBarrier(instruction, out_loc, out_loc, temp_loc, component_offset);
       __ testl(out, out);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ j(kEqual, &done);
@@ -5070,6 +5458,7 @@ void InstructionCodeGeneratorX86_64::VisitInstanceOf(HInstanceOf* instruction) {
       __ jmp(&done);
       break;
     }
+
     case TypeCheckKind::kArrayCheck: {
       if (cls.IsRegister()) {
         __ cmpl(out, cls.AsRegister<CpuRegister>());
@@ -5078,8 +5467,8 @@ void InstructionCodeGeneratorX86_64::VisitInstanceOf(HInstanceOf* instruction) {
         __ cmpl(out, Address(CpuRegister(RSP), cls.GetStackIndex()));
       }
       DCHECK(locations->OnlyCallsOnSlowPath());
-      slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathX86_64(
-          instruction, /* is_fatal */ false);
+      slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathX86_64(instruction,
+                                                                       /* is_fatal */ false);
       codegen_->AddSlowPath(slow_path);
       __ j(kNotEqual, slow_path->GetEntryLabel());
       __ movl(out, Immediate(1));
@@ -5088,13 +5477,25 @@ void InstructionCodeGeneratorX86_64::VisitInstanceOf(HInstanceOf* instruction) {
       }
       break;
     }
+
     case TypeCheckKind::kUnresolvedCheck:
-    case TypeCheckKind::kInterfaceCheck:
-    default: {
-      codegen_->InvokeRuntime(QUICK_ENTRY_POINT(pInstanceofNonTrivial),
-                              instruction,
-                              instruction->GetDexPc(),
-                              nullptr);
+    case TypeCheckKind::kInterfaceCheck: {
+      // Note that we indeed only call on slow path, but we always go
+      // into the slow path for the unresolved & interface check
+      // cases.
+      //
+      // We cannot directly call the InstanceofNonTrivial runtime
+      // entry point without resorting to a type checking slow path
+      // here (i.e. by calling InvokeRuntime directly), as it would
+      // require to assign fixed registers for the inputs of this
+      // HInstanceOf instruction (following the runtime calling
+      // convention), which might be cluttered by the potential first
+      // read barrier emission at the beginning of this method.
+      DCHECK(locations->OnlyCallsOnSlowPath());
+      slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathX86_64(instruction,
+                                                                       /* is_fatal */ false);
+      codegen_->AddSlowPath(slow_path);
+      __ jmp(slow_path->GetEntryLabel());
       if (zero.IsLinked()) {
         __ jmp(&done);
       }
@@ -5119,58 +5520,60 @@ void InstructionCodeGeneratorX86_64::VisitInstanceOf(HInstanceOf* instruction) {
 void LocationsBuilderX86_64::VisitCheckCast(HCheckCast* instruction) {
   LocationSummary::CallKind call_kind = LocationSummary::kNoCall;
   bool throws_into_catch = instruction->CanThrowIntoCatchBlock();
-
-  switch (instruction->GetTypeCheckKind()) {
+  TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
+  switch (type_check_kind) {
     case TypeCheckKind::kExactCheck:
     case TypeCheckKind::kAbstractClassCheck:
     case TypeCheckKind::kClassHierarchyCheck:
     case TypeCheckKind::kArrayObjectCheck:
-      call_kind = throws_into_catch
-          ? LocationSummary::kCallOnSlowPath
-          : LocationSummary::kNoCall;
+      call_kind = (throws_into_catch || kEmitCompilerReadBarrier) ?
+          LocationSummary::kCallOnSlowPath :
+          LocationSummary::kNoCall;  // In fact, call on a fatal (non-returning) slow path.
       break;
+    case TypeCheckKind::kArrayCheck:
     case TypeCheckKind::kUnresolvedCheck:
     case TypeCheckKind::kInterfaceCheck:
-      call_kind = LocationSummary::kCall;
-      break;
-    case TypeCheckKind::kArrayCheck:
       call_kind = LocationSummary::kCallOnSlowPath;
       break;
   }
-
-  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(
-      instruction, call_kind);
-  if (call_kind != LocationSummary::kCall) {
-    locations->SetInAt(0, Location::RequiresRegister());
-    locations->SetInAt(1, Location::Any());
-    // Note that TypeCheckSlowPathX86_64 uses this register too.
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, call_kind);
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(1, Location::Any());
+  // Note that TypeCheckSlowPathX86_64 uses this "temp" register too.
+  locations->AddTemp(Location::RequiresRegister());
+  // When read barriers are enabled, we need an additional temporary
+  // register for some cases.
+  if (kEmitCompilerReadBarrier &&
+      (type_check_kind == TypeCheckKind::kAbstractClassCheck ||
+       type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
+       type_check_kind == TypeCheckKind::kArrayObjectCheck)) {
     locations->AddTemp(Location::RequiresRegister());
-  } else {
-    InvokeRuntimeCallingConvention calling_convention;
-    locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
-    locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
   }
 }
 
 void InstructionCodeGeneratorX86_64::VisitCheckCast(HCheckCast* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  CpuRegister obj = locations->InAt(0).AsRegister<CpuRegister>();
+  Location obj_loc = locations->InAt(0);
+  CpuRegister obj = obj_loc.AsRegister<CpuRegister>();
   Location cls = locations->InAt(1);
-  CpuRegister temp = locations->WillCall()
-      ? CpuRegister(kNoRegister)
-      : locations->GetTemp(0).AsRegister<CpuRegister>();
-
+  Location temp_loc = locations->GetTemp(0);
+  CpuRegister temp = temp_loc.AsRegister<CpuRegister>();
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
   uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
   uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
   uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
-  SlowPathCode* slow_path = nullptr;
 
-  if (!locations->WillCall()) {
-    slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathX86_64(
-        instruction, !locations->CanCall());
-    codegen_->AddSlowPath(slow_path);
-  }
+  TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
+  bool is_type_check_slow_path_fatal =
+      (type_check_kind == TypeCheckKind::kExactCheck ||
+       type_check_kind == TypeCheckKind::kAbstractClassCheck ||
+       type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
+       type_check_kind == TypeCheckKind::kArrayObjectCheck) &&
+      !instruction->CanThrowIntoCatchBlock();
+  SlowPathCode* type_check_slow_path =
+      new (GetGraph()->GetArena()) TypeCheckSlowPathX86_64(instruction,
+                                                           is_type_check_slow_path_fatal);
+  codegen_->AddSlowPath(type_check_slow_path);
 
   NearLabel done;
   // Avoid null check if we know obj is not null.
@@ -5179,15 +5582,11 @@ void InstructionCodeGeneratorX86_64::VisitCheckCast(HCheckCast* instruction) {
     __ j(kEqual, &done);
   }
 
-  if (locations->WillCall()) {
-    __ movl(obj, Address(obj, class_offset));
-    __ MaybeUnpoisonHeapReference(obj);
-  } else {
-    __ movl(temp, Address(obj, class_offset));
-    __ MaybeUnpoisonHeapReference(temp);
-  }
+  // /* HeapReference<Class> */ temp = obj->klass_
+  __ movl(temp, Address(obj, class_offset));
+  codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, obj_loc, class_offset);
 
-  switch (instruction->GetTypeCheckKind()) {
+  switch (type_check_kind) {
     case TypeCheckKind::kExactCheck:
     case TypeCheckKind::kArrayCheck: {
       if (cls.IsRegister()) {
@@ -5198,19 +5597,44 @@ void InstructionCodeGeneratorX86_64::VisitCheckCast(HCheckCast* instruction) {
       }
       // Jump to slow path for throwing the exception or doing a
       // more involved array check.
-      __ j(kNotEqual, slow_path->GetEntryLabel());
+      __ j(kNotEqual, type_check_slow_path->GetEntryLabel());
       break;
     }
+
     case TypeCheckKind::kAbstractClassCheck: {
       // If the class is abstract, we eagerly fetch the super class of the
       // object to avoid doing a comparison we know will fail.
-      NearLabel loop;
+      NearLabel loop, compare_classes;
       __ Bind(&loop);
+      Location temp2_loc =
+          kEmitCompilerReadBarrier ? locations->GetTemp(1) : Location::NoLocation();
+      if (kEmitCompilerReadBarrier) {
+        // Save the value of `temp` into `temp2` before overwriting it
+        // in the following move operation, as we will need it for the
+        // read barrier below.
+        CpuRegister temp2 = temp2_loc.AsRegister<CpuRegister>();
+        __ movl(temp2, temp);
+      }
+      // /* HeapReference<Class> */ temp = temp->super_class_
       __ movl(temp, Address(temp, super_offset));
-      __ MaybeUnpoisonHeapReference(temp);
+      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, temp2_loc, super_offset);
+
+      // If the class reference currently in `temp` is not null, jump
+      // to the `compare_classes` label to compare it with the checked
+      // class.
       __ testl(temp, temp);
-      // Jump to the slow path to throw the exception.
-      __ j(kEqual, slow_path->GetEntryLabel());
+      __ j(kNotEqual, &compare_classes);
+      // Otherwise, jump to the slow path to throw the exception.
+      //
+      // But before, move back the object's class into `temp` before
+      // going into the slow path, as it has been overwritten in the
+      // meantime.
+      // /* HeapReference<Class> */ temp = obj->klass_
+      __ movl(temp, Address(obj, class_offset));
+      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, obj_loc, class_offset);
+      __ jmp(type_check_slow_path->GetEntryLabel());
+
+      __ Bind(&compare_classes);
       if (cls.IsRegister()) {
         __ cmpl(temp, cls.AsRegister<CpuRegister>());
       } else {
@@ -5220,6 +5644,7 @@ void InstructionCodeGeneratorX86_64::VisitCheckCast(HCheckCast* instruction) {
       __ j(kNotEqual, &loop);
       break;
     }
+
     case TypeCheckKind::kClassHierarchyCheck: {
       // Walk over the class hierarchy to find a match.
       NearLabel loop;
@@ -5231,16 +5656,39 @@ void InstructionCodeGeneratorX86_64::VisitCheckCast(HCheckCast* instruction) {
         __ cmpl(temp, Address(CpuRegister(RSP), cls.GetStackIndex()));
       }
       __ j(kEqual, &done);
+
+      Location temp2_loc =
+          kEmitCompilerReadBarrier ? locations->GetTemp(1) : Location::NoLocation();
+      if (kEmitCompilerReadBarrier) {
+        // Save the value of `temp` into `temp2` before overwriting it
+        // in the following move operation, as we will need it for the
+        // read barrier below.
+        CpuRegister temp2 = temp2_loc.AsRegister<CpuRegister>();
+        __ movl(temp2, temp);
+      }
+      // /* HeapReference<Class> */ temp = temp->super_class_
       __ movl(temp, Address(temp, super_offset));
-      __ MaybeUnpoisonHeapReference(temp);
+      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, temp2_loc, super_offset);
+
+      // If the class reference currently in `temp` is not null, jump
+      // back at the beginning of the loop.
       __ testl(temp, temp);
       __ j(kNotEqual, &loop);
-      // Jump to the slow path to throw the exception.
-      __ jmp(slow_path->GetEntryLabel());
+      // Otherwise, jump to the slow path to throw the exception.
+      //
+      // But before, move back the object's class into `temp` before
+      // going into the slow path, as it has been overwritten in the
+      // meantime.
+      // /* HeapReference<Class> */ temp = obj->klass_
+      __ movl(temp, Address(obj, class_offset));
+      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, obj_loc, class_offset);
+      __ jmp(type_check_slow_path->GetEntryLabel());
       break;
     }
+
     case TypeCheckKind::kArrayObjectCheck: {
       // Do an exact check.
+      NearLabel check_non_primitive_component_type;
       if (cls.IsRegister()) {
         __ cmpl(temp, cls.AsRegister<CpuRegister>());
       } else {
@@ -5248,29 +5696,67 @@ void InstructionCodeGeneratorX86_64::VisitCheckCast(HCheckCast* instruction) {
         __ cmpl(temp, Address(CpuRegister(RSP), cls.GetStackIndex()));
       }
       __ j(kEqual, &done);
-      // Otherwise, we need to check that the object's class is a non primitive array.
+
+      // Otherwise, we need to check that the object's class is a non-primitive array.
+      Location temp2_loc =
+          kEmitCompilerReadBarrier ? locations->GetTemp(1) : Location::NoLocation();
+      if (kEmitCompilerReadBarrier) {
+        // Save the value of `temp` into `temp2` before overwriting it
+        // in the following move operation, as we will need it for the
+        // read barrier below.
+        CpuRegister temp2 = temp2_loc.AsRegister<CpuRegister>();
+        __ movl(temp2, temp);
+      }
+      // /* HeapReference<Class> */ temp = temp->component_type_
       __ movl(temp, Address(temp, component_offset));
-      __ MaybeUnpoisonHeapReference(temp);
+      codegen_->MaybeGenerateReadBarrier(
+          instruction, temp_loc, temp_loc, temp2_loc, component_offset);
+
+      // If the component type is not null (i.e. the object is indeed
+      // an array), jump to label `check_non_primitive_component_type`
+      // to further check that this component type is not a primitive
+      // type.
       __ testl(temp, temp);
-      __ j(kEqual, slow_path->GetEntryLabel());
+      __ j(kNotEqual, &check_non_primitive_component_type);
+      // Otherwise, jump to the slow path to throw the exception.
+      //
+      // But before, move back the object's class into `temp` before
+      // going into the slow path, as it has been overwritten in the
+      // meantime.
+      // /* HeapReference<Class> */ temp = obj->klass_
+      __ movl(temp, Address(obj, class_offset));
+      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, obj_loc, class_offset);
+      __ jmp(type_check_slow_path->GetEntryLabel());
+
+      __ Bind(&check_non_primitive_component_type);
       __ cmpw(Address(temp, primitive_offset), Immediate(Primitive::kPrimNot));
-      __ j(kNotEqual, slow_path->GetEntryLabel());
+      __ j(kEqual, &done);
+      // Same comment as above regarding `temp` and the slow path.
+      // /* HeapReference<Class> */ temp = obj->klass_
+      __ movl(temp, Address(obj, class_offset));
+      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, obj_loc, class_offset);
+      __ jmp(type_check_slow_path->GetEntryLabel());
       break;
     }
+
     case TypeCheckKind::kUnresolvedCheck:
     case TypeCheckKind::kInterfaceCheck:
-    default:
-      codegen_->InvokeRuntime(QUICK_ENTRY_POINT(pCheckCast),
-                              instruction,
-                              instruction->GetDexPc(),
-                              nullptr);
+      // We always go into the type check slow path for the unresolved &
+      // interface check cases.
+      //
+      // We cannot directly call the CheckCast runtime entry point
+      // without resorting to a type checking slow path here (i.e. by
+      // calling InvokeRuntime directly), as it would require to
+      // assign fixed registers for the inputs of this HInstanceOf
+      // instruction (following the runtime calling convention), which
+      // might be cluttered by the potential first read barrier
+      // emission at the beginning of this method.
+      __ jmp(type_check_slow_path->GetEntryLabel());
       break;
   }
   __ Bind(&done);
 
-  if (slow_path != nullptr) {
-    __ Bind(slow_path->GetExitLabel());
-  }
+  __ Bind(type_check_slow_path->GetExitLabel());
 }
 
 void LocationsBuilderX86_64::VisitMonitorOperation(HMonitorOperation* instruction) {
@@ -5403,6 +5889,82 @@ void InstructionCodeGeneratorX86_64::HandleBitwiseOperation(HBinaryOperation* in
   }
 }
 
+void CodeGeneratorX86_64::GenerateReadBarrier(HInstruction* instruction,
+                                              Location out,
+                                              Location ref,
+                                              Location obj,
+                                              uint32_t offset,
+                                              Location index) {
+  DCHECK(kEmitCompilerReadBarrier);
+
+  // If heap poisoning is enabled, the unpoisoning of the loaded
+  // reference will be carried out by the runtime within the slow
+  // path.
+  //
+  // Note that `ref` currently does not get unpoisoned (when heap
+  // poisoning is enabled), which is alright as the `ref` argument is
+  // not used by the artReadBarrierSlow entry point.
+  //
+  // TODO: Unpoison `ref` when it is used by artReadBarrierSlow.
+  SlowPathCode* slow_path = new (GetGraph()->GetArena())
+      ReadBarrierForHeapReferenceSlowPathX86_64(instruction, out, ref, obj, offset, index);
+  AddSlowPath(slow_path);
+
+  // TODO: When read barrier has a fast path, add it here.
+  /* Currently the read barrier call is inserted after the original load.
+   * However, if we have a fast path, we need to perform the load of obj.LockWord *before* the
+   * original load. This load-load ordering is required by the read barrier.
+   * The fast path/slow path (for Baker's algorithm) should look like:
+   *
+   * bool isGray = obj.LockWord & kReadBarrierMask;
+   * lfence;  // load fence or artificial data dependence to prevent load-load reordering
+   * ref = obj.field;    // this is the original load
+   * if (isGray) {
+   *   ref = Mark(ref);  // ideally the slow path just does Mark(ref)
+   * }
+   */
+
+  __ jmp(slow_path->GetEntryLabel());
+  __ Bind(slow_path->GetExitLabel());
+}
+
+void CodeGeneratorX86_64::MaybeGenerateReadBarrier(HInstruction* instruction,
+                                                   Location out,
+                                                   Location ref,
+                                                   Location obj,
+                                                   uint32_t offset,
+                                                   Location index) {
+  if (kEmitCompilerReadBarrier) {
+    // If heap poisoning is enabled, unpoisoning will be taken care of
+    // by the runtime within the slow path.
+    GenerateReadBarrier(instruction, out, ref, obj, offset, index);
+  } else if (kPoisonHeapReferences) {
+    __ UnpoisonHeapReference(out.AsRegister<CpuRegister>());
+  }
+}
+
+void CodeGeneratorX86_64::GenerateReadBarrierForRoot(HInstruction* instruction,
+                                                     Location out,
+                                                     Location root) {
+  DCHECK(kEmitCompilerReadBarrier);
+
+  // Note that GC roots are not affected by heap poisoning, so we do
+  // not need to do anything special for this here.
+  SlowPathCode* slow_path =
+      new (GetGraph()->GetArena()) ReadBarrierForRootSlowPathX86_64(instruction, out, root);
+  AddSlowPath(slow_path);
+
+  // TODO: Implement a fast path for ReadBarrierForRoot, performing
+  // the following operation (for Baker's algorithm):
+  //
+  //   if (thread.tls32_.is_gc_marking) {
+  //     root = Mark(root);
+  //   }
+
+  __ jmp(slow_path->GetEntryLabel());
+  __ Bind(slow_path->GetExitLabel());
+}
+
 void LocationsBuilderX86_64::VisitBoundType(HBoundType* instruction ATTRIBUTE_UNUSED) {
   // Nothing to do, this should be removed during prepare for register allocator.
   LOG(FATAL) << "Unreachable";
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index 7a52473408..145b1f33b4 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -217,14 +217,12 @@ class InstructionCodeGeneratorX86_64 : public HGraphVisitor {
   void PushOntoFPStack(Location source, uint32_t temp_offset,
                        uint32_t stack_adjustment, bool is_float);
   void GenerateTestAndBranch(HInstruction* instruction,
+                             size_t condition_input_index,
                              Label* true_target,
-                             Label* false_target,
-                             Label* always_true_target);
-  void GenerateCompareTestAndBranch(HIf* if_inst,
-                                    HCondition* condition,
+                             Label* false_target);
+  void GenerateCompareTestAndBranch(HCondition* condition,
                                     Label* true_target,
-                                    Label* false_target,
-                                    Label* always_true_target);
+                                    Label* false_target);
   void GenerateFPJumps(HCondition* cond, Label* true_label, Label* false_label);
   void HandleGoto(HInstruction* got, HBasicBlock* successor);
 
@@ -352,6 +350,51 @@ class CodeGeneratorX86_64 : public CodeGenerator {
     return isa_features_;
   }
 
+  // Generate a read barrier for a heap reference within `instruction`.
+  //
+  // A read barrier for an object reference read from the heap is
+  // implemented as a call to the artReadBarrierSlow runtime entry
+  // point, which is passed the values in locations `ref`, `obj`, and
+  // `offset`:
+  //
+  //   mirror::Object* artReadBarrierSlow(mirror::Object* ref,
+  //                                      mirror::Object* obj,
+  //                                      uint32_t offset);
+  //
+  // The `out` location contains the value returned by
+  // artReadBarrierSlow.
+  //
+  // When `index` provided (i.e., when it is different from
+  // Location::NoLocation()), the offset value passed to
+  // artReadBarrierSlow is adjusted to take `index` into account.
+  void GenerateReadBarrier(HInstruction* instruction,
+                           Location out,
+                           Location ref,
+                           Location obj,
+                           uint32_t offset,
+                           Location index = Location::NoLocation());
+
+  // If read barriers are enabled, generate a read barrier for a heap reference.
+  // If heap poisoning is enabled, also unpoison the reference in `out`.
+  void MaybeGenerateReadBarrier(HInstruction* instruction,
+                                Location out,
+                                Location ref,
+                                Location obj,
+                                uint32_t offset,
+                                Location index = Location::NoLocation());
+
+  // Generate a read barrier for a GC root within `instruction`.
+  //
+  // A read barrier for an object reference GC root is implemented as
+  // a call to the artReadBarrierForRootSlow runtime entry point,
+  // which is passed the value in location `root`:
+  //
+  //   mirror::Object* artReadBarrierForRootSlow(GcRoot<mirror::Object>* root);
+  //
+  // The `out` location contains the value returned by
+  // artReadBarrierForRootSlow.
+  void GenerateReadBarrierForRoot(HInstruction* instruction, Location out, Location root);
+
   int ConstantAreaStart() const {
     return constant_area_start_;
   }
@@ -401,7 +444,7 @@ class CodeGeneratorX86_64 : public CodeGenerator {
   ArenaDeque<MethodPatchInfo<Label>> method_patches_;
   ArenaDeque<MethodPatchInfo<Label>> relative_call_patches_;
   // PC-relative DexCache access info.
-  ArenaDeque<PcRelativeDexCacheAccessInfo> pc_rel_dex_cache_patches_;
+  ArenaDeque<PcRelativeDexCacheAccessInfo> pc_relative_dex_cache_patches_;
 
   // When we don't know the proper offset for the value, we use kDummy32BitOffset.
   // We will fix this up in the linker later to have the right value.
diff --git a/compiler/optimizing/common_dominator.h b/compiler/optimizing/common_dominator.h
new file mode 100644
index 0000000000..b459d24d7c
--- /dev/null
+++ b/compiler/optimizing/common_dominator.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_OPTIMIZING_COMMON_DOMINATOR_H_
+#define ART_COMPILER_OPTIMIZING_COMMON_DOMINATOR_H_
+
+#include "nodes.h"
+
+namespace art {
+
+// Helper class for finding common dominators of two or more blocks in a graph.
+// The domination information of a graph must not be modified while there is
+// a CommonDominator object as it's internal state could become invalid.
+class CommonDominator {
+ public:
+  // Convenience function to find the common dominator of 2 blocks.
+  static HBasicBlock* ForPair(HBasicBlock* block1, HBasicBlock* block2) {
+    CommonDominator finder(block1);
+    finder.Update(block2);
+    return finder.Get();
+  }
+
+  // Create a finder starting with a given block.
+  explicit CommonDominator(HBasicBlock* block)
+      : dominator_(block), chain_length_(ChainLength(block)) {
+    DCHECK(block != nullptr);
+  }
+
+  // Update the common dominator with another block.
+  void Update(HBasicBlock* block) {
+    DCHECK(block != nullptr);
+    HBasicBlock* block2 = dominator_;
+    DCHECK(block2 != nullptr);
+    if (block == block2) {
+      return;
+    }
+    size_t chain_length = ChainLength(block);
+    size_t chain_length2 = chain_length_;
+    // Equalize the chain lengths
+    for ( ; chain_length > chain_length2; --chain_length) {
+      block = block->GetDominator();
+      DCHECK(block != nullptr);
+    }
+    for ( ; chain_length2 > chain_length; --chain_length2) {
+      block2 = block2->GetDominator();
+      DCHECK(block2 != nullptr);
+    }
+    // Now run up the chain until we hit the common dominator.
+    while (block != block2) {
+      --chain_length;
+      block = block->GetDominator();
+      DCHECK(block != nullptr);
+      block2 = block2->GetDominator();
+      DCHECK(block2 != nullptr);
+    }
+    dominator_ = block;
+    chain_length_ = chain_length;
+  }
+
+  HBasicBlock* Get() const {
+    return dominator_;
+  }
+
+ private:
+  static size_t ChainLength(HBasicBlock* block) {
+    size_t result = 0;
+    while (block != nullptr) {
+      ++result;
+      block = block->GetDominator();
+    }
+    return result;
+  }
+
+  HBasicBlock* dominator_;
+  size_t chain_length_;
+};
+
+}  // namespace art
+
+#endif  // ART_COMPILER_OPTIMIZING_COMMON_DOMINATOR_H_
diff --git a/compiler/optimizing/dead_code_elimination.cc b/compiler/optimizing/dead_code_elimination.cc
index 9754043f32..02e5dab3d4 100644
--- a/compiler/optimizing/dead_code_elimination.cc
+++ b/compiler/optimizing/dead_code_elimination.cc
@@ -123,20 +123,21 @@ void HDeadCodeElimination::RemoveDeadBlocks() {
   }
 
   // If we removed at least one block, we need to recompute the full
-  // dominator tree.
+  // dominator tree and try block membership.
   if (removed_one_or_more_blocks) {
     graph_->ClearDominanceInformation();
     graph_->ComputeDominanceInformation();
+    graph_->ComputeTryBlockInformation();
   }
 
   // Connect successive blocks created by dead branches. Order does not matter.
   for (HReversePostOrderIterator it(*graph_); !it.Done();) {
     HBasicBlock* block  = it.Current();
-    if (block->IsEntryBlock() || block->GetSuccessors().size() != 1u) {
+    if (block->IsEntryBlock() || !block->GetLastInstruction()->IsGoto()) {
       it.Advance();
       continue;
     }
-    HBasicBlock* successor = block->GetSuccessors()[0];
+    HBasicBlock* successor = block->GetSingleSuccessor();
     if (successor->IsExitBlock() || successor->GetPredecessors().size() != 1u) {
       it.Advance();
       continue;
@@ -176,10 +177,7 @@ void HDeadCodeElimination::RemoveDeadInstructions() {
 }
 
 void HDeadCodeElimination::Run() {
-  if (!graph_->HasTryCatch()) {
-    // TODO: Update dead block elimination and enable for try/catch.
-    RemoveDeadBlocks();
-  }
+  RemoveDeadBlocks();
   SsaRedundantPhiElimination(graph_).Run();
   RemoveDeadInstructions();
 }
diff --git a/compiler/optimizing/graph_checker.cc b/compiler/optimizing/graph_checker.cc
index 3de96b5d84..5814d7556f 100644
--- a/compiler/optimizing/graph_checker.cc
+++ b/compiler/optimizing/graph_checker.cc
@@ -163,12 +163,12 @@ void GraphChecker::VisitBoundsCheck(HBoundsCheck* check) {
 }
 
 void GraphChecker::VisitTryBoundary(HTryBoundary* try_boundary) {
-  // Ensure that all exception handlers are catch blocks and that handlers
-  // are not listed multiple times.
+  ArrayRef<HBasicBlock* const> handlers = try_boundary->GetExceptionHandlers();
+
+  // Ensure that all exception handlers are catch blocks.
   // Note that a normal-flow successor may be a catch block before CFG
   // simplification. We only test normal-flow successors in SsaChecker.
-  for (HExceptionHandlerIterator it(*try_boundary); !it.Done(); it.Advance()) {
-    HBasicBlock* handler = it.Current();
+  for (HBasicBlock* handler : handlers) {
     if (!handler->IsCatchBlock()) {
       AddError(StringPrintf("Block %d with %s:%d has exceptional successor %d which "
                             "is not a catch block.",
@@ -177,9 +177,13 @@ void GraphChecker::VisitTryBoundary(HTryBoundary* try_boundary) {
                             try_boundary->GetId(),
                             handler->GetBlockId()));
     }
-    if (current_block_->HasSuccessor(handler, it.CurrentSuccessorIndex() + 1)) {
-      AddError(StringPrintf("Exception handler block %d of %s:%d is listed multiple times.",
-                            handler->GetBlockId(),
+  }
+
+  // Ensure that handlers are not listed multiple times.
+  for (size_t i = 0, e = handlers.size(); i < e; ++i) {
+    if (ContainsElement(handlers, handlers[i], i + 1)) {
+        AddError(StringPrintf("Exception handler block %d of %s:%d is listed multiple times.",
+                            handlers[i]->GetBlockId(),
                             try_boundary->DebugName(),
                             try_boundary->GetId()));
     }
@@ -188,6 +192,21 @@ void GraphChecker::VisitTryBoundary(HTryBoundary* try_boundary) {
   VisitInstruction(try_boundary);
 }
 
+void GraphChecker::VisitLoadException(HLoadException* load) {
+  // Ensure that LoadException is the first instruction in a catch block.
+  if (!load->GetBlock()->IsCatchBlock()) {
+    AddError(StringPrintf("%s:%d is in a non-catch block %d.",
+                          load->DebugName(),
+                          load->GetId(),
+                          load->GetBlock()->GetBlockId()));
+  } else if (load->GetBlock()->GetFirstInstruction() != load) {
+    AddError(StringPrintf("%s:%d is not the first instruction in catch block %d.",
+                          load->DebugName(),
+                          load->GetId(),
+                          load->GetBlock()->GetBlockId()));
+  }
+}
+
 void GraphChecker::VisitInstruction(HInstruction* instruction) {
   if (seen_ids_.IsBitSet(instruction->GetId())) {
     AddError(StringPrintf("Instruction id %d is duplicate in graph.",
@@ -242,10 +261,11 @@ void GraphChecker::VisitInstruction(HInstruction* instruction) {
     }
     size_t use_index = use_it.Current()->GetIndex();
     if ((use_index >= use->InputCount()) || (use->InputAt(use_index) != instruction)) {
-      AddError(StringPrintf("User %s:%d of instruction %d has a wrong "
+      AddError(StringPrintf("User %s:%d of instruction %s:%d has a wrong "
                             "UseListNode index.",
                             use->DebugName(),
                             use->GetId(),
+                            instruction->DebugName(),
                             instruction->GetId()));
     }
   }
@@ -355,17 +375,14 @@ void SSAChecker::VisitBasicBlock(HBasicBlock* block) {
 
   // Ensure that catch blocks are not normal successors, and normal blocks are
   // never exceptional successors.
-  const size_t num_normal_successors = block->NumberOfNormalSuccessors();
-  for (size_t j = 0; j < num_normal_successors; ++j) {
-    HBasicBlock* successor = block->GetSuccessors()[j];
+  for (HBasicBlock* successor : block->GetNormalSuccessors()) {
     if (successor->IsCatchBlock()) {
       AddError(StringPrintf("Catch block %d is a normal successor of block %d.",
                             successor->GetBlockId(),
                             block->GetBlockId()));
     }
   }
-  for (size_t j = num_normal_successors, e = block->GetSuccessors().size(); j < e; ++j) {
-    HBasicBlock* successor = block->GetSuccessors()[j];
+  for (HBasicBlock* successor : block->GetExceptionalSuccessors()) {
     if (!successor->IsCatchBlock()) {
       AddError(StringPrintf("Normal block %d is an exceptional successor of block %d.",
                             successor->GetBlockId(),
@@ -377,10 +394,14 @@ void SSAChecker::VisitBasicBlock(HBasicBlock* block) {
   // block with multiple successors to a block with multiple
   // predecessors). Exceptional edges are synthesized and hence
   // not accounted for.
-  if (block->NumberOfNormalSuccessors() > 1) {
-    for (size_t j = 0, e = block->NumberOfNormalSuccessors(); j < e; ++j) {
-      HBasicBlock* successor = block->GetSuccessors()[j];
-      if (successor->GetPredecessors().size() > 1) {
+  if (block->GetSuccessors().size() > 1) {
+    for (HBasicBlock* successor : block->GetNormalSuccessors()) {
+      if (successor->IsExitBlock() &&
+          block->IsSingleTryBoundary() &&
+          block->GetPredecessors().size() == 1u &&
+          block->GetSinglePredecessor()->GetLastInstruction()->IsThrow()) {
+        // Allowed critical edge Throw->TryBoundary->Exit.
+      } else if (successor->GetPredecessors().size() > 1) {
         AddError(StringPrintf("Critical edge between blocks %d and %d.",
                               block->GetBlockId(),
                               successor->GetBlockId()));
@@ -445,12 +466,18 @@ void SSAChecker::CheckLoop(HBasicBlock* loop_header) {
   int id = loop_header->GetBlockId();
   HLoopInformation* loop_information = loop_header->GetLoopInformation();
 
-  // Ensure the pre-header block is first in the list of
-  // predecessors of a loop header.
+  // Ensure the pre-header block is first in the list of predecessors of a loop
+  // header and that the header block is its only successor.
   if (!loop_header->IsLoopPreHeaderFirstPredecessor()) {
     AddError(StringPrintf(
         "Loop pre-header is not the first predecessor of the loop header %d.",
         id));
+  } else if (loop_information->GetPreHeader()->GetSuccessors().size() != 1) {
+    AddError(StringPrintf(
+        "Loop pre-header %d of loop defined by header %d has %zu successors.",
+        loop_information->GetPreHeader()->GetBlockId(),
+        id,
+        loop_information->GetPreHeader()->GetSuccessors().size()));
   }
 
   // Ensure the loop header has only one incoming branch and the remaining
@@ -493,6 +520,13 @@ void SSAChecker::CheckLoop(HBasicBlock* loop_header) {
             "Loop defined by header %d has an invalid back edge %d.",
             id,
             back_edge_id));
+      } else if (back_edge->GetLoopInformation() != loop_information) {
+        AddError(StringPrintf(
+            "Back edge %d of loop defined by header %d belongs to nested loop "
+            "with header %d.",
+            back_edge_id,
+            id,
+            back_edge->GetLoopInformation()->GetHeader()->GetBlockId()));
       }
     }
   }
@@ -531,10 +565,14 @@ void SSAChecker::VisitInstruction(HInstruction* instruction) {
        !use_it.Done(); use_it.Advance()) {
     HInstruction* use = use_it.Current()->GetUser();
     if (!use->IsPhi() && !instruction->StrictlyDominates(use)) {
-      AddError(StringPrintf("Instruction %d in block %d does not dominate "
-                            "use %d in block %d.",
-                            instruction->GetId(), current_block_->GetBlockId(),
-                            use->GetId(), use->GetBlock()->GetBlockId()));
+      AddError(StringPrintf("Instruction %s:%d in block %d does not dominate "
+                            "use %s:%d in block %d.",
+                            instruction->DebugName(),
+                            instruction->GetId(),
+                            current_block_->GetBlockId(),
+                            use->DebugName(),
+                            use->GetId(),
+                            use->GetBlock()->GetBlockId()));
     }
   }
 
diff --git a/compiler/optimizing/graph_checker.h b/compiler/optimizing/graph_checker.h
index abf3659d91..d5ddbabc8c 100644
--- a/compiler/optimizing/graph_checker.h
+++ b/compiler/optimizing/graph_checker.h
@@ -50,6 +50,9 @@ class GraphChecker : public HGraphDelegateVisitor {
   // Check successors of blocks ending in TryBoundary.
   void VisitTryBoundary(HTryBoundary* try_boundary) OVERRIDE;
 
+  // Check that LoadException is the first instruction in a catch block.
+  void VisitLoadException(HLoadException* load) OVERRIDE;
+
   // Check that HCheckCast and HInstanceOf have HLoadClass as second input.
   void VisitCheckCast(HCheckCast* check) OVERRIDE;
   void VisitInstanceOf(HInstanceOf* check) OVERRIDE;
diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index 4111671a9b..d166d0061f 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc
@@ -24,6 +24,7 @@
 #include "code_generator.h"
 #include "dead_code_elimination.h"
 #include "disassembler.h"
+#include "inliner.h"
 #include "licm.h"
 #include "nodes.h"
 #include "optimization.h"
@@ -252,8 +253,7 @@ class HGraphVisualizerPrinter : public HGraphDelegateVisitor {
   void PrintSuccessors(HBasicBlock* block) {
     AddIndent();
     output_ << "successors";
-    for (size_t i = 0; i < block->NumberOfNormalSuccessors(); ++i) {
-      HBasicBlock* successor = block->GetSuccessors()[i];
+    for (HBasicBlock* successor : block->GetNormalSuccessors()) {
       output_ << " \"B" << successor->GetBlockId() << "\" ";
     }
     output_<< std::endl;
@@ -262,8 +262,7 @@ class HGraphVisualizerPrinter : public HGraphDelegateVisitor {
   void PrintExceptionHandlers(HBasicBlock* block) {
     AddIndent();
     output_ << "xhandlers";
-    for (size_t i = block->NumberOfNormalSuccessors(); i < block->GetSuccessors().size(); ++i) {
-      HBasicBlock* handler = block->GetSuccessors()[i];
+    for (HBasicBlock* handler : block->GetExceptionalSuccessors()) {
       output_ << " \"B" << handler->GetBlockId() << "\" ";
     }
     if (block->IsExitBlock() &&
@@ -398,6 +397,9 @@ class HGraphVisualizerPrinter : public HGraphDelegateVisitor {
                                       << invoke->IsRecursive()
                                       << std::noboolalpha;
     StartAttributeStream("intrinsic") << invoke->GetIntrinsic();
+    if (invoke->IsStatic()) {
+      StartAttributeStream("clinit_check") << invoke->GetClinitCheckRequirement();
+    }
   }
 
   void VisitUnresolvedInstanceFieldGet(HUnresolvedInstanceFieldGet* field_access) OVERRIDE {
@@ -424,11 +426,6 @@ class HGraphVisualizerPrinter : public HGraphDelegateVisitor {
     return strcmp(pass_name_, name) == 0;
   }
 
-  bool IsReferenceTypePropagationPass() {
-    return strstr(pass_name_, ReferenceTypePropagation::kReferenceTypePropagationPassName)
-        != nullptr;
-  }
-
   void PrintInstruction(HInstruction* instruction) {
     output_ << instruction->DebugName();
     if (instruction->InputCount() > 0) {
@@ -492,7 +489,8 @@ class HGraphVisualizerPrinter : public HGraphDelegateVisitor {
       } else {
         StartAttributeStream("loop") << "B" << info->GetHeader()->GetBlockId();
       }
-    } else if (IsReferenceTypePropagationPass()
+    } else if ((IsPass(ReferenceTypePropagation::kReferenceTypePropagationPassName)
+        || IsPass(HInliner::kInlinerPassName))
         && (instruction->GetType() == Primitive::kPrimNot)) {
       ReferenceTypeInfo info = instruction->IsLoadClass()
         ? instruction->AsLoadClass()->GetLoadedClassRTI()
@@ -505,6 +503,18 @@ class HGraphVisualizerPrinter : public HGraphDelegateVisitor {
         StartAttributeStream("exact") << std::boolalpha << info.IsExact() << std::noboolalpha;
       } else if (instruction->IsLoadClass()) {
         StartAttributeStream("klass") << "unresolved";
+      } else if (instruction->IsNullConstant()) {
+        // The NullConstant may be added to the graph during other passes that happen between
+        // ReferenceTypePropagation and Inliner (e.g. InstructionSimplifier). If the inliner
+        // doesn't run or doesn't inline anything, the NullConstant remains untyped.
+        // So we should check NullConstants for validity only after reference type propagation.
+        //
+        // Note: The infrastructure to properly type NullConstants everywhere is to complex to add
+        // for the benefits.
+        StartAttributeStream("klass") << "not_set";
+        DCHECK(!is_after_pass_
+            || !IsPass(ReferenceTypePropagation::kReferenceTypePropagationPassName))
+            << " Expected a valid rti after reference type propagation";
       } else {
         DCHECK(!is_after_pass_)
             << "Expected a valid rti after reference type propagation";
diff --git a/compiler/optimizing/induction_var_analysis_test.cc b/compiler/optimizing/induction_var_analysis_test.cc
index b7262f6b29..5de94f43c9 100644
--- a/compiler/optimizing/induction_var_analysis_test.cc
+++ b/compiler/optimizing/induction_var_analysis_test.cc
@@ -69,10 +69,13 @@ class InductionVarAnalysisTest : public testing::Test {
     entry_ = new (&allocator_) HBasicBlock(graph_);
     graph_->AddBlock(entry_);
     BuildForLoop(0, n);
+    return_ = new (&allocator_) HBasicBlock(graph_);
+    graph_->AddBlock(return_);
     exit_ = new (&allocator_) HBasicBlock(graph_);
     graph_->AddBlock(exit_);
     entry_->AddSuccessor(loop_preheader_[0]);
-    loop_header_[0]->AddSuccessor(exit_);
+    loop_header_[0]->AddSuccessor(return_);
+    return_->AddSuccessor(exit_);
     graph_->SetEntryBlock(entry_);
     graph_->SetExitBlock(exit_);
 
@@ -91,6 +94,7 @@ class InductionVarAnalysisTest : public testing::Test {
     entry_->AddInstruction(new (&allocator_) HStoreLocal(tmp_, constant100_));
     dum_ = new (&allocator_) HLocal(n + 2);
     entry_->AddInstruction(dum_);
+    return_->AddInstruction(new (&allocator_) HReturnVoid());
     exit_->AddInstruction(new (&allocator_) HExit());
 
     // Provide loop instructions.
@@ -177,6 +181,7 @@ class InductionVarAnalysisTest : public testing::Test {
 
   // Fixed basic blocks and instructions.
   HBasicBlock* entry_;
+  HBasicBlock* return_;
   HBasicBlock* exit_;
   HInstruction* parameter_;  // "this"
   HInstruction* constant0_;
diff --git a/compiler/optimizing/induction_var_range_test.cc b/compiler/optimizing/induction_var_range_test.cc
index fda5153d43..c2ba157ed8 100644
--- a/compiler/optimizing/induction_var_range_test.cc
+++ b/compiler/optimizing/induction_var_range_test.cc
@@ -70,11 +70,14 @@ class InductionVarRangeTest : public testing::Test {
     graph_->AddBlock(loop_header);
     HBasicBlock* loop_body = new (&allocator_) HBasicBlock(graph_);
     graph_->AddBlock(loop_body);
+    HBasicBlock* return_block = new (&allocator_) HBasicBlock(graph_);
+    graph_->AddBlock(return_block);
     entry_block_->AddSuccessor(loop_preheader_);
     loop_preheader_->AddSuccessor(loop_header);
     loop_header->AddSuccessor(loop_body);
-    loop_header->AddSuccessor(exit_block_);
+    loop_header->AddSuccessor(return_block);
     loop_body->AddSuccessor(loop_header);
+    return_block->AddSuccessor(exit_block_);
     // Instructions.
     HLocal* induc = new (&allocator_) HLocal(0);
     entry_block_->AddInstruction(induc);
@@ -96,7 +99,8 @@ class InductionVarRangeTest : public testing::Test {
     loop_body->AddInstruction(increment_);
     loop_body->AddInstruction(new (&allocator_) HStoreLocal(induc, increment_));  // i += s
     loop_body->AddInstruction(new (&allocator_) HGoto());
-    exit_block_->AddInstruction(new (&allocator_) HReturnVoid());
+    return_block->AddInstruction(new (&allocator_) HReturnVoid());
+    exit_block_->AddInstruction(new (&allocator_) HExit());
   }
 
   /** Performs induction variable analysis. */
diff --git a/compiler/optimizing/inliner.cc b/compiler/optimizing/inliner.cc
index 353881e47a..0363f203b2 100644
--- a/compiler/optimizing/inliner.cc
+++ b/compiler/optimizing/inliner.cc
@@ -148,7 +148,7 @@ static ArtMethod* FindVirtualOrInterfaceTarget(HInvoke* invoke, ArtMethod* resol
     // the target method. Since we check above the exact type of the receiver,
     // the only reason this can happen is an IncompatibleClassChangeError.
     return nullptr;
-  } else if (resolved_method->IsAbstract()) {
+  } else if (!resolved_method->IsInvokable()) {
     // The information we had on the receiver was not enough to find
     // the target method. Since we check above the exact type of the receiver,
     // the only reason this can happen is an IncompatibleClassChangeError.
@@ -406,8 +406,8 @@ bool HInliner::TryBuildAndInline(ArtMethod* resolved_method,
     &type_propagation,
     &sharpening,
     &simplify,
-    &dce,
     &fold,
+    &dce,
   };
 
   for (size_t i = 0; i < arraysize(optimizations); ++i) {
@@ -534,6 +534,7 @@ bool HInliner::TryBuildAndInline(ArtMethod* resolved_method,
             ReferenceTypeInfo::Create(obj_handle, false /* is_exact */));
   }
 
+  // Check the integrity of reference types and run another type propagation if needed.
   if ((return_replacement != nullptr)
       && (return_replacement->GetType() == Primitive::kPrimNot)) {
     if (!return_replacement->GetReferenceTypeInfo().IsValid()) {
@@ -544,10 +545,20 @@ bool HInliner::TryBuildAndInline(ArtMethod* resolved_method,
       DCHECK(return_replacement->IsPhi());
       size_t pointer_size = Runtime::Current()->GetClassLinker()->GetImagePointerSize();
       ReferenceTypeInfo::TypeHandle return_handle =
-        handles_->NewHandle(resolved_method->GetReturnType(true /* resolve */, pointer_size));
+          handles_->NewHandle(resolved_method->GetReturnType(true /* resolve */, pointer_size));
       return_replacement->SetReferenceTypeInfo(ReferenceTypeInfo::Create(
          return_handle, return_handle->CannotBeAssignedFromOtherTypes() /* is_exact */));
     }
+
+    // If the return type is a refinement of the declared type run the type propagation again.
+    ReferenceTypeInfo return_rti = return_replacement->GetReferenceTypeInfo();
+    ReferenceTypeInfo invoke_rti = invoke_instruction->GetReferenceTypeInfo();
+    if (invoke_rti.IsStrictSupertypeOf(return_rti)
+        || (return_rti.IsExact() && !invoke_rti.IsExact())
+        || !return_replacement->CanBeNull()) {
+      ReferenceTypePropagation rtp_fixup(graph_, handles_);
+      rtp_fixup.Run();
+    }
   }
 
   return true;
diff --git a/compiler/optimizing/instruction_simplifier.cc b/compiler/optimizing/instruction_simplifier.cc
index b97dc1a511..9ad2dd1c8e 100644
--- a/compiler/optimizing/instruction_simplifier.cc
+++ b/compiler/optimizing/instruction_simplifier.cc
@@ -796,6 +796,34 @@ void InstructionSimplifierVisitor::VisitMul(HMul* instruction) {
       HShl* shl = new(allocator) HShl(type, input_other, shift);
       block->ReplaceAndRemoveInstructionWith(instruction, shl);
       RecordSimplification();
+    } else if (IsPowerOfTwo(factor - 1)) {
+      // Transform code looking like
+      //    MUL dst, src, (2^n + 1)
+      // into
+      //    SHL tmp, src, n
+      //    ADD dst, src, tmp
+      HShl* shl = new (allocator) HShl(type,
+                                       input_other,
+                                       GetGraph()->GetIntConstant(WhichPowerOf2(factor - 1)));
+      HAdd* add = new (allocator) HAdd(type, input_other, shl);
+
+      block->InsertInstructionBefore(shl, instruction);
+      block->ReplaceAndRemoveInstructionWith(instruction, add);
+      RecordSimplification();
+    } else if (IsPowerOfTwo(factor + 1)) {
+      // Transform code looking like
+      //    MUL dst, src, (2^n - 1)
+      // into
+      //    SHL tmp, src, n
+      //    SUB dst, tmp, src
+      HShl* shl = new (allocator) HShl(type,
+                                       input_other,
+                                       GetGraph()->GetIntConstant(WhichPowerOf2(factor + 1)));
+      HSub* sub = new (allocator) HSub(type, shl, input_other);
+
+      block->InsertInstructionBefore(shl, instruction);
+      block->ReplaceAndRemoveInstructionWith(instruction, sub);
+      RecordSimplification();
     }
   }
 }
diff --git a/compiler/optimizing/intrinsics_arm.cc b/compiler/optimizing/intrinsics_arm.cc
index 0a5acc3e64..d2017da221 100644
--- a/compiler/optimizing/intrinsics_arm.cc
+++ b/compiler/optimizing/intrinsics_arm.cc
@@ -44,7 +44,23 @@ using IntrinsicSlowPathARM = IntrinsicSlowPath<InvokeDexCallingConventionVisitor
 bool IntrinsicLocationsBuilderARM::TryDispatch(HInvoke* invoke) {
   Dispatch(invoke);
   LocationSummary* res = invoke->GetLocations();
-  return res != nullptr && res->Intrinsified();
+  if (res == nullptr) {
+    return false;
+  }
+  if (kEmitCompilerReadBarrier && res->CanCall()) {
+    // Generating an intrinsic for this HInvoke may produce an
+    // IntrinsicSlowPathARM slow path.  Currently this approach
+    // does not work when using read barriers, as the emitted
+    // calling sequence will make use of another slow path
+    // (ReadBarrierForRootSlowPathARM for HInvokeStaticOrDirect,
+    // ReadBarrierSlowPathARM for HInvokeVirtual).  So we bail
+    // out in this case.
+    //
+    // TODO: Find a way to have intrinsics work with read barriers.
+    invoke->SetLocations(nullptr);
+    return false;
+  }
+  return res->Intrinsified();
 }
 
 #define __ assembler->
@@ -662,20 +678,23 @@ static void GenUnsafeGet(HInvoke* invoke,
          (type == Primitive::kPrimLong) ||
          (type == Primitive::kPrimNot));
   ArmAssembler* assembler = codegen->GetAssembler();
-  Register base = locations->InAt(1).AsRegister<Register>();           // Object pointer.
-  Register offset = locations->InAt(2).AsRegisterPairLow<Register>();  // Long offset, lo part only.
+  Location base_loc = locations->InAt(1);
+  Register base = base_loc.AsRegister<Register>();             // Object pointer.
+  Location offset_loc = locations->InAt(2);
+  Register offset = offset_loc.AsRegisterPairLow<Register>();  // Long offset, lo part only.
+  Location trg_loc = locations->Out();
 
   if (type == Primitive::kPrimLong) {
-    Register trg_lo = locations->Out().AsRegisterPairLow<Register>();
+    Register trg_lo = trg_loc.AsRegisterPairLow<Register>();
     __ add(IP, base, ShifterOperand(offset));
     if (is_volatile && !codegen->GetInstructionSetFeatures().HasAtomicLdrdAndStrd()) {
-      Register trg_hi = locations->Out().AsRegisterPairHigh<Register>();
+      Register trg_hi = trg_loc.AsRegisterPairHigh<Register>();
       __ ldrexd(trg_lo, trg_hi, IP);
     } else {
       __ ldrd(trg_lo, Address(IP));
     }
   } else {
-    Register trg = locations->Out().AsRegister<Register>();
+    Register trg = trg_loc.AsRegister<Register>();
     __ ldr(trg, Address(base, offset));
   }
 
@@ -684,14 +703,18 @@ static void GenUnsafeGet(HInvoke* invoke,
   }
 
   if (type == Primitive::kPrimNot) {
-    Register trg = locations->Out().AsRegister<Register>();
-    __ MaybeUnpoisonHeapReference(trg);
+    codegen->MaybeGenerateReadBarrier(invoke, trg_loc, trg_loc, base_loc, 0U, offset_loc);
   }
 }
 
 static void CreateIntIntIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke) {
+  bool can_call = kEmitCompilerReadBarrier &&
+      (invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObject ||
+       invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile);
   LocationSummary* locations = new (arena) LocationSummary(invoke,
-                                                           LocationSummary::kNoCall,
+                                                           can_call ?
+                                                               LocationSummary::kCallOnSlowPath :
+                                                               LocationSummary::kNoCall,
                                                            kIntrinsified);
   locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
   locations->SetInAt(1, Location::RequiresRegister());
@@ -936,6 +959,7 @@ static void GenCas(LocationSummary* locations, Primitive::Type type, CodeGenerat
   __ Bind(&loop_head);
 
   __ ldrex(tmp_lo, tmp_ptr);
+  // TODO: Do we need a read barrier here when `type == Primitive::kPrimNot`?
 
   __ subs(tmp_lo, tmp_lo, ShifterOperand(expected_lo));
 
@@ -964,7 +988,11 @@ void IntrinsicLocationsBuilderARM::VisitUnsafeCASObject(HInvoke* invoke) {
   // The UnsafeCASObject intrinsic does not always work when heap
   // poisoning is enabled (it breaks run-test 004-UnsafeTest); turn it
   // off temporarily as a quick fix.
+  //
   // TODO(rpl): Fix it and turn it back on.
+  //
+  // TODO(rpl): Also, we should investigate whether we need a read
+  // barrier in the generated code.
   if (kPoisonHeapReferences) {
     return;
   }
@@ -1400,6 +1428,10 @@ static void CheckPosition(ArmAssembler* assembler,
   }
 }
 
+// TODO: Implement read barriers in the SystemArrayCopy intrinsic.
+// Note that this code path is not used (yet) because we do not
+// intrinsify methods that can go into the IntrinsicSlowPathARM
+// slow path.
 void IntrinsicCodeGeneratorARM::VisitSystemArrayCopy(HInvoke* invoke) {
   ArmAssembler* assembler = GetAssembler();
   LocationSummary* locations = invoke->GetLocations();
diff --git a/compiler/optimizing/intrinsics_mips64.cc b/compiler/optimizing/intrinsics_mips64.cc
index ff843ebb1e..3654159f83 100644
--- a/compiler/optimizing/intrinsics_mips64.cc
+++ b/compiler/optimizing/intrinsics_mips64.cc
@@ -1391,6 +1391,108 @@ void IntrinsicCodeGeneratorMIPS64::VisitStringCompareTo(HInvoke* invoke) {
   __ Bind(slow_path->GetExitLabel());
 }
 
+// boolean java.lang.String.equals(Object anObject)
+void IntrinsicLocationsBuilderMIPS64::VisitStringEquals(HInvoke* invoke) {
+  LocationSummary* locations = new (arena_) LocationSummary(invoke,
+                                                            LocationSummary::kNoCall,
+                                                            kIntrinsified);
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(1, Location::RequiresRegister());
+  locations->SetOut(Location::RequiresRegister());
+
+  // Temporary registers to store lengths of strings and for calculations.
+  locations->AddTemp(Location::RequiresRegister());
+  locations->AddTemp(Location::RequiresRegister());
+  locations->AddTemp(Location::RequiresRegister());
+}
+
+void IntrinsicCodeGeneratorMIPS64::VisitStringEquals(HInvoke* invoke) {
+  Mips64Assembler* assembler = GetAssembler();
+  LocationSummary* locations = invoke->GetLocations();
+
+  GpuRegister str = locations->InAt(0).AsRegister<GpuRegister>();
+  GpuRegister arg = locations->InAt(1).AsRegister<GpuRegister>();
+  GpuRegister out = locations->Out().AsRegister<GpuRegister>();
+
+  GpuRegister temp1 = locations->GetTemp(0).AsRegister<GpuRegister>();
+  GpuRegister temp2 = locations->GetTemp(1).AsRegister<GpuRegister>();
+  GpuRegister temp3 = locations->GetTemp(2).AsRegister<GpuRegister>();
+
+  Label loop;
+  Label end;
+  Label return_true;
+  Label return_false;
+
+  // Get offsets of count, value, and class fields within a string object.
+  const int32_t count_offset = mirror::String::CountOffset().Int32Value();
+  const int32_t value_offset = mirror::String::ValueOffset().Int32Value();
+  const int32_t class_offset = mirror::Object::ClassOffset().Int32Value();
+
+  // Note that the null check must have been done earlier.
+  DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
+
+  // If the register containing the pointer to "this", and the register
+  // containing the pointer to "anObject" are the same register then
+  // "this", and "anObject" are the same object and we can
+  // short-circuit the logic to a true result.
+  if (str == arg) {
+    __ LoadConst64(out, 1);
+    return;
+  }
+
+  // Check if input is null, return false if it is.
+  __ Beqzc(arg, &return_false);
+
+  // Reference equality check, return true if same reference.
+  __ Beqc(str, arg, &return_true);
+
+  // Instanceof check for the argument by comparing class fields.
+  // All string objects must have the same type since String cannot be subclassed.
+  // Receiver must be a string object, so its class field is equal to all strings' class fields.
+  // If the argument is a string object, its class field must be equal to receiver's class field.
+  __ Lw(temp1, str, class_offset);
+  __ Lw(temp2, arg, class_offset);
+  __ Bnec(temp1, temp2, &return_false);
+
+  // Load lengths of this and argument strings.
+  __ Lw(temp1, str, count_offset);
+  __ Lw(temp2, arg, count_offset);
+  // Check if lengths are equal, return false if they're not.
+  __ Bnec(temp1, temp2, &return_false);
+  // Return true if both strings are empty.
+  __ Beqzc(temp1, &return_true);
+
+  // Don't overwrite input registers
+  __ Move(TMP, str);
+  __ Move(temp3, arg);
+
+  // Assertions that must hold in order to compare strings 4 characters at a time.
+  DCHECK_ALIGNED(value_offset, 8);
+  static_assert(IsAligned<8>(kObjectAlignment), "String of odd length is not zero padded");
+
+  // Loop to compare strings 4 characters at a time starting at the beginning of the string.
+  // Ok to do this because strings are zero-padded to be 8-byte aligned.
+  __ Bind(&loop);
+  __ Ld(out, TMP, value_offset);
+  __ Ld(temp2, temp3, value_offset);
+  __ Bnec(out, temp2, &return_false);
+  __ Daddiu(TMP, TMP, 8);
+  __ Daddiu(temp3, temp3, 8);
+  __ Addiu(temp1, temp1, -4);
+  __ Bgtzc(temp1, &loop);
+
+  // Return true and exit the function.
+  // If loop does not result in returning false, we return true.
+  __ Bind(&return_true);
+  __ LoadConst64(out, 1);
+  __ B(&end);
+
+  // Return false and exit the function.
+  __ Bind(&return_false);
+  __ LoadConst64(out, 0);
+  __ Bind(&end);
+}
+
 static void GenerateStringIndexOf(HInvoke* invoke,
                                   Mips64Assembler* assembler,
                                   CodeGeneratorMIPS64* codegen,
@@ -1586,8 +1688,6 @@ void IntrinsicCodeGeneratorMIPS64::Visit ## Name(HInvoke* invoke ATTRIBUTE_UNUSE
 UNIMPLEMENTED_INTRINSIC(MathRoundDouble)
 UNIMPLEMENTED_INTRINSIC(MathRoundFloat)
 
-UNIMPLEMENTED_INTRINSIC(StringEquals)
-
 UNIMPLEMENTED_INTRINSIC(ReferenceGetReferent)
 UNIMPLEMENTED_INTRINSIC(StringGetCharsNoCheck)
 UNIMPLEMENTED_INTRINSIC(SystemArrayCopyChar)
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index 040bf6a45e..371588fc47 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -55,7 +55,23 @@ ArenaAllocator* IntrinsicCodeGeneratorX86::GetAllocator() {
 bool IntrinsicLocationsBuilderX86::TryDispatch(HInvoke* invoke) {
   Dispatch(invoke);
   LocationSummary* res = invoke->GetLocations();
-  return res != nullptr && res->Intrinsified();
+  if (res == nullptr) {
+    return false;
+  }
+  if (kEmitCompilerReadBarrier && res->CanCall()) {
+    // Generating an intrinsic for this HInvoke may produce an
+    // IntrinsicSlowPathX86 slow path.  Currently this approach
+    // does not work when using read barriers, as the emitted
+    // calling sequence will make use of another slow path
+    // (ReadBarrierForRootSlowPathX86 for HInvokeStaticOrDirect,
+    // ReadBarrierSlowPathX86 for HInvokeVirtual).  So we bail
+    // out in this case.
+    //
+    // TODO: Find a way to have intrinsics work with read barriers.
+    invoke->SetLocations(nullptr);
+    return false;
+  }
+  return res->Intrinsified();
 }
 
 static void MoveArguments(HInvoke* invoke, CodeGeneratorX86* codegen) {
@@ -1571,26 +1587,32 @@ void IntrinsicCodeGeneratorX86::VisitThreadCurrentThread(HInvoke* invoke) {
   GetAssembler()->fs()->movl(out, Address::Absolute(Thread::PeerOffset<kX86WordSize>()));
 }
 
-static void GenUnsafeGet(LocationSummary* locations, Primitive::Type type,
-                         bool is_volatile, X86Assembler* assembler) {
-  Register base = locations->InAt(1).AsRegister<Register>();
-  Register offset = locations->InAt(2).AsRegisterPairLow<Register>();
-  Location output = locations->Out();
+static void GenUnsafeGet(HInvoke* invoke,
+                         Primitive::Type type,
+                         bool is_volatile,
+                         CodeGeneratorX86* codegen) {
+  X86Assembler* assembler = down_cast<X86Assembler*>(codegen->GetAssembler());
+  LocationSummary* locations = invoke->GetLocations();
+  Location base_loc = locations->InAt(1);
+  Register base = base_loc.AsRegister<Register>();
+  Location offset_loc = locations->InAt(2);
+  Register offset = offset_loc.AsRegisterPairLow<Register>();
+  Location output_loc = locations->Out();
 
   switch (type) {
     case Primitive::kPrimInt:
     case Primitive::kPrimNot: {
-      Register output_reg = output.AsRegister<Register>();
-      __ movl(output_reg, Address(base, offset, ScaleFactor::TIMES_1, 0));
+      Register output = output_loc.AsRegister<Register>();
+      __ movl(output, Address(base, offset, ScaleFactor::TIMES_1, 0));
       if (type == Primitive::kPrimNot) {
-        __ MaybeUnpoisonHeapReference(output_reg);
+        codegen->MaybeGenerateReadBarrier(invoke, output_loc, output_loc, base_loc, 0U, offset_loc);
       }
       break;
     }
 
     case Primitive::kPrimLong: {
-        Register output_lo = output.AsRegisterPairLow<Register>();
-        Register output_hi = output.AsRegisterPairHigh<Register>();
+        Register output_lo = output_loc.AsRegisterPairLow<Register>();
+        Register output_hi = output_loc.AsRegisterPairHigh<Register>();
         if (is_volatile) {
           // Need to use a XMM to read atomically.
           XmmRegister temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
@@ -1613,8 +1635,13 @@ static void GenUnsafeGet(LocationSummary* locations, Primitive::Type type,
 
 static void CreateIntIntIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke,
                                           bool is_long, bool is_volatile) {
+  bool can_call = kEmitCompilerReadBarrier &&
+      (invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObject ||
+       invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile);
   LocationSummary* locations = new (arena) LocationSummary(invoke,
-                                                           LocationSummary::kNoCall,
+                                                           can_call ?
+                                                               LocationSummary::kCallOnSlowPath :
+                                                               LocationSummary::kNoCall,
                                                            kIntrinsified);
   locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
   locations->SetInAt(1, Location::RequiresRegister());
@@ -1653,22 +1680,22 @@ void IntrinsicLocationsBuilderX86::VisitUnsafeGetObjectVolatile(HInvoke* invoke)
 
 
 void IntrinsicCodeGeneratorX86::VisitUnsafeGet(HInvoke* invoke) {
-  GenUnsafeGet(invoke->GetLocations(), Primitive::kPrimInt, false, GetAssembler());
+  GenUnsafeGet(invoke, Primitive::kPrimInt, false, codegen_);
 }
 void IntrinsicCodeGeneratorX86::VisitUnsafeGetVolatile(HInvoke* invoke) {
-  GenUnsafeGet(invoke->GetLocations(), Primitive::kPrimInt, true, GetAssembler());
+  GenUnsafeGet(invoke, Primitive::kPrimInt, true, codegen_);
 }
 void IntrinsicCodeGeneratorX86::VisitUnsafeGetLong(HInvoke* invoke) {
-  GenUnsafeGet(invoke->GetLocations(), Primitive::kPrimLong, false, GetAssembler());
+  GenUnsafeGet(invoke, Primitive::kPrimLong, false, codegen_);
 }
 void IntrinsicCodeGeneratorX86::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
-  GenUnsafeGet(invoke->GetLocations(), Primitive::kPrimLong, true, GetAssembler());
+  GenUnsafeGet(invoke, Primitive::kPrimLong, true, codegen_);
 }
 void IntrinsicCodeGeneratorX86::VisitUnsafeGetObject(HInvoke* invoke) {
-  GenUnsafeGet(invoke->GetLocations(), Primitive::kPrimNot, false, GetAssembler());
+  GenUnsafeGet(invoke, Primitive::kPrimNot, false, codegen_);
 }
 void IntrinsicCodeGeneratorX86::VisitUnsafeGetObjectVolatile(HInvoke* invoke) {
-  GenUnsafeGet(invoke->GetLocations(), Primitive::kPrimNot, true, GetAssembler());
+  GenUnsafeGet(invoke, Primitive::kPrimNot, true, codegen_);
 }
 
 
@@ -1890,13 +1917,18 @@ static void GenCAS(Primitive::Type type, HInvoke* invoke, CodeGeneratorX86* code
 
     __ LockCmpxchgl(Address(base, offset, TIMES_1, 0), value);
 
-    // locked cmpxchg has full barrier semantics, and we don't need
+    // LOCK CMPXCHG has full barrier semantics, and we don't need
     // scheduling barriers at this time.
 
     // Convert ZF into the boolean result.
     __ setb(kZero, out.AsRegister<Register>());
     __ movzxb(out.AsRegister<Register>(), out.AsRegister<ByteRegister>());
 
+    // In the case of the `UnsafeCASObject` intrinsic, accessing an
+    // object in the heap with LOCK CMPXCHG does not require a read
+    // barrier, as we do not keep a reference to this heap location.
+    // However, if heap poisoning is enabled, we need to unpoison the
+    // values that were poisoned earlier.
     if (kPoisonHeapReferences) {
       if (base_equals_value) {
         // `value` has been moved to a temporary register, no need to
@@ -1929,8 +1961,8 @@ static void GenCAS(Primitive::Type type, HInvoke* invoke, CodeGeneratorX86* code
       LOG(FATAL) << "Unexpected CAS type " << type;
     }
 
-    // locked cmpxchg has full barrier semantics, and we don't need
-    // scheduling barriers at this time.
+    // LOCK CMPXCHG/LOCK CMPXCHG8B have full barrier semantics, and we
+    // don't need scheduling barriers at this time.
 
     // Convert ZF into the boolean result.
     __ setb(kZero, out.AsRegister<Register>());
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index a29f3ef1d1..2d9f01b821 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -50,8 +50,24 @@ ArenaAllocator* IntrinsicCodeGeneratorX86_64::GetAllocator() {
 
 bool IntrinsicLocationsBuilderX86_64::TryDispatch(HInvoke* invoke) {
   Dispatch(invoke);
-  const LocationSummary* res = invoke->GetLocations();
-  return res != nullptr && res->Intrinsified();
+  LocationSummary* res = invoke->GetLocations();
+  if (res == nullptr) {
+    return false;
+  }
+  if (kEmitCompilerReadBarrier && res->CanCall()) {
+    // Generating an intrinsic for this HInvoke may produce an
+    // IntrinsicSlowPathX86_64 slow path.  Currently this approach
+    // does not work when using read barriers, as the emitted
+    // calling sequence will make use of another slow path
+    // (ReadBarrierForRootSlowPathX86_64 for HInvokeStaticOrDirect,
+    // ReadBarrierSlowPathX86_64 for HInvokeVirtual).  So we bail
+    // out in this case.
+    //
+    // TODO: Find a way to have intrinsics work with read barriers.
+    invoke->SetLocations(nullptr);
+    return false;
+  }
+  return res->Intrinsified();
 }
 
 static void MoveArguments(HInvoke* invoke, CodeGeneratorX86_64* codegen) {
@@ -917,6 +933,10 @@ void IntrinsicLocationsBuilderX86_64::VisitSystemArrayCopy(HInvoke* invoke) {
   CodeGenerator::CreateSystemArrayCopyLocationSummary(invoke);
 }
 
+// TODO: Implement read barriers in the SystemArrayCopy intrinsic.
+// Note that this code path is not used (yet) because we do not
+// intrinsify methods that can go into the IntrinsicSlowPathX86_64
+// slow path.
 void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) {
   X86_64Assembler* assembler = GetAssembler();
   LocationSummary* locations = invoke->GetLocations();
@@ -1698,23 +1718,30 @@ void IntrinsicCodeGeneratorX86_64::VisitThreadCurrentThread(HInvoke* invoke) {
   GetAssembler()->gs()->movl(out, Address::Absolute(Thread::PeerOffset<kX86_64WordSize>(), true));
 }
 
-static void GenUnsafeGet(LocationSummary* locations, Primitive::Type type,
-                         bool is_volatile ATTRIBUTE_UNUSED, X86_64Assembler* assembler) {
-  CpuRegister base = locations->InAt(1).AsRegister<CpuRegister>();
-  CpuRegister offset = locations->InAt(2).AsRegister<CpuRegister>();
-  CpuRegister trg = locations->Out().AsRegister<CpuRegister>();
+static void GenUnsafeGet(HInvoke* invoke,
+                         Primitive::Type type,
+                         bool is_volatile ATTRIBUTE_UNUSED,
+                         CodeGeneratorX86_64* codegen) {
+  X86_64Assembler* assembler = down_cast<X86_64Assembler*>(codegen->GetAssembler());
+  LocationSummary* locations = invoke->GetLocations();
+  Location base_loc = locations->InAt(1);
+  CpuRegister base = base_loc.AsRegister<CpuRegister>();
+  Location offset_loc = locations->InAt(2);
+  CpuRegister offset = offset_loc.AsRegister<CpuRegister>();
+  Location output_loc = locations->Out();
+  CpuRegister output = locations->Out().AsRegister<CpuRegister>();
 
   switch (type) {
     case Primitive::kPrimInt:
     case Primitive::kPrimNot:
-      __ movl(trg, Address(base, offset, ScaleFactor::TIMES_1, 0));
+      __ movl(output, Address(base, offset, ScaleFactor::TIMES_1, 0));
       if (type == Primitive::kPrimNot) {
-        __ MaybeUnpoisonHeapReference(trg);
+        codegen->MaybeGenerateReadBarrier(invoke, output_loc, output_loc, base_loc, 0U, offset_loc);
       }
       break;
 
     case Primitive::kPrimLong:
-      __ movq(trg, Address(base, offset, ScaleFactor::TIMES_1, 0));
+      __ movq(output, Address(base, offset, ScaleFactor::TIMES_1, 0));
       break;
 
     default:
@@ -1724,8 +1751,13 @@ static void GenUnsafeGet(LocationSummary* locations, Primitive::Type type,
 }
 
 static void CreateIntIntIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke) {
+  bool can_call = kEmitCompilerReadBarrier &&
+      (invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObject ||
+       invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile);
   LocationSummary* locations = new (arena) LocationSummary(invoke,
-                                                           LocationSummary::kNoCall,
+                                                           can_call ?
+                                                               LocationSummary::kCallOnSlowPath :
+                                                               LocationSummary::kNoCall,
                                                            kIntrinsified);
   locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
   locations->SetInAt(1, Location::RequiresRegister());
@@ -1754,22 +1786,22 @@ void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetObjectVolatile(HInvoke* invo
 
 
 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGet(HInvoke* invoke) {
-  GenUnsafeGet(invoke->GetLocations(), Primitive::kPrimInt, false, GetAssembler());
+  GenUnsafeGet(invoke, Primitive::kPrimInt, false, codegen_);
 }
 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetVolatile(HInvoke* invoke) {
-  GenUnsafeGet(invoke->GetLocations(), Primitive::kPrimInt, true, GetAssembler());
+  GenUnsafeGet(invoke, Primitive::kPrimInt, true, codegen_);
 }
 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetLong(HInvoke* invoke) {
-  GenUnsafeGet(invoke->GetLocations(), Primitive::kPrimLong, false, GetAssembler());
+  GenUnsafeGet(invoke, Primitive::kPrimLong, false, codegen_);
 }
 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
-  GenUnsafeGet(invoke->GetLocations(), Primitive::kPrimLong, true, GetAssembler());
+  GenUnsafeGet(invoke, Primitive::kPrimLong, true, codegen_);
 }
 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetObject(HInvoke* invoke) {
-  GenUnsafeGet(invoke->GetLocations(), Primitive::kPrimNot, false, GetAssembler());
+  GenUnsafeGet(invoke, Primitive::kPrimNot, false, codegen_);
 }
 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetObjectVolatile(HInvoke* invoke) {
-  GenUnsafeGet(invoke->GetLocations(), Primitive::kPrimNot, true, GetAssembler());
+  GenUnsafeGet(invoke, Primitive::kPrimNot, true, codegen_);
 }
 
 
@@ -1961,13 +1993,18 @@ static void GenCAS(Primitive::Type type, HInvoke* invoke, CodeGeneratorX86_64* c
 
     __ LockCmpxchgl(Address(base, offset, TIMES_1, 0), CpuRegister(value_reg));
 
-    // locked cmpxchg has full barrier semantics, and we don't need
+    // LOCK CMPXCHG has full barrier semantics, and we don't need
     // scheduling barriers at this time.
 
     // Convert ZF into the boolean result.
     __ setcc(kZero, out);
     __ movzxb(out, out);
 
+    // In the case of the `UnsafeCASObject` intrinsic, accessing an
+    // object in the heap with LOCK CMPXCHG does not require a read
+    // barrier, as we do not keep a reference to this heap location.
+    // However, if heap poisoning is enabled, we need to unpoison the
+    // values that were poisoned earlier.
     if (kPoisonHeapReferences) {
       if (base_equals_value) {
         // `value_reg` has been moved to a temporary register, no need
@@ -1992,7 +2029,7 @@ static void GenCAS(Primitive::Type type, HInvoke* invoke, CodeGeneratorX86_64* c
       LOG(FATAL) << "Unexpected CAS type " << type;
     }
 
-    // locked cmpxchg has full barrier semantics, and we don't need
+    // LOCK CMPXCHG has full barrier semantics, and we don't need
     // scheduling barriers at this time.
 
     // Convert ZF into the boolean result.
diff --git a/compiler/optimizing/licm_test.cc b/compiler/optimizing/licm_test.cc
index 47457dec7d..2bb769a430 100644
--- a/compiler/optimizing/licm_test.cc
+++ b/compiler/optimizing/licm_test.cc
@@ -42,12 +42,14 @@ class LICMTest : public testing::Test {
     loop_preheader_ = new (&allocator_) HBasicBlock(graph_);
     loop_header_ = new (&allocator_) HBasicBlock(graph_);
     loop_body_ = new (&allocator_) HBasicBlock(graph_);
+    return_ = new (&allocator_) HBasicBlock(graph_);
     exit_ = new (&allocator_) HBasicBlock(graph_);
 
     graph_->AddBlock(entry_);
     graph_->AddBlock(loop_preheader_);
     graph_->AddBlock(loop_header_);
     graph_->AddBlock(loop_body_);
+    graph_->AddBlock(return_);
     graph_->AddBlock(exit_);
 
     graph_->SetEntryBlock(entry_);
@@ -57,8 +59,9 @@ class LICMTest : public testing::Test {
     entry_->AddSuccessor(loop_preheader_);
     loop_preheader_->AddSuccessor(loop_header_);
     loop_header_->AddSuccessor(loop_body_);
-    loop_header_->AddSuccessor(exit_);
+    loop_header_->AddSuccessor(return_);
     loop_body_->AddSuccessor(loop_header_);
+    return_->AddSuccessor(exit_);
 
     // Provide boiler-plate instructions.
     parameter_ = new (&allocator_) HParameterValue(graph_->GetDexFile(), 0, 0, Primitive::kPrimNot);
@@ -89,6 +92,7 @@ class LICMTest : public testing::Test {
   HBasicBlock* loop_preheader_;
   HBasicBlock* loop_header_;
   HBasicBlock* loop_body_;
+  HBasicBlock* return_;
   HBasicBlock* exit_;
 
   HInstruction* parameter_;  // "this"
diff --git a/compiler/optimizing/load_store_elimination.cc b/compiler/optimizing/load_store_elimination.cc
index 6fbb6823d6..5b89cfef5a 100644
--- a/compiler/optimizing/load_store_elimination.cc
+++ b/compiler/optimizing/load_store_elimination.cc
@@ -119,19 +119,10 @@ class HeapLocation : public ArenaObject<kArenaAllocMisc> {
       : ref_info_(ref_info),
         offset_(offset),
         index_(index),
-        declaring_class_def_index_(declaring_class_def_index),
-        may_become_unknown_(true) {
+        declaring_class_def_index_(declaring_class_def_index) {
     DCHECK(ref_info != nullptr);
     DCHECK((offset == kInvalidFieldOffset && index != nullptr) ||
            (offset != kInvalidFieldOffset && index == nullptr));
-
-    if (ref_info->IsSingletonAndNotReturned()) {
-      // We try to track stores to singletons that aren't returned to eliminate the stores
-      // since values in singleton's fields cannot be killed due to aliasing. Those values
-      // can still be killed due to merging values since we don't build phi for merging heap
-      // values. SetMayBecomeUnknown(true) may be called later once such merge becomes possible.
-      may_become_unknown_ = false;
-    }
   }
 
   ReferenceInfo* GetReferenceInfo() const { return ref_info_; }
@@ -148,21 +139,11 @@ class HeapLocation : public ArenaObject<kArenaAllocMisc> {
     return index_ != nullptr;
   }
 
-  // Returns true if this heap location's value may become unknown after it's
-  // set to a value, due to merge of values, or killed due to aliasing.
-  bool MayBecomeUnknown() const {
-    return may_become_unknown_;
-  }
-  void SetMayBecomeUnknown(bool val) {
-    may_become_unknown_ = val;
-  }
-
  private:
   ReferenceInfo* const ref_info_;      // reference for instance/static field or array access.
   const size_t offset_;                // offset of static/instance field.
   HInstruction* const index_;          // index of an array element.
   const int16_t declaring_class_def_index_;  // declaring class's def's dex index.
-  bool may_become_unknown_;            // value may become kUnknownHeapValue.
 
   DISALLOW_COPY_AND_ASSIGN(HeapLocation);
 };
@@ -381,26 +362,13 @@ class HeapLocationCollector : public HGraphVisitor {
     return heap_locations_[heap_location_idx];
   }
 
-  void VisitFieldAccess(HInstruction* field_access,
-                        HInstruction* ref,
-                        const FieldInfo& field_info,
-                        bool is_store) {
+  void VisitFieldAccess(HInstruction* ref, const FieldInfo& field_info) {
     if (field_info.IsVolatile()) {
       has_volatile_ = true;
     }
     const uint16_t declaring_class_def_index = field_info.GetDeclaringClassDefIndex();
     const size_t offset = field_info.GetFieldOffset().SizeValue();
-    HeapLocation* location = GetOrCreateHeapLocation(ref, offset, nullptr, declaring_class_def_index);
-    // A store of a value may be eliminated if all future loads for that value can be eliminated.
-    // For a value that's stored into a singleton field, the value will not be killed due
-    // to aliasing. However if the value is set in a block that doesn't post dominate the definition,
-    // the value may be killed due to merging later. Before we have post dominating info, we check
-    // if the store is in the same block as the definition just to be conservative.
-    if (is_store &&
-        location->GetReferenceInfo()->IsSingletonAndNotReturned() &&
-        field_access->GetBlock() != ref->GetBlock()) {
-      location->SetMayBecomeUnknown(true);
-    }
+    GetOrCreateHeapLocation(ref, offset, nullptr, declaring_class_def_index);
   }
 
   void VisitArrayAccess(HInstruction* array, HInstruction* index) {
@@ -409,20 +377,20 @@ class HeapLocationCollector : public HGraphVisitor {
   }
 
   void VisitInstanceFieldGet(HInstanceFieldGet* instruction) OVERRIDE {
-    VisitFieldAccess(instruction, instruction->InputAt(0), instruction->GetFieldInfo(), false);
+    VisitFieldAccess(instruction->InputAt(0), instruction->GetFieldInfo());
   }
 
   void VisitInstanceFieldSet(HInstanceFieldSet* instruction) OVERRIDE {
-    VisitFieldAccess(instruction, instruction->InputAt(0), instruction->GetFieldInfo(), true);
+    VisitFieldAccess(instruction->InputAt(0), instruction->GetFieldInfo());
     has_heap_stores_ = true;
   }
 
   void VisitStaticFieldGet(HStaticFieldGet* instruction) OVERRIDE {
-    VisitFieldAccess(instruction, instruction->InputAt(0), instruction->GetFieldInfo(), false);
+    VisitFieldAccess(instruction->InputAt(0), instruction->GetFieldInfo());
   }
 
   void VisitStaticFieldSet(HStaticFieldSet* instruction) OVERRIDE {
-    VisitFieldAccess(instruction, instruction->InputAt(0), instruction->GetFieldInfo(), true);
+    VisitFieldAccess(instruction->InputAt(0), instruction->GetFieldInfo());
     has_heap_stores_ = true;
   }
 
@@ -464,9 +432,14 @@ class HeapLocationCollector : public HGraphVisitor {
 };
 
 // An unknown heap value. Loads with such a value in the heap location cannot be eliminated.
+// A heap location can be set to kUnknownHeapValue when:
+// - initially set a value.
+// - killed due to aliasing, merging, invocation, or loop side effects.
 static HInstruction* const kUnknownHeapValue =
     reinterpret_cast<HInstruction*>(static_cast<uintptr_t>(-1));
+
 // Default heap value after an allocation.
+// A heap location can be set to that value right after an allocation.
 static HInstruction* const kDefaultHeapValue =
     reinterpret_cast<HInstruction*>(static_cast<uintptr_t>(-2));
 
@@ -484,29 +457,17 @@ class LSEVisitor : public HGraphVisitor {
                                                     kUnknownHeapValue,
                                                     graph->GetArena()->Adapter(kArenaAllocLSE)),
                          graph->GetArena()->Adapter(kArenaAllocLSE)),
-        removed_instructions_(graph->GetArena()->Adapter(kArenaAllocLSE)),
-        substitute_instructions_(graph->GetArena()->Adapter(kArenaAllocLSE)),
+        removed_loads_(graph->GetArena()->Adapter(kArenaAllocLSE)),
+        substitute_instructions_for_loads_(graph->GetArena()->Adapter(kArenaAllocLSE)),
+        possibly_removed_stores_(graph->GetArena()->Adapter(kArenaAllocLSE)),
         singleton_new_instances_(graph->GetArena()->Adapter(kArenaAllocLSE)) {
   }
 
   void VisitBasicBlock(HBasicBlock* block) OVERRIDE {
-    int block_id = block->GetBlockId();
-    ArenaVector<HInstruction*>& heap_values = heap_values_for_[block_id];
+    // Populate the heap_values array for this block.
     // TODO: try to reuse the heap_values array from one predecessor if possible.
     if (block->IsLoopHeader()) {
-      // We do a single pass in reverse post order. For loops, use the side effects as a hint
-      // to see if the heap values should be killed.
-      if (side_effects_.GetLoopEffects(block).DoesAnyWrite()) {
-        // Leave all values as kUnknownHeapValue.
-      } else {
-        // Inherit the values from pre-header.
-        HBasicBlock* pre_header = block->GetLoopInformation()->GetPreHeader();
-        ArenaVector<HInstruction*>& pre_header_heap_values =
-            heap_values_for_[pre_header->GetBlockId()];
-        for (size_t i = 0; i < heap_values.size(); i++) {
-          heap_values[i] = pre_header_heap_values[i];
-        }
-      }
+      HandleLoopSideEffects(block);
     } else {
       MergePredecessorValues(block);
     }
@@ -515,23 +476,34 @@ class LSEVisitor : public HGraphVisitor {
 
   // Remove recorded instructions that should be eliminated.
   void RemoveInstructions() {
-    size_t size = removed_instructions_.size();
-    DCHECK_EQ(size, substitute_instructions_.size());
+    size_t size = removed_loads_.size();
+    DCHECK_EQ(size, substitute_instructions_for_loads_.size());
     for (size_t i = 0; i < size; i++) {
-      HInstruction* instruction = removed_instructions_[i];
-      DCHECK(instruction != nullptr);
-      HInstruction* substitute = substitute_instructions_[i];
-      if (substitute != nullptr) {
-        // Keep tracing substitute till one that's not removed.
-        HInstruction* sub_sub = FindSubstitute(substitute);
-        while (sub_sub != substitute) {
-          substitute = sub_sub;
-          sub_sub = FindSubstitute(substitute);
-        }
-        instruction->ReplaceWith(substitute);
+      HInstruction* load = removed_loads_[i];
+      DCHECK(load != nullptr);
+      DCHECK(load->IsInstanceFieldGet() ||
+             load->IsStaticFieldGet() ||
+             load->IsArrayGet());
+      HInstruction* substitute = substitute_instructions_for_loads_[i];
+      DCHECK(substitute != nullptr);
+      // Keep tracing substitute till one that's not removed.
+      HInstruction* sub_sub = FindSubstitute(substitute);
+      while (sub_sub != substitute) {
+        substitute = sub_sub;
+        sub_sub = FindSubstitute(substitute);
       }
-      instruction->GetBlock()->RemoveInstruction(instruction);
+      load->ReplaceWith(substitute);
+      load->GetBlock()->RemoveInstruction(load);
     }
+
+    // At this point, stores in possibly_removed_stores_ can be safely removed.
+    size = possibly_removed_stores_.size();
+    for (size_t i = 0; i < size; i++) {
+      HInstruction* store = possibly_removed_stores_[i];
+      DCHECK(store->IsInstanceFieldSet() || store->IsStaticFieldSet() || store->IsArraySet());
+      store->GetBlock()->RemoveInstruction(store);
+    }
+
     // TODO: remove unnecessary allocations.
     // Eliminate instructions in singleton_new_instances_ that:
     // - don't have uses,
@@ -541,6 +513,52 @@ class LSEVisitor : public HGraphVisitor {
   }
 
  private:
+  // If heap_values[index] is an instance field store, need to keep the store.
+  // This is necessary if a heap value is killed due to merging, or loop side
+  // effects (which is essentially merging also), since a load later from the
+  // location won't be eliminated.
+  void KeepIfIsStore(HInstruction* heap_value) {
+    if (heap_value == kDefaultHeapValue ||
+        heap_value == kUnknownHeapValue ||
+        !heap_value->IsInstanceFieldSet()) {
+      return;
+    }
+    auto idx = std::find(possibly_removed_stores_.begin(),
+        possibly_removed_stores_.end(), heap_value);
+    if (idx != possibly_removed_stores_.end()) {
+      // Make sure the store is kept.
+      possibly_removed_stores_.erase(idx);
+    }
+  }
+
+  void HandleLoopSideEffects(HBasicBlock* block) {
+    DCHECK(block->IsLoopHeader());
+    int block_id = block->GetBlockId();
+    ArenaVector<HInstruction*>& heap_values = heap_values_for_[block_id];
+    HBasicBlock* pre_header = block->GetLoopInformation()->GetPreHeader();
+    ArenaVector<HInstruction*>& pre_header_heap_values =
+        heap_values_for_[pre_header->GetBlockId()];
+    // We do a single pass in reverse post order. For loops, use the side effects as a hint
+    // to see if the heap values should be killed.
+    if (side_effects_.GetLoopEffects(block).DoesAnyWrite()) {
+      for (size_t i = 0; i < pre_header_heap_values.size(); i++) {
+        // heap value is killed by loop side effects, need to keep the last store.
+        KeepIfIsStore(pre_header_heap_values[i]);
+      }
+      if (kIsDebugBuild) {
+        // heap_values should all be kUnknownHeapValue that it is inited with.
+        for (size_t i = 0; i < heap_values.size(); i++) {
+          DCHECK_EQ(heap_values[i], kUnknownHeapValue);
+        }
+      }
+    } else {
+      // Inherit the values from pre-header.
+      for (size_t i = 0; i < heap_values.size(); i++) {
+        heap_values[i] = pre_header_heap_values[i];
+      }
+    }
+  }
+
   void MergePredecessorValues(HBasicBlock* block) {
     const ArenaVector<HBasicBlock*>& predecessors = block->GetPredecessors();
     if (predecessors.size() == 0) {
@@ -548,16 +566,25 @@ class LSEVisitor : public HGraphVisitor {
     }
     ArenaVector<HInstruction*>& heap_values = heap_values_for_[block->GetBlockId()];
     for (size_t i = 0; i < heap_values.size(); i++) {
-      HInstruction* value = heap_values_for_[predecessors[0]->GetBlockId()][i];
-      if (value != kUnknownHeapValue) {
+      HInstruction* pred0_value = heap_values_for_[predecessors[0]->GetBlockId()][i];
+      heap_values[i] = pred0_value;
+      if (pred0_value != kUnknownHeapValue) {
         for (size_t j = 1; j < predecessors.size(); j++) {
-          if (heap_values_for_[predecessors[j]->GetBlockId()][i] != value) {
-            value = kUnknownHeapValue;
+          HInstruction* pred_value = heap_values_for_[predecessors[j]->GetBlockId()][i];
+          if (pred_value != pred0_value) {
+            heap_values[i] = kUnknownHeapValue;
             break;
           }
         }
       }
-      heap_values[i] = value;
+
+      if (heap_values[i] == kUnknownHeapValue) {
+        // Keep the last store in each predecessor since future loads cannot be eliminated.
+        for (size_t j = 0; j < predecessors.size(); j++) {
+          ArenaVector<HInstruction*>& pred_values = heap_values_for_[predecessors[j]->GetBlockId()];
+          KeepIfIsStore(pred_values[i]);
+        }
+      }
     }
   }
 
@@ -616,21 +643,30 @@ class LSEVisitor : public HGraphVisitor {
     HInstruction* heap_value = heap_values[idx];
     if (heap_value == kDefaultHeapValue) {
       HInstruction* constant = GetDefaultValue(instruction->GetType());
-      removed_instructions_.push_back(instruction);
-      substitute_instructions_.push_back(constant);
+      removed_loads_.push_back(instruction);
+      substitute_instructions_for_loads_.push_back(constant);
       heap_values[idx] = constant;
       return;
     }
+    if (heap_value != kUnknownHeapValue && heap_value->IsInstanceFieldSet()) {
+      HInstruction* store = heap_value;
+      // This load must be from a singleton since it's from the same field
+      // that a "removed" store puts the value. That store must be to a singleton's field.
+      DCHECK(ref_info->IsSingleton());
+      // Get the real heap value of the store.
+      heap_value = store->InputAt(1);
+    }
     if ((heap_value != kUnknownHeapValue) &&
         // Keep the load due to possible I/F, J/D array aliasing.
         // See b/22538329 for details.
         (heap_value->GetType() == instruction->GetType())) {
-      removed_instructions_.push_back(instruction);
-      substitute_instructions_.push_back(heap_value);
+      removed_loads_.push_back(instruction);
+      substitute_instructions_for_loads_.push_back(heap_value);
       TryRemovingNullCheck(instruction);
       return;
     }
 
+    // Load isn't eliminated.
     if (heap_value == kUnknownHeapValue) {
       // Put the load as the value into the HeapLocation.
       // This acts like GVN but with better aliasing analysis.
@@ -662,51 +698,63 @@ class LSEVisitor : public HGraphVisitor {
     ArenaVector<HInstruction*>& heap_values =
         heap_values_for_[instruction->GetBlock()->GetBlockId()];
     HInstruction* heap_value = heap_values[idx];
-    bool redundant_store = false;
+    bool same_value = false;
+    bool possibly_redundant = false;
     if (Equal(heap_value, value)) {
       // Store into the heap location with the same value.
-      redundant_store = true;
+      same_value = true;
     } else if (index != nullptr) {
       // For array element, don't eliminate stores since it can be easily aliased
       // with non-constant index.
     } else if (!heap_location_collector_.MayDeoptimize() &&
-               ref_info->IsSingletonAndNotReturned() &&
-               !heap_location_collector_.GetHeapLocation(idx)->MayBecomeUnknown()) {
-      // Store into a field of a singleton that's not returned. And that value cannot be
-      // killed due to merge. It's redundant since future loads will get the value
-      // set by this instruction.
-      Primitive::Type type = Primitive::kPrimVoid;
-      if (instruction->IsInstanceFieldSet()) {
-        type = instruction->AsInstanceFieldSet()->GetFieldInfo().GetFieldType();
-      } else if (instruction->IsStaticFieldSet()) {
-        type = instruction->AsStaticFieldSet()->GetFieldInfo().GetFieldType();
-      } else {
-        DCHECK(false) << "Must be an instance/static field set instruction.";
-      }
-      if (value->GetType() != type) {
-        // I/F, J/D aliasing should not happen for fields.
-        DCHECK(Primitive::IsIntegralType(value->GetType()));
-        DCHECK(!Primitive::Is64BitType(value->GetType()));
-        DCHECK(Primitive::IsIntegralType(type));
-        DCHECK(!Primitive::Is64BitType(type));
-        // Keep the store since the corresponding load isn't eliminated due to different types.
-        // TODO: handle the different int types so that we can eliminate this store.
-        redundant_store = false;
+               ref_info->IsSingletonAndNotReturned()) {
+      // Store into a field of a singleton that's not returned. The value cannot be
+      // killed due to aliasing/invocation. It can be redundant since future loads can
+      // directly get the value set by this instruction. The value can still be killed due to
+      // merging or loop side effects. Stores whose values are killed due to merging/loop side
+      // effects later will be removed from possibly_removed_stores_ when that is detected.
+      possibly_redundant = true;
+      HNewInstance* new_instance = ref_info->GetReference()->AsNewInstance();
+      DCHECK(new_instance != nullptr);
+      if (new_instance->IsFinalizable()) {
+        // Finalizable objects escape globally. Need to keep the store.
+        possibly_redundant = false;
       } else {
-        redundant_store = true;
+        HLoopInformation* loop_info = instruction->GetBlock()->GetLoopInformation();
+        if (loop_info != nullptr) {
+          // instruction is a store in the loop so the loop must does write.
+          DCHECK(side_effects_.GetLoopEffects(loop_info->GetHeader()).DoesAnyWrite());
+
+          if (loop_info->IsLoopInvariant(original_ref, false)) {
+            DCHECK(original_ref->GetBlock()->Dominates(loop_info->GetPreHeader()));
+            // Keep the store since its value may be needed at the loop header.
+            possibly_redundant = false;
+          } else {
+            // The singleton is created inside the loop. Value stored to it isn't needed at
+            // the loop header. This is true for outer loops also.
+          }
+        }
       }
-      // TODO: eliminate the store if the singleton object is not finalizable.
-      redundant_store = false;
     }
-    if (redundant_store) {
-      removed_instructions_.push_back(instruction);
-      substitute_instructions_.push_back(nullptr);
-      TryRemovingNullCheck(instruction);
+    if (same_value || possibly_redundant) {
+      possibly_removed_stores_.push_back(instruction);
     }
 
-    heap_values[idx] = value;
+    if (!same_value) {
+      if (possibly_redundant) {
+        DCHECK(instruction->IsInstanceFieldSet());
+        // Put the store as the heap value. If the value is loaded from heap
+        // by a load later, this store isn't really redundant.
+        heap_values[idx] = instruction;
+      } else {
+        heap_values[idx] = value;
+      }
+    }
     // This store may kill values in other heap locations due to aliasing.
     for (size_t i = 0; i < heap_values.size(); i++) {
+      if (i == idx) {
+        continue;
+      }
       if (heap_values[i] == value) {
         // Same value should be kept even if aliasing happens.
         continue;
@@ -834,9 +882,10 @@ class LSEVisitor : public HGraphVisitor {
       return;
     }
     if (!heap_location_collector_.MayDeoptimize() &&
-        ref_info->IsSingletonAndNotReturned()) {
-      // The allocation might be eliminated.
-      singleton_new_instances_.push_back(new_instance);
+        ref_info->IsSingletonAndNotReturned() &&
+        !new_instance->IsFinalizable() &&
+        !new_instance->CanThrow()) {
+      // TODO: add new_instance to singleton_new_instances_ and enable allocation elimination.
     }
     ArenaVector<HInstruction*>& heap_values =
         heap_values_for_[new_instance->GetBlock()->GetBlockId()];
@@ -854,10 +903,10 @@ class LSEVisitor : public HGraphVisitor {
   // Find an instruction's substitute if it should be removed.
   // Return the same instruction if it should not be removed.
   HInstruction* FindSubstitute(HInstruction* instruction) {
-    size_t size = removed_instructions_.size();
+    size_t size = removed_loads_.size();
     for (size_t i = 0; i < size; i++) {
-      if (removed_instructions_[i] == instruction) {
-        return substitute_instructions_[i];
+      if (removed_loads_[i] == instruction) {
+        return substitute_instructions_for_loads_[i];
       }
     }
     return instruction;
@@ -871,8 +920,13 @@ class LSEVisitor : public HGraphVisitor {
 
   // We record the instructions that should be eliminated but may be
   // used by heap locations. They'll be removed in the end.
-  ArenaVector<HInstruction*> removed_instructions_;
-  ArenaVector<HInstruction*> substitute_instructions_;
+  ArenaVector<HInstruction*> removed_loads_;
+  ArenaVector<HInstruction*> substitute_instructions_for_loads_;
+
+  // Stores in this list may be removed from the list later when it's
+  // found that the store cannot be eliminated.
+  ArenaVector<HInstruction*> possibly_removed_stores_;
+
   ArenaVector<HInstruction*> singleton_new_instances_;
 
   DISALLOW_COPY_AND_ASSIGN(LSEVisitor);
diff --git a/compiler/optimizing/locations.h b/compiler/optimizing/locations.h
index 1181007666..63bbc2cd0a 100644
--- a/compiler/optimizing/locations.h
+++ b/compiler/optimizing/locations.h
@@ -594,6 +594,10 @@ class LocationSummary : public ArenaObject<kArenaAllocLocationSummary> {
     return intrinsified_;
   }
 
+  void SetIntrinsified(bool intrinsified) {
+    intrinsified_ = intrinsified;
+  }
+
  private:
   ArenaVector<Location> inputs_;
   ArenaVector<Location> temps_;
@@ -613,7 +617,7 @@ class LocationSummary : public ArenaObject<kArenaAllocLocationSummary> {
   RegisterSet live_registers_;
 
   // Whether these are locations for an intrinsified call.
-  const bool intrinsified_;
+  bool intrinsified_;
 
   ART_FRIEND_TEST(RegisterAllocatorTest, ExpectedInRegisterHint);
   ART_FRIEND_TEST(RegisterAllocatorTest, SameAsFirstInputHint);
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index 68fb0acf7f..0a39ff31bf 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -17,6 +17,7 @@
 #include "nodes.h"
 
 #include "code_generator.h"
+#include "common_dominator.h"
 #include "ssa_builder.h"
 #include "base/bit_vector-inl.h"
 #include "base/bit_utils.h"
@@ -179,7 +180,10 @@ void HGraph::ComputeDominanceInformation() {
       if (successor->GetDominator() == nullptr) {
         successor->SetDominator(current);
       } else {
-        successor->SetDominator(FindCommonDominator(successor->GetDominator(), current));
+        // The CommonDominator can work for multiple blocks as long as the
+        // domination information doesn't change. However, since we're changing
+        // that information here, we can use the finder only for pairs of blocks.
+        successor->SetDominator(CommonDominator::ForPair(successor->GetDominator(), current));
       }
 
       // Once all the forward edges have been visited, we know the immediate
@@ -194,24 +198,6 @@ void HGraph::ComputeDominanceInformation() {
   }
 }
 
-HBasicBlock* HGraph::FindCommonDominator(HBasicBlock* first, HBasicBlock* second) const {
-  ArenaBitVector visited(arena_, blocks_.size(), false);
-  // Walk the dominator tree of the first block and mark the visited blocks.
-  while (first != nullptr) {
-    visited.SetBit(first->GetBlockId());
-    first = first->GetDominator();
-  }
-  // Walk the dominator tree of the second block until a marked block is found.
-  while (second != nullptr) {
-    if (visited.IsBitSet(second->GetBlockId())) {
-      return second;
-    }
-    second = second->GetDominator();
-  }
-  LOG(ERROR) << "Could not find common dominator";
-  return nullptr;
-}
-
 void HGraph::TransformToSsa() {
   DCHECK(!reverse_post_order_.empty());
   SsaBuilder ssa_builder(this);
@@ -335,14 +321,24 @@ void HGraph::SimplifyCatchBlocks() {
       // instructions into `normal_block` and links the two blocks with a Goto.
       // Afterwards, incoming normal-flow edges are re-linked to `normal_block`,
       // leaving `catch_block` with the exceptional edges only.
+      //
       // Note that catch blocks with normal-flow predecessors cannot begin with
-      // a MOVE_EXCEPTION instruction, as guaranteed by the verifier.
-      DCHECK(!catch_block->GetFirstInstruction()->IsLoadException());
-      HBasicBlock* normal_block = catch_block->SplitBefore(catch_block->GetFirstInstruction());
-      for (size_t j = 0; j < catch_block->GetPredecessors().size(); ++j) {
-        if (!CheckIfPredecessorAtIsExceptional(*catch_block, j)) {
-          catch_block->GetPredecessors()[j]->ReplaceSuccessor(catch_block, normal_block);
-          --j;
+      // a move-exception instruction, as guaranteed by the verifier. However,
+      // trivially dead predecessors are ignored by the verifier and such code
+      // has not been removed at this stage. We therefore ignore the assumption
+      // and rely on GraphChecker to enforce it after initial DCE is run (b/25492628).
+      HBasicBlock* normal_block = catch_block->SplitCatchBlockAfterMoveException();
+      if (normal_block == nullptr) {
+        // Catch block is either empty or only contains a move-exception. It must
+        // therefore be dead and will be removed during initial DCE. Do nothing.
+        DCHECK(!catch_block->EndsWithControlFlowInstruction());
+      } else {
+        // Catch block was split. Re-link normal-flow edges to the new block.
+        for (size_t j = 0; j < catch_block->GetPredecessors().size(); ++j) {
+          if (!CheckIfPredecessorAtIsExceptional(*catch_block, j)) {
+            catch_block->GetPredecessors()[j]->ReplaceSuccessor(catch_block, normal_block);
+            --j;
+          }
         }
       }
     }
@@ -366,28 +362,45 @@ void HGraph::ComputeTryBlockInformation() {
     HBasicBlock* first_predecessor = block->GetPredecessors()[0];
     DCHECK(!block->IsLoopHeader() || !block->GetLoopInformation()->IsBackEdge(*first_predecessor));
     const HTryBoundary* try_entry = first_predecessor->ComputeTryEntryOfSuccessors();
-    if (try_entry != nullptr) {
+    if (try_entry != nullptr &&
+        (block->GetTryCatchInformation() == nullptr ||
+         try_entry != &block->GetTryCatchInformation()->GetTryEntry())) {
+      // We are either setting try block membership for the first time or it
+      // has changed.
       block->SetTryCatchInformation(new (arena_) TryCatchInformation(*try_entry));
     }
   }
 }
 
 void HGraph::SimplifyCFG() {
-  // Simplify the CFG for future analysis, and code generation:
+// Simplify the CFG for future analysis, and code generation:
   // (1): Split critical edges.
-  // (2): Simplify loops by having only one back edge, and one preheader.
+  // (2): Simplify loops by having only one preheader.
   // NOTE: We're appending new blocks inside the loop, so we need to use index because iterators
   // can be invalidated. We remember the initial size to avoid iterating over the new blocks.
   for (size_t block_id = 0u, end = blocks_.size(); block_id != end; ++block_id) {
     HBasicBlock* block = blocks_[block_id];
     if (block == nullptr) continue;
-    if (block->NumberOfNormalSuccessors() > 1) {
-      for (size_t j = 0; j < block->GetSuccessors().size(); ++j) {
-        HBasicBlock* successor = block->GetSuccessors()[j];
+    if (block->GetSuccessors().size() > 1) {
+      // Only split normal-flow edges. We cannot split exceptional edges as they
+      // are synthesized (approximate real control flow), and we do not need to
+      // anyway. Moves that would be inserted there are performed by the runtime.
+      ArrayRef<HBasicBlock* const> normal_successors = block->GetNormalSuccessors();
+      for (size_t j = 0, e = normal_successors.size(); j < e; ++j) {
+        HBasicBlock* successor = normal_successors[j];
         DCHECK(!successor->IsCatchBlock());
-        if (successor->GetPredecessors().size() > 1) {
+        if (successor == exit_block_) {
+          // Throw->TryBoundary->Exit. Special case which we do not want to split
+          // because Goto->Exit is not allowed.
+          DCHECK(block->IsSingleTryBoundary());
+          DCHECK(block->GetSinglePredecessor()->GetLastInstruction()->IsThrow());
+        } else if (successor->GetPredecessors().size() > 1) {
           SplitCriticalEdge(block, successor);
-          --j;
+          // SplitCriticalEdge could have invalidated the `normal_successors`
+          // ArrayRef. We must re-acquire it.
+          normal_successors = block->GetNormalSuccessors();
+          DCHECK_EQ(normal_successors[j]->GetSingleSuccessor(), successor);
+          DCHECK_EQ(e, normal_successors.size());
         }
       }
     }
@@ -1082,6 +1095,8 @@ HConstant* HBinaryOperation::TryStaticEvaluation() const {
     } else if (GetRight()->IsLongConstant()) {
       return Evaluate(GetLeft()->AsLongConstant(), GetRight()->AsLongConstant());
     }
+  } else if (GetLeft()->IsNullConstant() && GetRight()->IsNullConstant()) {
+    return Evaluate(GetLeft()->AsNullConstant(), GetRight()->AsNullConstant());
   }
   return nullptr;
 }
@@ -1163,7 +1178,7 @@ void HInstruction::MoveBefore(HInstruction* cursor) {
 }
 
 HBasicBlock* HBasicBlock::SplitBefore(HInstruction* cursor) {
-  DCHECK(!graph_->IsInSsaForm()) << "Support for SSA form not implemented";
+  DCHECK(!graph_->IsInSsaForm()) << "Support for SSA form not implemented.";
   DCHECK_EQ(cursor->GetBlock(), this);
 
   HBasicBlock* new_block = new (GetGraph()->GetArena()) HBasicBlock(GetGraph(),
@@ -1193,7 +1208,7 @@ HBasicBlock* HBasicBlock::SplitBefore(HInstruction* cursor) {
 }
 
 HBasicBlock* HBasicBlock::CreateImmediateDominator() {
-  DCHECK(!graph_->IsInSsaForm()) << "Support for SSA form not implemented";
+  DCHECK(!graph_->IsInSsaForm()) << "Support for SSA form not implemented.";
   DCHECK(!IsCatchBlock()) << "Support for updating try/catch information not implemented.";
 
   HBasicBlock* new_block = new (GetGraph()->GetArena()) HBasicBlock(GetGraph(), GetDexPc());
@@ -1209,6 +1224,34 @@ HBasicBlock* HBasicBlock::CreateImmediateDominator() {
   return new_block;
 }
 
+HBasicBlock* HBasicBlock::SplitCatchBlockAfterMoveException() {
+  DCHECK(!graph_->IsInSsaForm()) << "Support for SSA form not implemented.";
+  DCHECK(IsCatchBlock()) << "This method is intended for catch blocks only.";
+
+  HInstruction* first_insn = GetFirstInstruction();
+  HInstruction* split_before = nullptr;
+
+  if (first_insn != nullptr && first_insn->IsLoadException()) {
+    // Catch block starts with a LoadException. Split the block after
+    // the StoreLocal and ClearException which must come after the load.
+    DCHECK(first_insn->GetNext()->IsStoreLocal());
+    DCHECK(first_insn->GetNext()->GetNext()->IsClearException());
+    split_before = first_insn->GetNext()->GetNext()->GetNext();
+  } else {
+    // Catch block does not load the exception. Split at the beginning
+    // to create an empty catch block.
+    split_before = first_insn;
+  }
+
+  if (split_before == nullptr) {
+    // Catch block has no instructions after the split point (must be dead).
+    // Do not split it but rather signal error by returning nullptr.
+    return nullptr;
+  } else {
+    return SplitBefore(split_before);
+  }
+}
+
 HBasicBlock* HBasicBlock::SplitAfter(HInstruction* cursor) {
   DCHECK(!cursor->IsControlFlow());
   DCHECK_NE(instructions_.last_instruction_, cursor);
@@ -1293,17 +1336,38 @@ bool HBasicBlock::HasSinglePhi() const {
   return !GetPhis().IsEmpty() && GetFirstPhi()->GetNext() == nullptr;
 }
 
+ArrayRef<HBasicBlock* const> HBasicBlock::GetNormalSuccessors() const {
+  if (EndsWithTryBoundary()) {
+    // The normal-flow successor of HTryBoundary is always stored at index zero.
+    DCHECK_EQ(successors_[0], GetLastInstruction()->AsTryBoundary()->GetNormalFlowSuccessor());
+    return ArrayRef<HBasicBlock* const>(successors_).SubArray(0u, 1u);
+  } else {
+    // All successors of blocks not ending with TryBoundary are normal.
+    return ArrayRef<HBasicBlock* const>(successors_);
+  }
+}
+
+ArrayRef<HBasicBlock* const> HBasicBlock::GetExceptionalSuccessors() const {
+  if (EndsWithTryBoundary()) {
+    return GetLastInstruction()->AsTryBoundary()->GetExceptionHandlers();
+  } else {
+    // Blocks not ending with TryBoundary do not have exceptional successors.
+    return ArrayRef<HBasicBlock* const>();
+  }
+}
+
 bool HTryBoundary::HasSameExceptionHandlersAs(const HTryBoundary& other) const {
-  if (GetBlock()->GetSuccessors().size() != other.GetBlock()->GetSuccessors().size()) {
+  ArrayRef<HBasicBlock* const> handlers1 = GetExceptionHandlers();
+  ArrayRef<HBasicBlock* const> handlers2 = other.GetExceptionHandlers();
+
+  size_t length = handlers1.size();
+  if (length != handlers2.size()) {
     return false;
   }
 
   // Exception handlers need to be stored in the same order.
-  for (HExceptionHandlerIterator it1(*this), it2(other);
-       !it1.Done();
-       it1.Advance(), it2.Advance()) {
-    DCHECK(!it2.Done());
-    if (it1.Current() != it2.Current()) {
+  for (size_t i = 0; i < length; ++i) {
+    if (handlers1[i] != handlers2[i]) {
       return false;
     }
   }
@@ -1356,7 +1420,7 @@ void HBasicBlock::DisconnectAndDelete() {
   // iteration.
   DCHECK(dominated_blocks_.empty());
 
-  // Remove the block from all loops it is included in.
+  // (1) Remove the block from all loops it is included in.
   for (HLoopInformationOutwardIterator it(*this); !it.Done(); it.Advance()) {
     HLoopInformation* loop_info = it.Current();
     loop_info->Remove(this);
@@ -1368,17 +1432,34 @@ void HBasicBlock::DisconnectAndDelete() {
     }
   }
 
-  // Disconnect the block from its predecessors and update their control-flow
-  // instructions.
+  // (2) Disconnect the block from its predecessors and update their
+  //     control-flow instructions.
   for (HBasicBlock* predecessor : predecessors_) {
     HInstruction* last_instruction = predecessor->GetLastInstruction();
+    if (last_instruction->IsTryBoundary() && !IsCatchBlock()) {
+      // This block is the only normal-flow successor of the TryBoundary which
+      // makes `predecessor` dead. Since DCE removes blocks in post order,
+      // exception handlers of this TryBoundary were already visited and any
+      // remaining handlers therefore must be live. We remove `predecessor` from
+      // their list of predecessors.
+      DCHECK_EQ(last_instruction->AsTryBoundary()->GetNormalFlowSuccessor(), this);
+      while (predecessor->GetSuccessors().size() > 1) {
+        HBasicBlock* handler = predecessor->GetSuccessors()[1];
+        DCHECK(handler->IsCatchBlock());
+        predecessor->RemoveSuccessor(handler);
+        handler->RemovePredecessor(predecessor);
+      }
+    }
+
     predecessor->RemoveSuccessor(this);
     uint32_t num_pred_successors = predecessor->GetSuccessors().size();
     if (num_pred_successors == 1u) {
       // If we have one successor after removing one, then we must have
-      // had an HIf or HPackedSwitch, as they have more than one successor.
-      // Replace those with a HGoto.
-      DCHECK(last_instruction->IsIf() || last_instruction->IsPackedSwitch());
+      // had an HIf, HPackedSwitch or HTryBoundary, as they have more than one
+      // successor. Replace those with a HGoto.
+      DCHECK(last_instruction->IsIf() ||
+             last_instruction->IsPackedSwitch() ||
+             (last_instruction->IsTryBoundary() && IsCatchBlock()));
       predecessor->RemoveInstruction(last_instruction);
       predecessor->AddInstruction(new (graph_->GetArena()) HGoto(last_instruction->GetDexPc()));
     } else if (num_pred_successors == 0u) {
@@ -1387,15 +1468,17 @@ void HBasicBlock::DisconnectAndDelete() {
       // SSAChecker fails unless it is not removed during the pass too.
       predecessor->RemoveInstruction(last_instruction);
     } else {
-      // There are multiple successors left.  This must come from a HPackedSwitch
-      // and we are in the middle of removing the HPackedSwitch. Like above, leave
-      // this alone, and the SSAChecker will fail if it is not removed as well.
-      DCHECK(last_instruction->IsPackedSwitch());
+      // There are multiple successors left. The removed block might be a successor
+      // of a PackedSwitch which will be completely removed (perhaps replaced with
+      // a Goto), or we are deleting a catch block from a TryBoundary. In either
+      // case, leave `last_instruction` as is for now.
+      DCHECK(last_instruction->IsPackedSwitch() ||
+             (last_instruction->IsTryBoundary() && IsCatchBlock()));
     }
   }
   predecessors_.clear();
 
-  // Disconnect the block from its successors and update their phis.
+  // (3) Disconnect the block from its successors and update their phis.
   for (HBasicBlock* successor : successors_) {
     // Delete this block from the list of predecessors.
     size_t this_index = successor->GetPredecessorIndexOf(this);
@@ -1405,30 +1488,57 @@ void HBasicBlock::DisconnectAndDelete() {
     // dominator of `successor` which violates the order DCHECKed at the top.
     DCHECK(!successor->predecessors_.empty());
 
-    // Remove this block's entries in the successor's phis.
-    if (successor->predecessors_.size() == 1u) {
-      // The successor has just one predecessor left. Replace phis with the only
-      // remaining input.
-      for (HInstructionIterator phi_it(successor->GetPhis()); !phi_it.Done(); phi_it.Advance()) {
-        HPhi* phi = phi_it.Current()->AsPhi();
-        phi->ReplaceWith(phi->InputAt(1 - this_index));
-        successor->RemovePhi(phi);
-      }
-    } else {
-      for (HInstructionIterator phi_it(successor->GetPhis()); !phi_it.Done(); phi_it.Advance()) {
-        phi_it.Current()->AsPhi()->RemoveInputAt(this_index);
+    // Remove this block's entries in the successor's phis. Skip exceptional
+    // successors because catch phi inputs do not correspond to predecessor
+    // blocks but throwing instructions. Their inputs will be updated in step (4).
+    if (!successor->IsCatchBlock()) {
+      if (successor->predecessors_.size() == 1u) {
+        // The successor has just one predecessor left. Replace phis with the only
+        // remaining input.
+        for (HInstructionIterator phi_it(successor->GetPhis()); !phi_it.Done(); phi_it.Advance()) {
+          HPhi* phi = phi_it.Current()->AsPhi();
+          phi->ReplaceWith(phi->InputAt(1 - this_index));
+          successor->RemovePhi(phi);
+        }
+      } else {
+        for (HInstructionIterator phi_it(successor->GetPhis()); !phi_it.Done(); phi_it.Advance()) {
+          phi_it.Current()->AsPhi()->RemoveInputAt(this_index);
+        }
       }
     }
   }
   successors_.clear();
 
+  // (4) Remove instructions and phis. Instructions should have no remaining uses
+  //     except in catch phis. If an instruction is used by a catch phi at `index`,
+  //     remove `index`-th input of all phis in the catch block since they are
+  //     guaranteed dead. Note that we may miss dead inputs this way but the
+  //     graph will always remain consistent.
+  for (HBackwardInstructionIterator it(GetInstructions()); !it.Done(); it.Advance()) {
+    HInstruction* insn = it.Current();
+    while (insn->HasUses()) {
+      DCHECK(IsTryBlock());
+      HUseListNode<HInstruction*>* use = insn->GetUses().GetFirst();
+      size_t use_index = use->GetIndex();
+      HBasicBlock* user_block =  use->GetUser()->GetBlock();
+      DCHECK(use->GetUser()->IsPhi() && user_block->IsCatchBlock());
+      for (HInstructionIterator phi_it(user_block->GetPhis()); !phi_it.Done(); phi_it.Advance()) {
+        phi_it.Current()->AsPhi()->RemoveInputAt(use_index);
+      }
+    }
+
+    RemoveInstruction(insn);
+  }
+  for (HInstructionIterator it(GetPhis()); !it.Done(); it.Advance()) {
+    RemovePhi(it.Current()->AsPhi());
+  }
+
   // Disconnect from the dominator.
   dominator_->RemoveDominatedBlock(this);
   SetDominator(nullptr);
 
-  // Delete from the graph. The function safely deletes remaining instructions
-  // and updates the reverse post order.
-  graph_->DeleteDeadBlock(this);
+  // Delete from the graph, update reverse post order.
+  graph_->DeleteDeadEmptyBlock(this);
   SetGraph(nullptr);
 }
 
@@ -1475,7 +1585,7 @@ void HBasicBlock::MergeWith(HBasicBlock* other) {
   other->predecessors_.clear();
 
   // Delete `other` from the graph. The function updates reverse post order.
-  graph_->DeleteDeadBlock(other);
+  graph_->DeleteDeadEmptyBlock(other);
   other->SetGraph(nullptr);
 }
 
@@ -1539,19 +1649,14 @@ static void MakeRoomFor(ArenaVector<HBasicBlock*>* blocks,
   std::copy_backward(blocks->begin() + after + 1u, blocks->begin() + old_size, blocks->end());
 }
 
-void HGraph::DeleteDeadBlock(HBasicBlock* block) {
+void HGraph::DeleteDeadEmptyBlock(HBasicBlock* block) {
   DCHECK_EQ(block->GetGraph(), this);
   DCHECK(block->GetSuccessors().empty());
   DCHECK(block->GetPredecessors().empty());
   DCHECK(block->GetDominatedBlocks().empty());
   DCHECK(block->GetDominator() == nullptr);
-
-  for (HBackwardInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
-    block->RemoveInstruction(it.Current());
-  }
-  for (HBackwardInstructionIterator it(block->GetPhis()); !it.Done(); it.Advance()) {
-    block->RemovePhi(it.Current()->AsPhi());
-  }
+  DCHECK(block->GetInstructions().IsEmpty());
+  DCHECK(block->GetPhis().IsEmpty());
 
   if (block->IsExitBlock()) {
     exit_block_ = nullptr;
@@ -1654,6 +1759,9 @@ HInstruction* HGraph::InlineInto(HGraph* outer_graph, HInvoke* invoke) {
     // (2) the reverse post order of that graph,
     // (3) the potential loop information they are now in,
     // (4) try block membership.
+    // Note that we do not need to update catch phi inputs because they
+    // correspond to the register file of the outer method which the inlinee
+    // cannot modify.
 
     // We don't add the entry block, the exit block, and the first block, which
     // has been merged with `at`.
@@ -1940,6 +2048,39 @@ bool HInvokeStaticOrDirect::NeedsDexCacheOfDeclaringClass() const {
   return !opt.GetDoesNotNeedDexCache();
 }
 
+void HInvokeStaticOrDirect::InsertInputAt(size_t index, HInstruction* input) {
+  inputs_.insert(inputs_.begin() + index, HUserRecord<HInstruction*>(input));
+  input->AddUseAt(this, index);
+  // Update indexes in use nodes of inputs that have been pushed further back by the insert().
+  for (size_t i = index + 1u, size = inputs_.size(); i != size; ++i) {
+    DCHECK_EQ(InputRecordAt(i).GetUseNode()->GetIndex(), i - 1u);
+    InputRecordAt(i).GetUseNode()->SetIndex(i);
+  }
+}
+
+void HInvokeStaticOrDirect::RemoveInputAt(size_t index) {
+  RemoveAsUserOfInput(index);
+  inputs_.erase(inputs_.begin() + index);
+  // Update indexes in use nodes of inputs that have been pulled forward by the erase().
+  for (size_t i = index, e = InputCount(); i < e; ++i) {
+    DCHECK_EQ(InputRecordAt(i).GetUseNode()->GetIndex(), i + 1u);
+    InputRecordAt(i).GetUseNode()->SetIndex(i);
+  }
+}
+
+std::ostream& operator<<(std::ostream& os, HInvokeStaticOrDirect::ClinitCheckRequirement rhs) {
+  switch (rhs) {
+    case HInvokeStaticOrDirect::ClinitCheckRequirement::kExplicit:
+      return os << "explicit";
+    case HInvokeStaticOrDirect::ClinitCheckRequirement::kImplicit:
+      return os << "implicit";
+    case HInvokeStaticOrDirect::ClinitCheckRequirement::kNone:
+      return os << "none";
+    default:
+      return os << "unknown:" << static_cast<int>(rhs);
+  }
+}
+
 void HInstruction::RemoveEnvironmentUsers() {
   for (HUseIterator<HEnvironment*> use_it(GetEnvUses()); !use_it.Done(); use_it.Advance()) {
     HUseListNode<HEnvironment*>* user_node = use_it.Current();
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 0f2c1cffee..4f894b07c7 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -35,6 +35,7 @@
 #include "mirror/class.h"
 #include "offsets.h"
 #include "primitive.h"
+#include "utils/array_ref.h"
 
 namespace art {
 
@@ -240,8 +241,9 @@ class HGraph : public ArenaObject<kArenaAllocGraph> {
   // put deoptimization instructions, etc.
   void TransformLoopHeaderForBCE(HBasicBlock* header);
 
-  // Removes `block` from the graph.
-  void DeleteDeadBlock(HBasicBlock* block);
+  // Removes `block` from the graph. Assumes `block` has been disconnected from
+  // other blocks and has no instructions or phis.
+  void DeleteDeadEmptyBlock(HBasicBlock* block);
 
   // Splits the edge between `block` and `successor` while preserving the
   // indices in the predecessor/successor lists. If there are multiple edges
@@ -350,8 +352,6 @@ class HGraph : public ArenaObject<kArenaAllocGraph> {
 
   HCurrentMethod* GetCurrentMethod();
 
-  HBasicBlock* FindCommonDominator(HBasicBlock* first, HBasicBlock* second) const;
-
   const DexFile& GetDexFile() const {
     return dex_file_;
   }
@@ -661,6 +661,9 @@ class HBasicBlock : public ArenaObject<kArenaAllocBasicBlock> {
     return successors_;
   }
 
+  ArrayRef<HBasicBlock* const> GetNormalSuccessors() const;
+  ArrayRef<HBasicBlock* const> GetExceptionalSuccessors() const;
+
   bool HasSuccessor(const HBasicBlock* block, size_t start_from = 0u) {
     return ContainsElement(successors_, block, start_from);
   }
@@ -811,12 +814,6 @@ class HBasicBlock : public ArenaObject<kArenaAllocBasicBlock> {
     return GetPredecessorIndexOf(predecessor) == idx;
   }
 
-  // Returns the number of non-exceptional successors. SsaChecker ensures that
-  // these are stored at the beginning of the successor list.
-  size_t NumberOfNormalSuccessors() const {
-    return EndsWithTryBoundary() ? 1 : GetSuccessors().size();
-  }
-
   // Create a new block between this block and its predecessors. The new block
   // is added to the graph, all predecessor edges are relinked to it and an edge
   // is created to `this`. Returns the new empty block. Reverse post order or
@@ -837,6 +834,15 @@ class HBasicBlock : public ArenaObject<kArenaAllocBasicBlock> {
   // blocks are consistent (for example ending with a control flow instruction).
   HBasicBlock* SplitAfter(HInstruction* cursor);
 
+  // Split catch block into two blocks after the original move-exception bytecode
+  // instruction, or at the beginning if not present. Returns the newly created,
+  // latter block, or nullptr if such block could not be created (must be dead
+  // in that case). Note that this method just updates raw block information,
+  // like predecessors, successors, dominators, and instruction list. It does not
+  // update the graph, reverse post order, loop information, nor make sure the
+  // blocks are consistent (for example ending with a control flow instruction).
+  HBasicBlock* SplitCatchBlockAfterMoveException();
+
   // Merge `other` at the end of `this`. Successors and dominated blocks of
   // `other` are changed to be successors and dominated blocks of `this`. Note
   // that this method does not update the graph, reverse post order, loop
@@ -1430,7 +1436,7 @@ class SideEffects : public ValueObject {
     return flags_ == (kAllChangeBits | kAllDependOnBits);
   }
 
-  // Returns true if this may read something written by other.
+  // Returns true if `this` may read something written by `other`.
   bool MayDependOn(SideEffects other) const {
     const uint64_t depends_on_flags = (flags_ & kAllDependOnBits) >> kChangeBits;
     return (other.flags_ & depends_on_flags);
@@ -1725,6 +1731,13 @@ class ReferenceTypeInfo : ValueObject {
     return GetTypeHandle()->IsAssignableFrom(rti.GetTypeHandle().Get());
   }
 
+  bool IsStrictSupertypeOf(ReferenceTypeInfo rti) const SHARED_REQUIRES(Locks::mutator_lock_) {
+    DCHECK(IsValid());
+    DCHECK(rti.IsValid());
+    return GetTypeHandle().Get() != rti.GetTypeHandle().Get() &&
+        GetTypeHandle()->IsAssignableFrom(rti.GetTypeHandle().Get());
+  }
+
   // Returns true if the type information provide the same amount of details.
   // Note that it does not mean that the instructions have the same actual type
   // (because the type can be the result of a merge).
@@ -2390,6 +2403,10 @@ class HTryBoundary : public HTemplateInstruction<0> {
   // Returns the block's non-exceptional successor (index zero).
   HBasicBlock* GetNormalFlowSuccessor() const { return GetBlock()->GetSuccessors()[0]; }
 
+  ArrayRef<HBasicBlock* const> GetExceptionHandlers() const {
+    return ArrayRef<HBasicBlock* const>(GetBlock()->GetSuccessors()).SubArray(1u);
+  }
+
   // Returns whether `handler` is among its exception handlers (non-zero index
   // successors).
   bool HasExceptionHandler(const HBasicBlock& handler) const {
@@ -2417,25 +2434,6 @@ class HTryBoundary : public HTemplateInstruction<0> {
   DISALLOW_COPY_AND_ASSIGN(HTryBoundary);
 };
 
-// Iterator over exception handlers of a given HTryBoundary, i.e. over
-// exceptional successors of its basic block.
-class HExceptionHandlerIterator : public ValueObject {
- public:
-  explicit HExceptionHandlerIterator(const HTryBoundary& try_boundary)
-    : block_(*try_boundary.GetBlock()), index_(block_.NumberOfNormalSuccessors()) {}
-
-  bool Done() const { return index_ == block_.GetSuccessors().size(); }
-  HBasicBlock* Current() const { return block_.GetSuccessors()[index_]; }
-  size_t CurrentSuccessorIndex() const { return index_; }
-  void Advance() { ++index_; }
-
- private:
-  const HBasicBlock& block_;
-  size_t index_;
-
-  DISALLOW_COPY_AND_ASSIGN(HExceptionHandlerIterator);
-};
-
 // Deoptimize to interpreter, upon checking a condition.
 class HDeoptimize : public HTemplateInstruction<1> {
  public:
@@ -2604,6 +2602,11 @@ class HBinaryOperation : public HExpression<2> {
     VLOG(compiler) << DebugName() << " is not defined for the (long, int) case.";
     return nullptr;
   }
+  virtual HConstant* Evaluate(HNullConstant* x ATTRIBUTE_UNUSED,
+                              HNullConstant* y ATTRIBUTE_UNUSED) const {
+    VLOG(compiler) << DebugName() << " is not defined for the (null, null) case.";
+    return nullptr;
+  }
 
   // Returns an input that can legally be used as the right input and is
   // constant, or null.
@@ -2694,6 +2697,10 @@ class HEqual : public HCondition {
     return GetBlock()->GetGraph()->GetIntConstant(
         Compute(x->GetValue(), y->GetValue()), GetDexPc());
   }
+  HConstant* Evaluate(HNullConstant* x ATTRIBUTE_UNUSED,
+                      HNullConstant* y ATTRIBUTE_UNUSED) const OVERRIDE {
+    return GetBlock()->GetGraph()->GetIntConstant(1);
+  }
 
   DECLARE_INSTRUCTION(Equal);
 
@@ -2726,6 +2733,10 @@ class HNotEqual : public HCondition {
     return GetBlock()->GetGraph()->GetIntConstant(
         Compute(x->GetValue(), y->GetValue()), GetDexPc());
   }
+  HConstant* Evaluate(HNullConstant* x ATTRIBUTE_UNUSED,
+                      HNullConstant* y ATTRIBUTE_UNUSED) const OVERRIDE {
+    return GetBlock()->GetGraph()->GetIntConstant(0);
+  }
 
   DECLARE_INSTRUCTION(NotEqual);
 
@@ -3399,11 +3410,12 @@ class HInvokeStaticOrDirect : public HInvoke {
                         ClinitCheckRequirement clinit_check_requirement)
       : HInvoke(arena,
                 number_of_arguments,
-                // There is one extra argument for the HCurrentMethod node, and
-                // potentially one other if the clinit check is explicit, and one other
-                // if the method is a string factory.
-                1u + (clinit_check_requirement == ClinitCheckRequirement::kExplicit ? 1u : 0u)
-                   + (dispatch_info.method_load_kind == MethodLoadKind::kStringInit ? 1u : 0u),
+                // There is potentially one extra argument for the HCurrentMethod node, and
+                // potentially one other if the clinit check is explicit, and potentially
+                // one other if the method is a string factory.
+                (NeedsCurrentMethodInput(dispatch_info.method_load_kind) ? 1u : 0u) +
+                    (clinit_check_requirement == ClinitCheckRequirement::kExplicit ? 1u : 0u) +
+                    (dispatch_info.method_load_kind == MethodLoadKind::kStringInit ? 1u : 0u),
                 return_type,
                 dex_pc,
                 method_index,
@@ -3411,12 +3423,31 @@ class HInvokeStaticOrDirect : public HInvoke {
         invoke_type_(invoke_type),
         clinit_check_requirement_(clinit_check_requirement),
         target_method_(target_method),
-        dispatch_info_(dispatch_info) {}
+        dispatch_info_(dispatch_info) { }
 
   void SetDispatchInfo(const DispatchInfo& dispatch_info) {
+    bool had_current_method_input = HasCurrentMethodInput();
+    bool needs_current_method_input = NeedsCurrentMethodInput(dispatch_info.method_load_kind);
+
+    // Using the current method is the default and once we find a better
+    // method load kind, we should not go back to using the current method.
+    DCHECK(had_current_method_input || !needs_current_method_input);
+
+    if (had_current_method_input && !needs_current_method_input) {
+      DCHECK_EQ(InputAt(GetSpecialInputIndex()), GetBlock()->GetGraph()->GetCurrentMethod());
+      RemoveInputAt(GetSpecialInputIndex());
+    }
     dispatch_info_ = dispatch_info;
   }
 
+  void AddSpecialInput(HInstruction* input) {
+    // We allow only one special input.
+    DCHECK(!IsStringInit() && !HasCurrentMethodInput());
+    DCHECK(InputCount() == GetSpecialInputIndex() ||
+           (InputCount() == GetSpecialInputIndex() + 1 && IsStaticWithExplicitClinitCheck()));
+    InsertInputAt(GetSpecialInputIndex(), input);
+  }
+
   bool CanDoImplicitNullCheckOn(HInstruction* obj ATTRIBUTE_UNUSED) const OVERRIDE {
     // We access the method via the dex cache so we can't do an implicit null check.
     // TODO: for intrinsics we can generate implicit null checks.
@@ -3427,17 +3458,35 @@ class HInvokeStaticOrDirect : public HInvoke {
     return return_type_ == Primitive::kPrimNot && !IsStringInit();
   }
 
+  // Get the index of the special input, if any.
+  //
+  // If the invoke IsStringInit(), it initially has a HFakeString special argument
+  // which is removed by the instruction simplifier; if the invoke HasCurrentMethodInput(),
+  // the "special input" is the current method pointer; otherwise there may be one
+  // platform-specific special input, such as PC-relative addressing base.
+  uint32_t GetSpecialInputIndex() const { return GetNumberOfArguments(); }
+
   InvokeType GetInvokeType() const { return invoke_type_; }
   MethodLoadKind GetMethodLoadKind() const { return dispatch_info_.method_load_kind; }
   CodePtrLocation GetCodePtrLocation() const { return dispatch_info_.code_ptr_location; }
   bool IsRecursive() const { return GetMethodLoadKind() == MethodLoadKind::kRecursive; }
   bool NeedsDexCacheOfDeclaringClass() const OVERRIDE;
   bool IsStringInit() const { return GetMethodLoadKind() == MethodLoadKind::kStringInit; }
-  uint32_t GetCurrentMethodInputIndex() const { return GetNumberOfArguments(); }
   bool HasMethodAddress() const { return GetMethodLoadKind() == MethodLoadKind::kDirectAddress; }
-  bool HasPcRelDexCache() const {
+  bool HasPcRelativeDexCache() const {
     return GetMethodLoadKind() == MethodLoadKind::kDexCachePcRelative;
   }
+  bool HasCurrentMethodInput() const {
+    // This function can be called only after the invoke has been fully initialized by the builder.
+    if (NeedsCurrentMethodInput(GetMethodLoadKind())) {
+      DCHECK(InputAt(GetSpecialInputIndex())->IsCurrentMethod());
+      return true;
+    } else {
+      DCHECK(InputCount() == GetSpecialInputIndex() ||
+             !InputAt(GetSpecialInputIndex())->IsCurrentMethod());
+      return false;
+    }
+  }
   bool HasDirectCodePtr() const { return GetCodePtrLocation() == CodePtrLocation::kCallDirect; }
   MethodReference GetTargetMethod() const { return target_method_; }
 
@@ -3452,7 +3501,7 @@ class HInvokeStaticOrDirect : public HInvoke {
   }
 
   uint32_t GetDexCacheArrayOffset() const {
-    DCHECK(HasPcRelDexCache());
+    DCHECK(HasPcRelativeDexCache());
     return dispatch_info_.method_load_data;
   }
 
@@ -3468,26 +3517,25 @@ class HInvokeStaticOrDirect : public HInvoke {
     return GetInvokeType() == kStatic;
   }
 
-  // Remove the art::HLoadClass instruction set as last input by
-  // art::PrepareForRegisterAllocation::VisitClinitCheck in lieu of
-  // the initial art::HClinitCheck instruction (only relevant for
-  // static calls with explicit clinit check).
-  void RemoveLoadClassAsLastInput() {
+  // Remove the HClinitCheck or the replacement HLoadClass (set as last input by
+  // PrepareForRegisterAllocation::VisitClinitCheck() in lieu of the initial HClinitCheck)
+  // instruction; only relevant for static calls with explicit clinit check.
+  void RemoveExplicitClinitCheck(ClinitCheckRequirement new_requirement) {
     DCHECK(IsStaticWithExplicitClinitCheck());
     size_t last_input_index = InputCount() - 1;
     HInstruction* last_input = InputAt(last_input_index);
     DCHECK(last_input != nullptr);
-    DCHECK(last_input->IsLoadClass()) << last_input->DebugName();
+    DCHECK(last_input->IsLoadClass() || last_input->IsClinitCheck()) << last_input->DebugName();
     RemoveAsUserOfInput(last_input_index);
     inputs_.pop_back();
-    clinit_check_requirement_ = ClinitCheckRequirement::kImplicit;
-    DCHECK(IsStaticWithImplicitClinitCheck());
+    clinit_check_requirement_ = new_requirement;
+    DCHECK(!IsStaticWithExplicitClinitCheck());
   }
 
   bool IsStringFactoryFor(HFakeString* str) const {
     if (!IsStringInit()) return false;
-    // +1 for the current method.
-    if (InputCount() == (number_of_arguments_ + 1)) return false;
+    DCHECK(!HasCurrentMethodInput());
+    if (InputCount() == (number_of_arguments_)) return false;
     return InputAt(InputCount() - 1)->AsFakeString() == str;
   }
 
@@ -3502,7 +3550,7 @@ class HInvokeStaticOrDirect : public HInvoke {
   }
 
   // Is this a call to a static method whose declaring class has an
-  // explicit intialization check in the graph?
+  // explicit initialization check in the graph?
   bool IsStaticWithExplicitClinitCheck() const {
     return IsStatic() && (clinit_check_requirement_ == ClinitCheckRequirement::kExplicit);
   }
@@ -3513,6 +3561,11 @@ class HInvokeStaticOrDirect : public HInvoke {
     return IsStatic() && (clinit_check_requirement_ == ClinitCheckRequirement::kImplicit);
   }
 
+  // Does this method load kind need the current method as an input?
+  static bool NeedsCurrentMethodInput(MethodLoadKind kind) {
+    return kind == MethodLoadKind::kRecursive || kind == MethodLoadKind::kDexCacheViaMethod;
+  }
+
   DECLARE_INSTRUCTION(InvokeStaticOrDirect);
 
  protected:
@@ -3530,6 +3583,9 @@ class HInvokeStaticOrDirect : public HInvoke {
     return input_record;
   }
 
+  void InsertInputAt(size_t index, HInstruction* input);
+  void RemoveInputAt(size_t index);
+
  private:
   const InvokeType invoke_type_;
   ClinitCheckRequirement clinit_check_requirement_;
@@ -3541,6 +3597,7 @@ class HInvokeStaticOrDirect : public HInvoke {
 
   DISALLOW_COPY_AND_ASSIGN(HInvokeStaticOrDirect);
 };
+std::ostream& operator<<(std::ostream& os, HInvokeStaticOrDirect::ClinitCheckRequirement rhs);
 
 class HInvokeVirtual : public HInvoke {
  public:
@@ -3601,10 +3658,14 @@ class HNewInstance : public HExpression<1> {
                uint32_t dex_pc,
                uint16_t type_index,
                const DexFile& dex_file,
+               bool can_throw,
+               bool finalizable,
                QuickEntrypointEnum entrypoint)
       : HExpression(Primitive::kPrimNot, SideEffects::CanTriggerGC(), dex_pc),
         type_index_(type_index),
         dex_file_(dex_file),
+        can_throw_(can_throw),
+        finalizable_(finalizable),
         entrypoint_(entrypoint) {
     SetRawInputAt(0, current_method);
   }
@@ -3614,11 +3675,13 @@ class HNewInstance : public HExpression<1> {
 
   // Calls runtime so needs an environment.
   bool NeedsEnvironment() const OVERRIDE { return true; }
-  // It may throw when called on:
-  //   - interfaces
-  //   - abstract/innaccessible/unknown classes
-  // TODO: optimize when possible.
-  bool CanThrow() const OVERRIDE { return true; }
+
+  // It may throw when called on type that's not instantiable/accessible.
+  // It can throw OOME.
+  // TODO: distinguish between the two cases so we can for example allow allocation elimination.
+  bool CanThrow() const OVERRIDE { return can_throw_ || true; }
+
+  bool IsFinalizable() const { return finalizable_; }
 
   bool CanBeNull() const OVERRIDE { return false; }
 
@@ -3629,6 +3692,8 @@ class HNewInstance : public HExpression<1> {
  private:
   const uint16_t type_index_;
   const DexFile& dex_file_;
+  const bool can_throw_;
+  const bool finalizable_;
   const QuickEntrypointEnum entrypoint_;
 
   DISALLOW_COPY_AND_ASSIGN(HNewInstance);
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index 7e3c5e602e..2204921c53 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -24,7 +24,7 @@
 #endif
 
 #ifdef ART_ENABLE_CODEGEN_x86
-#include "constant_area_fixups_x86.h"
+#include "pc_relative_fixups_x86.h"
 #endif
 
 #include "art_method-inl.h"
@@ -383,6 +383,14 @@ static bool IsInstructionSetSupported(InstructionSet instruction_set) {
       || instruction_set == kX86_64;
 }
 
+// Read barrier are supported only on ARM, x86 and x86-64 at the moment.
+// TODO: Add support for other architectures and remove this function
+static bool InstructionSetSupportsReadBarrier(InstructionSet instruction_set) {
+  return instruction_set == kThumb2
+      || instruction_set == kX86
+      || instruction_set == kX86_64;
+}
+
 static void RunOptimizations(HOptimization* optimizations[],
                              size_t length,
                              PassObserver* pass_observer) {
@@ -405,20 +413,9 @@ static void MaybeRunInliner(HGraph* graph,
   if (!should_inline) {
     return;
   }
-
-  ArenaAllocator* arena = graph->GetArena();
-  HInliner* inliner = new (arena) HInliner(
+  HInliner* inliner = new (graph->GetArena()) HInliner(
     graph, codegen, dex_compilation_unit, dex_compilation_unit, driver, handles, stats);
-  ReferenceTypePropagation* type_propagation =
-    new (arena) ReferenceTypePropagation(graph, handles,
-        "reference_type_propagation_after_inlining");
-
-  HOptimization* optimizations[] = {
-    inliner,
-    // Run another type propagation phase: inlining will open up more opportunities
-    // to remove checkcast/instanceof and null checks.
-    type_propagation,
-  };
+  HOptimization* optimizations[] = { inliner };
 
   RunOptimizations(optimizations, arraysize(optimizations), pass_observer);
 }
@@ -446,10 +443,9 @@ static void RunArchOptimizations(InstructionSet instruction_set,
 #endif
 #ifdef ART_ENABLE_CODEGEN_x86
     case kX86: {
-      x86::ConstantAreaFixups* constant_area_fixups =
-          new (arena) x86::ConstantAreaFixups(graph, stats);
+      x86::PcRelativeFixups* pc_relative_fixups = new (arena) x86::PcRelativeFixups(graph, stats);
       HOptimization* x86_optimizations[] = {
-        constant_area_fixups
+          pc_relative_fixups
       };
       RunOptimizations(x86_optimizations, arraysize(x86_optimizations), pass_observer);
       break;
@@ -531,6 +527,7 @@ static void RunOptimizations(HGraph* graph,
   //       pipeline for all methods.
   if (graph->HasTryCatch()) {
     HOptimization* optimizations2[] = {
+      boolean_simplify,
       side_effects,
       gvn,
       dce2,
@@ -672,8 +669,8 @@ CodeGenerator* OptimizingCompiler::TryCompile(ArenaAllocator* arena,
   CompilerDriver* compiler_driver = GetCompilerDriver();
   InstructionSet instruction_set = compiler_driver->GetInstructionSet();
 
-  // Always use the thumb2 assembler: some runtime functionality (like implicit stack
-  // overflow checks) assume thumb2.
+  // Always use the Thumb-2 assembler: some runtime functionality
+  // (like implicit stack overflow checks) assume Thumb-2.
   if (instruction_set == kArm) {
     instruction_set = kThumb2;
   }
@@ -684,6 +681,12 @@ CodeGenerator* OptimizingCompiler::TryCompile(ArenaAllocator* arena,
     return nullptr;
   }
 
+  // When read barriers are enabled, do not attempt to compile for
+  // instruction sets that have no read barrier support.
+  if (kEmitCompilerReadBarrier && !InstructionSetSupportsReadBarrier(instruction_set)) {
+    return nullptr;
+  }
+
   if (Compiler::IsPathologicalCase(*code_item, method_idx, dex_file)) {
     MaybeRecordStat(MethodCompilationStat::kNotCompiledPathological);
     return nullptr;
@@ -852,9 +855,14 @@ CompiledMethod* OptimizingCompiler::Compile(const DexFile::CodeItem* code_item,
 
   if (kIsDebugBuild &&
       IsCompilingWithCoreImage() &&
-      IsInstructionSetSupported(compiler_driver->GetInstructionSet())) {
-    // For testing purposes, we put a special marker on method names that should be compiled
-    // with this compiler. This makes sure we're not regressing.
+      IsInstructionSetSupported(compiler_driver->GetInstructionSet()) &&
+      (!kEmitCompilerReadBarrier ||
+       InstructionSetSupportsReadBarrier(compiler_driver->GetInstructionSet()))) {
+    // For testing purposes, we put a special marker on method names
+    // that should be compiled with this compiler (when the the
+    // instruction set is supported -- and has support for read
+    // barriers, if they are enabled). This makes sure we're not
+    // regressing.
     std::string method_name = PrettyMethod(method_idx, dex_file);
     bool shouldCompile = method_name.find("$opt$") != std::string::npos;
     DCHECK((method != nullptr) || !shouldCompile) << "Didn't compile " << method_name;
diff --git a/compiler/optimizing/constant_area_fixups_x86.cc b/compiler/optimizing/pc_relative_fixups_x86.cc
index c3470002c5..808a1dc6c2 100644
--- a/compiler/optimizing/constant_area_fixups_x86.cc
+++ b/compiler/optimizing/pc_relative_fixups_x86.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "constant_area_fixups_x86.h"
+#include "pc_relative_fixups_x86.h"
 
 namespace art {
 namespace x86 {
@@ -22,9 +22,9 @@ namespace x86 {
 /**
  * Finds instructions that need the constant area base as an input.
  */
-class ConstantHandlerVisitor : public HGraphVisitor {
+class PCRelativeHandlerVisitor : public HGraphVisitor {
  public:
-  explicit ConstantHandlerVisitor(HGraph* graph) : HGraphVisitor(graph), base_(nullptr) {}
+  explicit PCRelativeHandlerVisitor(HGraph* graph) : HGraphVisitor(graph), base_(nullptr) {}
 
  private:
   void VisitAdd(HAdd* add) OVERRIDE {
@@ -72,7 +72,7 @@ class ConstantHandlerVisitor : public HGraphVisitor {
   void VisitPackedSwitch(HPackedSwitch* switch_insn) OVERRIDE {
     // We need to replace the HPackedSwitch with a HX86PackedSwitch in order to
     // address the constant area.
-    InitializeConstantAreaPointer(switch_insn);
+    InitializePCRelativeBasePointer(switch_insn);
     HGraph* graph = GetGraph();
     HBasicBlock* block = switch_insn->GetBlock();
     HX86PackedSwitch* x86_switch = new (graph->GetArena()) HX86PackedSwitch(
@@ -84,7 +84,7 @@ class ConstantHandlerVisitor : public HGraphVisitor {
     block->ReplaceAndRemoveInstructionWith(switch_insn, x86_switch);
   }
 
-  void InitializeConstantAreaPointer(HInstruction* user) {
+  void InitializePCRelativeBasePointer(HInstruction* user) {
     // Ensure we only initialize the pointer once.
     if (base_ != nullptr) {
       return;
@@ -99,16 +99,23 @@ class ConstantHandlerVisitor : public HGraphVisitor {
   }
 
   void ReplaceInput(HInstruction* insn, HConstant* value, int input_index, bool materialize) {
-    InitializeConstantAreaPointer(insn);
-    HGraph* graph = GetGraph();
-    HBasicBlock* block = insn->GetBlock();
+    InitializePCRelativeBasePointer(insn);
     HX86LoadFromConstantTable* load_constant =
-        new (graph->GetArena()) HX86LoadFromConstantTable(base_, value, materialize);
-    block->InsertInstructionBefore(load_constant, insn);
+        new (GetGraph()->GetArena()) HX86LoadFromConstantTable(base_, value, materialize);
+    insn->GetBlock()->InsertInstructionBefore(load_constant, insn);
     insn->ReplaceInput(load_constant, input_index);
   }
 
   void HandleInvoke(HInvoke* invoke) {
+    // If this is an invoke-static/-direct with PC-relative dex cache array
+    // addressing, we need the PC-relative address base.
+    HInvokeStaticOrDirect* invoke_static_or_direct = invoke->AsInvokeStaticOrDirect();
+    if (invoke_static_or_direct != nullptr && invoke_static_or_direct->HasPcRelativeDexCache()) {
+      InitializePCRelativeBasePointer(invoke);
+      // Add the extra parameter base_.
+      DCHECK(!invoke_static_or_direct->HasCurrentMethodInput());
+      invoke_static_or_direct->AddSpecialInput(base_);
+    }
     // Ensure that we can load FP arguments from the constant area.
     for (size_t i = 0, e = invoke->InputCount(); i < e; i++) {
       HConstant* input = invoke->InputAt(i)->AsConstant();
@@ -123,8 +130,8 @@ class ConstantHandlerVisitor : public HGraphVisitor {
   HX86ComputeBaseMethodAddress* base_;
 };
 
-void ConstantAreaFixups::Run() {
-  ConstantHandlerVisitor visitor(graph_);
+void PcRelativeFixups::Run() {
+  PCRelativeHandlerVisitor visitor(graph_);
   visitor.VisitInsertionOrder();
 }
 
diff --git a/compiler/optimizing/constant_area_fixups_x86.h b/compiler/optimizing/pc_relative_fixups_x86.h
index 4138039cdd..af708acb03 100644
--- a/compiler/optimizing/constant_area_fixups_x86.h
+++ b/compiler/optimizing/pc_relative_fixups_x86.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef ART_COMPILER_OPTIMIZING_CONSTANT_AREA_FIXUPS_X86_H_
-#define ART_COMPILER_OPTIMIZING_CONSTANT_AREA_FIXUPS_X86_H_
+#ifndef ART_COMPILER_OPTIMIZING_PC_RELATIVE_FIXUPS_X86_H_
+#define ART_COMPILER_OPTIMIZING_PC_RELATIVE_FIXUPS_X86_H_
 
 #include "nodes.h"
 #include "optimization.h"
@@ -23,10 +23,10 @@
 namespace art {
 namespace x86 {
 
-class ConstantAreaFixups : public HOptimization {
+class PcRelativeFixups : public HOptimization {
  public:
-  ConstantAreaFixups(HGraph* graph, OptimizingCompilerStats* stats)
-      : HOptimization(graph, "constant_area_fixups_x86", stats) {}
+  PcRelativeFixups(HGraph* graph, OptimizingCompilerStats* stats)
+      : HOptimization(graph, "pc_relative_fixups_x86", stats) {}
 
   void Run() OVERRIDE;
 };
@@ -34,4 +34,4 @@ class ConstantAreaFixups : public HOptimization {
 }  // namespace x86
 }  // namespace art
 
-#endif  // ART_COMPILER_OPTIMIZING_CONSTANT_AREA_FIXUPS_X86_H_
+#endif  // ART_COMPILER_OPTIMIZING_PC_RELATIVE_FIXUPS_X86_H_
diff --git a/compiler/optimizing/prepare_for_register_allocation.cc b/compiler/optimizing/prepare_for_register_allocation.cc
index ca928ae0f2..f3d075caaa 100644
--- a/compiler/optimizing/prepare_for_register_allocation.cc
+++ b/compiler/optimizing/prepare_for_register_allocation.cc
@@ -48,12 +48,46 @@ void PrepareForRegisterAllocation::VisitBoundType(HBoundType* bound_type) {
 }
 
 void PrepareForRegisterAllocation::VisitClinitCheck(HClinitCheck* check) {
-  HLoadClass* cls = check->GetLoadClass();
-  check->ReplaceWith(cls);
-  if (check->GetPrevious() == cls) {
+  // Try to find a static invoke from which this check originated.
+  HInvokeStaticOrDirect* invoke = nullptr;
+  for (HUseIterator<HInstruction*> it(check->GetUses()); !it.Done(); it.Advance()) {
+    HInstruction* user = it.Current()->GetUser();
+    if (user->IsInvokeStaticOrDirect() && CanMoveClinitCheck(check, user)) {
+      invoke = user->AsInvokeStaticOrDirect();
+      DCHECK(invoke->IsStaticWithExplicitClinitCheck());
+      invoke->RemoveExplicitClinitCheck(HInvokeStaticOrDirect::ClinitCheckRequirement::kImplicit);
+      break;
+    }
+  }
+  // If we found a static invoke for merging, remove the check from all other static invokes.
+  if (invoke != nullptr) {
+    for (HUseIterator<HInstruction*> it(check->GetUses()); !it.Done(); ) {
+      HInstruction* user = it.Current()->GetUser();
+      DCHECK(invoke->StrictlyDominates(user));  // All other uses must be dominated.
+      it.Advance();  // Advance before we remove the node, reference to the next node is preserved.
+      if (user->IsInvokeStaticOrDirect()) {
+        user->AsInvokeStaticOrDirect()->RemoveExplicitClinitCheck(
+            HInvokeStaticOrDirect::ClinitCheckRequirement::kNone);
+      }
+    }
+  }
+
+  HLoadClass* load_class = check->GetLoadClass();
+  bool can_merge_with_load_class = CanMoveClinitCheck(load_class, check);
+
+  check->ReplaceWith(load_class);
+
+  if (invoke != nullptr) {
+    // Remove the check from the graph. It has been merged into the invoke.
+    check->GetBlock()->RemoveInstruction(check);
+    // Check if we can merge the load class as well.
+    if (can_merge_with_load_class && !load_class->HasUses()) {
+      load_class->GetBlock()->RemoveInstruction(load_class);
+    }
+  } else if (can_merge_with_load_class) {
     // Pass the initialization duty to the `HLoadClass` instruction,
     // and remove the instruction from the graph.
-    cls->SetMustGenerateClinitCheck(true);
+    load_class->SetMustGenerateClinitCheck(true);
     check->GetBlock()->RemoveInstruction(check);
   }
 }
@@ -86,30 +120,60 @@ void PrepareForRegisterAllocation::VisitInvokeStaticOrDirect(HInvokeStaticOrDire
     DCHECK(last_input != nullptr)
         << "Last input is not HLoadClass. It is " << last_input->DebugName();
 
-    // Remove a load class instruction as last input of a static
-    // invoke, which has been added (along with a clinit check,
-    // removed by PrepareForRegisterAllocation::VisitClinitCheck
-    // previously) by the graph builder during the creation of the
-    // static invoke instruction, but is no longer required at this
-    // stage (i.e., after inlining has been performed).
-    invoke->RemoveLoadClassAsLastInput();
-
-    // The static call will initialize the class so there's no need for a clinit check if
-    // it's the first user.
-    // There is one special case where we still need the clinit check, when inlining. Because
-    // currently the callee is responsible for reporting parameters to the GC, the code
-    // that walks the stack during `artQuickResolutionTrampoline` cannot be interrupted for GC.
-    // Therefore we cannot allocate any object in that code, including loading a new class.
-    if (last_input == invoke->GetPrevious() && !invoke->IsFromInlinedInvoke()) {
-      last_input->SetMustGenerateClinitCheck(false);
-
-      // If the load class instruction is no longer used, remove it from
-      // the graph.
-      if (!last_input->HasUses()) {
-        last_input->GetBlock()->RemoveInstruction(last_input);
-      }
+    // Detach the explicit class initialization check from the invoke.
+    // Keeping track of the initializing instruction is no longer required
+    // at this stage (i.e., after inlining has been performed).
+    invoke->RemoveExplicitClinitCheck(HInvokeStaticOrDirect::ClinitCheckRequirement::kNone);
+
+    // Merging with load class should have happened in VisitClinitCheck().
+    DCHECK(!CanMoveClinitCheck(last_input, invoke));
+  }
+}
+
+bool PrepareForRegisterAllocation::CanMoveClinitCheck(HInstruction* input, HInstruction* user) {
+  // Determine if input and user come from the same dex instruction, so that we can move
+  // the clinit check responsibility from one to the other, i.e. from HClinitCheck (user)
+  // to HLoadClass (input), or from HClinitCheck (input) to HInvokeStaticOrDirect (user).
+
+  // Start with a quick dex pc check.
+  if (user->GetDexPc() != input->GetDexPc()) {
+    return false;
+  }
+
+  // Now do a thorough environment check that this is really coming from the same instruction in
+  // the same inlined graph. Unfortunately, we have to go through the whole environment chain.
+  HEnvironment* user_environment = user->GetEnvironment();
+  HEnvironment* input_environment = input->GetEnvironment();
+  while (user_environment != nullptr || input_environment != nullptr) {
+    if (user_environment == nullptr || input_environment == nullptr) {
+      // Different environment chain length. This happens when a method is called
+      // once directly and once indirectly through another inlined method.
+      return false;
+    }
+    if (user_environment->GetDexPc() != input_environment->GetDexPc() ||
+        user_environment->GetMethodIdx() != input_environment->GetMethodIdx() ||
+        !IsSameDexFile(user_environment->GetDexFile(), input_environment->GetDexFile())) {
+      return false;
+    }
+    user_environment = user_environment->GetParent();
+    input_environment = input_environment->GetParent();
+  }
+
+  // Check for code motion taking the input to a different block.
+  if (user->GetBlock() != input->GetBlock()) {
+    return false;
+  }
+
+  // In debug mode, check that we have not inserted a throwing instruction
+  // or an instruction with side effects between input and user.
+  if (kIsDebugBuild) {
+    for (HInstruction* between = input->GetNext(); between != user; between = between->GetNext()) {
+      CHECK(between != nullptr);  // User must be after input in the same block.
+      CHECK(!between->CanThrow());
+      CHECK(!between->HasSideEffects());
     }
   }
+  return true;
 }
 
 }  // namespace art
diff --git a/compiler/optimizing/prepare_for_register_allocation.h b/compiler/optimizing/prepare_for_register_allocation.h
index d7f277fa0d..a70fb309df 100644
--- a/compiler/optimizing/prepare_for_register_allocation.h
+++ b/compiler/optimizing/prepare_for_register_allocation.h
@@ -41,6 +41,8 @@ class PrepareForRegisterAllocation : public HGraphDelegateVisitor {
   void VisitCondition(HCondition* condition) OVERRIDE;
   void VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) OVERRIDE;
 
+  bool CanMoveClinitCheck(HInstruction* input, HInstruction* user);
+
   DISALLOW_COPY_AND_ASSIGN(PrepareForRegisterAllocation);
 };
 
diff --git a/compiler/optimizing/reference_type_propagation.cc b/compiler/optimizing/reference_type_propagation.cc
index 659da068a9..0d05c49fc5 100644
--- a/compiler/optimizing/reference_type_propagation.cc
+++ b/compiler/optimizing/reference_type_propagation.cc
@@ -99,17 +99,9 @@ ReferenceTypePropagation::ReferenceTypePropagation(HGraph* graph,
   }
 }
 
-void ReferenceTypePropagation::Run() {
-  // To properly propagate type info we need to visit in the dominator-based order.
-  // Reverse post order guarantees a node's dominators are visited first.
-  // We take advantage of this order in `VisitBasicBlock`.
-  for (HReversePostOrderIterator it(*graph_); !it.Done(); it.Advance()) {
-    VisitBasicBlock(it.Current());
-  }
-  ProcessWorklist();
-
+void ReferenceTypePropagation::ValidateTypes() {
+  // TODO: move this to the graph checker.
   if (kIsDebugBuild) {
-    // TODO: move this to the graph checker.
     ScopedObjectAccess soa(Thread::Current());
     for (HReversePostOrderIterator it(*graph_); !it.Done(); it.Advance()) {
       HBasicBlock* block = it.Current();
@@ -135,6 +127,18 @@ void ReferenceTypePropagation::Run() {
   }
 }
 
+void ReferenceTypePropagation::Run() {
+  // To properly propagate type info we need to visit in the dominator-based order.
+  // Reverse post order guarantees a node's dominators are visited first.
+  // We take advantage of this order in `VisitBasicBlock`.
+  for (HReversePostOrderIterator it(*graph_); !it.Done(); it.Advance()) {
+    VisitBasicBlock(it.Current());
+  }
+
+  ProcessWorklist();
+  ValidateTypes();
+}
+
 void ReferenceTypePropagation::VisitBasicBlock(HBasicBlock* block) {
   RTPVisitor visitor(graph_,
                      handles_,
@@ -610,23 +614,36 @@ ReferenceTypeInfo ReferenceTypePropagation::MergeTypes(const ReferenceTypeInfo&
   }
 
   bool is_exact = a.IsExact() && b.IsExact();
-  Handle<mirror::Class> type_handle;
+  ReferenceTypeInfo::TypeHandle result_type_handle;
+  ReferenceTypeInfo::TypeHandle a_type_handle = a.GetTypeHandle();
+  ReferenceTypeInfo::TypeHandle b_type_handle = b.GetTypeHandle();
+  bool a_is_interface = a_type_handle->IsInterface();
+  bool b_is_interface = b_type_handle->IsInterface();
 
   if (a.GetTypeHandle().Get() == b.GetTypeHandle().Get()) {
-    type_handle = a.GetTypeHandle();
+    result_type_handle = a_type_handle;
   } else if (a.IsSupertypeOf(b)) {
-    type_handle = a.GetTypeHandle();
+    result_type_handle = a_type_handle;
     is_exact = false;
   } else if (b.IsSupertypeOf(a)) {
-    type_handle = b.GetTypeHandle();
+    result_type_handle = b_type_handle;
+    is_exact = false;
+  } else if (!a_is_interface && !b_is_interface) {
+    result_type_handle = handles_->NewHandle(a_type_handle->GetCommonSuperClass(b_type_handle));
     is_exact = false;
   } else {
-    // TODO: Find the first common super class.
-    type_handle = object_class_handle_;
+    // This can happen if:
+    //    - both types are interfaces. TODO(calin): implement
+    //    - one is an interface, the other a class, and the type does not implement the interface
+    //      e.g:
+    //        void foo(Interface i, boolean cond) {
+    //          Object o = cond ? i : new Object();
+    //        }
+    result_type_handle = object_class_handle_;
     is_exact = false;
   }
 
-  return ReferenceTypeInfo::Create(type_handle, is_exact);
+  return ReferenceTypeInfo::Create(result_type_handle, is_exact);
 }
 
 static void UpdateArrayGet(HArrayGet* instr,
@@ -715,14 +732,35 @@ void ReferenceTypePropagation::UpdateBoundType(HBoundType* instr) {
   instr->SetReferenceTypeInfo(new_rti);
 }
 
+// NullConstant inputs are ignored during merging as they do not provide any useful information.
+// If all the inputs are NullConstants then the type of the phi will be set to Object.
 void ReferenceTypePropagation::UpdatePhi(HPhi* instr) {
-  ReferenceTypeInfo new_rti = instr->InputAt(0)->GetReferenceTypeInfo();
+  size_t input_count = instr->InputCount();
+  size_t first_input_index_not_null = 0;
+  while (first_input_index_not_null < input_count &&
+      instr->InputAt(first_input_index_not_null)->IsNullConstant()) {
+    first_input_index_not_null++;
+  }
+  if (first_input_index_not_null == input_count) {
+    // All inputs are NullConstants, set the type to object.
+    // This may happen in the presence of inlining.
+    instr->SetReferenceTypeInfo(
+        ReferenceTypeInfo::Create(object_class_handle_, /* is_exact */ false));
+    return;
+  }
+
+  ReferenceTypeInfo new_rti = instr->InputAt(first_input_index_not_null)->GetReferenceTypeInfo();
+
   if (new_rti.IsValid() && new_rti.IsObjectClass() && !new_rti.IsExact()) {
     // Early return if we are Object and inexact.
     instr->SetReferenceTypeInfo(new_rti);
     return;
   }
-  for (size_t i = 1; i < instr->InputCount(); i++) {
+
+  for (size_t i = first_input_index_not_null + 1; i < input_count; i++) {
+    if (instr->InputAt(i)->IsNullConstant()) {
+      continue;
+    }
     new_rti = MergeTypes(new_rti, instr->InputAt(i)->GetReferenceTypeInfo());
     if (new_rti.IsValid() && new_rti.IsObjectClass()) {
       if (!new_rti.IsExact()) {
diff --git a/compiler/optimizing/reference_type_propagation.h b/compiler/optimizing/reference_type_propagation.h
index 5493601adc..5c05592726 100644
--- a/compiler/optimizing/reference_type_propagation.h
+++ b/compiler/optimizing/reference_type_propagation.h
@@ -56,6 +56,8 @@ class ReferenceTypePropagation : public HOptimization {
   ReferenceTypeInfo MergeTypes(const ReferenceTypeInfo& a, const ReferenceTypeInfo& b)
       SHARED_REQUIRES(Locks::mutator_lock_);
 
+  void ValidateTypes();
+
   StackHandleScopeCollection* handles_;
 
   ArenaVector<HInstruction*> worklist_;
diff --git a/compiler/optimizing/register_allocator.cc b/compiler/optimizing/register_allocator.cc
index ef22c816a0..d399bc2d7a 100644
--- a/compiler/optimizing/register_allocator.cc
+++ b/compiler/optimizing/register_allocator.cc
@@ -1525,7 +1525,7 @@ void RegisterAllocator::InsertParallelMoveAtExitOf(HBasicBlock* block,
   DCHECK(IsValidDestination(destination)) << destination;
   if (source.Equals(destination)) return;
 
-  DCHECK_EQ(block->NumberOfNormalSuccessors(), 1u);
+  DCHECK_EQ(block->GetNormalSuccessors().size(), 1u);
   HInstruction* last = block->GetLastInstruction();
   // We insert moves at exit for phi predecessors and connecting blocks.
   // A block ending with an if or a packed switch cannot branch to a block
@@ -1752,7 +1752,7 @@ void RegisterAllocator::ConnectSplitSiblings(LiveInterval* interval,
 
   // If `from` has only one successor, we can put the moves at the exit of it. Otherwise
   // we need to put the moves at the entry of `to`.
-  if (from->NumberOfNormalSuccessors() == 1) {
+  if (from->GetNormalSuccessors().size() == 1) {
     InsertParallelMoveAtExitOf(from,
                                interval->GetParent()->GetDefinedBy(),
                                source->ToLocation(),
@@ -1894,7 +1894,7 @@ void RegisterAllocator::Resolve() {
         HInstruction* phi = inst_it.Current();
         for (size_t i = 0, e = current->GetPredecessors().size(); i < e; ++i) {
           HBasicBlock* predecessor = current->GetPredecessors()[i];
-          DCHECK_EQ(predecessor->NumberOfNormalSuccessors(), 1u);
+          DCHECK_EQ(predecessor->GetNormalSuccessors().size(), 1u);
           HInstruction* input = phi->InputAt(i);
           Location source = input->GetLiveInterval()->GetLocationAt(
               predecessor->GetLifetimeEnd() - 1);
diff --git a/compiler/optimizing/side_effects_test.cc b/compiler/optimizing/side_effects_test.cc
index ec45d6b2ca..9bbc354290 100644
--- a/compiler/optimizing/side_effects_test.cc
+++ b/compiler/optimizing/side_effects_test.cc
@@ -129,13 +129,13 @@ TEST(SideEffectsTest, NoDependences) {
 
 TEST(SideEffectsTest, VolatileDependences) {
   SideEffects volatile_write =
-      SideEffects::FieldWriteOfType(Primitive::kPrimInt, true);
+      SideEffects::FieldWriteOfType(Primitive::kPrimInt, /* is_volatile */ true);
   SideEffects any_write =
-      SideEffects::FieldWriteOfType(Primitive::kPrimInt, false);
+      SideEffects::FieldWriteOfType(Primitive::kPrimInt, /* is_volatile */ false);
   SideEffects volatile_read =
-      SideEffects::FieldReadOfType(Primitive::kPrimByte, true);
+      SideEffects::FieldReadOfType(Primitive::kPrimByte, /* is_volatile */ true);
   SideEffects any_read =
-      SideEffects::FieldReadOfType(Primitive::kPrimByte, false);
+      SideEffects::FieldReadOfType(Primitive::kPrimByte, /* is_volatile */ false);
 
   EXPECT_FALSE(volatile_write.MayDependOn(any_read));
   EXPECT_TRUE(any_read.MayDependOn(volatile_write));
@@ -151,15 +151,15 @@ TEST(SideEffectsTest, VolatileDependences) {
 TEST(SideEffectsTest, SameWidthTypes) {
   // Type I/F.
   testWriteAndReadDependence(
-      SideEffects::FieldWriteOfType(Primitive::kPrimInt, false),
-      SideEffects::FieldReadOfType(Primitive::kPrimFloat, false));
+      SideEffects::FieldWriteOfType(Primitive::kPrimInt, /* is_volatile */ false),
+      SideEffects::FieldReadOfType(Primitive::kPrimFloat, /* is_volatile */ false));
   testWriteAndReadDependence(
       SideEffects::ArrayWriteOfType(Primitive::kPrimInt),
       SideEffects::ArrayReadOfType(Primitive::kPrimFloat));
   // Type L/D.
   testWriteAndReadDependence(
-      SideEffects::FieldWriteOfType(Primitive::kPrimLong, false),
-      SideEffects::FieldReadOfType(Primitive::kPrimDouble, false));
+      SideEffects::FieldWriteOfType(Primitive::kPrimLong, /* is_volatile */ false),
+      SideEffects::FieldReadOfType(Primitive::kPrimDouble, /* is_volatile */ false));
   testWriteAndReadDependence(
       SideEffects::ArrayWriteOfType(Primitive::kPrimLong),
       SideEffects::ArrayReadOfType(Primitive::kPrimDouble));
@@ -171,9 +171,9 @@ TEST(SideEffectsTest, AllWritesAndReads) {
   for (Primitive::Type type = Primitive::kPrimNot;
         type < Primitive::kPrimVoid;
         type = Primitive::Type(type + 1)) {
-    s = s.Union(SideEffects::FieldWriteOfType(type, false));
+    s = s.Union(SideEffects::FieldWriteOfType(type, /* is_volatile */ false));
     s = s.Union(SideEffects::ArrayWriteOfType(type));
-    s = s.Union(SideEffects::FieldReadOfType(type, false));
+    s = s.Union(SideEffects::FieldReadOfType(type, /* is_volatile */ false));
     s = s.Union(SideEffects::ArrayReadOfType(type));
   }
   EXPECT_TRUE(s.DoesAllReadWrite());
@@ -225,10 +225,10 @@ TEST(SideEffectsTest, BitStrings) {
       "||DJ|||||",  // note: DJ alias
       SideEffects::ArrayReadOfType(Primitive::kPrimDouble).ToString().c_str());
   SideEffects s = SideEffects::None();
-  s = s.Union(SideEffects::FieldWriteOfType(Primitive::kPrimChar, false));
-  s = s.Union(SideEffects::FieldWriteOfType(Primitive::kPrimLong, false));
+  s = s.Union(SideEffects::FieldWriteOfType(Primitive::kPrimChar, /* is_volatile */ false));
+  s = s.Union(SideEffects::FieldWriteOfType(Primitive::kPrimLong, /* is_volatile */ false));
   s = s.Union(SideEffects::ArrayWriteOfType(Primitive::kPrimShort));
-  s = s.Union(SideEffects::FieldReadOfType(Primitive::kPrimInt, false));
+  s = s.Union(SideEffects::FieldReadOfType(Primitive::kPrimInt, /* is_volatile */ false));
   s = s.Union(SideEffects::ArrayReadOfType(Primitive::kPrimFloat));
   s = s.Union(SideEffects::ArrayReadOfType(Primitive::kPrimDouble));
   EXPECT_STREQ(
diff --git a/compiler/optimizing/ssa_builder.cc b/compiler/optimizing/ssa_builder.cc
index 4565590bc3..5190eb3b26 100644
--- a/compiler/optimizing/ssa_builder.cc
+++ b/compiler/optimizing/ssa_builder.cc
@@ -660,8 +660,7 @@ void SsaBuilder::VisitInstruction(HInstruction* instruction) {
   if (instruction->CanThrowIntoCatchBlock()) {
     const HTryBoundary& try_entry =
         instruction->GetBlock()->GetTryCatchInformation()->GetTryEntry();
-    for (HExceptionHandlerIterator it(try_entry); !it.Done(); it.Advance()) {
-      HBasicBlock* catch_block = it.Current();
+    for (HBasicBlock* catch_block : try_entry.GetExceptionHandlers()) {
       ArenaVector<HInstruction*>* handler_locals = GetLocalsFor(catch_block);
       DCHECK_EQ(handler_locals->size(), current_locals_->size());
       for (size_t vreg = 0, e = current_locals_->size(); vreg < e; ++vreg) {
diff --git a/compiler/utils/mips64/assembler_mips64.cc b/compiler/utils/mips64/assembler_mips64.cc
index 00e8995bff..ba2525e555 100644
--- a/compiler/utils/mips64/assembler_mips64.cc
+++ b/compiler/utils/mips64/assembler_mips64.cc
@@ -117,14 +117,6 @@ void Mips64Assembler::EmitFI(int opcode, int fmt, FpuRegister ft, uint16_t imm)
   Emit(encoding);
 }
 
-void Mips64Assembler::Add(GpuRegister rd, GpuRegister rs, GpuRegister rt) {
-  EmitR(0, rs, rt, rd, 0, 0x20);
-}
-
-void Mips64Assembler::Addi(GpuRegister rt, GpuRegister rs, uint16_t imm16) {
-  EmitI(0x8, rs, rt, imm16);
-}
-
 void Mips64Assembler::Addu(GpuRegister rd, GpuRegister rs, GpuRegister rt) {
   EmitR(0, rs, rt, rd, 0, 0x21);
 }
@@ -141,10 +133,6 @@ void Mips64Assembler::Daddiu(GpuRegister rt, GpuRegister rs, uint16_t imm16) {
   EmitI(0x19, rs, rt, imm16);
 }
 
-void Mips64Assembler::Sub(GpuRegister rd, GpuRegister rs, GpuRegister rt) {
-  EmitR(0, rs, rt, rd, 0, 0x22);
-}
-
 void Mips64Assembler::Subu(GpuRegister rd, GpuRegister rs, GpuRegister rt) {
   EmitR(0, rs, rt, rd, 0, 0x23);
 }
@@ -153,50 +141,14 @@ void Mips64Assembler::Dsubu(GpuRegister rd, GpuRegister rs, GpuRegister rt) {
   EmitR(0, rs, rt, rd, 0, 0x2f);
 }
 
-void Mips64Assembler::MultR2(GpuRegister rs, GpuRegister rt) {
-  EmitR(0, rs, rt, static_cast<GpuRegister>(0), 0, 0x18);
-}
-
-void Mips64Assembler::MultuR2(GpuRegister rs, GpuRegister rt) {
-  EmitR(0, rs, rt, static_cast<GpuRegister>(0), 0, 0x19);
-}
-
-void Mips64Assembler::DivR2(GpuRegister rs, GpuRegister rt) {
-  EmitR(0, rs, rt, static_cast<GpuRegister>(0), 0, 0x1a);
-}
-
-void Mips64Assembler::DivuR2(GpuRegister rs, GpuRegister rt) {
-  EmitR(0, rs, rt, static_cast<GpuRegister>(0), 0, 0x1b);
-}
-
-void Mips64Assembler::MulR2(GpuRegister rd, GpuRegister rs, GpuRegister rt) {
-  EmitR(0x1c, rs, rt, rd, 0, 2);
-}
-
-void Mips64Assembler::DivR2(GpuRegister rd, GpuRegister rs, GpuRegister rt) {
-  DivR2(rs, rt);
-  Mflo(rd);
-}
-
-void Mips64Assembler::ModR2(GpuRegister rd, GpuRegister rs, GpuRegister rt) {
-  DivR2(rs, rt);
-  Mfhi(rd);
-}
-
-void Mips64Assembler::DivuR2(GpuRegister rd, GpuRegister rs, GpuRegister rt) {
-  DivuR2(rs, rt);
-  Mflo(rd);
-}
-
-void Mips64Assembler::ModuR2(GpuRegister rd, GpuRegister rs, GpuRegister rt) {
-  DivuR2(rs, rt);
-  Mfhi(rd);
-}
-
 void Mips64Assembler::MulR6(GpuRegister rd, GpuRegister rs, GpuRegister rt) {
   EmitR(0, rs, rt, rd, 2, 0x18);
 }
 
+void Mips64Assembler::MuhR6(GpuRegister rd, GpuRegister rs, GpuRegister rt) {
+  EmitR(0, rs, rt, rd, 3, 0x18);
+}
+
 void Mips64Assembler::DivR6(GpuRegister rd, GpuRegister rs, GpuRegister rt) {
   EmitR(0, rs, rt, rd, 2, 0x1a);
 }
@@ -217,6 +169,10 @@ void Mips64Assembler::Dmul(GpuRegister rd, GpuRegister rs, GpuRegister rt) {
   EmitR(0, rs, rt, rd, 2, 0x1c);
 }
 
+void Mips64Assembler::Dmuh(GpuRegister rd, GpuRegister rs, GpuRegister rt) {
+  EmitR(0, rs, rt, rd, 3, 0x1c);
+}
+
 void Mips64Assembler::Ddiv(GpuRegister rd, GpuRegister rs, GpuRegister rt) {
   EmitR(0, rs, rt, rd, 2, 0x1e);
 }
@@ -440,14 +396,6 @@ void Mips64Assembler::Sync(uint32_t stype) {
            static_cast<GpuRegister>(0), stype & 0x1f, 0xf);
 }
 
-void Mips64Assembler::Mfhi(GpuRegister rd) {
-  EmitR(0, static_cast<GpuRegister>(0), static_cast<GpuRegister>(0), rd, 0, 0x10);
-}
-
-void Mips64Assembler::Mflo(GpuRegister rd) {
-  EmitR(0, static_cast<GpuRegister>(0), static_cast<GpuRegister>(0), rd, 0, 0x12);
-}
-
 void Mips64Assembler::Sb(GpuRegister rt, GpuRegister rs, uint16_t imm16) {
   EmitI(0x28, rs, rt, imm16);
 }
@@ -892,45 +840,58 @@ void Mips64Assembler::LoadConst64(GpuRegister rd, int64_t value) {
   } else if ((value & 0xFFFF) == 0 && ((value >> 31) & 0x1FFFF) == ((0x20000 - bit31) & 0x1FFFF)) {
     Lui(rd, value >> 16);
     Dati(rd, (value >> 48) + bit31);
+  } else if (IsPowerOfTwo(value + UINT64_C(1))) {
+    int shift_cnt = 64 - CTZ(value + UINT64_C(1));
+    Daddiu(rd, ZERO, -1);
+    if (shift_cnt < 32) {
+      Dsrl(rd, rd, shift_cnt);
+    } else {
+      Dsrl32(rd, rd, shift_cnt & 31);
+    }
   } else {
     int shift_cnt = CTZ(value);
     int64_t tmp = value >> shift_cnt;
     if (IsUint<16>(tmp)) {
       Ori(rd, ZERO, tmp);
-      if (shift_cnt < 32)
+      if (shift_cnt < 32) {
         Dsll(rd, rd, shift_cnt);
-      else
+      } else {
         Dsll32(rd, rd, shift_cnt & 31);
+      }
     } else if (IsInt<16>(tmp)) {
       Daddiu(rd, ZERO, tmp);
-      if (shift_cnt < 32)
+      if (shift_cnt < 32) {
         Dsll(rd, rd, shift_cnt);
-      else
+      } else {
         Dsll32(rd, rd, shift_cnt & 31);
+      }
     } else if (IsInt<32>(tmp)) {
       // Loads with 3 instructions.
       Lui(rd, tmp >> 16);
       Ori(rd, rd, tmp);
-      if (shift_cnt < 32)
+      if (shift_cnt < 32) {
         Dsll(rd, rd, shift_cnt);
-      else
+      } else {
         Dsll32(rd, rd, shift_cnt & 31);
+      }
     } else {
       shift_cnt = 16 + CTZ(value >> 16);
       tmp = value >> shift_cnt;
       if (IsUint<16>(tmp)) {
         Ori(rd, ZERO, tmp);
-        if (shift_cnt < 32)
+        if (shift_cnt < 32) {
           Dsll(rd, rd, shift_cnt);
-        else
+        } else {
           Dsll32(rd, rd, shift_cnt & 31);
+        }
         Ori(rd, rd, value);
       } else if (IsInt<16>(tmp)) {
         Daddiu(rd, ZERO, tmp);
-        if (shift_cnt < 32)
+        if (shift_cnt < 32) {
           Dsll(rd, rd, shift_cnt);
-        else
+        } else {
           Dsll32(rd, rd, shift_cnt & 31);
+        }
         Ori(rd, rd, value);
       } else {
         // Loads with 3-4 instructions.
@@ -941,10 +902,11 @@ void Mips64Assembler::LoadConst64(GpuRegister rd, int64_t value) {
           used_lui = true;
         }
         if ((tmp2 & 0xFFFF) != 0) {
-          if (used_lui)
+          if (used_lui) {
             Ori(rd, rd, tmp2);
-          else
+          } else {
             Ori(rd, ZERO, tmp2);
+          }
         }
         if (bit31) {
           tmp2 += UINT64_C(0x100000000);
diff --git a/compiler/utils/mips64/assembler_mips64.h b/compiler/utils/mips64/assembler_mips64.h
index 33f22d2c2d..42962bca20 100644
--- a/compiler/utils/mips64/assembler_mips64.h
+++ b/compiler/utils/mips64/assembler_mips64.h
@@ -66,35 +66,25 @@ class Mips64Assembler FINAL : public Assembler {
   virtual ~Mips64Assembler() {}
 
   // Emit Machine Instructions.
-  void Add(GpuRegister rd, GpuRegister rs, GpuRegister rt);
-  void Addi(GpuRegister rt, GpuRegister rs, uint16_t imm16);
   void Addu(GpuRegister rd, GpuRegister rs, GpuRegister rt);
   void Addiu(GpuRegister rt, GpuRegister rs, uint16_t imm16);
   void Daddu(GpuRegister rd, GpuRegister rs, GpuRegister rt);  // MIPS64
   void Daddiu(GpuRegister rt, GpuRegister rs, uint16_t imm16);  // MIPS64
-  void Sub(GpuRegister rd, GpuRegister rs, GpuRegister rt);
   void Subu(GpuRegister rd, GpuRegister rs, GpuRegister rt);
   void Dsubu(GpuRegister rd, GpuRegister rs, GpuRegister rt);  // MIPS64
 
-  void MultR2(GpuRegister rs, GpuRegister rt);  // R2
-  void MultuR2(GpuRegister rs, GpuRegister rt);  // R2
-  void DivR2(GpuRegister rs, GpuRegister rt);  // R2
-  void DivuR2(GpuRegister rs, GpuRegister rt);  // R2
-  void MulR2(GpuRegister rd, GpuRegister rs, GpuRegister rt);  // R2
-  void DivR2(GpuRegister rd, GpuRegister rs, GpuRegister rt);  // R2
-  void ModR2(GpuRegister rd, GpuRegister rs, GpuRegister rt);  // R2
-  void DivuR2(GpuRegister rd, GpuRegister rs, GpuRegister rt);  // R2
-  void ModuR2(GpuRegister rd, GpuRegister rs, GpuRegister rt);  // R2
-  void MulR6(GpuRegister rd, GpuRegister rs, GpuRegister rt);  // R6
-  void DivR6(GpuRegister rd, GpuRegister rs, GpuRegister rt);  // R6
-  void ModR6(GpuRegister rd, GpuRegister rs, GpuRegister rt);  // R6
-  void DivuR6(GpuRegister rd, GpuRegister rs, GpuRegister rt);  // R6
-  void ModuR6(GpuRegister rd, GpuRegister rs, GpuRegister rt);  // R6
-  void Dmul(GpuRegister rd, GpuRegister rs, GpuRegister rt);  // MIPS64 R6
-  void Ddiv(GpuRegister rd, GpuRegister rs, GpuRegister rt);  // MIPS64 R6
-  void Dmod(GpuRegister rd, GpuRegister rs, GpuRegister rt);  // MIPS64 R6
-  void Ddivu(GpuRegister rd, GpuRegister rs, GpuRegister rt);  // MIPS64 R6
-  void Dmodu(GpuRegister rd, GpuRegister rs, GpuRegister rt);  // MIPS64 R6
+  void MulR6(GpuRegister rd, GpuRegister rs, GpuRegister rt);
+  void MuhR6(GpuRegister rd, GpuRegister rs, GpuRegister rt);
+  void DivR6(GpuRegister rd, GpuRegister rs, GpuRegister rt);
+  void ModR6(GpuRegister rd, GpuRegister rs, GpuRegister rt);
+  void DivuR6(GpuRegister rd, GpuRegister rs, GpuRegister rt);
+  void ModuR6(GpuRegister rd, GpuRegister rs, GpuRegister rt);
+  void Dmul(GpuRegister rd, GpuRegister rs, GpuRegister rt);  // MIPS64
+  void Dmuh(GpuRegister rd, GpuRegister rs, GpuRegister rt);  // MIPS64
+  void Ddiv(GpuRegister rd, GpuRegister rs, GpuRegister rt);  // MIPS64
+  void Dmod(GpuRegister rd, GpuRegister rs, GpuRegister rt);  // MIPS64
+  void Ddivu(GpuRegister rd, GpuRegister rs, GpuRegister rt);  // MIPS64
+  void Dmodu(GpuRegister rd, GpuRegister rs, GpuRegister rt);  // MIPS64
 
   void And(GpuRegister rd, GpuRegister rs, GpuRegister rt);
   void Andi(GpuRegister rt, GpuRegister rs, uint16_t imm16);
@@ -104,12 +94,12 @@ class Mips64Assembler FINAL : public Assembler {
   void Xori(GpuRegister rt, GpuRegister rs, uint16_t imm16);
   void Nor(GpuRegister rd, GpuRegister rs, GpuRegister rt);
 
-  void Bitswap(GpuRegister rd, GpuRegister rt);  // R6
-  void Dbitswap(GpuRegister rd, GpuRegister rt);  // R6
-  void Seb(GpuRegister rd, GpuRegister rt);  // R2+
-  void Seh(GpuRegister rd, GpuRegister rt);  // R2+
-  void Dsbh(GpuRegister rd, GpuRegister rt);  // R2+
-  void Dshd(GpuRegister rd, GpuRegister rt);  // R2+
+  void Bitswap(GpuRegister rd, GpuRegister rt);
+  void Dbitswap(GpuRegister rd, GpuRegister rt);
+  void Seb(GpuRegister rd, GpuRegister rt);
+  void Seh(GpuRegister rd, GpuRegister rt);
+  void Dsbh(GpuRegister rd, GpuRegister rt);
+  void Dshd(GpuRegister rd, GpuRegister rt);
   void Dext(GpuRegister rs, GpuRegister rt, int pos, int size_less_one);  // MIPS64
   void Wsbh(GpuRegister rd, GpuRegister rt);
   void Sc(GpuRegister rt, GpuRegister base, int16_t imm9 = 0);
@@ -146,11 +136,9 @@ class Mips64Assembler FINAL : public Assembler {
   void Lhu(GpuRegister rt, GpuRegister rs, uint16_t imm16);
   void Lwu(GpuRegister rt, GpuRegister rs, uint16_t imm16);  // MIPS64
   void Lui(GpuRegister rt, uint16_t imm16);
-  void Dahi(GpuRegister rs, uint16_t imm16);  // MIPS64 R6
-  void Dati(GpuRegister rs, uint16_t imm16);  // MIPS64 R6
+  void Dahi(GpuRegister rs, uint16_t imm16);  // MIPS64
+  void Dati(GpuRegister rs, uint16_t imm16);  // MIPS64
   void Sync(uint32_t stype);
-  void Mfhi(GpuRegister rd);  // R2
-  void Mflo(GpuRegister rd);  // R2
 
   void Sb(GpuRegister rt, GpuRegister rs, uint16_t imm16);
   void Sh(GpuRegister rt, GpuRegister rs, uint16_t imm16);
@@ -175,21 +163,21 @@ class Mips64Assembler FINAL : public Assembler {
   void Jalr(GpuRegister rd, GpuRegister rs);
   void Jalr(GpuRegister rs);
   void Jr(GpuRegister rs);
-  void Auipc(GpuRegister rs, uint16_t imm16);  // R6
-  void Jic(GpuRegister rt, uint16_t imm16);  // R6
-  void Jialc(GpuRegister rt, uint16_t imm16);  // R6
-  void Bltc(GpuRegister rs, GpuRegister rt, uint16_t imm16);  // R6
-  void Bltzc(GpuRegister rt, uint16_t imm16);  // R6
-  void Bgtzc(GpuRegister rt, uint16_t imm16);  // R6
-  void Bgec(GpuRegister rs, GpuRegister rt, uint16_t imm16);  // R6
-  void Bgezc(GpuRegister rt, uint16_t imm16);  // R6
-  void Blezc(GpuRegister rt, uint16_t imm16);  // R6
-  void Bltuc(GpuRegister rs, GpuRegister rt, uint16_t imm16);  // R6
-  void Bgeuc(GpuRegister rs, GpuRegister rt, uint16_t imm16);  // R6
-  void Beqc(GpuRegister rs, GpuRegister rt, uint16_t imm16);  // R6
-  void Bnec(GpuRegister rs, GpuRegister rt, uint16_t imm16);  // R6
-  void Beqzc(GpuRegister rs, uint32_t imm21);  // R6
-  void Bnezc(GpuRegister rs, uint32_t imm21);  // R6
+  void Auipc(GpuRegister rs, uint16_t imm16);
+  void Jic(GpuRegister rt, uint16_t imm16);
+  void Jialc(GpuRegister rt, uint16_t imm16);
+  void Bltc(GpuRegister rs, GpuRegister rt, uint16_t imm16);
+  void Bltzc(GpuRegister rt, uint16_t imm16);
+  void Bgtzc(GpuRegister rt, uint16_t imm16);
+  void Bgec(GpuRegister rs, GpuRegister rt, uint16_t imm16);
+  void Bgezc(GpuRegister rt, uint16_t imm16);
+  void Blezc(GpuRegister rt, uint16_t imm16);
+  void Bltuc(GpuRegister rs, GpuRegister rt, uint16_t imm16);
+  void Bgeuc(GpuRegister rs, GpuRegister rt, uint16_t imm16);
+  void Beqc(GpuRegister rs, GpuRegister rt, uint16_t imm16);
+  void Bnec(GpuRegister rs, GpuRegister rt, uint16_t imm16);
+  void Beqzc(GpuRegister rs, uint32_t imm21);
+  void Bnezc(GpuRegister rs, uint32_t imm21);
 
   void AddS(FpuRegister fd, FpuRegister fs, FpuRegister ft);
   void SubS(FpuRegister fd, FpuRegister fs, FpuRegister ft);
@@ -259,25 +247,25 @@ class Mips64Assembler FINAL : public Assembler {
   void Addiu32(GpuRegister rt, GpuRegister rs, int32_t value, GpuRegister rtmp = AT);
   void Daddiu64(GpuRegister rt, GpuRegister rs, int64_t value, GpuRegister rtmp = AT);  // MIPS64
 
-  void Bind(Label* label) OVERRIDE;  // R6
+  void Bind(Label* label) OVERRIDE;
   void Jump(Label* label) OVERRIDE {
     B(label);
   }
-  void B(Label* label);  // R6
-  void Jalr(Label* label, GpuRegister indirect_reg = RA);  // R6
+  void B(Label* label);
+  void Jalr(Label* label, GpuRegister indirect_reg = RA);
   // TODO: implement common for R6 and non-R6 interface for conditional branches?
-  void Bltc(GpuRegister rs, GpuRegister rt, Label* label);  // R6
-  void Bltzc(GpuRegister rt, Label* label);  // R6
-  void Bgtzc(GpuRegister rt, Label* label);  // R6
-  void Bgec(GpuRegister rs, GpuRegister rt, Label* label);  // R6
-  void Bgezc(GpuRegister rt, Label* label);  // R6
-  void Blezc(GpuRegister rt, Label* label);  // R6
-  void Bltuc(GpuRegister rs, GpuRegister rt, Label* label);  // R6
-  void Bgeuc(GpuRegister rs, GpuRegister rt, Label* label);  // R6
-  void Beqc(GpuRegister rs, GpuRegister rt, Label* label);  // R6
-  void Bnec(GpuRegister rs, GpuRegister rt, Label* label);  // R6
-  void Beqzc(GpuRegister rs, Label* label);  // R6
-  void Bnezc(GpuRegister rs, Label* label);  // R6
+  void Bltc(GpuRegister rs, GpuRegister rt, Label* label);
+  void Bltzc(GpuRegister rt, Label* label);
+  void Bgtzc(GpuRegister rt, Label* label);
+  void Bgec(GpuRegister rs, GpuRegister rt, Label* label);
+  void Bgezc(GpuRegister rt, Label* label);
+  void Blezc(GpuRegister rt, Label* label);
+  void Bltuc(GpuRegister rs, GpuRegister rt, Label* label);
+  void Bgeuc(GpuRegister rs, GpuRegister rt, Label* label);
+  void Beqc(GpuRegister rs, GpuRegister rt, Label* label);
+  void Bnec(GpuRegister rs, GpuRegister rt, Label* label);
+  void Beqzc(GpuRegister rs, Label* label);
+  void Bnezc(GpuRegister rs, Label* label);
 
   void EmitLoad(ManagedRegister m_dst, GpuRegister src_register, int32_t src_offset, size_t size);
   void LoadFromOffset(LoadOperandType type, GpuRegister reg, GpuRegister base, int32_t offset);