55 files changed, 3446 insertions, 1362 deletions
diff --git a/compiler/dex/gvn_dead_code_elimination.cc b/compiler/dex/gvn_dead_code_elimination.cc
index 6d8a7dab2b..b1f5d870d4 100644
--- a/compiler/dex/gvn_dead_code_elimination.cc
+++ b/compiler/dex/gvn_dead_code_elimination.cc
@@ -1003,7 +1003,6 @@ bool GvnDeadCodeElimination::BackwardPassTryToKillLastMIR() {
                 vreg_chains_.GetMIRData(kill_heads_[v_reg])->PrevChange(v_reg));
     }
   }
-  unused_vregs_->Union(vregs_to_kill_);
   for (auto it = changes_to_kill_.rbegin(), end = changes_to_kill_.rend(); it != end; ++it) {
     MIRData* data = vreg_chains_.GetMIRData(*it);
     DCHECK(!data->must_keep);
@@ -1012,6 +1011,10 @@ bool GvnDeadCodeElimination::BackwardPassTryToKillLastMIR() {
     KillMIR(data);
   }
 
+  // Each dependent register not in vregs_to_kill_ is either already marked unused or
+  // it's one word of a wide register where the other word has been overwritten.
+  unused_vregs_->UnionIfNotIn(dependent_vregs_, vregs_to_kill_);
+
   vreg_chains_.RemoveTrailingNops();
   return true;
 }
diff --git a/compiler/dex/gvn_dead_code_elimination_test.cc b/compiler/dex/gvn_dead_code_elimination_test.cc
index de591d0edb..461c844a60 100644
--- a/compiler/dex/gvn_dead_code_elimination_test.cc
+++ b/compiler/dex/gvn_dead_code_elimination_test.cc
@@ -137,6 +137,8 @@ class GvnDeadCodeEliminationTest : public testing::Test {
     { bb, opcode, 0u, 0u, 1, { src1 }, 1, { result } }
 #define DEF_BINOP(bb, opcode, result, src1, src2) \
     { bb, opcode, 0u, 0u, 2, { src1, src2 }, 1, { result } }
+#define DEF_BINOP_WIDE(bb, opcode, result, src1, src2) \
+    { bb, opcode, 0u, 0u, 4, { src1, src1 + 1, src2, src2 + 1 }, 2, { result, result + 1 } }
 
   void DoPrepareIFields(const IFieldDef* defs, size_t count) {
     cu_.mir_graph->ifield_lowering_infos_.clear();
@@ -1936,7 +1938,7 @@ TEST_F(GvnDeadCodeEliminationTestSimple, MixedOverlaps1) {
       DEF_CONST(3, Instruction::CONST, 0u, 1000u),
       DEF_MOVE(3, Instruction::MOVE, 1u, 0u),
       DEF_CONST(3, Instruction::CONST, 2u, 2000u),
-      { 3, Instruction::INT_TO_LONG, 0, 0u, 1, { 2u }, 2, { 3u, 4u} },
+      { 3, Instruction::INT_TO_LONG, 0, 0u, 1, { 2u }, 2, { 3u, 4u } },
       DEF_MOVE_WIDE(3, Instruction::MOVE_WIDE, 5u, 3u),
       DEF_CONST(3, Instruction::CONST, 7u, 3000u),
       DEF_CONST(3, Instruction::CONST, 8u, 4000u),
@@ -1983,4 +1985,85 @@ TEST_F(GvnDeadCodeEliminationTestSimple, MixedOverlaps1) {
   EXPECT_EQ(0u, int_to_long->dalvikInsn.vB);
 }
 
+TEST_F(GvnDeadCodeEliminationTestSimple, UnusedRegs1) {
+  static const MIRDef mirs[] = {
+      DEF_CONST(3, Instruction::CONST, 0u, 1000u),
+      DEF_CONST(3, Instruction::CONST, 1u, 2000u),
+      DEF_BINOP(3, Instruction::ADD_INT, 2u, 1u, 0u),
+      DEF_CONST(3, Instruction::CONST, 3u, 1000u),            // NOT killed (b/21702651).
+      DEF_BINOP(3, Instruction::ADD_INT, 4u, 1u, 3u),         // Killed (RecordPass)
+      DEF_CONST(3, Instruction::CONST, 5u, 2000u),            // Killed with 9u (BackwardPass)
+      DEF_BINOP(3, Instruction::ADD_INT, 6u, 5u, 0u),         // Killed (RecordPass)
+      DEF_CONST(3, Instruction::CONST, 7u, 4000u),
+      DEF_MOVE(3, Instruction::MOVE, 8u, 0u),                 // Killed with 6u (BackwardPass)
+  };
+
+  static const int32_t sreg_to_vreg_map[] = { 1, 2, 3, 0, 3, 0, 3, 4, 0 };
+  PrepareSRegToVRegMap(sreg_to_vreg_map);
+
+  PrepareMIRs(mirs);
+  PerformGVN_DCE();
+
+  ASSERT_EQ(arraysize(mirs), value_names_.size());
+  static const size_t diff_indexes[] = { 0, 1, 2, 7 };
+  ExpectValueNamesNE(diff_indexes);
+  EXPECT_EQ(value_names_[0], value_names_[3]);
+  EXPECT_EQ(value_names_[2], value_names_[4]);
+  EXPECT_EQ(value_names_[1], value_names_[5]);
+  EXPECT_EQ(value_names_[2], value_names_[6]);
+  EXPECT_EQ(value_names_[0], value_names_[8]);
+
+  static const bool eliminated[] = {
+      false, false, false, false, true, true, true, false, true,
+  };
+  static_assert(arraysize(eliminated) == arraysize(mirs), "array size mismatch");
+  for (size_t i = 0; i != arraysize(eliminated); ++i) {
+    bool actually_eliminated = (static_cast<int>(mirs_[i].dalvikInsn.opcode) == kMirOpNop);
+    EXPECT_EQ(eliminated[i], actually_eliminated) << i;
+  }
+}
+
+TEST_F(GvnDeadCodeEliminationTestSimple, UnusedRegs2) {
+  static const MIRDef mirs[] = {
+      DEF_CONST(3, Instruction::CONST, 0u, 1000u),
+      DEF_CONST(3, Instruction::CONST, 1u, 2000u),
+      DEF_BINOP(3, Instruction::ADD_INT, 2u, 1u, 0u),
+      DEF_CONST(3, Instruction::CONST, 3u, 1000u),            // Killed (BackwardPass; b/21702651)
+      DEF_BINOP(3, Instruction::ADD_INT, 4u, 1u, 3u),         // Killed (RecordPass)
+      DEF_CONST_WIDE(3, Instruction::CONST_WIDE, 5u, 4000u),
+      { 3, Instruction::LONG_TO_INT, 0, 0u, 2, { 5u, 6u }, 1, { 7u } },
+      DEF_BINOP(3, Instruction::ADD_INT, 8u, 7u, 0u),
+      DEF_CONST_WIDE(3, Instruction::CONST_WIDE, 9u, 4000u),  // Killed with 12u (BackwardPass)
+      DEF_CONST(3, Instruction::CONST, 11u, 6000u),
+      { 3, Instruction::LONG_TO_INT, 0, 0u, 2, { 9u, 10u }, 1, { 12u } },  // Killed with 9u (BP)
+  };
+
+  static const int32_t sreg_to_vreg_map[] = {
+      2, 3, 4, 1, 4, 5, 6 /* high word */, 0, 7, 0, 1 /* high word */, 8, 0
+  };
+  PrepareSRegToVRegMap(sreg_to_vreg_map);
+
+  PrepareMIRs(mirs);
+  static const int32_t wide_sregs[] = { 5, 9 };
+  MarkAsWideSRegs(wide_sregs);
+  PerformGVN_DCE();
+
+  ASSERT_EQ(arraysize(mirs), value_names_.size());
+  static const size_t diff_indexes[] = { 0, 1, 2, 5, 6, 7, 9 };
+  ExpectValueNamesNE(diff_indexes);
+  EXPECT_EQ(value_names_[0], value_names_[3]);
+  EXPECT_EQ(value_names_[2], value_names_[4]);
+  EXPECT_EQ(value_names_[5], value_names_[8]);
+  EXPECT_EQ(value_names_[6], value_names_[10]);
+
+  static const bool eliminated[] = {
+      false, false, false, true, true, false, false, false, true, false, true,
+  };
+  static_assert(arraysize(eliminated) == arraysize(mirs), "array size mismatch");
+  for (size_t i = 0; i != arraysize(eliminated); ++i) {
+    bool actually_eliminated = (static_cast<int>(mirs_[i].dalvikInsn.opcode) == kMirOpNop);
+    EXPECT_EQ(eliminated[i], actually_eliminated) << i;
+  }
+}
+
 }  // namespace art
diff --git a/compiler/dex/mir_graph.cc b/compiler/dex/mir_graph.cc
index cc1ba35b96..38342420ac 100644
--- a/compiler/dex/mir_graph.cc
+++ b/compiler/dex/mir_graph.cc
@@ -398,12 +398,13 @@ bool MIRGraph::IsBadMonitorExitCatch(NarrowDexOffset monitor_exit_offset,
   DCHECK(monitor_exit->Opcode() == Instruction::MONITOR_EXIT);
   int monitor_reg = monitor_exit->VRegA_11x();
   const Instruction* check_insn = Instruction::At(current_code_item_->insns_ + catch_offset);
-  DCHECK(check_insn->Opcode() == Instruction::MOVE_EXCEPTION);
-  if (check_insn->VRegA_11x() == monitor_reg) {
-    // Unexpected move-exception to the same register. Probably not the pattern we're looking for.
-    return false;
+  if (check_insn->Opcode() == Instruction::MOVE_EXCEPTION) {
+    if (check_insn->VRegA_11x() == monitor_reg) {
+      // Unexpected move-exception to the same register. Probably not the pattern we're looking for.
+      return false;
+    }
+    check_insn = check_insn->Next();
   }
-  check_insn = check_insn->Next();
   while (true) {
     int dest = -1;
     bool wide = false;
diff --git a/compiler/driver/compiler_driver-inl.h b/compiler/driver/compiler_driver-inl.h
index b25e967609..e0c56fcc82 100644
--- a/compiler/driver/compiler_driver-inl.h
+++ b/compiler/driver/compiler_driver-inl.h
@@ -233,11 +233,32 @@ inline bool CompilerDriver::IsStaticFieldInReferrerClass(mirror::Class* referrer
   return referrer_class == fields_class;
 }
 
+inline bool CompilerDriver::CanAssumeClassIsInitialized(mirror::Class* klass) {
+  // Being loaded is a pre-requisite for being initialized but let's do the cheap check first.
+  //
+  // NOTE: When AOT compiling an app, we eagerly initialize app classes (and potentially their
+  // super classes in the boot image) but only those that have a trivial initialization, i.e.
+  // without <clinit>() or static values in the dex file for that class or any of its super
+  // classes. So while we could see the klass as initialized during AOT compilation and have
+  // it only loaded at runtime, the needed initialization would have to be trivial and
+  // unobservable from Java, so we may as well treat it as initialized.
+  if (!klass->IsInitialized()) {
+    return false;
+  }
+  return CanAssumeClassIsLoaded(klass);
+}
+
+inline bool CompilerDriver::CanReferrerAssumeClassIsInitialized(mirror::Class* referrer_class,
+                                                                mirror::Class* klass) {
+  return (referrer_class != nullptr && referrer_class->IsSubClass(klass)) ||
+      CanAssumeClassIsInitialized(klass);
+}
+
 inline bool CompilerDriver::IsStaticFieldsClassInitialized(mirror::Class* referrer_class,
                                                            ArtField* resolved_field) {
   DCHECK(resolved_field->IsStatic());
   mirror::Class* fields_class = resolved_field->GetDeclaringClass();
-  return fields_class == referrer_class || fields_class->IsInitialized();
+  return CanReferrerAssumeClassIsInitialized(referrer_class, fields_class);
 }
 
 inline ArtMethod* CompilerDriver::ResolveMethod(
@@ -394,7 +415,7 @@ inline bool CompilerDriver::IsMethodsClassInitialized(mirror::Class* referrer_cl
     return true;
   }
   mirror::Class* methods_class = resolved_method->GetDeclaringClass();
-  return methods_class == referrer_class || methods_class->IsInitialized();
+  return CanReferrerAssumeClassIsInitialized(referrer_class, methods_class);
 }
 
 }  // namespace art
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index 22fcf87524..84b6a52bda 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -659,7 +659,8 @@ void CompilerDriver::PreCompile(jobject class_loader, const std::vector<const De
 
 bool CompilerDriver::IsImageClass(const char* descriptor) const {
   if (!IsImage()) {
-    return true;
+    // NOTE: Currently unreachable, all callers check IsImage().
+    return false;
   } else {
     return image_classes_->find(descriptor) != image_classes_->end();
   }
@@ -992,6 +993,24 @@ void CompilerDriver::UpdateImageClasses(TimingLogger* timings) {
   }
 }
 
+bool CompilerDriver::CanAssumeClassIsLoaded(mirror::Class* klass) {
+  Runtime* runtime = Runtime::Current();
+  if (!runtime->IsAotCompiler()) {
+    DCHECK(runtime->UseJit());
+    // Having the klass reference here implies that the klass is already loaded.
+    return true;
+  }
+  if (!IsImage()) {
+    // Assume loaded only if klass is in the boot image. App classes cannot be assumed
+    // loaded because we don't even know what class loader will be used to load them.
+    bool class_in_image = runtime->GetHeap()->FindSpaceFromObject(klass, false)->IsImageSpace();
+    return class_in_image;
+  }
+  std::string temp;
+  const char* descriptor = klass->GetDescriptor(&temp);
+  return IsImageClass(descriptor);
+}
+
 bool CompilerDriver::CanAssumeTypeIsPresentInDexCache(const DexFile& dex_file, uint32_t type_idx) {
   if (IsImage() &&
       IsImageClass(dex_file.StringDataByIdx(dex_file.GetTypeId(type_idx).descriptor_idx_))) {
diff --git a/compiler/driver/compiler_driver.h b/compiler/driver/compiler_driver.h
index 68c905eb22..f737007308 100644
--- a/compiler/driver/compiler_driver.h
+++ b/compiler/driver/compiler_driver.h
@@ -501,6 +501,16 @@ class CompilerDriver {
                                       uint32_t field_idx)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  // Can we assume that the klass is initialized?
+  bool CanAssumeClassIsInitialized(mirror::Class* klass)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  bool CanReferrerAssumeClassIsInitialized(mirror::Class* referrer_class, mirror::Class* klass)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
+  // Can we assume that the klass is loaded?
+  bool CanAssumeClassIsLoaded(mirror::Class* klass)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
   // These flags are internal to CompilerDriver for collecting INVOKE resolution statistics.
   // The only external contract is that unresolved method has flags 0 and resolved non-0.
   enum {
diff --git a/compiler/dwarf/dwarf_test.cc b/compiler/dwarf/dwarf_test.cc
index 4971f0ef10..4d423d007f 100644
--- a/compiler/dwarf/dwarf_test.cc
+++ b/compiler/dwarf/dwarf_test.cc
@@ -26,11 +26,11 @@
 namespace art {
 namespace dwarf {
 
-constexpr CFIFormat kCFIFormat = DW_DEBUG_FRAME_FORMAT;
-
 // Run the tests only on host since we need objdump.
 #ifndef HAVE_ANDROID_OS
 
+constexpr CFIFormat kCFIFormat = DW_DEBUG_FRAME_FORMAT;
+
 TEST_F(DwarfTest, DebugFrame) {
   const bool is64bit = false;
 
diff --git a/compiler/image_writer.cc b/compiler/image_writer.cc
index 32bde8e3b4..73e121f1cd 100644
--- a/compiler/image_writer.cc
+++ b/compiler/image_writer.cc
@@ -110,10 +110,6 @@ bool ImageWriter::PrepareImageAddressSpace() {
     CheckNoDexObjects();
   }
 
-  if (!AllocMemory()) {
-    return false;
-  }
-
   if (kIsDebugBuild) {
     ScopedObjectAccess soa(Thread::Current());
     CheckNonImageClassesRemoved();
@@ -123,6 +119,12 @@ bool ImageWriter::PrepareImageAddressSpace() {
   CalculateNewObjectOffsets();
   Thread::Current()->TransitionFromRunnableToSuspended(kNative);
 
+  // This needs to happen after CalculateNewObjectOffsets since it relies on intern_table_bytes_ and
+  // bin size sums being calculated.
+  if (!AllocMemory()) {
+    return false;
+  }
+
   return true;
 }
 
@@ -205,7 +207,7 @@ bool ImageWriter::Write(const std::string& image_filename,
   }
 
   // Write out the image bitmap at the page aligned start of the image end.
-  const auto& bitmap_section = image_header->GetImageSection(ImageHeader::kSectionImageBitmap);
+  const ImageSection& bitmap_section = image_header->GetImageSection(ImageHeader::kSectionImageBitmap);
   CHECK_ALIGNED(bitmap_section.Offset(), kPageSize);
   if (!image_file->Write(reinterpret_cast<char*>(image_bitmap_->Begin()),
                          bitmap_section.Size(), bitmap_section.Offset())) {
@@ -222,26 +224,10 @@ bool ImageWriter::Write(const std::string& image_filename,
   return true;
 }
 
-void ImageWriter::SetImageOffset(mirror::Object* object,
-                                 ImageWriter::BinSlot bin_slot,
-                                 size_t offset) {
+void ImageWriter::SetImageOffset(mirror::Object* object, size_t offset) {
   DCHECK(object != nullptr);
   DCHECK_NE(offset, 0U);
-  mirror::Object* obj = reinterpret_cast<mirror::Object*>(image_->Begin() + offset);
-  DCHECK_ALIGNED(obj, kObjectAlignment);
 
-  static size_t max_offset = 0;
-  max_offset = std::max(max_offset, offset);
-  image_bitmap_->Set(obj);  // Mark the obj as mutated, since we will end up changing it.
-  {
-    // Remember the object-inside-of-the-image's hash code so we can restore it after the copy.
-    auto hash_it = saved_hashes_map_.find(bin_slot);
-    if (hash_it != saved_hashes_map_.end()) {
-      std::pair<BinSlot, uint32_t> slot_hash = *hash_it;
-      saved_hashes_.push_back(std::make_pair(obj, slot_hash.second));
-      saved_hashes_map_.erase(hash_it);
-    }
-  }
   // The object is already deflated from when we set the bin slot. Just overwrite the lock word.
   object->SetLockWord(LockWord::FromForwardingAddress(offset), false);
   DCHECK_EQ(object->GetLockWord(false).ReadBarrierState(), 0u);
@@ -262,7 +248,7 @@ void ImageWriter::AssignImageOffset(mirror::Object* object, ImageWriter::BinSlot
   size_t new_offset = image_objects_offset_begin_ + previous_bin_sizes + bin_slot.GetIndex();
   DCHECK_ALIGNED(new_offset, kObjectAlignment);
 
-  SetImageOffset(object, bin_slot, new_offset);
+  SetImageOffset(object, new_offset);
   DCHECK_LT(new_offset, image_end_);
 }
 
@@ -302,14 +288,14 @@ void ImageWriter::SetImageBinSlot(mirror::Object* object, BinSlot bin_slot) {
       // No hash, don't need to save it.
       break;
     case LockWord::kHashCode:
-      saved_hashes_map_[bin_slot] = lw.GetHashCode();
+      DCHECK(saved_hashcode_map_.find(object) == saved_hashcode_map_.end());
+      saved_hashcode_map_.emplace(object, lw.GetHashCode());
       break;
     default:
       LOG(FATAL) << "Unreachable.";
       UNREACHABLE();
   }
-  object->SetLockWord(LockWord::FromForwardingAddress(static_cast<uint32_t>(bin_slot)),
-                      false);
+  object->SetLockWord(LockWord::FromForwardingAddress(bin_slot.Uint32Value()), false);
   DCHECK_EQ(object->GetLockWord(false).ReadBarrierState(), 0u);
   DCHECK(IsImageBinSlotAssigned(object));
 }
@@ -487,11 +473,8 @@ void ImageWriter::AssignImageBinSlot(mirror::Object* object) {
 
   ++bin_slot_count_[bin];
 
-  DCHECK_LT(GetBinSizeSum(), image_->Size());
-
   // Grow the image closer to the end by the object we just assigned.
   image_end_ += offset_delta;
-  DCHECK_LT(image_end_, image_->Size());
 }
 
 bool ImageWriter::WillMethodBeDirty(ArtMethod* m) const {
@@ -535,10 +518,8 @@ ImageWriter::BinSlot ImageWriter::GetImageBinSlot(mirror::Object* object) const
 }
 
 bool ImageWriter::AllocMemory() {
-  auto* runtime = Runtime::Current();
-  const size_t heap_size = runtime->GetHeap()->GetTotalMemory();
-  // Add linear alloc usage since we need to have room for the ArtFields.
-  const size_t length = RoundUp(heap_size + runtime->GetLinearAlloc()->GetUsedMemory(), kPageSize);
+  const size_t length = RoundUp(image_objects_offset_begin_ + GetBinSizeSum() + intern_table_bytes_,
+                                kPageSize);
   std::string error_msg;
   image_.reset(MemMap::MapAnonymous("image writer image", nullptr, length, PROT_READ | PROT_WRITE,
                                     false, false, &error_msg));
@@ -547,9 +528,10 @@ bool ImageWriter::AllocMemory() {
     return false;
   }
 
-  // Create the image bitmap.
-  image_bitmap_.reset(gc::accounting::ContinuousSpaceBitmap::Create("image bitmap", image_->Begin(),
-                                                                    RoundUp(length, kPageSize)));
+  // Create the image bitmap, only needs to cover mirror object section which is up to image_end_.
+  CHECK_LE(image_end_, length);
+  image_bitmap_.reset(gc::accounting::ContinuousSpaceBitmap::Create(
+      "image bitmap", image_->Begin(), RoundUp(image_end_, kPageSize)));
   if (image_bitmap_.get() == nullptr) {
     LOG(ERROR) << "Failed to allocate memory for image bitmap";
     return false;
@@ -569,42 +551,6 @@ bool ImageWriter::ComputeLazyFieldsForClassesVisitor(Class* c, void* /*arg*/) {
   return true;
 }
 
-// Collect all the java.lang.String in the heap and put them in the output strings_ array.
-class StringCollector {
- public:
-  StringCollector(Handle<mirror::ObjectArray<mirror::String>> strings, size_t index)
-      : strings_(strings), index_(index) {
-  }
-  static void Callback(Object* obj, void* arg) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    auto* collector = reinterpret_cast<StringCollector*>(arg);
-    if (obj->GetClass()->IsStringClass()) {
-      collector->strings_->SetWithoutChecks<false>(collector->index_++, obj->AsString());
-    }
-  }
-  size_t GetIndex() const {
-    return index_;
-  }
-
- private:
-  Handle<mirror::ObjectArray<mirror::String>> strings_;
-  size_t index_;
-};
-
-// Compare strings based on length, used for sorting strings by length / reverse length.
-class LexicographicalStringComparator {
- public:
-  bool operator()(const mirror::HeapReference<mirror::String>& lhs,
-                  const mirror::HeapReference<mirror::String>& rhs) const
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    mirror::String* lhs_s = lhs.AsMirrorPtr();
-    mirror::String* rhs_s = rhs.AsMirrorPtr();
-    uint16_t* lhs_begin = lhs_s->GetValue();
-    uint16_t* rhs_begin = rhs_s->GetValue();
-    return std::lexicographical_compare(lhs_begin, lhs_begin + lhs_s->GetLength(),
-                                        rhs_begin, rhs_begin + rhs_s->GetLength());
-  }
-};
-
 void ImageWriter::ComputeEagerResolvedStringsCallback(Object* obj, void* arg ATTRIBUTE_UNUSED) {
   if (!obj->GetClass()->IsStringClass()) {
     return;
@@ -769,7 +715,8 @@ void ImageWriter::CalculateObjectBinSlots(Object* obj) {
       DCHECK_EQ(obj, obj->AsString()->Intern());
       return;
     }
-    mirror::String* const interned = obj->AsString()->Intern();
+    mirror::String* const interned = Runtime::Current()->GetInternTable()->InternStrong(
+        obj->AsString()->Intern());
     if (obj != interned) {
       if (!IsImageBinSlotAssigned(interned)) {
         // interned obj is after us, allocate its location early
@@ -965,7 +912,6 @@ void ImageWriter::CalculateNewObjectOffsets() {
   // know where image_roots is going to end up
   image_end_ += RoundUp(sizeof(ImageHeader), kObjectAlignment);  // 64-bit-alignment
 
-  DCHECK_LT(image_end_, image_->Size());
   image_objects_offset_begin_ = image_end_;
   // Prepare bin slots for dex cache arrays.
   PrepareDexCacheArraySlots();
@@ -997,7 +943,6 @@ void ImageWriter::CalculateNewObjectOffsets() {
 
   // Transform each object's bin slot into an offset which will be used to do the final copy.
   heap->VisitObjects(UnbinObjectsIntoOffsetCallback, this);
-  DCHECK(saved_hashes_map_.empty());  // All binslot hashes should've been put into vector by now.
 
   DCHECK_EQ(image_end_, GetBinSizeSum(kBinMirrorCount) + image_objects_offset_begin_);
 
@@ -1010,6 +955,11 @@ void ImageWriter::CalculateNewObjectOffsets() {
         bin_slot_previous_sizes_[native_reloc.bin_type];
   }
 
+  // Calculate how big the intern table will be after being serialized.
+  auto* const intern_table = Runtime::Current()->GetInternTable();
+  CHECK_EQ(intern_table->WeakSize(), 0u) << " should have strong interned all the strings";
+  intern_table_bytes_ = intern_table->WriteToMemory(nullptr);
+
   // Note that image_end_ is left at end of used mirror object section.
 }
 
@@ -1039,6 +989,10 @@ void ImageWriter::CreateHeader(size_t oat_loaded_size, size_t oat_data_offset) {
   CHECK_EQ(image_objects_offset_begin_ + bin_slot_previous_sizes_[kBinArtMethodClean],
            methods_section->Offset());
   cur_pos = methods_section->End();
+  // Calculate the size of the interned strings.
+  auto* interned_strings_section = &sections[ImageHeader::kSectionInternedStrings];
+  *interned_strings_section = ImageSection(cur_pos, intern_table_bytes_);
+  cur_pos = interned_strings_section->End();
   // Finally bitmap section.
   const size_t bitmap_bytes = image_bitmap_->Size();
   auto* bitmap_section = &sections[ImageHeader::kSectionImageBitmap];
@@ -1046,16 +1000,19 @@ void ImageWriter::CreateHeader(size_t oat_loaded_size, size_t oat_data_offset) {
   cur_pos = bitmap_section->End();
   if (kIsDebugBuild) {
     size_t idx = 0;
-    for (auto& section : sections) {
+    for (const ImageSection& section : sections) {
       LOG(INFO) << static_cast<ImageHeader::ImageSections>(idx) << " " << section;
       ++idx;
     }
     LOG(INFO) << "Methods: clean=" << clean_methods_ << " dirty=" << dirty_methods_;
   }
+  const size_t image_end = static_cast<uint32_t>(interned_strings_section->End());
+  CHECK_EQ(AlignUp(image_begin_ + image_end, kPageSize), oat_file_begin) <<
+      "Oat file should be right after the image.";
   // Create the header.
   new (image_->Begin()) ImageHeader(
-      PointerToLowMemUInt32(image_begin_), static_cast<uint32_t>(methods_section->End()), sections,
-      image_roots_address_, oat_file_->GetOatHeader().GetChecksum(),
+      PointerToLowMemUInt32(image_begin_), image_end,
+      sections, image_roots_address_, oat_file_->GetOatHeader().GetChecksum(),
       PointerToLowMemUInt32(oat_file_begin), PointerToLowMemUInt32(oat_data_begin_),
       PointerToLowMemUInt32(oat_data_end), PointerToLowMemUInt32(oat_file_end), target_ptr_size_,
       compile_pic_);
@@ -1068,6 +1025,37 @@ ArtMethod* ImageWriter::GetImageMethodAddress(ArtMethod* method) {
   return reinterpret_cast<ArtMethod*>(image_begin_ + it->second.offset);
 }
 
+class FixupRootVisitor : public RootVisitor {
+ public:
+  explicit FixupRootVisitor(ImageWriter* image_writer) : image_writer_(image_writer) {
+  }
+
+  void VisitRoots(mirror::Object*** roots, size_t count, const RootInfo& info ATTRIBUTE_UNUSED)
+      OVERRIDE SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    for (size_t i = 0; i < count; ++i) {
+      *roots[i] = ImageAddress(*roots[i]);
+    }
+  }
+
+  void VisitRoots(mirror::CompressedReference<mirror::Object>** roots, size_t count,
+                  const RootInfo& info ATTRIBUTE_UNUSED)
+      OVERRIDE SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    for (size_t i = 0; i < count; ++i) {
+      roots[i]->Assign(ImageAddress(roots[i]->AsMirrorPtr()));
+    }
+  }
+
+ private:
+  ImageWriter* const image_writer_;
+
+  mirror::Object* ImageAddress(mirror::Object* obj) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    const size_t offset = image_writer_->GetImageOffset(obj);
+    auto* const dest = reinterpret_cast<Object*>(image_writer_->image_begin_ + offset);
+    VLOG(compiler) << "Update root from " << obj << " to " << dest;
+    return dest;
+  }
+};
+
 void ImageWriter::CopyAndFixupNativeData() {
   // Copy ArtFields and methods to their locations and update the array for convenience.
   for (auto& pair : native_object_reloc_) {
@@ -1088,7 +1076,7 @@ void ImageWriter::CopyAndFixupNativeData() {
   }
   // Fixup the image method roots.
   auto* image_header = reinterpret_cast<ImageHeader*>(image_->Begin());
-  const auto& methods_section = image_header->GetMethodsSection();
+  const ImageSection& methods_section = image_header->GetMethodsSection();
   for (size_t i = 0; i < ImageHeader::kImageMethodsCount; ++i) {
     auto* m = image_methods_[i];
     CHECK(m != nullptr);
@@ -1101,18 +1089,35 @@ void ImageWriter::CopyAndFixupNativeData() {
     auto* dest = reinterpret_cast<ArtMethod*>(image_begin_ + it->second.offset);
     image_header->SetImageMethod(static_cast<ImageHeader::ImageMethod>(i), dest);
   }
+  // Write the intern table into the image.
+  const ImageSection& intern_table_section = image_header->GetImageSection(
+      ImageHeader::kSectionInternedStrings);
+  InternTable* const intern_table = Runtime::Current()->GetInternTable();
+  uint8_t* const memory_ptr = image_->Begin() + intern_table_section.Offset();
+  const size_t intern_table_bytes = intern_table->WriteToMemory(memory_ptr);
+  // Fixup the pointers in the newly written intern table to contain image addresses.
+  InternTable temp_table;
+  // Note that we require that ReadFromMemory does not make an internal copy of the elements so that
+  // the VisitRoots() will update the memory directly rather than the copies.
+  // This also relies on visit roots not doing any verification which could fail after we update
+  // the roots to be the image addresses.
+  temp_table.ReadFromMemory(memory_ptr);
+  CHECK_EQ(temp_table.Size(), intern_table->Size());
+  FixupRootVisitor visitor(this);
+  temp_table.VisitRoots(&visitor, kVisitRootFlagAllRoots);
+  CHECK_EQ(intern_table_bytes, intern_table_bytes_);
 }
 
 void ImageWriter::CopyAndFixupObjects() {
   gc::Heap* heap = Runtime::Current()->GetHeap();
   heap->VisitObjects(CopyAndFixupObjectsCallback, this);
   // Fix up the object previously had hash codes.
-  for (const std::pair<mirror::Object*, uint32_t>& hash_pair : saved_hashes_) {
+  for (const auto& hash_pair : saved_hashcode_map_) {
     Object* obj = hash_pair.first;
     DCHECK_EQ(obj->GetLockWord<kVerifyNone>(false).ReadBarrierState(), 0U);
     obj->SetLockWord<kVerifyNone>(LockWord::FromHashCode(hash_pair.second, 0U), false);
   }
-  saved_hashes_.clear();
+  saved_hashcode_map_.clear();
 }
 
 void ImageWriter::CopyAndFixupObjectsCallback(Object* obj, void* arg) {
@@ -1155,18 +1160,22 @@ void ImageWriter::FixupPointerArray(mirror::Object* dst, mirror::PointerArray* a
 }
 
 void ImageWriter::CopyAndFixupObject(Object* obj) {
-  // see GetLocalAddress for similar computation
   size_t offset = GetImageOffset(obj);
   auto* dst = reinterpret_cast<Object*>(image_->Begin() + offset);
-  const uint8_t* src = reinterpret_cast<const uint8_t*>(obj);
+  DCHECK_LT(offset, image_end_);
+  const auto* src = reinterpret_cast<const uint8_t*>(obj);
+
+  image_bitmap_->Set(dst);  // Mark the obj as live.
 
-  size_t n = obj->SizeOf();
+  const size_t n = obj->SizeOf();
   DCHECK_LE(offset + n, image_->Size());
   memcpy(dst, src, n);
 
   // Write in a hash code of objects which have inflated monitors or a hash code in their monitor
   // word.
-  dst->SetLockWord(LockWord::Default(), false);
+  const auto it = saved_hashcode_map_.find(obj);
+  dst->SetLockWord(it != saved_hashcode_map_.end() ?
+      LockWord::FromHashCode(it->second, 0u) : LockWord::Default(), false);
   FixupObject(obj, dst);
 }
 
@@ -1176,7 +1185,7 @@ class FixupVisitor {
   FixupVisitor(ImageWriter* image_writer, Object* copy) : image_writer_(image_writer), copy_(copy) {
   }
 
-  void operator()(Object* obj, MemberOffset offset, bool /*is_static*/) const
+  void operator()(Object* obj, MemberOffset offset, bool is_static ATTRIBUTE_UNUSED) const
       EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_, Locks::heap_bitmap_lock_) {
     Object* ref = obj->GetFieldObject<Object, kVerifyNone>(offset);
     // Use SetFieldObjectWithoutWriteBarrier to avoid card marking since we are writing to the
@@ -1186,7 +1195,7 @@ class FixupVisitor {
   }
 
   // java.lang.ref.Reference visitor.
-  void operator()(mirror::Class* /*klass*/, mirror::Reference* ref) const
+  void operator()(mirror::Class* klass ATTRIBUTE_UNUSED, mirror::Reference* ref) const
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) {
     copy_->SetFieldObjectWithoutWriteBarrier<false, true, kVerifyNone>(
@@ -1490,4 +1499,11 @@ uint32_t ImageWriter::BinSlot::GetIndex() const {
   return lockword_ & ~kBinMask;
 }
 
+uint8_t* ImageWriter::GetOatFileBegin() const {
+  DCHECK_GT(intern_table_bytes_, 0u);
+  return image_begin_ + RoundUp(
+      image_end_ + bin_slot_sizes_[kBinArtField] + bin_slot_sizes_[kBinArtMethodDirty] +
+      bin_slot_sizes_[kBinArtMethodClean] + intern_table_bytes_, kPageSize);
+}
+
 }  // namespace art
diff --git a/compiler/image_writer.h b/compiler/image_writer.h
index a35d6ad9c9..9d45ce2bd4 100644
--- a/compiler/image_writer.h
+++ b/compiler/image_writer.h
@@ -54,7 +54,7 @@ class ImageWriter FINAL {
         quick_to_interpreter_bridge_offset_(0), compile_pic_(compile_pic),
         target_ptr_size_(InstructionSetPointerSize(compiler_driver_.GetInstructionSet())),
         bin_slot_sizes_(), bin_slot_previous_sizes_(), bin_slot_count_(),
-        dirty_methods_(0u), clean_methods_(0u) {
+        intern_table_bytes_(0u), dirty_methods_(0u), clean_methods_(0u) {
     CHECK_NE(image_begin, 0U);
     std::fill(image_methods_, image_methods_ + arraysize(image_methods_), nullptr);
   }
@@ -84,11 +84,7 @@ class ImageWriter FINAL {
         image_begin_ + RoundUp(sizeof(ImageHeader), kObjectAlignment) + it->second + offset);
   }
 
-  uint8_t* GetOatFileBegin() const {
-    return image_begin_ + RoundUp(
-        image_end_ + bin_slot_sizes_[kBinArtField] + bin_slot_sizes_[kBinArtMethodDirty] +
-        bin_slot_sizes_[kBinArtMethodClean], kPageSize);
-  }
+  uint8_t* GetOatFileBegin() const;
 
   bool Write(const std::string& image_filename, const std::string& oat_filename,
              const std::string& oat_location)
@@ -158,7 +154,7 @@ class ImageWriter FINAL {
     // The offset in bytes from the beginning of the bin. Aligned to object size.
     uint32_t GetIndex() const;
     // Pack into a single uint32_t, for storing into a lock word.
-    explicit operator uint32_t() const { return lockword_; }
+    uint32_t Uint32Value() const { return lockword_; }
     // Comparison operator for map support
     bool operator<(const BinSlot& other) const  { return lockword_ < other.lockword_; }
 
@@ -170,7 +166,7 @@ class ImageWriter FINAL {
   // We use the lock word to store the offset of the object in the image.
   void AssignImageOffset(mirror::Object* object, BinSlot bin_slot)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  void SetImageOffset(mirror::Object* object, BinSlot bin_slot, size_t offset)
+  void SetImageOffset(mirror::Object* object, size_t offset)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   bool IsImageOffsetAssigned(mirror::Object* object) const
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
@@ -330,11 +326,9 @@ class ImageWriter FINAL {
   // The start offsets of the dex cache arrays.
   SafeMap<const DexFile*, size_t> dex_cache_array_starts_;
 
-  // Saved hashes (objects are inside of the image so that they don't move).
-  std::vector<std::pair<mirror::Object*, uint32_t>> saved_hashes_;
-
-  // Saved hashes (objects are bin slots to inside of the image, not yet allocated an address).
-  std::map<BinSlot, uint32_t> saved_hashes_map_;
+  // Saved hash codes. We use these to restore lockwords which were temporarily used to have
+  // forwarding addresses as well as copying over hash codes.
+  std::unordered_map<mirror::Object*, uint32_t> saved_hashcode_map_;
 
   // Beginning target oat address for the pointers from the output image to its oat file.
   const uint8_t* oat_data_begin_;
@@ -360,6 +354,9 @@ class ImageWriter FINAL {
   size_t bin_slot_previous_sizes_[kBinSize];  // Number of bytes in previous bins.
   size_t bin_slot_count_[kBinSize];  // Number of objects in a bin
 
+  // Cached size of the intern table for when we allocate memory.
+  size_t intern_table_bytes_;
+
   // ArtField, ArtMethod relocating map. These are allocated as array of structs but we want to
   // have one entry per art field for convenience. ArtFields are placed right after the end of the
   // image objects (aka sum of bin_slot_sizes_). ArtMethods are placed right after the ArtFields.
@@ -376,8 +373,9 @@ class ImageWriter FINAL {
   uint64_t dirty_methods_;
   uint64_t clean_methods_;
 
-  friend class FixupVisitor;
   friend class FixupClassVisitor;
+  friend class FixupRootVisitor;
+  friend class FixupVisitor;
   DISALLOW_COPY_AND_ASSIGN(ImageWriter);
 };
 
diff --git a/compiler/jni/jni_cfi_test.cc b/compiler/jni/jni_cfi_test.cc
index 3a0d520e47..016f28ef1e 100644
--- a/compiler/jni/jni_cfi_test.cc
+++ b/compiler/jni/jni_cfi_test.cc
@@ -56,7 +56,7 @@ class JNICFITest : public CFITest {
     jni_asm->IncreaseFrameSize(32);
     jni_asm->DecreaseFrameSize(32);
     jni_asm->RemoveFrame(frame_size, callee_save_regs);
-    jni_asm->EmitSlowPaths();
+    jni_asm->FinalizeCode();
     std::vector<uint8_t> actual_asm(jni_asm->CodeSize());
     MemoryRegion code(&actual_asm[0], actual_asm.size());
     jni_asm->FinalizeInstructions(code);
diff --git a/compiler/jni/quick/jni_compiler.cc b/compiler/jni/quick/jni_compiler.cc
index 4d7d86cce6..85fd6962fa 100644
--- a/compiler/jni/quick/jni_compiler.cc
+++ b/compiler/jni/quick/jni_compiler.cc
@@ -474,7 +474,7 @@ CompiledMethod* ArtJniCompileMethodInternal(CompilerDriver* driver,
   DCHECK_EQ(jni_asm->cfi().GetCurrentCFAOffset(), static_cast<int>(frame_size));
 
   // 17. Finalize code generation
-  __ EmitSlowPaths();
+  __ FinalizeCode();
   size_t cs = __ CodeSize();
   std::vector<uint8_t> managed_code(cs);
   MemoryRegion code(&managed_code[0], managed_code.size());
diff --git a/compiler/linker/arm/relative_patcher_thumb2.cc b/compiler/linker/arm/relative_patcher_thumb2.cc
index d0104300d3..a3e889f0f6 100644
--- a/compiler/linker/arm/relative_patcher_thumb2.cc
+++ b/compiler/linker/arm/relative_patcher_thumb2.cc
@@ -82,6 +82,7 @@ std::vector<uint8_t> Thumb2RelativePatcher::CompileThunkCode() {
       arm::kLoadWord, arm::PC, arm::R0,
       ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArmPointerSize).Int32Value());
   assembler.bkpt(0);
+  assembler.FinalizeCode();
   std::vector<uint8_t> thunk_code(assembler.CodeSize());
   MemoryRegion code(thunk_code.data(), thunk_code.size());
   assembler.FinalizeInstructions(code);
diff --git a/compiler/linker/arm64/relative_patcher_arm64.cc b/compiler/linker/arm64/relative_patcher_arm64.cc
index ee48789ad2..29355d6968 100644
--- a/compiler/linker/arm64/relative_patcher_arm64.cc
+++ b/compiler/linker/arm64/relative_patcher_arm64.cc
@@ -233,7 +233,7 @@ std::vector<uint8_t> Arm64RelativePatcher::CompileThunkCode() {
       kArm64PointerSize).Int32Value());
   assembler.JumpTo(ManagedRegister(arm64::X0), offset, ManagedRegister(arm64::IP0));
   // Ensure we emit the literal pool.
-  assembler.EmitSlowPaths();
+  assembler.FinalizeCode();
   std::vector<uint8_t> thunk_code(assembler.CodeSize());
   MemoryRegion code(thunk_code.data(), thunk_code.size());
   assembler.FinalizeInstructions(code);
diff --git a/compiler/optimizing/bounds_check_elimination.cc b/compiler/optimizing/bounds_check_elimination.cc
index b2b54965b5..97b3725da1 100644
--- a/compiler/optimizing/bounds_check_elimination.cc
+++ b/compiler/optimizing/bounds_check_elimination.cc
@@ -126,11 +126,14 @@ class ValueBound : public ValueObject {
     return instruction_ == bound.instruction_ && constant_ == bound.constant_;
   }
 
-  static HInstruction* FromArrayLengthToNewArrayIfPossible(HInstruction* instruction) {
-    // Null check on the NewArray should have been eliminated by instruction
-    // simplifier already.
-    if (instruction->IsArrayLength() && instruction->InputAt(0)->IsNewArray()) {
-      return instruction->InputAt(0)->AsNewArray();
+  static HInstruction* FromArrayLengthToArray(HInstruction* instruction) {
+    DCHECK(instruction->IsArrayLength() || instruction->IsNewArray());
+    if (instruction->IsArrayLength()) {
+      HInstruction* input = instruction->InputAt(0);
+      if (input->IsNullCheck()) {
+        input = input->AsNullCheck()->InputAt(0);
+      }
+      return input;
     }
     return instruction;
   }
@@ -146,8 +149,9 @@ class ValueBound : public ValueObject {
 
     // Some bounds are created with HNewArray* as the instruction instead
     // of HArrayLength*. They are treated the same.
-    instruction1 = FromArrayLengthToNewArrayIfPossible(instruction1);
-    instruction2 = FromArrayLengthToNewArrayIfPossible(instruction2);
+    // HArrayLength with the same array input are considered equal also.
+    instruction1 = FromArrayLengthToArray(instruction1);
+    instruction2 = FromArrayLengthToArray(instruction2);
     return instruction1 == instruction2;
   }
 
@@ -271,7 +275,7 @@ class ArrayAccessInsideLoopFinder : public ValueObject {
       // Loop header of loop_info. Exiting loop is normal.
       return false;
     }
-    const GrowableArray<HBasicBlock*> successors = block->GetSuccessors();
+    const GrowableArray<HBasicBlock*>& successors = block->GetSuccessors();
     for (size_t i = 0; i < successors.Size(); i++) {
       if (!loop_info->Contains(*successors.Get(i))) {
         // One of the successors exits the loop.
@@ -293,8 +297,14 @@ class ArrayAccessInsideLoopFinder : public ValueObject {
 
   void Run() {
     HLoopInformation* loop_info = induction_variable_->GetBlock()->GetLoopInformation();
-    for (HBlocksInLoopIterator it_loop(*loop_info); !it_loop.Done(); it_loop.Advance()) {
-      HBasicBlock* block = it_loop.Current();
+    HBlocksInLoopReversePostOrderIterator it_loop(*loop_info);
+    HBasicBlock* block = it_loop.Current();
+    DCHECK(block == induction_variable_->GetBlock());
+    // Skip loop header. Since narrowed value range of a MonotonicValueRange only
+    // applies to the loop body (after the test at the end of the loop header).
+    it_loop.Advance();
+    for (; !it_loop.Done(); it_loop.Advance()) {
+      block = it_loop.Current();
       DCHECK(block->IsInLoop());
       if (!DominatesAllBackEdges(block, loop_info)) {
         // In order not to trigger deoptimization unnecessarily, make sure
@@ -308,30 +318,35 @@ class ArrayAccessInsideLoopFinder : public ValueObject {
         // that the loop will loop through the full monotonic value range from
         // initial_ to end_. So adding deoptimization might be too aggressive and can
         // trigger deoptimization unnecessarily even if the loop won't actually throw
-        // AIOOBE. Otherwise, the loop induction variable is going to cover the full
-        // monotonic value range from initial_ to end_, and deoptimizations are added
-        // iff the loop will throw AIOOBE.
+        // AIOOBE.
         found_array_length_ = nullptr;
         return;
       }
       for (HInstruction* instruction = block->GetFirstInstruction();
            instruction != nullptr;
            instruction = instruction->GetNext()) {
-        if (!instruction->IsArrayGet() && !instruction->IsArraySet()) {
+        if (!instruction->IsBoundsCheck()) {
           continue;
         }
-        HInstruction* index = instruction->InputAt(1);
-        if (!index->IsBoundsCheck()) {
+
+        HInstruction* length_value = instruction->InputAt(1);
+        if (length_value->IsIntConstant()) {
+          // TODO: may optimize for constant case.
           continue;
         }
 
-        HArrayLength* array_length = index->InputAt(1)->AsArrayLength();
-        if (array_length == nullptr) {
-          DCHECK(index->InputAt(1)->IsIntConstant());
-          // TODO: may optimize for constant case.
+        if (length_value->IsPhi()) {
+          // When adding deoptimizations in outer loops, we might create
+          // a phi for the array length, and update all uses of the
+          // length in the loop to that phi. Therefore, inner loops having
+          // bounds checks on the same array will use that phi.
+          // TODO: handle these cases.
           continue;
         }
 
+        DCHECK(length_value->IsArrayLength());
+        HArrayLength* array_length = length_value->AsArrayLength();
+
         HInstruction* array = array_length->InputAt(0);
         if (array->IsNullCheck()) {
           array = array->AsNullCheck()->InputAt(0);
@@ -347,7 +362,7 @@ class ArrayAccessInsideLoopFinder : public ValueObject {
           continue;
         }
 
-        index = index->AsBoundsCheck()->InputAt(0);
+        HInstruction* index = instruction->AsBoundsCheck()->InputAt(0);
         HInstruction* left = index;
         int32_t right = 0;
         if (left == induction_variable_ ||
@@ -375,7 +390,7 @@ class ArrayAccessInsideLoopFinder : public ValueObject {
   // The instruction that corresponds to a MonotonicValueRange.
   HInstruction* induction_variable_;
 
-  // The array length of the array that's accessed inside the loop.
+  // The array length of the array that's accessed inside the loop body.
   HArrayLength* found_array_length_;
 
   // The lowest and highest constant offsets relative to induction variable
@@ -411,6 +426,8 @@ class ValueRange : public ArenaObject<kArenaAllocMisc> {
   ValueBound GetLower() const { return lower_; }
   ValueBound GetUpper() const { return upper_; }
 
+  bool IsConstantValueRange() { return lower_.IsConstant() && upper_.IsConstant(); }
+
   // If it's certain that this value range fits in other_range.
   virtual bool FitsIn(ValueRange* other_range) const {
     if (other_range == nullptr) {
@@ -495,13 +512,30 @@ class MonotonicValueRange : public ValueRange {
   ValueBound GetBound() const { return bound_; }
   void SetEnd(HInstruction* end) { end_ = end; }
   void SetInclusive(bool inclusive) { inclusive_ = inclusive; }
-  HBasicBlock* GetLoopHead() const {
+  HBasicBlock* GetLoopHeader() const {
     DCHECK(induction_variable_->GetBlock()->IsLoopHeader());
     return induction_variable_->GetBlock();
   }
 
   MonotonicValueRange* AsMonotonicValueRange() OVERRIDE { return this; }
 
+  HBasicBlock* GetLoopHeaderSuccesorInLoop() {
+    HBasicBlock* header = GetLoopHeader();
+    HInstruction* instruction = header->GetLastInstruction();
+    DCHECK(instruction->IsIf());
+    HIf* h_if = instruction->AsIf();
+    HLoopInformation* loop_info = header->GetLoopInformation();
+    bool true_successor_in_loop = loop_info->Contains(*h_if->IfTrueSuccessor());
+    bool false_successor_in_loop = loop_info->Contains(*h_if->IfFalseSuccessor());
+
+    // Just in case it's some strange loop structure.
+    if (true_successor_in_loop && false_successor_in_loop) {
+      return nullptr;
+    }
+    DCHECK(true_successor_in_loop || false_successor_in_loop);
+    return false_successor_in_loop ? h_if->IfFalseSuccessor() : h_if->IfTrueSuccessor();
+  }
+
   // If it's certain that this value range fits in other_range.
   bool FitsIn(ValueRange* other_range) const OVERRIDE {
     if (other_range == nullptr) {
@@ -593,12 +627,114 @@ class MonotonicValueRange : public ValueRange {
     }
   }
 
+  // Try to add HDeoptimize's in the loop pre-header first to narrow this range.
+  // For example, this loop:
+  //
+  //   for (int i = start; i < end; i++) {
+  //     array[i - 1] = array[i] + array[i + 1];
+  //   }
+  //
+  // will be transformed to:
+  //
+  //   int array_length_in_loop_body_if_needed;
+  //   if (start >= end) {
+  //     array_length_in_loop_body_if_needed = 0;
+  //   } else {
+  //     if (start < 1) deoptimize();
+  //     if (array == null) deoptimize();
+  //     array_length = array.length;
+  //     if (end > array_length - 1) deoptimize;
+  //     array_length_in_loop_body_if_needed = array_length;
+  //   }
+  //   for (int i = start; i < end; i++) {
+  //     // No more null check and bounds check.
+  //     // array.length value is replaced with array_length_in_loop_body_if_needed
+  //     // in the loop body.
+  //     array[i - 1] = array[i] + array[i + 1];
+  //   }
+  //
+  // We basically first go through the loop body and find those array accesses whose
+  // index is at a constant offset from the induction variable ('i' in the above example),
+  // and update offset_low and offset_high along the way. We then add the following
+  // deoptimizations in the loop pre-header (suppose end is not inclusive).
+  //   if (start < -offset_low) deoptimize();
+  //   if (end >= array.length - offset_high) deoptimize();
+  // It might be necessary to first hoist array.length (and the null check on it) out of
+  // the loop with another deoptimization.
+  //
+  // In order not to trigger deoptimization unnecessarily, we want to make a strong
+  // guarantee that no deoptimization is triggered if the loop body itself doesn't
+  // throw AIOOBE. (It's the same as saying if deoptimization is triggered, the loop
+  // body must throw AIOOBE).
+  // This is achieved by the following:
+  // 1) We only process loops that iterate through the full monotonic range from
+  //    initial_ to end_. We do the following checks to make sure that's the case:
+  //    a) The loop doesn't have early exit (via break, return, etc.)
+  //    b) The increment_ is 1/-1. An increment of 2, for example, may skip end_.
+  // 2) We only collect array accesses of blocks in the loop body that dominate
+  //    all loop back edges, these array accesses are guaranteed to happen
+  //    at each loop iteration.
+  // With 1) and 2), if the loop body doesn't throw AIOOBE, collected array accesses
+  // when the induction variable is at initial_ and end_ must be in a legal range.
+  // Since the added deoptimizations are basically checking the induction variable
+  // at initial_ and end_ values, no deoptimization will be triggered either.
+  //
+  // A special case is the loop body isn't entered at all. In that case, we may still
+  // add deoptimization due to the analysis described above. In order not to trigger
+  // deoptimization, we do a test between initial_ and end_ first and skip over
+  // the added deoptimization.
+  ValueRange* NarrowWithDeoptimization() {
+    if (increment_ != 1 && increment_ != -1) {
+      // In order not to trigger deoptimization unnecessarily, we want to
+      // make sure the loop iterates through the full range from initial_ to
+      // end_ so that boundaries are covered by the loop. An increment of 2,
+      // for example, may skip end_.
+      return this;
+    }
+
+    if (end_ == nullptr) {
+      // No full info to add deoptimization.
+      return this;
+    }
+
+    HBasicBlock* header = induction_variable_->GetBlock();
+    DCHECK(header->IsLoopHeader());
+    HBasicBlock* pre_header = header->GetLoopInformation()->GetPreHeader();
+    if (!initial_->GetBlock()->Dominates(pre_header) ||
+        !end_->GetBlock()->Dominates(pre_header)) {
+      // Can't add a check in loop pre-header if the value isn't available there.
+      return this;
+    }
+
+    ArrayAccessInsideLoopFinder finder(induction_variable_);
+
+    if (!finder.HasFoundArrayLength()) {
+      // No array access was found inside the loop that can benefit
+      // from deoptimization.
+      return this;
+    }
+
+    if (!AddDeoptimization(finder)) {
+      return this;
+    }
+
+    // After added deoptimizations, induction variable fits in
+    // [-offset_low, array.length-1-offset_high], adjusted with collected offsets.
+    ValueBound lower = ValueBound(0, -finder.GetOffsetLow());
+    ValueBound upper = ValueBound(finder.GetFoundArrayLength(), -1 - finder.GetOffsetHigh());
+    // We've narrowed the range after added deoptimizations.
+    return new (GetAllocator()) ValueRange(GetAllocator(), lower, upper);
+  }
+
   // Returns true if adding a (constant >= value) check for deoptimization
   // is allowed and will benefit compiled code.
-  bool CanAddDeoptimizationConstant(HInstruction* value,
-                                    int32_t constant,
-                                    bool* is_proven) {
+  bool CanAddDeoptimizationConstant(HInstruction* value, int32_t constant, bool* is_proven) {
     *is_proven = false;
+    HBasicBlock* header = induction_variable_->GetBlock();
+    DCHECK(header->IsLoopHeader());
+    HBasicBlock* pre_header = header->GetLoopInformation()->GetPreHeader();
+    DCHECK(value->GetBlock()->Dominates(pre_header));
+
     // See if we can prove the relationship first.
     if (value->IsIntConstant()) {
       if (value->AsIntConstant()->GetValue() >= constant) {
@@ -615,22 +751,118 @@ class MonotonicValueRange : public ValueRange {
     return true;
   }
 
+  // Try to filter out cases that the loop entry test will never be true.
+  bool LoopEntryTestUseful() {
+    if (initial_->IsIntConstant() && end_->IsIntConstant()) {
+      int32_t initial_val = initial_->AsIntConstant()->GetValue();
+      int32_t end_val = end_->AsIntConstant()->GetValue();
+      if (increment_ == 1) {
+        if (inclusive_) {
+          return initial_val > end_val;
+        } else {
+          return initial_val >= end_val;
+        }
+      } else {
+        DCHECK_EQ(increment_, -1);
+        if (inclusive_) {
+          return initial_val < end_val;
+        } else {
+          return initial_val <= end_val;
+        }
+      }
+    }
+    return true;
+  }
+
+  // Returns the block for adding deoptimization.
+  HBasicBlock* TransformLoopForDeoptimizationIfNeeded() {
+    HBasicBlock* header = induction_variable_->GetBlock();
+    DCHECK(header->IsLoopHeader());
+    HBasicBlock* pre_header = header->GetLoopInformation()->GetPreHeader();
+    // Deoptimization is only added when both initial_ and end_ are defined
+    // before the loop.
+    DCHECK(initial_->GetBlock()->Dominates(pre_header));
+    DCHECK(end_->GetBlock()->Dominates(pre_header));
+
+    // If it can be proven the loop body is definitely entered (unless exception
+    // is thrown in the loop header for which triggering deoptimization is fine),
+    // there is no need for tranforming the loop. In that case, deoptimization
+    // will just be added in the loop pre-header.
+    if (!LoopEntryTestUseful()) {
+      return pre_header;
+    }
+
+    HGraph* graph = header->GetGraph();
+    graph->TransformLoopHeaderForBCE(header);
+    HBasicBlock* new_pre_header = header->GetDominator();
+    DCHECK(new_pre_header == header->GetLoopInformation()->GetPreHeader());
+    HBasicBlock* if_block = new_pre_header->GetDominator();
+    HBasicBlock* dummy_block = if_block->GetSuccessors().Get(0);  // True successor.
+    HBasicBlock* deopt_block = if_block->GetSuccessors().Get(1);  // False successor.
+
+    dummy_block->AddInstruction(new (graph->GetArena()) HGoto());
+    deopt_block->AddInstruction(new (graph->GetArena()) HGoto());
+    new_pre_header->AddInstruction(new (graph->GetArena()) HGoto());
+    return deopt_block;
+  }
+
+  // Adds a test between initial_ and end_ to see if the loop body is entered.
+  // If the loop body isn't entered at all, it jumps to the loop pre-header (after
+  // transformation) to avoid any deoptimization.
+  void AddLoopBodyEntryTest() {
+    HBasicBlock* header = induction_variable_->GetBlock();
+    DCHECK(header->IsLoopHeader());
+    HBasicBlock* pre_header = header->GetLoopInformation()->GetPreHeader();
+    HBasicBlock* if_block = pre_header->GetDominator();
+    HGraph* graph = header->GetGraph();
+
+    HCondition* cond;
+    if (increment_ == 1) {
+      if (inclusive_) {
+        cond = new (graph->GetArena()) HGreaterThan(initial_, end_);
+      } else {
+        cond = new (graph->GetArena()) HGreaterThanOrEqual(initial_, end_);
+      }
+    } else {
+      DCHECK_EQ(increment_, -1);
+      if (inclusive_) {
+        cond = new (graph->GetArena()) HLessThan(initial_, end_);
+      } else {
+        cond = new (graph->GetArena()) HLessThanOrEqual(initial_, end_);
+      }
+    }
+    HIf* h_if = new (graph->GetArena()) HIf(cond);
+    if_block->AddInstruction(cond);
+    if_block->AddInstruction(h_if);
+  }
+
   // Adds a check that (value >= constant), and HDeoptimize otherwise.
   void AddDeoptimizationConstant(HInstruction* value,
-                                 int32_t constant) {
-    HBasicBlock* block = induction_variable_->GetBlock();
-    DCHECK(block->IsLoopHeader());
-    HGraph* graph = block->GetGraph();
-    HBasicBlock* pre_header = block->GetLoopInformation()->GetPreHeader();
-    HSuspendCheck* suspend_check = block->GetLoopInformation()->GetSuspendCheck();
+                                 int32_t constant,
+                                 HBasicBlock* deopt_block,
+                                 bool loop_entry_test_block_added) {
+    HBasicBlock* header = induction_variable_->GetBlock();
+    DCHECK(header->IsLoopHeader());
+    HBasicBlock* pre_header = header->GetDominator();
+    if (loop_entry_test_block_added) {
+      DCHECK(deopt_block->GetSuccessors().Get(0) == pre_header);
+    } else {
+      DCHECK(deopt_block == pre_header);
+    }
+    HGraph* graph = header->GetGraph();
+    HSuspendCheck* suspend_check = header->GetLoopInformation()->GetSuspendCheck();
+    if (loop_entry_test_block_added) {
+      DCHECK_EQ(deopt_block, header->GetDominator()->GetDominator()->GetSuccessors().Get(1));
+    }
+
     HIntConstant* const_instr = graph->GetIntConstant(constant);
     HCondition* cond = new (graph->GetArena()) HLessThan(value, const_instr);
     HDeoptimize* deoptimize = new (graph->GetArena())
         HDeoptimize(cond, suspend_check->GetDexPc());
-    pre_header->InsertInstructionBefore(cond, pre_header->GetLastInstruction());
-    pre_header->InsertInstructionBefore(deoptimize, pre_header->GetLastInstruction());
+    deopt_block->InsertInstructionBefore(cond, deopt_block->GetLastInstruction());
+    deopt_block->InsertInstructionBefore(deoptimize, deopt_block->GetLastInstruction());
     deoptimize->CopyEnvironmentFromWithLoopPhiAdjustment(
-        suspend_check->GetEnvironment(), block);
+        suspend_check->GetEnvironment(), header);
   }
 
   // Returns true if adding a (value <= array_length + offset) check for deoptimization
@@ -640,6 +872,26 @@ class MonotonicValueRange : public ValueRange {
                                        int32_t offset,
                                        bool* is_proven) {
     *is_proven = false;
+    HBasicBlock* header = induction_variable_->GetBlock();
+    DCHECK(header->IsLoopHeader());
+    HBasicBlock* pre_header = header->GetLoopInformation()->GetPreHeader();
+    DCHECK(value->GetBlock()->Dominates(pre_header));
+
+    if (array_length->GetBlock() == header) {
+      // array_length_in_loop_body_if_needed only has correct value when the loop
+      // body is entered. We bail out in this case. Usually array_length defined
+      // in the loop header is already hoisted by licm.
+      return false;
+    } else {
+      // array_length is defined either before the loop header already, or in
+      // the loop body since it's used in the loop body. If it's defined in the loop body,
+      // a phi array_length_in_loop_body_if_needed is used to replace it. In that case,
+      // all the uses of array_length must be dominated by its definition in the loop
+      // body. array_length_in_loop_body_if_needed is guaranteed to be the same as
+      // array_length once the loop body is entered so all the uses of the phi will
+      // use the correct value.
+    }
+
     if (offset > 0) {
       // There might be overflow issue.
       // TODO: handle this, possibly with some distance relationship between
@@ -667,56 +919,99 @@ class MonotonicValueRange : public ValueRange {
   // Adds a check that (value <= array_length + offset), and HDeoptimize otherwise.
   void AddDeoptimizationArrayLength(HInstruction* value,
                                     HArrayLength* array_length,
-                                    int32_t offset) {
-    HBasicBlock* block = induction_variable_->GetBlock();
-    DCHECK(block->IsLoopHeader());
-    HGraph* graph = block->GetGraph();
-    HBasicBlock* pre_header = block->GetLoopInformation()->GetPreHeader();
-    HSuspendCheck* suspend_check = block->GetLoopInformation()->GetSuspendCheck();
+                                    int32_t offset,
+                                    HBasicBlock* deopt_block,
+                                    bool loop_entry_test_block_added) {
+    HBasicBlock* header = induction_variable_->GetBlock();
+    DCHECK(header->IsLoopHeader());
+    HBasicBlock* pre_header = header->GetDominator();
+    if (loop_entry_test_block_added) {
+      DCHECK(deopt_block->GetSuccessors().Get(0) == pre_header);
+    } else {
+      DCHECK(deopt_block == pre_header);
+    }
+    HGraph* graph = header->GetGraph();
+    HSuspendCheck* suspend_check = header->GetLoopInformation()->GetSuspendCheck();
 
     // We may need to hoist null-check and array_length out of loop first.
-    if (!array_length->GetBlock()->Dominates(pre_header)) {
+    if (!array_length->GetBlock()->Dominates(deopt_block)) {
+      // array_length must be defined in the loop body.
+      DCHECK(header->GetLoopInformation()->Contains(*array_length->GetBlock()));
+      DCHECK(array_length->GetBlock() != header);
+
       HInstruction* array = array_length->InputAt(0);
       HNullCheck* null_check = array->AsNullCheck();
       if (null_check != nullptr) {
         array = null_check->InputAt(0);
       }
-      // We've already made sure array is defined before the loop when collecting
+      // We've already made sure the array is defined before the loop when collecting
       // array accesses for the loop.
-      DCHECK(array->GetBlock()->Dominates(pre_header));
-      if (null_check != nullptr && !null_check->GetBlock()->Dominates(pre_header)) {
+      DCHECK(array->GetBlock()->Dominates(deopt_block));
+      if (null_check != nullptr && !null_check->GetBlock()->Dominates(deopt_block)) {
         // Hoist null check out of loop with a deoptimization.
         HNullConstant* null_constant = graph->GetNullConstant();
         HCondition* null_check_cond = new (graph->GetArena()) HEqual(array, null_constant);
         // TODO: for one dex_pc, share the same deoptimization slow path.
         HDeoptimize* null_check_deoptimize = new (graph->GetArena())
             HDeoptimize(null_check_cond, suspend_check->GetDexPc());
-        pre_header->InsertInstructionBefore(null_check_cond, pre_header->GetLastInstruction());
-        pre_header->InsertInstructionBefore(
-            null_check_deoptimize, pre_header->GetLastInstruction());
+        deopt_block->InsertInstructionBefore(
+            null_check_cond, deopt_block->GetLastInstruction());
+        deopt_block->InsertInstructionBefore(
+            null_check_deoptimize, deopt_block->GetLastInstruction());
         // Eliminate null check in the loop.
         null_check->ReplaceWith(array);
         null_check->GetBlock()->RemoveInstruction(null_check);
         null_check_deoptimize->CopyEnvironmentFromWithLoopPhiAdjustment(
-            suspend_check->GetEnvironment(), block);
+            suspend_check->GetEnvironment(), header);
       }
-      // Hoist array_length out of loop.
-      array_length->MoveBefore(pre_header->GetLastInstruction());
+
+      HArrayLength* new_array_length = new (graph->GetArena()) HArrayLength(array);
+      deopt_block->InsertInstructionBefore(new_array_length, deopt_block->GetLastInstruction());
+
+      if (loop_entry_test_block_added) {
+        // Replace array_length defined inside the loop body with a phi
+        // array_length_in_loop_body_if_needed. This is a synthetic phi so there is
+        // no vreg number for it.
+        HPhi* phi = new (graph->GetArena()) HPhi(
+            graph->GetArena(), kNoRegNumber, 2, Primitive::kPrimInt);
+        // Set to 0 if the loop body isn't entered.
+        phi->SetRawInputAt(0, graph->GetIntConstant(0));
+        // Set to array.length if the loop body is entered.
+        phi->SetRawInputAt(1, new_array_length);
+        pre_header->AddPhi(phi);
+        array_length->ReplaceWith(phi);
+        // Make sure phi is only used after the loop body is entered.
+        if (kIsDebugBuild) {
+          for (HUseIterator<HInstruction*> it(phi->GetUses());
+               !it.Done();
+               it.Advance()) {
+            HInstruction* user = it.Current()->GetUser();
+            DCHECK(GetLoopHeaderSuccesorInLoop()->Dominates(user->GetBlock()));
+          }
+        }
+      } else {
+        array_length->ReplaceWith(new_array_length);
+      }
+
+      array_length->GetBlock()->RemoveInstruction(array_length);
+      // Use new_array_length for deopt.
+      array_length = new_array_length;
     }
 
-    HIntConstant* offset_instr = graph->GetIntConstant(offset);
-    HAdd* add = new (graph->GetArena()) HAdd(Primitive::kPrimInt, array_length, offset_instr);
-    HCondition* cond = new (graph->GetArena()) HGreaterThan(value, add);
-    HDeoptimize* deoptimize = new (graph->GetArena())
-        HDeoptimize(cond, suspend_check->GetDexPc());
-    pre_header->InsertInstructionBefore(add, pre_header->GetLastInstruction());
-    pre_header->InsertInstructionBefore(cond, pre_header->GetLastInstruction());
-    pre_header->InsertInstructionBefore(deoptimize, pre_header->GetLastInstruction());
-    deoptimize->CopyEnvironmentFromWithLoopPhiAdjustment(
-        suspend_check->GetEnvironment(), block);
+    HInstruction* added = array_length;
+    if (offset != 0) {
+      HIntConstant* offset_instr = graph->GetIntConstant(offset);
+      added = new (graph->GetArena()) HAdd(Primitive::kPrimInt, array_length, offset_instr);
+      deopt_block->InsertInstructionBefore(added, deopt_block->GetLastInstruction());
+    }
+    HCondition* cond = new (graph->GetArena()) HGreaterThan(value, added);
+    HDeoptimize* deopt = new (graph->GetArena()) HDeoptimize(cond, suspend_check->GetDexPc());
+    deopt_block->InsertInstructionBefore(cond, deopt_block->GetLastInstruction());
+    deopt_block->InsertInstructionBefore(deopt, deopt_block->GetLastInstruction());
+    deopt->CopyEnvironmentFromWithLoopPhiAdjustment(suspend_check->GetEnvironment(), header);
   }
 
-  // Add deoptimizations in loop pre-header with the collected array access
+  // Adds deoptimizations in loop pre-header with the collected array access
   // data so that value ranges can be established in loop body.
   // Returns true if deoptimizations are successfully added, or if it's proven
   // it's not necessary.
@@ -733,70 +1028,60 @@ class MonotonicValueRange : public ValueRange {
       return false;
     }
 
+    HBasicBlock* deopt_block;
+    bool loop_entry_test_block_added = false;
     bool is_constant_proven, is_length_proven;
+
+    HInstruction* const_comparing_instruction;
+    int32_t const_compared_to;
+    HInstruction* array_length_comparing_instruction;
+    int32_t array_length_offset;
     if (increment_ == 1) {
       // Increasing from initial_ to end_.
-      int32_t offset = inclusive_ ? -offset_high - 1 : -offset_high;
-      if (CanAddDeoptimizationConstant(initial_, -offset_low, &is_constant_proven) &&
-          CanAddDeoptimizationArrayLength(end_, array_length, offset, &is_length_proven)) {
-        if (!is_constant_proven) {
-          AddDeoptimizationConstant(initial_, -offset_low);
-        }
-        if (!is_length_proven) {
-          AddDeoptimizationArrayLength(end_, array_length, offset);
+      const_comparing_instruction = initial_;
+      const_compared_to = -offset_low;
+      array_length_comparing_instruction = end_;
+      array_length_offset = inclusive_ ? -offset_high - 1 : -offset_high;
+    } else {
+      const_comparing_instruction = end_;
+      const_compared_to = inclusive_ ? -offset_low : -offset_low - 1;
+      array_length_comparing_instruction = initial_;
+      array_length_offset = -offset_high - 1;
+    }
+
+    if (CanAddDeoptimizationConstant(const_comparing_instruction,
+                                     const_compared_to,
+                                     &is_constant_proven) &&
+        CanAddDeoptimizationArrayLength(array_length_comparing_instruction,
+                                        array_length,
+                                        array_length_offset,
+                                        &is_length_proven)) {
+      if (!is_constant_proven || !is_length_proven) {
+        deopt_block = TransformLoopForDeoptimizationIfNeeded();
+        loop_entry_test_block_added = (deopt_block != pre_header);
+        if (loop_entry_test_block_added) {
+          // Loop body may be entered.
+          AddLoopBodyEntryTest();
         }
-        return true;
       }
-    } else if (increment_ == -1) {
-      // Decreasing from initial_ to end_.
-      int32_t constant = inclusive_ ? -offset_low : -offset_low - 1;
-      if (CanAddDeoptimizationConstant(end_, constant, &is_constant_proven) &&
-          CanAddDeoptimizationArrayLength(
-              initial_, array_length, -offset_high - 1, &is_length_proven)) {
-        if (!is_constant_proven) {
-          AddDeoptimizationConstant(end_, constant);
-        }
-        if (!is_length_proven) {
-          AddDeoptimizationArrayLength(initial_, array_length, -offset_high - 1);
-        }
-        return true;
+      if (!is_constant_proven) {
+        AddDeoptimizationConstant(const_comparing_instruction,
+                                  const_compared_to,
+                                  deopt_block,
+                                  loop_entry_test_block_added);
+      }
+      if (!is_length_proven) {
+        AddDeoptimizationArrayLength(array_length_comparing_instruction,
+                                     array_length,
+                                     array_length_offset,
+                                     deopt_block,
+                                     loop_entry_test_block_added);
       }
+      return true;
     }
     return false;
   }
 
-  // Try to add HDeoptimize's in the loop pre-header first to narrow this range.
-  ValueRange* NarrowWithDeoptimization() {
-    if (increment_ != 1 && increment_ != -1) {
-      // TODO: possibly handle overflow/underflow issues with deoptimization.
-      return this;
-    }
-
-    if (end_ == nullptr) {
-      // No full info to add deoptimization.
-      return this;
-    }
-
-    ArrayAccessInsideLoopFinder finder(induction_variable_);
-
-    if (!finder.HasFoundArrayLength()) {
-      // No array access was found inside the loop that can benefit
-      // from deoptimization.
-      return this;
-    }
-
-    if (!AddDeoptimization(finder)) {
-      return this;
-    }
-
-    // After added deoptimizations, induction variable fits in
-    // [-offset_low, array.length-1-offset_high], adjusted with collected offsets.
-    ValueBound lower = ValueBound(0, -finder.GetOffsetLow());
-    ValueBound upper = ValueBound(finder.GetFoundArrayLength(), -1 - finder.GetOffsetHigh());
-    // We've narrowed the range after added deoptimizations.
-    return new (GetAllocator()) ValueRange(GetAllocator(), lower, upper);
-  }
-
  private:
   HPhi* const induction_variable_;  // Induction variable for this monotonic value range.
   HInstruction* const initial_;     // Initial value.
@@ -819,12 +1104,17 @@ class BCEVisitor : public HGraphVisitor {
   // it's likely some AIOOBE will be thrown.
   static constexpr int32_t kMaxConstantForAddingDeoptimize = INT_MAX - 1024 * 1024;
 
+  // Added blocks for loop body entry test.
+  bool IsAddedBlock(HBasicBlock* block) const {
+    return block->GetBlockId() >= initial_block_size_;
+  }
+
   explicit BCEVisitor(HGraph* graph)
-      : HGraphVisitor(graph),
-        maps_(graph->GetBlocks().Size()),
-        need_to_revisit_block_(false) {}
+      : HGraphVisitor(graph), maps_(graph->GetBlocks().Size()),
+        need_to_revisit_block_(false), initial_block_size_(graph->GetBlocks().Size()) {}
 
   void VisitBasicBlock(HBasicBlock* block) OVERRIDE {
+    DCHECK(!IsAddedBlock(block));
     first_constant_index_bounds_check_map_.clear();
     HGraphVisitor::VisitBasicBlock(block);
     if (need_to_revisit_block_) {
@@ -839,6 +1129,10 @@ class BCEVisitor : public HGraphVisitor {
  private:
   // Return the map of proven value ranges at the beginning of a basic block.
   ArenaSafeMap<int, ValueRange*>* GetValueRangeMap(HBasicBlock* basic_block) {
+    if (IsAddedBlock(basic_block)) {
+      // Added blocks don't keep value ranges.
+      return nullptr;
+    }
     int block_id = basic_block->GetBlockId();
     if (maps_.at(block_id) == nullptr) {
       std::unique_ptr<ArenaSafeMap<int, ValueRange*>> map(
@@ -853,8 +1147,12 @@ class BCEVisitor : public HGraphVisitor {
   ValueRange* LookupValueRange(HInstruction* instruction, HBasicBlock* basic_block) {
     while (basic_block != nullptr) {
       ArenaSafeMap<int, ValueRange*>* map = GetValueRangeMap(basic_block);
-      if (map->find(instruction->GetId()) != map->end()) {
-        return map->Get(instruction->GetId());
+      if (map != nullptr) {
+        if (map->find(instruction->GetId()) != map->end()) {
+          return map->Get(instruction->GetId());
+        }
+      } else {
+        DCHECK(IsAddedBlock(basic_block));
       }
       basic_block = basic_block->GetDominator();
     }
@@ -971,7 +1269,7 @@ class BCEVisitor : public HGraphVisitor {
     if (left_range != nullptr) {
       left_monotonic_range = left_range->AsMonotonicValueRange();
       if (left_monotonic_range != nullptr) {
-        HBasicBlock* loop_head = left_monotonic_range->GetLoopHead();
+        HBasicBlock* loop_head = left_monotonic_range->GetLoopHeader();
         if (instruction->GetBlock() != loop_head) {
           // For monotonic value range, don't handle `instruction`
           // if it's not defined in the loop header.
@@ -1013,7 +1311,7 @@ class BCEVisitor : public HGraphVisitor {
         // Update the info for monotonic value range.
         if (left_monotonic_range->GetInductionVariable() == left &&
             left_monotonic_range->GetIncrement() < 0 &&
-            block == left_monotonic_range->GetLoopHead() &&
+            block == left_monotonic_range->GetLoopHeader() &&
             instruction->IfFalseSuccessor()->GetLoopInformation() == block->GetLoopInformation()) {
           left_monotonic_range->SetEnd(right);
           left_monotonic_range->SetInclusive(cond == kCondLT);
@@ -1047,7 +1345,7 @@ class BCEVisitor : public HGraphVisitor {
         // Update the info for monotonic value range.
         if (left_monotonic_range->GetInductionVariable() == left &&
             left_monotonic_range->GetIncrement() > 0 &&
-            block == left_monotonic_range->GetLoopHead() &&
+            block == left_monotonic_range->GetLoopHeader() &&
             instruction->IfFalseSuccessor()->GetLoopInformation() == block->GetLoopInformation()) {
           left_monotonic_range->SetEnd(right);
           left_monotonic_range->SetInclusive(cond == kCondGT);
@@ -1083,7 +1381,16 @@ class BCEVisitor : public HGraphVisitor {
     HBasicBlock* block = bounds_check->GetBlock();
     HInstruction* index = bounds_check->InputAt(0);
     HInstruction* array_length = bounds_check->InputAt(1);
-    DCHECK(array_length->IsIntConstant() || array_length->IsArrayLength());
+    DCHECK(array_length->IsIntConstant() ||
+           array_length->IsArrayLength() ||
+           array_length->IsPhi());
+
+    if (array_length->IsPhi()) {
+      // Input 1 of the phi contains the real array.length once the loop body is
+      // entered. That value will be used for bound analysis. The graph is still
+      // strickly in SSA form.
+      array_length = array_length->AsPhi()->InputAt(1)->AsArrayLength();
+    }
 
     if (!index->IsIntConstant()) {
       ValueRange* index_range = LookupValueRange(index, block);
@@ -1238,25 +1545,26 @@ class BCEVisitor : public HGraphVisitor {
         }
 
         if (left_range->IsMonotonicValueRange() &&
-            block == left_range->AsMonotonicValueRange()->GetLoopHead()) {
+            block == left_range->AsMonotonicValueRange()->GetLoopHeader()) {
           // The comparison is for an induction variable in the loop header.
           DCHECK(left == left_range->AsMonotonicValueRange()->GetInductionVariable());
-          HBasicBlock* loop_body_successor;
-          if (LIKELY(block->GetLoopInformation()->
-              Contains(*instruction->IfFalseSuccessor()))) {
-            loop_body_successor = instruction->IfFalseSuccessor();
-          } else {
-            loop_body_successor = instruction->IfTrueSuccessor();
+          HBasicBlock* loop_body_successor =
+            left_range->AsMonotonicValueRange()->GetLoopHeaderSuccesorInLoop();
+          if (loop_body_successor == nullptr) {
+            // In case it's some strange loop structure.
+            return;
           }
           ValueRange* new_left_range = LookupValueRange(left, loop_body_successor);
-          if (new_left_range == left_range) {
+          if ((new_left_range == left_range) ||
+              // Range narrowed with deoptimization is usually more useful than
+              // a constant range.
+              new_left_range->IsConstantValueRange()) {
             // We are not successful in narrowing the monotonic value range to
             // a regular value range. Try using deoptimization.
             new_left_range = left_range->AsMonotonicValueRange()->
                 NarrowWithDeoptimization();
             if (new_left_range != left_range) {
-              GetValueRangeMap(instruction->IfFalseSuccessor())->
-                  Overwrite(left->GetId(), new_left_range);
+              GetValueRangeMap(loop_body_successor)->Overwrite(left->GetId(), new_left_range);
             }
           }
         }
@@ -1511,6 +1819,9 @@ class BCEVisitor : public HGraphVisitor {
   // eliminate those bounds checks.
   bool need_to_revisit_block_;
 
+  // Initial number of blocks.
+  int32_t initial_block_size_;
+
   DISALLOW_COPY_AND_ASSIGN(BCEVisitor);
 };
 
@@ -1527,7 +1838,22 @@ void BoundsCheckElimination::Run() {
   // value can be narrowed further down in the dominator tree.
   //
   // TODO: only visit blocks that dominate some array accesses.
-  visitor.VisitReversePostOrder();
+  HBasicBlock* last_visited_block = nullptr;
+  for (HReversePostOrderIterator it(*graph_); !it.Done(); it.Advance()) {
+    HBasicBlock* current = it.Current();
+    if (current == last_visited_block) {
+      // We may insert blocks into the reverse post order list when processing
+      // a loop header. Don't process it again.
+      DCHECK(current->IsLoopHeader());
+      continue;
+    }
+    if (visitor.IsAddedBlock(current)) {
+      // Skip added blocks. Their effects are already taken care of.
+      continue;
+    }
+    visitor.VisitBasicBlock(current);
+    last_visited_block = current;
+  }
 }
 
 }  // namespace art
diff --git a/compiler/optimizing/bounds_check_elimination_test.cc b/compiler/optimizing/bounds_check_elimination_test.cc
index e383ec664b..4701bddd48 100644
--- a/compiler/optimizing/bounds_check_elimination_test.cc
+++ b/compiler/optimizing/bounds_check_elimination_test.cc
@@ -440,22 +440,16 @@ TEST(BoundsCheckEliminationTest, LoopArrayBoundsElimination1) {
   HInstruction* bounds_check = nullptr;
   HGraph* graph = BuildSSAGraph1(&allocator, &bounds_check, 0, 1);
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
+  RunSimplifierAndGvn(graph);
   BoundsCheckElimination bounds_check_elimination(graph);
   bounds_check_elimination.Run();
-  ASSERT_FALSE(IsRemoved(bounds_check));
-
-  // This time add gvn. Need gvn to eliminate the second
-  // HArrayLength which uses the null check as its input.
-  graph = BuildSSAGraph1(&allocator, &bounds_check, 0, 1);
-  graph->BuildDominatorTree();
-  RunSimplifierAndGvn(graph);
-  BoundsCheckElimination bounds_check_elimination_after_gvn(graph);
-  bounds_check_elimination_after_gvn.Run();
   ASSERT_TRUE(IsRemoved(bounds_check));
 
   // for (int i=1; i<array.length; i++) { array[i] = 10; // Can eliminate. }
   graph = BuildSSAGraph1(&allocator, &bounds_check, 1, 1);
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
   RunSimplifierAndGvn(graph);
   BoundsCheckElimination bounds_check_elimination_with_initial_1(graph);
   bounds_check_elimination_with_initial_1.Run();
@@ -464,6 +458,7 @@ TEST(BoundsCheckEliminationTest, LoopArrayBoundsElimination1) {
   // for (int i=-1; i<array.length; i++) { array[i] = 10; // Can't eliminate. }
   graph = BuildSSAGraph1(&allocator, &bounds_check, -1, 1);
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
   RunSimplifierAndGvn(graph);
   BoundsCheckElimination bounds_check_elimination_with_initial_minus_1(graph);
   bounds_check_elimination_with_initial_minus_1.Run();
@@ -472,6 +467,7 @@ TEST(BoundsCheckEliminationTest, LoopArrayBoundsElimination1) {
   // for (int i=0; i<=array.length; i++) { array[i] = 10; // Can't eliminate. }
   graph = BuildSSAGraph1(&allocator, &bounds_check, 0, 1, kCondGT);
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
   RunSimplifierAndGvn(graph);
   BoundsCheckElimination bounds_check_elimination_with_greater_than(graph);
   bounds_check_elimination_with_greater_than.Run();
@@ -481,6 +477,7 @@ TEST(BoundsCheckEliminationTest, LoopArrayBoundsElimination1) {
   //   array[i] = 10; // Can't eliminate due to overflow concern. }
   graph = BuildSSAGraph1(&allocator, &bounds_check, 0, 2);
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
   RunSimplifierAndGvn(graph);
   BoundsCheckElimination bounds_check_elimination_with_increment_2(graph);
   bounds_check_elimination_with_increment_2.Run();
@@ -489,6 +486,7 @@ TEST(BoundsCheckEliminationTest, LoopArrayBoundsElimination1) {
   // for (int i=1; i<array.length; i += 2) { array[i] = 10; // Can eliminate. }
   graph = BuildSSAGraph1(&allocator, &bounds_check, 1, 2);
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
   RunSimplifierAndGvn(graph);
   BoundsCheckElimination bounds_check_elimination_with_increment_2_from_1(graph);
   bounds_check_elimination_with_increment_2_from_1.Run();
@@ -579,22 +577,16 @@ TEST(BoundsCheckEliminationTest, LoopArrayBoundsElimination2) {
   HInstruction* bounds_check = nullptr;
   HGraph* graph = BuildSSAGraph2(&allocator, &bounds_check, 0);
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
+  RunSimplifierAndGvn(graph);
   BoundsCheckElimination bounds_check_elimination(graph);
   bounds_check_elimination.Run();
-  ASSERT_FALSE(IsRemoved(bounds_check));
-
-  // This time add gvn. Need gvn to eliminate the second
-  // HArrayLength which uses the null check as its input.
-  graph = BuildSSAGraph2(&allocator, &bounds_check, 0);
-  graph->BuildDominatorTree();
-  RunSimplifierAndGvn(graph);
-  BoundsCheckElimination bounds_check_elimination_after_gvn(graph);
-  bounds_check_elimination_after_gvn.Run();
   ASSERT_TRUE(IsRemoved(bounds_check));
 
   // for (int i=array.length; i>1; i--) { array[i-1] = 10; // Can eliminate. }
   graph = BuildSSAGraph2(&allocator, &bounds_check, 1);
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
   RunSimplifierAndGvn(graph);
   BoundsCheckElimination bounds_check_elimination_with_initial_1(graph);
   bounds_check_elimination_with_initial_1.Run();
@@ -603,6 +595,7 @@ TEST(BoundsCheckEliminationTest, LoopArrayBoundsElimination2) {
   // for (int i=array.length; i>-1; i--) { array[i-1] = 10; // Can't eliminate. }
   graph = BuildSSAGraph2(&allocator, &bounds_check, -1);
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
   RunSimplifierAndGvn(graph);
   BoundsCheckElimination bounds_check_elimination_with_initial_minus_1(graph);
   bounds_check_elimination_with_initial_minus_1.Run();
@@ -611,6 +604,7 @@ TEST(BoundsCheckEliminationTest, LoopArrayBoundsElimination2) {
   // for (int i=array.length; i>=0; i--) { array[i-1] = 10; // Can't eliminate. }
   graph = BuildSSAGraph2(&allocator, &bounds_check, 0, -1, kCondLT);
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
   RunSimplifierAndGvn(graph);
   BoundsCheckElimination bounds_check_elimination_with_less_than(graph);
   bounds_check_elimination_with_less_than.Run();
@@ -619,6 +613,7 @@ TEST(BoundsCheckEliminationTest, LoopArrayBoundsElimination2) {
   // for (int i=array.length; i>0; i-=2) { array[i-1] = 10; // Can eliminate. }
   graph = BuildSSAGraph2(&allocator, &bounds_check, 0, -2);
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
   RunSimplifierAndGvn(graph);
   BoundsCheckElimination bounds_check_elimination_increment_minus_2(graph);
   bounds_check_elimination_increment_minus_2.Run();
@@ -710,15 +705,17 @@ TEST(BoundsCheckEliminationTest, LoopArrayBoundsElimination3) {
   HInstruction* bounds_check = nullptr;
   HGraph* graph = BuildSSAGraph3(&allocator, &bounds_check, 0, 1, kCondGE);
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
   RunSimplifierAndGvn(graph);
-  BoundsCheckElimination bounds_check_elimination_after_gvn(graph);
-  bounds_check_elimination_after_gvn.Run();
+  BoundsCheckElimination bounds_check_elimination(graph);
+  bounds_check_elimination.Run();
   ASSERT_TRUE(IsRemoved(bounds_check));
 
   // int[] array = new int[10];
   // for (int i=1; i<10; i++) { array[i] = 10; // Can eliminate. }
   graph = BuildSSAGraph3(&allocator, &bounds_check, 1, 1, kCondGE);
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
   RunSimplifierAndGvn(graph);
   BoundsCheckElimination bounds_check_elimination_with_initial_1(graph);
   bounds_check_elimination_with_initial_1.Run();
@@ -728,6 +725,7 @@ TEST(BoundsCheckEliminationTest, LoopArrayBoundsElimination3) {
   // for (int i=0; i<=10; i++) { array[i] = 10; // Can't eliminate. }
   graph = BuildSSAGraph3(&allocator, &bounds_check, 0, 1, kCondGT);
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
   RunSimplifierAndGvn(graph);
   BoundsCheckElimination bounds_check_elimination_with_greater_than(graph);
   bounds_check_elimination_with_greater_than.Run();
@@ -737,6 +735,7 @@ TEST(BoundsCheckEliminationTest, LoopArrayBoundsElimination3) {
   // for (int i=1; i<10; i+=8) { array[i] = 10; // Can eliminate. }
   graph = BuildSSAGraph3(&allocator, &bounds_check, 1, 8, kCondGE);
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
   RunSimplifierAndGvn(graph);
   BoundsCheckElimination bounds_check_elimination_increment_8(graph);
   bounds_check_elimination_increment_8.Run();
@@ -828,22 +827,16 @@ TEST(BoundsCheckEliminationTest, LoopArrayBoundsElimination4) {
   HInstruction* bounds_check = nullptr;
   HGraph* graph = BuildSSAGraph4(&allocator, &bounds_check, 0);
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
+  RunSimplifierAndGvn(graph);
   BoundsCheckElimination bounds_check_elimination(graph);
   bounds_check_elimination.Run();
-  ASSERT_FALSE(IsRemoved(bounds_check));
-
-  // This time add gvn. Need gvn to eliminate the second
-  // HArrayLength which uses the null check as its input.
-  graph = BuildSSAGraph4(&allocator, &bounds_check, 0);
-  graph->BuildDominatorTree();
-  RunSimplifierAndGvn(graph);
-  BoundsCheckElimination bounds_check_elimination_after_gvn(graph);
-  bounds_check_elimination_after_gvn.Run();
   ASSERT_TRUE(IsRemoved(bounds_check));
 
   // for (int i=1; i<array.length; i++) { array[array.length-i-1] = 10; // Can eliminate. }
   graph = BuildSSAGraph4(&allocator, &bounds_check, 1);
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
   RunSimplifierAndGvn(graph);
   BoundsCheckElimination bounds_check_elimination_with_initial_1(graph);
   bounds_check_elimination_with_initial_1.Run();
@@ -852,6 +845,7 @@ TEST(BoundsCheckEliminationTest, LoopArrayBoundsElimination4) {
   // for (int i=0; i<=array.length; i++) { array[array.length-i] = 10; // Can't eliminate. }
   graph = BuildSSAGraph4(&allocator, &bounds_check, 0, kCondGT);
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
   RunSimplifierAndGvn(graph);
   BoundsCheckElimination bounds_check_elimination_with_greater_than(graph);
   bounds_check_elimination_with_greater_than.Run();
@@ -1027,6 +1021,7 @@ TEST(BoundsCheckEliminationTest, BubbleSortArrayBoundsElimination) {
   outer_body_add->AddSuccessor(outer_header);
 
   graph->BuildDominatorTree();
+  graph->AnalyzeNaturalLoops();
   RunSimplifierAndGvn(graph);
   // gvn should remove the same bounds check.
   ASSERT_FALSE(IsRemoved(bounds_check1));
diff --git a/compiler/optimizing/builder.cc b/compiler/optimizing/builder.cc
index cbd042901d..946c0602cf 100644
--- a/compiler/optimizing/builder.cc
+++ b/compiler/optimizing/builder.cc
@@ -603,7 +603,12 @@ bool HGraphBuilder::BuildInvoke(const Instruction& instruction,
   const char* descriptor = dex_file_->StringDataByIdx(proto_id.shorty_idx_);
   Primitive::Type return_type = Primitive::GetType(descriptor[0]);
   bool is_instance_call = invoke_type != kStatic;
-  size_t number_of_arguments = strlen(descriptor) - (is_instance_call ? 0 : 1);
+  // Remove the return type from the 'proto'.
+  size_t number_of_arguments = strlen(descriptor) - 1;
+  if (is_instance_call) {
+    // One extra argument for 'this'.
+    ++number_of_arguments;
+  }
 
   MethodReference target_method(dex_file_, method_idx);
   uintptr_t direct_code;
@@ -614,7 +619,8 @@ bool HGraphBuilder::BuildInvoke(const Instruction& instruction,
   if (!compiler_driver_->ComputeInvokeInfo(dex_compilation_unit_, dex_pc, true, true,
                                            &optimized_invoke_type, &target_method, &table_index,
                                            &direct_code, &direct_method)) {
-    VLOG(compiler) << "Did not compile " << PrettyMethod(method_idx, *dex_file_)
+    VLOG(compiler) << "Did not compile "
+                   << PrettyMethod(dex_compilation_unit_->GetDexMethodIndex(), *dex_file_)
                    << " because a method call could not be resolved";
     MaybeRecordStat(MethodCompilationStat::kNotCompiledUnresolvedMethod);
     return false;
@@ -723,10 +729,16 @@ bool HGraphBuilder::BuildInvoke(const Instruction& instruction,
       }
     }
 
-    invoke = new (arena_) HInvokeStaticOrDirect(
-        arena_, number_of_arguments, return_type, dex_pc, target_method.dex_method_index,
-        is_recursive, string_init_offset, invoke_type, optimized_invoke_type,
-        clinit_check_requirement);
+    invoke = new (arena_) HInvokeStaticOrDirect(arena_,
+                                                number_of_arguments,
+                                                return_type,
+                                                dex_pc,
+                                                target_method.dex_method_index,
+                                                is_recursive,
+                                                string_init_offset,
+                                                invoke_type,
+                                                optimized_invoke_type,
+                                                clinit_check_requirement);
   }
 
   size_t start_index = 0;
@@ -740,19 +752,29 @@ bool HGraphBuilder::BuildInvoke(const Instruction& instruction,
     start_index = 1;
   }
 
-  uint32_t descriptor_index = 1;
+  uint32_t descriptor_index = 1;  // Skip the return type.
   uint32_t argument_index = start_index;
   if (is_string_init) {
     start_index = 1;
   }
-  for (size_t i = start_index; i < number_of_vreg_arguments; i++, argument_index++) {
+  for (size_t i = start_index;
+       // Make sure we don't go over the expected arguments or over the number of
+       // dex registers given. If the instruction was seen as dead by the verifier,
+       // it hasn't been properly checked.
+       (i < number_of_vreg_arguments) && (argument_index < number_of_arguments);
+       i++, argument_index++) {
     Primitive::Type type = Primitive::GetType(descriptor[descriptor_index++]);
     bool is_wide = (type == Primitive::kPrimLong) || (type == Primitive::kPrimDouble);
-    if (!is_range && is_wide && args[i] + 1 != args[i + 1]) {
-      LOG(WARNING) << "Non sequential register pair in " << dex_compilation_unit_->GetSymbol()
-                   << " at " << dex_pc;
-      // We do not implement non sequential register pair.
-      MaybeRecordStat(MethodCompilationStat::kNotCompiledNonSequentialRegPair);
+    if (!is_range
+        && is_wide
+        && ((i + 1 == number_of_vreg_arguments) || (args[i] + 1 != args[i + 1]))) {
+      // Longs and doubles should be in pairs, that is, sequential registers. The verifier should
+      // reject any class where this is violated. However, the verifier only does these checks
+      // on non trivially dead instructions, so we just bailout the compilation.
+      VLOG(compiler) << "Did not compile "
+                     << PrettyMethod(dex_compilation_unit_->GetDexMethodIndex(), *dex_file_)
+                     << " because of non-sequential dex register pair in wide argument";
+      MaybeRecordStat(MethodCompilationStat::kNotCompiledMalformedOpcode);
       return false;
     }
     HInstruction* arg = LoadLocal(is_range ? register_index + i : args[i], type);
@@ -761,7 +783,14 @@ bool HGraphBuilder::BuildInvoke(const Instruction& instruction,
       i++;
     }
   }
-  DCHECK_EQ(argument_index, number_of_arguments);
+
+  if (argument_index != number_of_arguments) {
+    VLOG(compiler) << "Did not compile "
+                   << PrettyMethod(dex_compilation_unit_->GetDexMethodIndex(), *dex_file_)
+                   << " because of wrong number of arguments in invoke instruction";
+    MaybeRecordStat(MethodCompilationStat::kNotCompiledMalformedOpcode);
+    return false;
+  }
 
   if (invoke->IsInvokeStaticOrDirect()) {
     invoke->SetArgumentAt(argument_index, graph_->GetCurrentMethod());
@@ -1206,14 +1235,20 @@ bool HGraphBuilder::NeedsAccessCheck(uint32_t type_index) const {
 }
 
 void HGraphBuilder::BuildPackedSwitch(const Instruction& instruction, uint32_t dex_pc) {
+  // Verifier guarantees that the payload for PackedSwitch contains:
+  //   (a) number of entries (may be zero)
+  //   (b) first and lowest switch case value (entry 0, always present)
+  //   (c) list of target pcs (entries 1 <= i <= N)
   SwitchTable table(instruction, dex_pc, false);
 
   // Value to test against.
   HInstruction* value = LoadLocal(instruction.VRegA(), Primitive::kPrimInt);
 
+  // Retrieve number of entries.
   uint16_t num_entries = table.GetNumEntries();
-  // There should be at least one entry here.
-  DCHECK_GT(num_entries, 0U);
+  if (num_entries == 0) {
+    return;
+  }
 
   // Chained cmp-and-branch, starting from starting_key.
   int32_t starting_key = table.GetEntryAt(0);
@@ -1225,6 +1260,10 @@ void HGraphBuilder::BuildPackedSwitch(const Instruction& instruction, uint32_t d
 }
 
 void HGraphBuilder::BuildSparseSwitch(const Instruction& instruction, uint32_t dex_pc) {
+  // Verifier guarantees that the payload for SparseSwitch contains:
+  //   (a) number of entries (may be zero)
+  //   (b) sorted key values (entries 0 <= i < N)
+  //   (c) target pcs corresponding to the switch values (entries N <= i < 2*N)
   SwitchTable table(instruction, dex_pc, true);
 
   // Value to test against.
@@ -1424,21 +1463,16 @@ bool HGraphBuilder::AnalyzeDexInstruction(const Instruction& instruction, uint32
     }
 
     case Instruction::RETURN: {
-      DCHECK_NE(return_type_, Primitive::kPrimNot);
-      DCHECK_NE(return_type_, Primitive::kPrimLong);
-      DCHECK_NE(return_type_, Primitive::kPrimDouble);
       BuildReturn(instruction, return_type_);
       break;
     }
 
     case Instruction::RETURN_OBJECT: {
-      DCHECK(return_type_ == Primitive::kPrimNot);
       BuildReturn(instruction, return_type_);
       break;
     }
 
     case Instruction::RETURN_WIDE: {
-      DCHECK(return_type_ == Primitive::kPrimDouble || return_type_ == Primitive::kPrimLong);
       BuildReturn(instruction, return_type_);
       break;
     }
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index 049b3e3a40..09f7d86605 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -236,7 +236,6 @@ void CodeGenerator::InitializeCodeGeneration(size_t number_of_spill_slots,
                                              const GrowableArray<HBasicBlock*>& block_order) {
   block_order_ = &block_order;
   DCHECK(block_order_->Get(0) == GetGraph()->GetEntryBlock());
-  DCHECK(GoesToNextBlock(GetGraph()->GetEntryBlock(), block_order_->Get(1)));
   ComputeSpillMask();
   first_register_slot_in_slow_path_ = (number_of_out_slots + number_of_spill_slots) * kVRegSize;
 
@@ -508,19 +507,14 @@ void CodeGenerator::BuildNativeGCMap(
       dex_compilation_unit.GetVerifiedMethod()->GetDexGcMap();
   verifier::DexPcToReferenceMap dex_gc_map(&(gc_map_raw)[0]);
 
-  uint32_t max_native_offset = 0;
-  for (size_t i = 0; i < pc_infos_.Size(); i++) {
-    uint32_t native_offset = pc_infos_.Get(i).native_pc;
-    if (native_offset > max_native_offset) {
-      max_native_offset = native_offset;
-    }
-  }
+  uint32_t max_native_offset = stack_map_stream_.ComputeMaxNativePcOffset();
 
-  GcMapBuilder builder(data, pc_infos_.Size(), max_native_offset, dex_gc_map.RegWidth());
-  for (size_t i = 0; i < pc_infos_.Size(); i++) {
-    struct PcInfo pc_info = pc_infos_.Get(i);
-    uint32_t native_offset = pc_info.native_pc;
-    uint32_t dex_pc = pc_info.dex_pc;
+  size_t num_stack_maps = stack_map_stream_.GetNumberOfStackMaps();
+  GcMapBuilder builder(data, num_stack_maps, max_native_offset, dex_gc_map.RegWidth());
+  for (size_t i = 0; i != num_stack_maps; ++i) {
+    const StackMapStream::StackMapEntry& stack_map_entry = stack_map_stream_.GetStackMap(i);
+    uint32_t native_offset = stack_map_entry.native_pc_offset;
+    uint32_t dex_pc = stack_map_entry.dex_pc;
     const uint8_t* references = dex_gc_map.FindBitMap(dex_pc, false);
     CHECK(references != nullptr) << "Missing ref for dex pc 0x" << std::hex << dex_pc;
     builder.AddEntry(native_offset, references);
@@ -528,17 +522,17 @@ void CodeGenerator::BuildNativeGCMap(
 }
 
 void CodeGenerator::BuildSourceMap(DefaultSrcMap* src_map) const {
-  for (size_t i = 0; i < pc_infos_.Size(); i++) {
-    struct PcInfo pc_info = pc_infos_.Get(i);
-    uint32_t pc2dex_offset = pc_info.native_pc;
-    int32_t pc2dex_dalvik_offset = pc_info.dex_pc;
+  for (size_t i = 0, num = stack_map_stream_.GetNumberOfStackMaps(); i != num; ++i) {
+    const StackMapStream::StackMapEntry& stack_map_entry = stack_map_stream_.GetStackMap(i);
+    uint32_t pc2dex_offset = stack_map_entry.native_pc_offset;
+    int32_t pc2dex_dalvik_offset = stack_map_entry.dex_pc;
     src_map->push_back(SrcMapElem({pc2dex_offset, pc2dex_dalvik_offset}));
   }
 }
 
 void CodeGenerator::BuildMappingTable(std::vector<uint8_t>* data) const {
   uint32_t pc2dex_data_size = 0u;
-  uint32_t pc2dex_entries = pc_infos_.Size();
+  uint32_t pc2dex_entries = stack_map_stream_.GetNumberOfStackMaps();
   uint32_t pc2dex_offset = 0u;
   int32_t pc2dex_dalvik_offset = 0;
   uint32_t dex2pc_data_size = 0u;
@@ -547,11 +541,11 @@ void CodeGenerator::BuildMappingTable(std::vector<uint8_t>* data) const {
   int32_t dex2pc_dalvik_offset = 0;
 
   for (size_t i = 0; i < pc2dex_entries; i++) {
-    struct PcInfo pc_info = pc_infos_.Get(i);
-    pc2dex_data_size += UnsignedLeb128Size(pc_info.native_pc - pc2dex_offset);
-    pc2dex_data_size += SignedLeb128Size(pc_info.dex_pc - pc2dex_dalvik_offset);
-    pc2dex_offset = pc_info.native_pc;
-    pc2dex_dalvik_offset = pc_info.dex_pc;
+    const StackMapStream::StackMapEntry& stack_map_entry = stack_map_stream_.GetStackMap(i);
+    pc2dex_data_size += UnsignedLeb128Size(stack_map_entry.native_pc_offset - pc2dex_offset);
+    pc2dex_data_size += SignedLeb128Size(stack_map_entry.dex_pc - pc2dex_dalvik_offset);
+    pc2dex_offset = stack_map_entry.native_pc_offset;
+    pc2dex_dalvik_offset = stack_map_entry.dex_pc;
   }
 
   // Walk over the blocks and find which ones correspond to catch block entries.
@@ -586,12 +580,12 @@ void CodeGenerator::BuildMappingTable(std::vector<uint8_t>* data) const {
   dex2pc_dalvik_offset = 0u;
 
   for (size_t i = 0; i < pc2dex_entries; i++) {
-    struct PcInfo pc_info = pc_infos_.Get(i);
-    DCHECK(pc2dex_offset <= pc_info.native_pc);
-    write_pos = EncodeUnsignedLeb128(write_pos, pc_info.native_pc - pc2dex_offset);
-    write_pos = EncodeSignedLeb128(write_pos, pc_info.dex_pc - pc2dex_dalvik_offset);
-    pc2dex_offset = pc_info.native_pc;
-    pc2dex_dalvik_offset = pc_info.dex_pc;
+    const StackMapStream::StackMapEntry& stack_map_entry = stack_map_stream_.GetStackMap(i);
+    DCHECK(pc2dex_offset <= stack_map_entry.native_pc_offset);
+    write_pos = EncodeUnsignedLeb128(write_pos, stack_map_entry.native_pc_offset - pc2dex_offset);
+    write_pos = EncodeSignedLeb128(write_pos, stack_map_entry.dex_pc - pc2dex_dalvik_offset);
+    pc2dex_offset = stack_map_entry.native_pc_offset;
+    pc2dex_dalvik_offset = stack_map_entry.dex_pc;
   }
 
   for (size_t i = 0; i < graph_->GetBlocks().Size(); ++i) {
@@ -617,9 +611,9 @@ void CodeGenerator::BuildMappingTable(std::vector<uint8_t>* data) const {
     auto it = table.PcToDexBegin();
     auto it2 = table.DexToPcBegin();
     for (size_t i = 0; i < pc2dex_entries; i++) {
-      struct PcInfo pc_info = pc_infos_.Get(i);
-      CHECK_EQ(pc_info.native_pc, it.NativePcOffset());
-      CHECK_EQ(pc_info.dex_pc, it.DexPc());
+      const StackMapStream::StackMapEntry& stack_map_entry = stack_map_stream_.GetStackMap(i);
+      CHECK_EQ(stack_map_entry.native_pc_offset, it.NativePcOffset());
+      CHECK_EQ(stack_map_entry.dex_pc, it.DexPc());
       ++it;
     }
     for (size_t i = 0; i < graph_->GetBlocks().Size(); ++i) {
@@ -695,14 +689,11 @@ void CodeGenerator::RecordPcInfo(HInstruction* instruction,
   }
 
   // Collect PC infos for the mapping table.
-  struct PcInfo pc_info;
-  pc_info.dex_pc = outer_dex_pc;
-  pc_info.native_pc = GetAssembler()->CodeSize();
-  pc_infos_.Add(pc_info);
+  uint32_t native_pc = GetAssembler()->CodeSize();
 
   if (instruction == nullptr) {
     // For stack overflow checks.
-    stack_map_stream_.BeginStackMapEntry(pc_info.dex_pc, pc_info.native_pc, 0, 0, 0, 0);
+    stack_map_stream_.BeginStackMapEntry(outer_dex_pc, native_pc, 0, 0, 0, 0);
     stack_map_stream_.EndStackMapEntry();
     return;
   }
@@ -719,8 +710,8 @@ void CodeGenerator::RecordPcInfo(HInstruction* instruction,
   }
   // The register mask must be a subset of callee-save registers.
   DCHECK_EQ(register_mask & core_callee_save_mask_, register_mask);
-  stack_map_stream_.BeginStackMapEntry(pc_info.dex_pc,
-                                       pc_info.native_pc,
+  stack_map_stream_.BeginStackMapEntry(outer_dex_pc,
+                                       native_pc,
                                        register_mask,
                                        locations->GetStackMask(),
                                        outer_environment_size,
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index c6ebf6dbd8..5b0abd76b3 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -64,11 +64,6 @@ class CodeAllocator {
   DISALLOW_COPY_AND_ASSIGN(CodeAllocator);
 };
 
-struct PcInfo {
-  uint32_t dex_pc;
-  uintptr_t native_pc;
-};
-
 class SlowPathCode : public ArenaObject<kArenaAllocSlowPaths> {
  public:
   SlowPathCode() {
@@ -363,16 +358,15 @@ class CodeGenerator {
         number_of_register_pairs_(number_of_register_pairs),
         core_callee_save_mask_(core_callee_save_mask),
         fpu_callee_save_mask_(fpu_callee_save_mask),
+        stack_map_stream_(graph->GetArena()),
+        block_order_(nullptr),
         is_baseline_(false),
         graph_(graph),
         compiler_options_(compiler_options),
-        pc_infos_(graph->GetArena(), 32),
         slow_paths_(graph->GetArena(), 8),
-        block_order_(nullptr),
         current_block_index_(0),
         is_leaf_(true),
-        requires_current_method_(false),
-        stack_map_stream_(graph->GetArena()) {}
+        requires_current_method_(false) {}
 
   // Register allocation logic.
   void AllocateRegistersLocally(HInstruction* instruction) const;
@@ -442,6 +436,11 @@ class CodeGenerator {
   const uint32_t core_callee_save_mask_;
   const uint32_t fpu_callee_save_mask_;
 
+  StackMapStream stack_map_stream_;
+
+  // The order to use for code generation.
+  const GrowableArray<HBasicBlock*>* block_order_;
+
   // Whether we are using baseline.
   bool is_baseline_;
 
@@ -455,12 +454,8 @@ class CodeGenerator {
   HGraph* const graph_;
   const CompilerOptions& compiler_options_;
 
-  GrowableArray<PcInfo> pc_infos_;
   GrowableArray<SlowPathCode*> slow_paths_;
 
-  // The order to use for code generation.
-  const GrowableArray<HBasicBlock*>* block_order_;
-
   // The current block index in `block_order_` of the block
   // we are generating code for.
   size_t current_block_index_;
@@ -471,8 +466,6 @@ class CodeGenerator {
   // Whether an instruction in the graph accesses the current method.
   bool requires_current_method_;
 
-  StackMapStream stack_map_stream_;
-
   friend class OptimizingCFITest;
 
   DISALLOW_COPY_AND_ASSIGN(CodeGenerator);
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 3d3e35d0fc..f6ae45238c 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -392,12 +392,38 @@ CodeGeneratorARM::CodeGeneratorARM(HGraph* graph,
       location_builder_(graph, this),
       instruction_visitor_(graph, this),
       move_resolver_(graph->GetArena(), this),
-      assembler_(false /* can_relocate_branches */),
+      assembler_(),
       isa_features_(isa_features) {
   // Save the PC register to mimic Quick.
   AddAllocatedRegister(Location::RegisterLocation(PC));
 }
 
+void CodeGeneratorARM::Finalize(CodeAllocator* allocator) {
+  // Ensure that we fix up branches and literal loads and emit the literal pool.
+  __ FinalizeCode();
+
+  // Adjust native pc offsets in stack maps.
+  for (size_t i = 0, num = stack_map_stream_.GetNumberOfStackMaps(); i != num; ++i) {
+    uint32_t old_position = stack_map_stream_.GetStackMap(i).native_pc_offset;
+    uint32_t new_position = __ GetAdjustedPosition(old_position);
+    stack_map_stream_.SetStackMapNativePcOffset(i, new_position);
+  }
+  // Adjust native pc offsets of block labels.
+  for (size_t block_idx = 0u, end = block_order_->Size(); block_idx != end; ++block_idx) {
+    HBasicBlock* block = block_order_->Get(block_idx);
+    // Get the label directly from block_labels_ rather than through GetLabelOf() to avoid
+    // FirstNonEmptyBlock() which could lead to adjusting a label more than once.
+    DCHECK_LT(static_cast<size_t>(block->GetBlockId()), block_labels_.Size());
+    Label* block_label = &block_labels_.GetRawStorage()[block->GetBlockId()];
+    DCHECK_EQ(block_label->IsBound(), !block->IsSingleGoto());
+    if (block_label->IsBound()) {
+      __ AdjustLabelPosition(block_label);
+    }
+  }
+
+  CodeGenerator::Finalize(allocator);
+}
+
 Location CodeGeneratorARM::AllocateFreeRegister(Primitive::Type type) const {
   switch (type) {
     case Primitive::kPrimLong: {
@@ -2831,7 +2857,7 @@ void InstructionCodeGeneratorARM::VisitCompare(HCompare* compare) {
   Location left = locations->InAt(0);
   Location right = locations->InAt(1);
 
-  NearLabel less, greater, done;
+  Label less, greater, done;
   Primitive::Type type = compare->InputAt(0)->GetType();
   switch (type) {
     case Primitive::kPrimLong: {
@@ -2927,7 +2953,7 @@ void InstructionCodeGeneratorARM::GenerateWideAtomicStore(Register addr,
                                                           Register temp1,
                                                           Register temp2,
                                                           HInstruction* instruction) {
-  NearLabel fail;
+  Label fail;
   if (offset != 0) {
     __ LoadImmediate(temp1, offset);
     __ add(IP, addr, ShifterOperand(temp1));
@@ -3607,7 +3633,7 @@ void CodeGeneratorARM::MarkGCCard(Register temp,
                                   Register object,
                                   Register value,
                                   bool can_be_null) {
-  NearLabel is_null;
+  Label is_null;
   if (can_be_null) {
     __ CompareAndBranchIfZero(value, &is_null);
   }
@@ -4036,7 +4062,7 @@ void InstructionCodeGeneratorARM::VisitInstanceOf(HInstanceOf* instruction) {
   Register cls = locations->InAt(1).AsRegister<Register>();
   Register out = locations->Out().AsRegister<Register>();
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
-  NearLabel done, zero;
+  Label done, zero;
   SlowPathCodeARM* slow_path = nullptr;
 
   // Return 0 if `obj` is null.
@@ -4093,19 +4119,15 @@ void InstructionCodeGeneratorARM::VisitCheckCast(HCheckCast* instruction) {
       instruction, locations->InAt(1), locations->GetTemp(0), instruction->GetDexPc());
   codegen_->AddSlowPath(slow_path);
 
-  NearLabel done;
   // avoid null check if we know obj is not null.
   if (instruction->MustDoNullCheck()) {
-    __ CompareAndBranchIfZero(obj, &done);
+    __ CompareAndBranchIfZero(obj, slow_path->GetExitLabel());
   }
   // Compare the class of `obj` with `cls`.
   __ LoadFromOffset(kLoadWord, temp, obj, class_offset);
   __ cmp(temp, ShifterOperand(cls));
   __ b(slow_path->GetEntryLabel(), NE);
   __ Bind(slow_path->GetExitLabel());
-  if (instruction->MustDoNullCheck()) {
-    __ Bind(&done);
-  }
 }
 
 void LocationsBuilderARM::VisitMonitorOperation(HMonitorOperation* instruction) {
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index af2481661a..1599a23568 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -139,10 +139,16 @@ class LocationsBuilderARM : public HGraphVisitor {
 #define DECLARE_VISIT_INSTRUCTION(name, super)     \
   void Visit##name(H##name* instr);
 
-  FOR_EACH_CONCRETE_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_CONCRETE_INSTRUCTION_COMMON(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_CONCRETE_INSTRUCTION_ARM(DECLARE_VISIT_INSTRUCTION)
 
 #undef DECLARE_VISIT_INSTRUCTION
 
+  void VisitInstruction(HInstruction* instruction) OVERRIDE {
+    LOG(FATAL) << "Unreachable instruction " << instruction->DebugName()
+               << " (id " << instruction->GetId() << ")";
+  }
+
  private:
   void HandleInvoke(HInvoke* invoke);
   void HandleBitwiseOperation(HBinaryOperation* operation);
@@ -163,10 +169,16 @@ class InstructionCodeGeneratorARM : public HGraphVisitor {
 #define DECLARE_VISIT_INSTRUCTION(name, super)     \
   void Visit##name(H##name* instr);
 
-  FOR_EACH_CONCRETE_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_CONCRETE_INSTRUCTION_COMMON(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_CONCRETE_INSTRUCTION_ARM(DECLARE_VISIT_INSTRUCTION)
 
 #undef DECLARE_VISIT_INSTRUCTION
 
+  void VisitInstruction(HInstruction* instruction) OVERRIDE {
+    LOG(FATAL) << "Unreachable instruction " << instruction->DebugName()
+               << " (id " << instruction->GetId() << ")";
+  }
+
   ArmAssembler* GetAssembler() const { return assembler_; }
 
  private:
@@ -286,6 +298,8 @@ class CodeGeneratorARM : public CodeGenerator {
     block_labels_.SetSize(GetGraph()->GetBlocks().Size());
   }
 
+  void Finalize(CodeAllocator* allocator) OVERRIDE;
+
   const ArmInstructionSetFeatures& GetInstructionSetFeatures() const {
     return isa_features_;
   }
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 2d2419a284..f96810ff80 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -147,9 +147,17 @@ class InstructionCodeGeneratorARM64 : public HGraphVisitor {
 
 #define DECLARE_VISIT_INSTRUCTION(name, super) \
   void Visit##name(H##name* instr) OVERRIDE;
-  FOR_EACH_CONCRETE_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+
+  FOR_EACH_CONCRETE_INSTRUCTION_COMMON(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_CONCRETE_INSTRUCTION_ARM64(DECLARE_VISIT_INSTRUCTION)
+
 #undef DECLARE_VISIT_INSTRUCTION
 
+  void VisitInstruction(HInstruction* instruction) OVERRIDE {
+    LOG(FATAL) << "Unreachable instruction " << instruction->DebugName()
+               << " (id " << instruction->GetId() << ")";
+  }
+
   Arm64Assembler* GetAssembler() const { return assembler_; }
   vixl::MacroAssembler* GetVIXLAssembler() { return GetAssembler()->vixl_masm_; }
 
@@ -188,9 +196,17 @@ class LocationsBuilderARM64 : public HGraphVisitor {
 
 #define DECLARE_VISIT_INSTRUCTION(name, super) \
   void Visit##name(H##name* instr) OVERRIDE;
-  FOR_EACH_CONCRETE_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+
+  FOR_EACH_CONCRETE_INSTRUCTION_COMMON(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_CONCRETE_INSTRUCTION_ARM64(DECLARE_VISIT_INSTRUCTION)
+
 #undef DECLARE_VISIT_INSTRUCTION
 
+  void VisitInstruction(HInstruction* instruction) OVERRIDE {
+    LOG(FATAL) << "Unreachable instruction " << instruction->DebugName()
+               << " (id " << instruction->GetId() << ")";
+  }
+
  private:
   void HandleBinaryOp(HBinaryOperation* instr);
   void HandleFieldSet(HInstruction* instruction);
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index faf3cf9ffa..696d8d549e 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -124,10 +124,16 @@ class LocationsBuilderX86 : public HGraphVisitor {
 #define DECLARE_VISIT_INSTRUCTION(name, super)     \
   void Visit##name(H##name* instr) OVERRIDE;
 
-  FOR_EACH_CONCRETE_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_CONCRETE_INSTRUCTION_COMMON(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_CONCRETE_INSTRUCTION_X86(DECLARE_VISIT_INSTRUCTION)
 
 #undef DECLARE_VISIT_INSTRUCTION
 
+  void VisitInstruction(HInstruction* instruction) OVERRIDE {
+    LOG(FATAL) << "Unreachable instruction " << instruction->DebugName()
+               << " (id " << instruction->GetId() << ")";
+  }
+
  private:
   void HandleBitwiseOperation(HBinaryOperation* instruction);
   void HandleInvoke(HInvoke* invoke);
@@ -148,10 +154,16 @@ class InstructionCodeGeneratorX86 : public HGraphVisitor {
 #define DECLARE_VISIT_INSTRUCTION(name, super)     \
   void Visit##name(H##name* instr) OVERRIDE;
 
-  FOR_EACH_CONCRETE_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_CONCRETE_INSTRUCTION_COMMON(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_CONCRETE_INSTRUCTION_X86(DECLARE_VISIT_INSTRUCTION)
 
 #undef DECLARE_VISIT_INSTRUCTION
 
+  void VisitInstruction(HInstruction* instruction) OVERRIDE {
+    LOG(FATAL) << "Unreachable instruction " << instruction->DebugName()
+               << " (id " << instruction->GetId() << ")";
+  }
+
   X86Assembler* GetAssembler() const { return assembler_; }
 
  private:
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index e46994c79e..215754cd46 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -134,10 +134,16 @@ class LocationsBuilderX86_64 : public HGraphVisitor {
 #define DECLARE_VISIT_INSTRUCTION(name, super)     \
   void Visit##name(H##name* instr) OVERRIDE;
 
-  FOR_EACH_CONCRETE_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_CONCRETE_INSTRUCTION_COMMON(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_CONCRETE_INSTRUCTION_X86_64(DECLARE_VISIT_INSTRUCTION)
 
 #undef DECLARE_VISIT_INSTRUCTION
 
+  void VisitInstruction(HInstruction* instruction) OVERRIDE {
+    LOG(FATAL) << "Unreachable instruction " << instruction->DebugName()
+               << " (id " << instruction->GetId() << ")";
+  }
+
  private:
   void HandleInvoke(HInvoke* invoke);
   void HandleBitwiseOperation(HBinaryOperation* operation);
@@ -158,10 +164,16 @@ class InstructionCodeGeneratorX86_64 : public HGraphVisitor {
 #define DECLARE_VISIT_INSTRUCTION(name, super)     \
   void Visit##name(H##name* instr) OVERRIDE;
 
-  FOR_EACH_CONCRETE_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_CONCRETE_INSTRUCTION_COMMON(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_CONCRETE_INSTRUCTION_X86_64(DECLARE_VISIT_INSTRUCTION)
 
 #undef DECLARE_VISIT_INSTRUCTION
 
+  void VisitInstruction(HInstruction* instruction) OVERRIDE {
+    LOG(FATAL) << "Unreachable instruction " << instruction->DebugName()
+               << " (id " << instruction->GetId() << ")";
+  }
+
   X86_64Assembler* GetAssembler() const { return assembler_; }
 
  private:
diff --git a/compiler/optimizing/dead_code_elimination.cc b/compiler/optimizing/dead_code_elimination.cc
index 17a006cc3a..fdfe518e95 100644
--- a/compiler/optimizing/dead_code_elimination.cc
+++ b/compiler/optimizing/dead_code_elimination.cc
@@ -122,10 +122,6 @@ void HDeadCodeElimination::RemoveDeadInstructions() {
       if (!inst->HasSideEffects()
           && !inst->CanThrow()
           && !inst->IsSuspendCheck()
-          // The current method needs to stay in the graph in case of inlining.
-          // It is always passed anyway, and keeping it in the graph does not
-          // affect the generated code.
-          && !inst->IsCurrentMethod()
           // If we added an explicit barrier then we should keep it.
           && !inst->IsMemoryBarrier()
           && !inst->HasUses()) {
diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index fd2e4e81df..b64791788d 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc
@@ -21,6 +21,7 @@
 #include "licm.h"
 #include "nodes.h"
 #include "optimization.h"
+#include "reference_type_propagation.h"
 #include "register_allocator.h"
 #include "ssa_liveness_analysis.h"
 
@@ -354,6 +355,24 @@ class HGraphVisualizerPrinter : public HGraphVisitor {
       } else {
         StartAttributeStream("loop") << "B" << info->GetHeader()->GetBlockId();
       }
+    } else if (IsPass(ReferenceTypePropagation::kReferenceTypePropagationPassName)
+               && is_after_pass_) {
+      if (instruction->GetType() == Primitive::kPrimNot) {
+        if (instruction->IsLoadClass()) {
+          ScopedObjectAccess soa(Thread::Current());
+          StartAttributeStream("klass")
+              << PrettyClass(instruction->AsLoadClass()->GetLoadedClassRTI().GetTypeHandle().Get());
+        } else {
+          ReferenceTypeInfo info = instruction->GetReferenceTypeInfo();
+          if (info.IsTop()) {
+            StartAttributeStream("klass") << "java.lang.Object";
+          } else {
+            ScopedObjectAccess soa(Thread::Current());
+            StartAttributeStream("klass") << PrettyClass(info.GetTypeHandle().Get());
+          }
+          StartAttributeStream("exact") << std::boolalpha << info.IsExact() << std::noboolalpha;
+        }
+      }
     }
   }
 
diff --git a/compiler/optimizing/inliner.cc b/compiler/optimizing/inliner.cc
index c3fc33735a..92ebf060eb 100644
--- a/compiler/optimizing/inliner.cc
+++ b/compiler/optimizing/inliner.cc
@@ -27,6 +27,7 @@
 #include "mirror/class_loader.h"
 #include "mirror/dex_cache.h"
 #include "nodes.h"
+#include "reference_type_propagation.h"
 #include "register_allocator.h"
 #include "ssa_phi_elimination.h"
 #include "scoped_thread_state_change.h"
@@ -57,7 +58,7 @@ void HInliner::Run() {
     next_block = (i == blocks.Size() - 1) ? nullptr : blocks.Get(i + 1);
     for (HInstruction* instruction = block->GetFirstInstruction(); instruction != nullptr;) {
       HInstruction* next = instruction->GetNext();
-      HInvokeStaticOrDirect* call = instruction->AsInvokeStaticOrDirect();
+      HInvoke* call = instruction->AsInvoke();
       // As long as the call is not intrinsified, it is worth trying to inline.
       if (call != nullptr && call->GetIntrinsic() == Intrinsics::kNone) {
         // We use the original invoke type to ensure the resolution of the called method
@@ -83,6 +84,93 @@ void HInliner::Run() {
   }
 }
 
+static bool IsMethodOrDeclaringClassFinal(ArtMethod* method)
+    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  return method->IsFinal() || method->GetDeclaringClass()->IsFinal();
+}
+
+/**
+ * Given the `resolved_method` looked up in the dex cache, try to find
+ * the actual runtime target of an interface or virtual call.
+ * Return nullptr if the runtime target cannot be proven.
+ */
+static ArtMethod* FindVirtualOrInterfaceTarget(HInvoke* invoke, ArtMethod* resolved_method)
+    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  if (IsMethodOrDeclaringClassFinal(resolved_method)) {
+    // No need to lookup further, the resolved method will be the target.
+    return resolved_method;
+  }
+
+  HInstruction* receiver = invoke->InputAt(0);
+  if (receiver->IsNullCheck()) {
+    // Due to multiple levels of inlining within the same pass, it might be that
+    // null check does not have the reference type of the actual receiver.
+    receiver = receiver->InputAt(0);
+  }
+  ReferenceTypeInfo info = receiver->GetReferenceTypeInfo();
+  if (info.IsTop()) {
+    // We have no information on the receiver.
+    return nullptr;
+  } else if (!info.IsExact()) {
+    // We currently only support inlining with known receivers.
+    // TODO: Remove this check, we should be able to inline final methods
+    // on unknown receivers.
+    return nullptr;
+  } else if (info.GetTypeHandle()->IsInterface()) {
+    // Statically knowing that the receiver has an interface type cannot
+    // help us find what is the target method.
+    return nullptr;
+  } else if (!resolved_method->GetDeclaringClass()->IsAssignableFrom(info.GetTypeHandle().Get())) {
+    // The method that we're trying to call is not in the receiver's class or super classes.
+    return nullptr;
+  }
+
+  ClassLinker* cl = Runtime::Current()->GetClassLinker();
+  size_t pointer_size = cl->GetImagePointerSize();
+  if (invoke->IsInvokeInterface()) {
+    resolved_method = info.GetTypeHandle()->FindVirtualMethodForInterface(
+        resolved_method, pointer_size);
+  } else {
+    DCHECK(invoke->IsInvokeVirtual());
+    resolved_method = info.GetTypeHandle()->FindVirtualMethodForVirtual(
+        resolved_method, pointer_size);
+  }
+
+  if (resolved_method == nullptr) {
+    // The information we had on the receiver was not enough to find
+    // the target method. Since we check above the exact type of the receiver,
+    // the only reason this can happen is an IncompatibleClassChangeError.
+    return nullptr;
+  } else if (resolved_method->IsAbstract()) {
+    // The information we had on the receiver was not enough to find
+    // the target method. Since we check above the exact type of the receiver,
+    // the only reason this can happen is an IncompatibleClassChangeError.
+    return nullptr;
+  } else if (IsMethodOrDeclaringClassFinal(resolved_method)) {
+    // A final method has to be the target method.
+    return resolved_method;
+  } else if (info.IsExact()) {
+    // If we found a method and the receiver's concrete type is statically
+    // known, we know for sure the target.
+    return resolved_method;
+  } else {
+    // Even if we did find a method, the receiver type was not enough to
+    // statically find the runtime target.
+    return nullptr;
+  }
+}
+
+static uint32_t FindMethodIndexIn(ArtMethod* method,
+                                  const DexFile& dex_file,
+                                  uint32_t referrer_index)
+    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  if (method->GetDexFile()->GetLocation().compare(dex_file.GetLocation()) == 0) {
+    return method->GetDexMethodIndex();
+  } else {
+    return method->FindDexMethodIndexInOtherDexFile(dex_file, referrer_index);
+  }
+}
+
 bool HInliner::TryInline(HInvoke* invoke_instruction, uint32_t method_index) const {
   ScopedObjectAccess soa(Thread::Current());
   const DexFile& caller_dex_file = *caller_compilation_unit_.GetDexFile();
@@ -99,6 +187,25 @@ bool HInliner::TryInline(HInvoke* invoke_instruction, uint32_t method_index) con
     return false;
   }
 
+  if (!invoke_instruction->IsInvokeStaticOrDirect()) {
+    resolved_method = FindVirtualOrInterfaceTarget(invoke_instruction, resolved_method);
+    if (resolved_method == nullptr) {
+      VLOG(compiler) << "Interface or virtual call to "
+                     << PrettyMethod(method_index, caller_dex_file)
+                     << " could not be statically determined";
+      return false;
+    }
+    // We have found a method, but we need to find where that method is for the caller's
+    // dex file.
+    method_index = FindMethodIndexIn(resolved_method, caller_dex_file, method_index);
+    if (method_index == DexFile::kDexNoIndex) {
+      VLOG(compiler) << "Interface or virtual call to "
+                     << PrettyMethod(resolved_method)
+                     << " cannot be inlined because unaccessible to caller";
+      return false;
+    }
+  }
+
   bool same_dex_file = true;
   const DexFile& outer_dex_file = *outer_compilation_unit_.GetDexFile();
   if (resolved_method->GetDexFile()->GetLocation().compare(outer_dex_file.GetLocation()) != 0) {
@@ -149,7 +256,7 @@ bool HInliner::TryInline(HInvoke* invoke_instruction, uint32_t method_index) con
     return false;
   }
 
-  if (!TryBuildAndInline(resolved_method, invoke_instruction, method_index, same_dex_file)) {
+  if (!TryBuildAndInline(resolved_method, invoke_instruction, same_dex_file)) {
     return false;
   }
 
@@ -160,11 +267,11 @@ bool HInliner::TryInline(HInvoke* invoke_instruction, uint32_t method_index) con
 
 bool HInliner::TryBuildAndInline(ArtMethod* resolved_method,
                                  HInvoke* invoke_instruction,
-                                 uint32_t method_index,
                                  bool same_dex_file) const {
   ScopedObjectAccess soa(Thread::Current());
   const DexFile::CodeItem* code_item = resolved_method->GetCodeItem();
-  const DexFile& caller_dex_file = *caller_compilation_unit_.GetDexFile();
+  const DexFile& callee_dex_file = *resolved_method->GetDexFile();
+  uint32_t method_index = resolved_method->GetDexMethodIndex();
 
   DexCompilationUnit dex_compilation_unit(
     nullptr,
@@ -204,7 +311,7 @@ bool HInliner::TryBuildAndInline(ArtMethod* resolved_method,
   }
   HGraph* callee_graph = new (graph_->GetArena()) HGraph(
       graph_->GetArena(),
-      caller_dex_file,
+      callee_dex_file,
       method_index,
       requires_ctor_barrier,
       compiler_driver_->GetInstructionSet(),
@@ -221,7 +328,7 @@ bool HInliner::TryBuildAndInline(ArtMethod* resolved_method,
                         &inline_stats);
 
   if (!builder.BuildGraph(*code_item)) {
-    VLOG(compiler) << "Method " << PrettyMethod(method_index, caller_dex_file)
+    VLOG(compiler) << "Method " << PrettyMethod(method_index, callee_dex_file)
                    << " could not be built, so cannot be inlined";
     // There could be multiple reasons why the graph could not be built, including
     // unaccessible methods/fields due to using a different dex cache. We do not mark
@@ -231,14 +338,14 @@ bool HInliner::TryBuildAndInline(ArtMethod* resolved_method,
 
   if (!RegisterAllocator::CanAllocateRegistersFor(*callee_graph,
                                                   compiler_driver_->GetInstructionSet())) {
-    VLOG(compiler) << "Method " << PrettyMethod(method_index, caller_dex_file)
+    VLOG(compiler) << "Method " << PrettyMethod(method_index, callee_dex_file)
                    << " cannot be inlined because of the register allocator";
     resolved_method->SetShouldNotInline();
     return false;
   }
 
   if (!callee_graph->TryBuildingSsa()) {
-    VLOG(compiler) << "Method " << PrettyMethod(method_index, caller_dex_file)
+    VLOG(compiler) << "Method " << PrettyMethod(method_index, callee_dex_file)
                    << " could not be transformed to SSA";
     resolved_method->SetShouldNotInline();
     return false;
@@ -247,11 +354,13 @@ bool HInliner::TryBuildAndInline(ArtMethod* resolved_method,
   // Run simple optimizations on the graph.
   HDeadCodeElimination dce(callee_graph, stats_);
   HConstantFolding fold(callee_graph);
+  ReferenceTypePropagation type_propagation(callee_graph, handles_);
   InstructionSimplifier simplify(callee_graph, stats_);
 
   HOptimization* optimizations[] = {
     &dce,
     &fold,
+    &type_propagation,
     &simplify,
   };
 
@@ -265,6 +374,7 @@ bool HInliner::TryBuildAndInline(ArtMethod* resolved_method,
                      outer_compilation_unit_,
                      dex_compilation_unit,
                      compiler_driver_,
+                     handles_,
                      stats_,
                      depth_ + 1);
     inliner.Run();
@@ -275,7 +385,7 @@ bool HInliner::TryBuildAndInline(ArtMethod* resolved_method,
   // a throw predecessor.
   HBasicBlock* exit_block = callee_graph->GetExitBlock();
   if (exit_block == nullptr) {
-    VLOG(compiler) << "Method " << PrettyMethod(method_index, caller_dex_file)
+    VLOG(compiler) << "Method " << PrettyMethod(method_index, callee_dex_file)
                    << " could not be inlined because it has an infinite loop";
     resolved_method->SetShouldNotInline();
     return false;
@@ -289,7 +399,7 @@ bool HInliner::TryBuildAndInline(ArtMethod* resolved_method,
     }
   }
   if (has_throw_predecessor) {
-    VLOG(compiler) << "Method " << PrettyMethod(method_index, caller_dex_file)
+    VLOG(compiler) << "Method " << PrettyMethod(method_index, callee_dex_file)
                    << " could not be inlined because one branch always throws";
     resolved_method->SetShouldNotInline();
     return false;
@@ -300,7 +410,7 @@ bool HInliner::TryBuildAndInline(ArtMethod* resolved_method,
   for (; !it.Done(); it.Advance()) {
     HBasicBlock* block = it.Current();
     if (block->IsLoopHeader()) {
-      VLOG(compiler) << "Method " << PrettyMethod(method_index, caller_dex_file)
+      VLOG(compiler) << "Method " << PrettyMethod(method_index, callee_dex_file)
                      << " could not be inlined because it contains a loop";
       resolved_method->SetShouldNotInline();
       return false;
@@ -314,21 +424,21 @@ bool HInliner::TryBuildAndInline(ArtMethod* resolved_method,
       if (current->IsInvokeInterface()) {
         // Disable inlining of interface calls. The cost in case of entering the
         // resolution conflict is currently too high.
-        VLOG(compiler) << "Method " << PrettyMethod(method_index, caller_dex_file)
+        VLOG(compiler) << "Method " << PrettyMethod(method_index, callee_dex_file)
                        << " could not be inlined because it has an interface call.";
         resolved_method->SetShouldNotInline();
         return false;
       }
 
       if (!same_dex_file && current->NeedsEnvironment()) {
-        VLOG(compiler) << "Method " << PrettyMethod(method_index, caller_dex_file)
+        VLOG(compiler) << "Method " << PrettyMethod(method_index, callee_dex_file)
                        << " could not be inlined because " << current->DebugName()
                        << " needs an environment and is in a different dex file";
         return false;
       }
 
       if (!same_dex_file && current->NeedsDexCache()) {
-        VLOG(compiler) << "Method " << PrettyMethod(method_index, caller_dex_file)
+        VLOG(compiler) << "Method " << PrettyMethod(method_index, callee_dex_file)
                        << " could not be inlined because " << current->DebugName()
                        << " it is in a different dex file and requires access to the dex cache";
         // Do not flag the method as not-inlineable. A caller within the same
diff --git a/compiler/optimizing/inliner.h b/compiler/optimizing/inliner.h
index f7d8cf8715..24044b73a1 100644
--- a/compiler/optimizing/inliner.h
+++ b/compiler/optimizing/inliner.h
@@ -34,13 +34,15 @@ class HInliner : public HOptimization {
            const DexCompilationUnit& outer_compilation_unit,
            const DexCompilationUnit& caller_compilation_unit,
            CompilerDriver* compiler_driver,
+           StackHandleScopeCollection* handles,
            OptimizingCompilerStats* stats,
            size_t depth = 0)
       : HOptimization(outer_graph, true, kInlinerPassName, stats),
         outer_compilation_unit_(outer_compilation_unit),
         caller_compilation_unit_(caller_compilation_unit),
         compiler_driver_(compiler_driver),
-        depth_(depth) {}
+        depth_(depth),
+        handles_(handles) {}
 
   void Run() OVERRIDE;
 
@@ -50,13 +52,13 @@ class HInliner : public HOptimization {
   bool TryInline(HInvoke* invoke_instruction, uint32_t method_index) const;
   bool TryBuildAndInline(ArtMethod* resolved_method,
                          HInvoke* invoke_instruction,
-                         uint32_t method_index,
                          bool same_dex_file) const;
 
   const DexCompilationUnit& outer_compilation_unit_;
   const DexCompilationUnit& caller_compilation_unit_;
   CompilerDriver* const compiler_driver_;
   const size_t depth_;
+  StackHandleScopeCollection* const handles_;
 
   DISALLOW_COPY_AND_ASSIGN(HInliner);
 };
diff --git a/compiler/optimizing/instruction_simplifier.cc b/compiler/optimizing/instruction_simplifier.cc
index fcb3471821..2daeeb3c0c 100644
--- a/compiler/optimizing/instruction_simplifier.cc
+++ b/compiler/optimizing/instruction_simplifier.cc
@@ -186,33 +186,94 @@ bool InstructionSimplifierVisitor::IsDominatedByInputNullCheck(HInstruction* ins
   return false;
 }
 
-void InstructionSimplifierVisitor::VisitCheckCast(HCheckCast* check_cast) {
-  HLoadClass* load_class = check_cast->InputAt(1)->AsLoadClass();
-  if (!check_cast->InputAt(0)->CanBeNull() || IsDominatedByInputNullCheck(check_cast)) {
-    check_cast->ClearMustDoNullCheck();
-  }
-
-  if (!load_class->IsResolved()) {
+// Returns whether doing a type test between the class of `object` against `klass` has
+// a statically known outcome. The result of the test is stored in `outcome`.
+static bool TypeCheckHasKnownOutcome(HLoadClass* klass, HInstruction* object, bool* outcome) {
+  if (!klass->IsResolved()) {
     // If the class couldn't be resolve it's not safe to compare against it. It's
     // default type would be Top which might be wider that the actual class type
     // and thus producing wrong results.
-    return;
+    return false;
   }
-  ReferenceTypeInfo obj_rti = check_cast->InputAt(0)->GetReferenceTypeInfo();
-  ReferenceTypeInfo class_rti = load_class->GetLoadedClassRTI();
+
+  ReferenceTypeInfo obj_rti = object->GetReferenceTypeInfo();
+  ReferenceTypeInfo class_rti = klass->GetLoadedClassRTI();
   ScopedObjectAccess soa(Thread::Current());
   if (class_rti.IsSupertypeOf(obj_rti)) {
+    *outcome = true;
+    return true;
+  } else if (obj_rti.IsExact()) {
+    // The test failed at compile time so will also fail at runtime.
+    *outcome = false;
+    return true;
+  } else if (!class_rti.IsInterface()
+             && !obj_rti.IsInterface()
+             && !obj_rti.IsSupertypeOf(class_rti)) {
+    // Different type hierarchy. The test will fail.
+    *outcome = false;
+    return true;
+  }
+  return false;
+}
+
+void InstructionSimplifierVisitor::VisitCheckCast(HCheckCast* check_cast) {
+  HInstruction* object = check_cast->InputAt(0);
+  if (!object->CanBeNull() || IsDominatedByInputNullCheck(check_cast)) {
+    check_cast->ClearMustDoNullCheck();
+  }
+
+  if (object->IsNullConstant()) {
     check_cast->GetBlock()->RemoveInstruction(check_cast);
     if (stats_ != nullptr) {
       stats_->RecordStat(MethodCompilationStat::kRemovedCheckedCast);
     }
+    return;
+  }
+
+  bool outcome;
+  if (TypeCheckHasKnownOutcome(check_cast->InputAt(1)->AsLoadClass(), object, &outcome)) {
+    if (outcome) {
+      check_cast->GetBlock()->RemoveInstruction(check_cast);
+      if (stats_ != nullptr) {
+        stats_->RecordStat(MethodCompilationStat::kRemovedCheckedCast);
+      }
+    } else {
+      // Don't do anything for exceptional cases for now. Ideally we should remove
+      // all instructions and blocks this instruction dominates.
+    }
   }
 }
 
 void InstructionSimplifierVisitor::VisitInstanceOf(HInstanceOf* instruction) {
-  if (!instruction->InputAt(0)->CanBeNull() || IsDominatedByInputNullCheck(instruction)) {
+  HInstruction* object = instruction->InputAt(0);
+  bool can_be_null = true;
+  if (!object->CanBeNull() || IsDominatedByInputNullCheck(instruction)) {
+    can_be_null = false;
     instruction->ClearMustDoNullCheck();
   }
+
+  HGraph* graph = GetGraph();
+  if (object->IsNullConstant()) {
+    instruction->ReplaceWith(graph->GetIntConstant(0));
+    instruction->GetBlock()->RemoveInstruction(instruction);
+    RecordSimplification();
+    return;
+  }
+
+  bool outcome;
+  if (TypeCheckHasKnownOutcome(instruction->InputAt(1)->AsLoadClass(), object, &outcome)) {
+    if (outcome && can_be_null) {
+      // Type test will succeed, we just need a null test.
+      HNotEqual* test = new (graph->GetArena()) HNotEqual(graph->GetNullConstant(), object);
+      instruction->GetBlock()->InsertInstructionBefore(test, instruction);
+      instruction->ReplaceWith(test);
+    } else {
+      // We've statically determined the result of the instanceof.
+      instruction->ReplaceWith(graph->GetIntConstant(outcome));
+    }
+    RecordSimplification();
+    instruction->GetBlock()->RemoveInstruction(instruction);
+  }
 }
 
 void InstructionSimplifierVisitor::VisitInstanceFieldSet(HInstanceFieldSet* instruction) {
diff --git a/compiler/optimizing/instruction_simplifier.h b/compiler/optimizing/instruction_simplifier.h
index 024462081f..668956a614 100644
--- a/compiler/optimizing/instruction_simplifier.h
+++ b/compiler/optimizing/instruction_simplifier.h
@@ -36,6 +36,9 @@ class InstructionSimplifier : public HOptimization {
   static constexpr const char* kInstructionSimplifierPassName = "instruction_simplifier";
 
   void Run() OVERRIDE;
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(InstructionSimplifier);
 };
 
 }  // namespace art
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index cd91d2c87b..68c197e607 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -288,7 +288,10 @@ void HGraph::InsertConstant(HConstant* constant) {
 }
 
 HNullConstant* HGraph::GetNullConstant() {
-  if (cached_null_constant_ == nullptr) {
+  // For simplicity, don't bother reviving the cached null constant if it is
+  // not null and not in a block. Otherwise, we need to clear the instruction
+  // id and/or any invariants the graph is assuming when adding new instructions.
+  if ((cached_null_constant_ == nullptr) || (cached_null_constant_->GetBlock() == nullptr)) {
     cached_null_constant_ = new (arena_) HNullConstant();
     InsertConstant(cached_null_constant_);
   }
@@ -296,7 +299,10 @@ HNullConstant* HGraph::GetNullConstant() {
 }
 
 HCurrentMethod* HGraph::GetCurrentMethod() {
-  if (cached_current_method_ == nullptr) {
+  // For simplicity, don't bother reviving the cached current method if it is
+  // not null and not in a block. Otherwise, we need to clear the instruction
+  // id and/or any invariants the graph is assuming when adding new instructions.
+  if ((cached_current_method_ == nullptr) || (cached_current_method_->GetBlock() == nullptr)) {
     cached_current_method_ = new (arena_) HCurrentMethod(
         Is64BitInstructionSet(instruction_set_) ? Primitive::kPrimLong : Primitive::kPrimInt);
     if (entry_block_->GetFirstInstruction() == nullptr) {
@@ -1510,6 +1516,81 @@ void HGraph::InlineInto(HGraph* outer_graph, HInvoke* invoke) {
   invoke->GetBlock()->RemoveInstruction(invoke);
 }
 
+/*
+ * Loop will be transformed to:
+ *       old_pre_header
+ *             |
+ *          if_block
+ *           /    \
+ *  dummy_block   deopt_block
+ *           \    /
+ *       new_pre_header
+ *             |
+ *           header
+ */
+void HGraph::TransformLoopHeaderForBCE(HBasicBlock* header) {
+  DCHECK(header->IsLoopHeader());
+  HBasicBlock* pre_header = header->GetDominator();
+
+  // Need this to avoid critical edge.
+  HBasicBlock* if_block = new (arena_) HBasicBlock(this, header->GetDexPc());
+  // Need this to avoid critical edge.
+  HBasicBlock* dummy_block = new (arena_) HBasicBlock(this, header->GetDexPc());
+  HBasicBlock* deopt_block = new (arena_) HBasicBlock(this, header->GetDexPc());
+  HBasicBlock* new_pre_header = new (arena_) HBasicBlock(this, header->GetDexPc());
+  AddBlock(if_block);
+  AddBlock(dummy_block);
+  AddBlock(deopt_block);
+  AddBlock(new_pre_header);
+
+  header->ReplacePredecessor(pre_header, new_pre_header);
+  pre_header->successors_.Reset();
+  pre_header->dominated_blocks_.Reset();
+
+  pre_header->AddSuccessor(if_block);
+  if_block->AddSuccessor(dummy_block);  // True successor
+  if_block->AddSuccessor(deopt_block);  // False successor
+  dummy_block->AddSuccessor(new_pre_header);
+  deopt_block->AddSuccessor(new_pre_header);
+
+  pre_header->dominated_blocks_.Add(if_block);
+  if_block->SetDominator(pre_header);
+  if_block->dominated_blocks_.Add(dummy_block);
+  dummy_block->SetDominator(if_block);
+  if_block->dominated_blocks_.Add(deopt_block);
+  deopt_block->SetDominator(if_block);
+  if_block->dominated_blocks_.Add(new_pre_header);
+  new_pre_header->SetDominator(if_block);
+  new_pre_header->dominated_blocks_.Add(header);
+  header->SetDominator(new_pre_header);
+
+  size_t index_of_header = 0;
+  while (reverse_post_order_.Get(index_of_header) != header) {
+    index_of_header++;
+  }
+  MakeRoomFor(&reverse_post_order_, 4, index_of_header - 1);
+  reverse_post_order_.Put(index_of_header++, if_block);
+  reverse_post_order_.Put(index_of_header++, dummy_block);
+  reverse_post_order_.Put(index_of_header++, deopt_block);
+  reverse_post_order_.Put(index_of_header++, new_pre_header);
+
+  HLoopInformation* info = pre_header->GetLoopInformation();
+  if (info != nullptr) {
+    if_block->SetLoopInformation(info);
+    dummy_block->SetLoopInformation(info);
+    deopt_block->SetLoopInformation(info);
+    new_pre_header->SetLoopInformation(info);
+    for (HLoopInformationOutwardIterator loop_it(*pre_header);
+         !loop_it.Done();
+         loop_it.Advance()) {
+      loop_it.Current()->Add(if_block);
+      loop_it.Current()->Add(dummy_block);
+      loop_it.Current()->Add(deopt_block);
+      loop_it.Current()->Add(new_pre_header);
+    }
+  }
+}
+
 std::ostream& operator<<(std::ostream& os, const ReferenceTypeInfo& rhs) {
   ScopedObjectAccess soa(Thread::Current());
   os << "["
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index f87775e195..9443653db7 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -195,6 +195,10 @@ class HGraph : public ArenaObject<kArenaAllocMisc> {
   // Inline this graph in `outer_graph`, replacing the given `invoke` instruction.
   void InlineInto(HGraph* outer_graph, HInvoke* invoke);
 
+  // Need to add a couple of blocks to test if the loop body is entered and
+  // put deoptimization instructions, etc.
+  void TransformLoopHeaderForBCE(HBasicBlock* header);
+
   // Removes `block` from the graph.
   void DeleteDeadBlock(HBasicBlock* block);
 
@@ -331,6 +335,7 @@ class HGraph : public ArenaObject<kArenaAllocMisc> {
     }
 
     // If not found or previously deleted, create and cache a new instruction.
+    // Don't bother reviving a previously deleted instruction, for simplicity.
     if (constant == nullptr || constant->GetBlock() == nullptr) {
       constant = new (arena_) InstructionType(value);
       cache->Overwrite(value, constant);
@@ -824,7 +829,7 @@ class HLoopInformationOutwardIterator : public ValueObject {
   DISALLOW_COPY_AND_ASSIGN(HLoopInformationOutwardIterator);
 };
 
-#define FOR_EACH_CONCRETE_INSTRUCTION(M)                                \
+#define FOR_EACH_CONCRETE_INSTRUCTION_COMMON(M)                         \
   M(Add, BinaryOperation)                                               \
   M(And, BinaryOperation)                                               \
   M(ArrayGet, Instruction)                                              \
@@ -894,6 +899,21 @@ class HLoopInformationOutwardIterator : public ValueObject {
   M(UShr, BinaryOperation)                                              \
   M(Xor, BinaryOperation)                                               \
 
+#define FOR_EACH_CONCRETE_INSTRUCTION_ARM(M)
+
+#define FOR_EACH_CONCRETE_INSTRUCTION_ARM64(M)
+
+#define FOR_EACH_CONCRETE_INSTRUCTION_X86(M)
+
+#define FOR_EACH_CONCRETE_INSTRUCTION_X86_64(M)
+
+#define FOR_EACH_CONCRETE_INSTRUCTION(M)                                \
+  FOR_EACH_CONCRETE_INSTRUCTION_COMMON(M)                               \
+  FOR_EACH_CONCRETE_INSTRUCTION_ARM(M)                                  \
+  FOR_EACH_CONCRETE_INSTRUCTION_ARM64(M)                                \
+  FOR_EACH_CONCRETE_INSTRUCTION_X86(M)                                  \
+  FOR_EACH_CONCRETE_INSTRUCTION_X86_64(M)
+
 #define FOR_EACH_INSTRUCTION(M)                                         \
   FOR_EACH_CONCRETE_INSTRUCTION(M)                                      \
   M(Constant, Instruction)                                              \
@@ -1281,6 +1301,9 @@ class ReferenceTypeInfo : ValueObject {
 
   bool IsExact() const { return is_exact_; }
   bool IsTop() const { return is_top_; }
+  bool IsInterface() const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    return !IsTop() && GetTypeHandle()->IsInterface();
+  }
 
   Handle<mirror::Class> GetTypeHandle() const { return type_handle_; }
 
@@ -2461,7 +2484,7 @@ class HInvoke : public HInstruction {
     intrinsic_ = intrinsic;
   }
 
-  bool IsInlined() const {
+  bool IsFromInlinedInvoke() const {
     return GetEnvironment()->GetParent() != nullptr;
   }
 
@@ -3581,7 +3604,7 @@ class HLoadClass : public HExpression<1> {
   bool CanThrow() const OVERRIDE {
     // May call runtime and and therefore can throw.
     // TODO: finer grain decision.
-    return !is_referrers_class_;
+    return CanCallRuntime();
   }
 
   ReferenceTypeInfo GetLoadedClassRTI() {
@@ -4246,6 +4269,39 @@ class HBlocksInLoopIterator : public ValueObject {
   DISALLOW_COPY_AND_ASSIGN(HBlocksInLoopIterator);
 };
 
+// Iterator over the blocks that art part of the loop. Includes blocks part
+// of an inner loop. The order in which the blocks are iterated is reverse
+// post order.
+class HBlocksInLoopReversePostOrderIterator : public ValueObject {
+ public:
+  explicit HBlocksInLoopReversePostOrderIterator(const HLoopInformation& info)
+      : blocks_in_loop_(info.GetBlocks()),
+        blocks_(info.GetHeader()->GetGraph()->GetReversePostOrder()),
+        index_(0) {
+    if (!blocks_in_loop_.IsBitSet(blocks_.Get(index_)->GetBlockId())) {
+      Advance();
+    }
+  }
+
+  bool Done() const { return index_ == blocks_.Size(); }
+  HBasicBlock* Current() const { return blocks_.Get(index_); }
+  void Advance() {
+    ++index_;
+    for (size_t e = blocks_.Size(); index_ < e; ++index_) {
+      if (blocks_in_loop_.IsBitSet(blocks_.Get(index_)->GetBlockId())) {
+        break;
+      }
+    }
+  }
+
+ private:
+  const BitVector& blocks_in_loop_;
+  const GrowableArray<HBasicBlock*>& blocks_;
+  size_t index_;
+
+  DISALLOW_COPY_AND_ASSIGN(HBlocksInLoopReversePostOrderIterator);
+};
+
 inline int64_t Int64FromConstant(HConstant* constant) {
   DCHECK(constant->IsIntConstant() || constant->IsLongConstant());
   return constant->IsIntConstant() ? constant->AsIntConstant()->GetValue()
diff --git a/compiler/optimizing/optimization.h b/compiler/optimizing/optimization.h
index ccf8de9f6a..2d1c0ba9f9 100644
--- a/compiler/optimizing/optimization.h
+++ b/compiler/optimizing/optimization.h
@@ -17,6 +17,7 @@
 #ifndef ART_COMPILER_OPTIMIZING_OPTIMIZATION_H_
 #define ART_COMPILER_OPTIMIZING_OPTIMIZATION_H_
 
+#include "base/arena_object.h"
 #include "nodes.h"
 #include "optimizing_compiler_stats.h"
 
@@ -25,7 +26,7 @@ namespace art {
 /**
  * Abstraction to implement an optimization pass.
  */
-class HOptimization : public ValueObject {
+class HOptimization : public ArenaObject<kArenaAllocMisc> {
  public:
   HOptimization(HGraph* graph,
                 bool is_in_ssa_form,
diff --git a/compiler/optimizing/optimizing_cfi_test.cc b/compiler/optimizing/optimizing_cfi_test.cc
index b0d1433667..fe3bb1a2b4 100644
--- a/compiler/optimizing/optimizing_cfi_test.cc
+++ b/compiler/optimizing/optimizing_cfi_test.cc
@@ -71,6 +71,8 @@ class OptimizingCFITest : public CFITest {
         }
       }
     }
+    GrowableArray<HBasicBlock*> blocks(&allocator, 0);
+    code_gen->block_order_ = &blocks;
     code_gen->ComputeSpillMask();
     code_gen->SetFrameSize(frame_size);
     code_gen->GenerateFrameEntry();
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index f6ef2f7e82..8d43adaada 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -318,43 +318,61 @@ static void RunOptimizations(HGraph* graph,
                              const DexCompilationUnit& dex_compilation_unit,
                              PassInfoPrinter* pass_info_printer,
                              StackHandleScopeCollection* handles) {
-  HDeadCodeElimination dce1(graph, stats,
-                            HDeadCodeElimination::kInitialDeadCodeEliminationPassName);
-  HDeadCodeElimination dce2(graph, stats,
-                            HDeadCodeElimination::kFinalDeadCodeEliminationPassName);
-  HConstantFolding fold1(graph);
-  InstructionSimplifier simplify1(graph, stats);
-  HBooleanSimplifier boolean_simplify(graph);
-
-  HInliner inliner(graph, dex_compilation_unit, dex_compilation_unit, driver, stats);
-
-  HConstantFolding fold2(graph, "constant_folding_after_inlining");
-  SideEffectsAnalysis side_effects(graph);
-  GVNOptimization gvn(graph, side_effects);
-  LICM licm(graph, side_effects);
-  BoundsCheckElimination bce(graph);
-  ReferenceTypePropagation type_propagation(graph, handles);
-  InstructionSimplifier simplify2(graph, stats, "instruction_simplifier_after_types");
-
-  IntrinsicsRecognizer intrinsics(graph, driver);
+  ArenaAllocator* arena = graph->GetArena();
+  HDeadCodeElimination* dce1 = new (arena) HDeadCodeElimination(
+      graph, stats, HDeadCodeElimination::kInitialDeadCodeEliminationPassName);
+  HDeadCodeElimination* dce2 = new (arena) HDeadCodeElimination(
+      graph, stats, HDeadCodeElimination::kFinalDeadCodeEliminationPassName);
+  HConstantFolding* fold1 = new (arena) HConstantFolding(graph);
+  InstructionSimplifier* simplify1 = new (arena) InstructionSimplifier(graph, stats);
+  HBooleanSimplifier* boolean_simplify = new (arena) HBooleanSimplifier(graph);
+
+  HInliner* inliner = new (arena) HInliner(
+      graph, dex_compilation_unit, dex_compilation_unit, driver, handles, stats);
+
+  HConstantFolding* fold2 = new (arena) HConstantFolding(graph, "constant_folding_after_inlining");
+  SideEffectsAnalysis* side_effects = new (arena) SideEffectsAnalysis(graph);
+  GVNOptimization* gvn = new (arena) GVNOptimization(graph, *side_effects);
+  LICM* licm = new (arena) LICM(graph, *side_effects);
+  BoundsCheckElimination* bce = new (arena) BoundsCheckElimination(graph);
+  ReferenceTypePropagation* type_propagation =
+      new (arena) ReferenceTypePropagation(graph, handles);
+  InstructionSimplifier* simplify2 = new (arena) InstructionSimplifier(
+      graph, stats, "instruction_simplifier_after_types");
+  InstructionSimplifier* simplify3 = new (arena) InstructionSimplifier(
+      graph, stats, "instruction_simplifier_after_bce");
+  ReferenceTypePropagation* type_propagation2 =
+      new (arena) ReferenceTypePropagation(graph, handles);
+  InstructionSimplifier* simplify4 = new (arena) InstructionSimplifier(
+      graph, stats, "instruction_simplifier_before_codegen");
+
+  IntrinsicsRecognizer* intrinsics = new (arena) IntrinsicsRecognizer(graph, driver);
 
   HOptimization* optimizations[] = {
-    &intrinsics,
-    &dce1,
-    &fold1,
-    &simplify1,
-    &inliner,
+    intrinsics,
+    fold1,
+    simplify1,
+    type_propagation,
+    dce1,
+    simplify2,
+    inliner,
+    // Run another type propagation phase: inlining will open up more opprotunities
+    // to remove checkast/instanceof and null checks.
+    type_propagation2,
     // BooleanSimplifier depends on the InstructionSimplifier removing redundant
     // suspend checks to recognize empty blocks.
-    &boolean_simplify,
-    &fold2,
-    &side_effects,
-    &gvn,
-    &licm,
-    &bce,
-    &type_propagation,
-    &simplify2,
-    &dce2,
+    boolean_simplify,
+    fold2,
+    side_effects,
+    gvn,
+    licm,
+    bce,
+    simplify3,
+    dce2,
+    // The codegen has a few assumptions that only the instruction simplifier can
+    // satisfy. For example, the code generator does not expect to see a
+    // HTypeConversion from a type to the same type.
+    simplify4,
   };
 
   RunOptimizations(optimizations, arraysize(optimizations), pass_info_printer);
diff --git a/compiler/optimizing/optimizing_compiler_stats.h b/compiler/optimizing/optimizing_compiler_stats.h
index b6b1bb1cad..53d052b2bc 100644
--- a/compiler/optimizing/optimizing_compiler_stats.h
+++ b/compiler/optimizing/optimizing_compiler_stats.h
@@ -19,6 +19,7 @@
 
 #include <sstream>
 #include <string>
+#include <type_traits>
 
 #include "atomic.h"
 
@@ -37,8 +38,8 @@ enum MethodCompilationStat {
   kNotCompiledClassNotVerified,
   kNotCompiledHugeMethod,
   kNotCompiledLargeMethodNoBranches,
+  kNotCompiledMalformedOpcode,
   kNotCompiledNoCodegen,
-  kNotCompiledNonSequentialRegPair,
   kNotCompiledPathological,
   kNotCompiledSpaceFilter,
   kNotCompiledUnhandledInstruction,
@@ -84,14 +85,15 @@ class OptimizingCompilerStats {
 
       for (int i = 0; i < kLastStat; i++) {
         if (compile_stats_[i] != 0) {
-          LOG(INFO) << PrintMethodCompilationStat(i) << ": " << compile_stats_[i];
+          LOG(INFO) << PrintMethodCompilationStat(static_cast<MethodCompilationStat>(i)) << ": "
+              << compile_stats_[i];
         }
       }
     }
   }
 
  private:
-  std::string PrintMethodCompilationStat(int stat) const {
+  std::string PrintMethodCompilationStat(MethodCompilationStat stat) const {
     switch (stat) {
       case kAttemptCompilation : return "kAttemptCompilation";
       case kCompiledBaseline : return "kCompiledBaseline";
@@ -105,8 +107,8 @@ class OptimizingCompilerStats {
       case kNotCompiledClassNotVerified : return "kNotCompiledClassNotVerified";
       case kNotCompiledHugeMethod : return "kNotCompiledHugeMethod";
       case kNotCompiledLargeMethodNoBranches : return "kNotCompiledLargeMethodNoBranches";
+      case kNotCompiledMalformedOpcode : return "kNotCompiledMalformedOpcode";
       case kNotCompiledNoCodegen : return "kNotCompiledNoCodegen";
-      case kNotCompiledNonSequentialRegPair : return "kNotCompiledNonSequentialRegPair";
       case kNotCompiledPathological : return "kNotCompiledPathological";
       case kNotCompiledSpaceFilter : return "kNotCompiledSpaceFilter";
       case kNotCompiledUnhandledInstruction : return "kNotCompiledUnhandledInstruction";
@@ -120,9 +122,12 @@ class OptimizingCompilerStats {
       case kRemovedCheckedCast: return "kRemovedCheckedCast";
       case kRemovedDeadInstruction: return "kRemovedDeadInstruction";
       case kRemovedNullCheck: return "kRemovedNullCheck";
-      default: LOG(FATAL) << "invalid stat";
+
+      case kLastStat: break;  // Invalid to print out.
     }
-    return "";
+    LOG(FATAL) << "invalid stat "
+        << static_cast<std::underlying_type<MethodCompilationStat>::type>(stat);
+    UNREACHABLE();
   }
 
   AtomicInteger compile_stats_[kLastStat];
diff --git a/compiler/optimizing/prepare_for_register_allocation.cc b/compiler/optimizing/prepare_for_register_allocation.cc
index a249aa9711..ca928ae0f2 100644
--- a/compiler/optimizing/prepare_for_register_allocation.cc
+++ b/compiler/optimizing/prepare_for_register_allocation.cc
@@ -86,16 +86,6 @@ void PrepareForRegisterAllocation::VisitInvokeStaticOrDirect(HInvokeStaticOrDire
     DCHECK(last_input != nullptr)
         << "Last input is not HLoadClass. It is " << last_input->DebugName();
 
-    // The static call will initialize the class so there's no need for a clinit check if
-    // it's the first user.
-    // There is one special case where we still need the clinit check, when inlining. Because
-    // currently the callee is responsible for reporting parameters to the GC, the code
-    // that walks the stack during `artQuickResolutionTrampoline` cannot be interrupted for GC.
-    // Therefore we cannot allocate any object in that code, including loading a new class.
-    if (last_input == invoke->GetPrevious() && !invoke->IsInlined()) {
-      last_input->SetMustGenerateClinitCheck(false);
-    }
-
     // Remove a load class instruction as last input of a static
     // invoke, which has been added (along with a clinit check,
     // removed by PrepareForRegisterAllocation::VisitClinitCheck
@@ -104,10 +94,20 @@ void PrepareForRegisterAllocation::VisitInvokeStaticOrDirect(HInvokeStaticOrDire
     // stage (i.e., after inlining has been performed).
     invoke->RemoveLoadClassAsLastInput();
 
-    // If the load class instruction is no longer used, remove it from
-    // the graph.
-    if (!last_input->HasUses() && !(last_input->MustGenerateClinitCheck() && invoke->IsInlined())) {
-      last_input->GetBlock()->RemoveInstruction(last_input);
+    // The static call will initialize the class so there's no need for a clinit check if
+    // it's the first user.
+    // There is one special case where we still need the clinit check, when inlining. Because
+    // currently the callee is responsible for reporting parameters to the GC, the code
+    // that walks the stack during `artQuickResolutionTrampoline` cannot be interrupted for GC.
+    // Therefore we cannot allocate any object in that code, including loading a new class.
+    if (last_input == invoke->GetPrevious() && !invoke->IsFromInlinedInvoke()) {
+      last_input->SetMustGenerateClinitCheck(false);
+
+      // If the load class instruction is no longer used, remove it from
+      // the graph.
+      if (!last_input->HasUses()) {
+        last_input->GetBlock()->RemoveInstruction(last_input);
+      }
     }
   }
 }
diff --git a/compiler/optimizing/reference_type_propagation.cc b/compiler/optimizing/reference_type_propagation.cc
index 4f1f45769d..a048c856c5 100644
--- a/compiler/optimizing/reference_type_propagation.cc
+++ b/compiler/optimizing/reference_type_propagation.cc
@@ -23,6 +23,30 @@
 
 namespace art {
 
+class RTPVisitor : public HGraphDelegateVisitor {
+ public:
+  RTPVisitor(HGraph* graph, StackHandleScopeCollection* handles)
+    : HGraphDelegateVisitor(graph),
+      handles_(handles) {}
+
+  void VisitNewInstance(HNewInstance* new_instance) OVERRIDE;
+  void VisitLoadClass(HLoadClass* load_class) OVERRIDE;
+  void VisitNewArray(HNewArray* instr) OVERRIDE;
+  void UpdateFieldAccessTypeInfo(HInstruction* instr, const FieldInfo& info);
+  void SetClassAsTypeInfo(HInstruction* instr, mirror::Class* klass, bool is_exact);
+  void VisitInstanceFieldGet(HInstanceFieldGet* instr) OVERRIDE;
+  void VisitStaticFieldGet(HStaticFieldGet* instr) OVERRIDE;
+  void VisitInvoke(HInvoke* instr) OVERRIDE;
+  void VisitArrayGet(HArrayGet* instr) OVERRIDE;
+  void UpdateReferenceTypeInfo(HInstruction* instr,
+                               uint16_t type_idx,
+                               const DexFile& dex_file,
+                               bool is_exact);
+
+ private:
+  StackHandleScopeCollection* handles_;
+};
+
 void ReferenceTypePropagation::Run() {
   // To properly propagate type info we need to visit in the dominator-based order.
   // Reverse post order guarantees a node's dominators are visited first.
@@ -35,23 +59,13 @@ void ReferenceTypePropagation::Run() {
 
 void ReferenceTypePropagation::VisitBasicBlock(HBasicBlock* block) {
   // TODO: handle other instructions that give type info
-  // (Call/array accesses)
+  // (array accesses)
 
+  RTPVisitor visitor(graph_, handles_);
   // Initialize exact types first for faster convergence.
   for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
     HInstruction* instr = it.Current();
-    // TODO: Make ReferenceTypePropagation a visitor or create a new one.
-    if (instr->IsNewInstance()) {
-      VisitNewInstance(instr->AsNewInstance());
-    } else if (instr->IsLoadClass()) {
-      VisitLoadClass(instr->AsLoadClass());
-    } else if (instr->IsNewArray()) {
-      VisitNewArray(instr->AsNewArray());
-    } else if (instr->IsInstanceFieldGet()) {
-      VisitInstanceFieldGet(instr->AsInstanceFieldGet());
-    } else if (instr->IsStaticFieldGet()) {
-      VisitStaticFieldGet(instr->AsStaticFieldGet());
-    }
+    instr->Accept(&visitor);
   }
 
   // Handle Phis.
@@ -166,35 +180,39 @@ void ReferenceTypePropagation::BoundTypeForIfInstanceOf(HBasicBlock* block) {
   }
 }
 
-void ReferenceTypePropagation::SetClassAsTypeInfo(HInstruction* instr, mirror::Class* klass) {
+void RTPVisitor::SetClassAsTypeInfo(HInstruction* instr,
+                                    mirror::Class* klass,
+                                    bool is_exact) {
   if (klass != nullptr) {
     ScopedObjectAccess soa(Thread::Current());
     MutableHandle<mirror::Class> handle = handles_->NewHandle(klass);
-    instr->SetReferenceTypeInfo(ReferenceTypeInfo::Create(handle, true));
+    is_exact = is_exact || klass->IsFinal();
+    instr->SetReferenceTypeInfo(ReferenceTypeInfo::Create(handle, is_exact));
   }
 }
 
-void ReferenceTypePropagation::UpdateReferenceTypeInfo(HInstruction* instr,
-                                                       uint16_t type_idx,
-                                                       const DexFile& dex_file) {
+void RTPVisitor::UpdateReferenceTypeInfo(HInstruction* instr,
+                                         uint16_t type_idx,
+                                         const DexFile& dex_file,
+                                         bool is_exact) {
   DCHECK_EQ(instr->GetType(), Primitive::kPrimNot);
 
   ScopedObjectAccess soa(Thread::Current());
   mirror::DexCache* dex_cache = Runtime::Current()->GetClassLinker()->FindDexCache(dex_file);
   // Get type from dex cache assuming it was populated by the verifier.
-  SetClassAsTypeInfo(instr, dex_cache->GetResolvedType(type_idx));
+  SetClassAsTypeInfo(instr, dex_cache->GetResolvedType(type_idx), is_exact);
 }
 
-void ReferenceTypePropagation::VisitNewInstance(HNewInstance* instr) {
-  UpdateReferenceTypeInfo(instr, instr->GetTypeIndex(), instr->GetDexFile());
+void RTPVisitor::VisitNewInstance(HNewInstance* instr) {
+  UpdateReferenceTypeInfo(instr, instr->GetTypeIndex(), instr->GetDexFile(), /* is_exact */ true);
 }
 
-void ReferenceTypePropagation::VisitNewArray(HNewArray* instr) {
-  UpdateReferenceTypeInfo(instr, instr->GetTypeIndex(), instr->GetDexFile());
+void RTPVisitor::VisitNewArray(HNewArray* instr) {
+  UpdateReferenceTypeInfo(instr, instr->GetTypeIndex(), instr->GetDexFile(), /* is_exact */ true);
 }
 
-void ReferenceTypePropagation::UpdateFieldAccessTypeInfo(HInstruction* instr,
-                                                         const FieldInfo& info) {
+void RTPVisitor::UpdateFieldAccessTypeInfo(HInstruction* instr,
+                                           const FieldInfo& info) {
   // The field index is unknown only during tests.
   if (instr->GetType() != Primitive::kPrimNot || info.GetFieldIndex() == kUnknownFieldIndex) {
     return;
@@ -206,18 +224,18 @@ void ReferenceTypePropagation::UpdateFieldAccessTypeInfo(HInstruction* instr,
   ArtField* field = cl->GetResolvedField(info.GetFieldIndex(), dex_cache);
   DCHECK(field != nullptr);
   mirror::Class* klass = field->GetType<false>();
-  SetClassAsTypeInfo(instr, klass);
+  SetClassAsTypeInfo(instr, klass, /* is_exact */ false);
 }
 
-void ReferenceTypePropagation::VisitInstanceFieldGet(HInstanceFieldGet* instr) {
+void RTPVisitor::VisitInstanceFieldGet(HInstanceFieldGet* instr) {
   UpdateFieldAccessTypeInfo(instr, instr->GetFieldInfo());
 }
 
-void ReferenceTypePropagation::VisitStaticFieldGet(HStaticFieldGet* instr) {
+void RTPVisitor::VisitStaticFieldGet(HStaticFieldGet* instr) {
   UpdateFieldAccessTypeInfo(instr, instr->GetFieldInfo());
 }
 
-void ReferenceTypePropagation::VisitLoadClass(HLoadClass* instr) {
+void RTPVisitor::VisitLoadClass(HLoadClass* instr) {
   ScopedObjectAccess soa(Thread::Current());
   mirror::DexCache* dex_cache =
       Runtime::Current()->GetClassLinker()->FindDexCache(instr->GetDexFile());
@@ -295,6 +313,34 @@ bool ReferenceTypePropagation::UpdateReferenceTypeInfo(HInstruction* instr) {
   return !previous_rti.IsEqual(instr->GetReferenceTypeInfo());
 }
 
+void RTPVisitor::VisitInvoke(HInvoke* instr) {
+  if (instr->GetType() != Primitive::kPrimNot) {
+    return;
+  }
+
+  ScopedObjectAccess soa(Thread::Current());
+  ClassLinker* cl = Runtime::Current()->GetClassLinker();
+  mirror::DexCache* dex_cache = cl->FindDexCache(instr->GetDexFile());
+  ArtMethod* method = dex_cache->GetResolvedMethod(
+      instr->GetDexMethodIndex(), cl->GetImagePointerSize());
+  DCHECK(method != nullptr);
+  mirror::Class* klass = method->GetReturnType(false);
+  SetClassAsTypeInfo(instr, klass, /* is_exact */ false);
+}
+
+void RTPVisitor::VisitArrayGet(HArrayGet* instr) {
+  if (instr->GetType() != Primitive::kPrimNot) {
+    return;
+  }
+
+  HInstruction* parent = instr->InputAt(0);
+  ScopedObjectAccess soa(Thread::Current());
+  Handle<mirror::Class> handle = parent->GetReferenceTypeInfo().GetTypeHandle();
+  if (handle.GetReference() != nullptr && handle->IsObjectArrayClass()) {
+    SetClassAsTypeInfo(instr, handle->GetComponentType(), /* is_exact */ false);
+  }
+}
+
 void ReferenceTypePropagation::UpdateBoundType(HBoundType* instr) {
   ReferenceTypeInfo new_rti = instr->InputAt(0)->GetReferenceTypeInfo();
   // Be sure that we don't go over the bounded type.
diff --git a/compiler/optimizing/reference_type_propagation.h b/compiler/optimizing/reference_type_propagation.h
index 74e425fb3e..0d687d25cb 100644
--- a/compiler/optimizing/reference_type_propagation.h
+++ b/compiler/optimizing/reference_type_propagation.h
@@ -40,23 +40,12 @@ class ReferenceTypePropagation : public HOptimization {
   static constexpr const char* kReferenceTypePropagationPassName = "reference_type_propagation";
 
  private:
-  void VisitNewInstance(HNewInstance* new_instance);
-  void VisitLoadClass(HLoadClass* load_class);
-  void VisitNewArray(HNewArray* instr);
   void VisitPhi(HPhi* phi);
   void VisitBasicBlock(HBasicBlock* block);
-  void UpdateFieldAccessTypeInfo(HInstruction* instr, const FieldInfo& info);
-  void SetClassAsTypeInfo(HInstruction* instr, mirror::Class* klass);
-
   void UpdateBoundType(HBoundType* bound_type) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void UpdatePhi(HPhi* phi) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-
   void BoundTypeForIfNotNull(HBasicBlock* block);
   void BoundTypeForIfInstanceOf(HBasicBlock* block);
-  void UpdateReferenceTypeInfo(HInstruction* instr, uint16_t type_idx, const DexFile& dex_file);
-  void VisitInstanceFieldGet(HInstanceFieldGet* instr);
-  void VisitStaticFieldGet(HStaticFieldGet* instr);
-
   void ProcessWorklist();
   void AddToWorklist(HInstruction* instr);
   void AddDependentInstructionsToWorklist(HInstruction* instr);
diff --git a/compiler/optimizing/ssa_builder.cc b/compiler/optimizing/ssa_builder.cc
index c4612af393..2a86e60e14 100644
--- a/compiler/optimizing/ssa_builder.cc
+++ b/compiler/optimizing/ssa_builder.cc
@@ -184,22 +184,24 @@ void SsaBuilder::FixNullConstantType() {
       }
       HInstruction* left = equality_instr->InputAt(0);
       HInstruction* right = equality_instr->InputAt(1);
-      HInstruction* null_instr = nullptr;
+      HInstruction* int_operand = nullptr;
 
-      if ((left->GetType() == Primitive::kPrimNot) && right->IsIntConstant()) {
-        null_instr = right;
-      } else if ((right->GetType() == Primitive::kPrimNot) && left->IsIntConstant()) {
-        null_instr = left;
+      if ((left->GetType() == Primitive::kPrimNot) && (right->GetType() == Primitive::kPrimInt)) {
+        int_operand = right;
+      } else if ((right->GetType() == Primitive::kPrimNot)
+                 && (left->GetType() == Primitive::kPrimInt)) {
+        int_operand = left;
       } else {
         continue;
       }
 
       // If we got here, we are comparing against a reference and the int constant
       // should be replaced with a null constant.
-      if (null_instr->IsIntConstant()) {
-        DCHECK_EQ(0, null_instr->AsIntConstant()->GetValue());
-        equality_instr->ReplaceInput(GetGraph()->GetNullConstant(), null_instr == right ? 1 : 0);
-      }
+      // Both type propagation and redundant phi elimination ensure `int_operand`
+      // can only be the 0 constant.
+      DCHECK(int_operand->IsIntConstant());
+      DCHECK_EQ(0, int_operand->AsIntConstant()->GetValue());
+      equality_instr->ReplaceInput(GetGraph()->GetNullConstant(), int_operand == right ? 1 : 0);
     }
   }
 }
@@ -255,21 +257,18 @@ void SsaBuilder::BuildSsa() {
   PrimitiveTypePropagation type_propagation(GetGraph());
   type_propagation.Run();
 
-  // 5) Fix the type for null constants which are part of an equality comparison.
-  FixNullConstantType();
-
-  // 6) When creating equivalent phis we copy the inputs of the original phi which
-  // may be improperly typed. This will be fixed during the type propagation but
+  // 5) When creating equivalent phis we copy the inputs of the original phi which
+  // may be improperly typed. This was fixed during the type propagation in 4) but
   // as a result we may end up with two equivalent phis with the same type for
   // the same dex register. This pass cleans them up.
   EquivalentPhisCleanup();
 
-  // 7) Mark dead phis again. Step 4) may have introduced new phis.
-  // Step 6) might enable the death of new phis.
+  // 6) Mark dead phis again. Step 4) may have introduced new phis.
+  // Step 5) might enable the death of new phis.
   SsaDeadPhiElimination dead_phis(GetGraph());
   dead_phis.MarkDeadPhis();
 
-  // 8) Now that the graph is correctly typed, we can get rid of redundant phis.
+  // 7) Now that the graph is correctly typed, we can get rid of redundant phis.
   // Note that we cannot do this phase before type propagation, otherwise
   // we could get rid of phi equivalents, whose presence is a requirement for the
   // type propagation phase. Note that this is to satisfy statement (a) of the
@@ -277,6 +276,13 @@ void SsaBuilder::BuildSsa() {
   SsaRedundantPhiElimination redundant_phi(GetGraph());
   redundant_phi.Run();
 
+  // 8) Fix the type for null constants which are part of an equality comparison.
+  // We need to do this after redundant phi elimination, to ensure the only cases
+  // that we can see are reference comparison against 0. The redundant phi
+  // elimination ensures we do not see a phi taking two 0 constants in a HEqual
+  // or HNotEqual.
+  FixNullConstantType();
+
   // 9) Make sure environments use the right phi "equivalent": a phi marked dead
   // can have a phi equivalent that is not dead. We must therefore update
   // all environment uses of the dead phi to use its equivalent. Note that there
diff --git a/compiler/optimizing/stack_map_stream.cc b/compiler/optimizing/stack_map_stream.cc
index 42b9182d55..65610d54a6 100644
--- a/compiler/optimizing/stack_map_stream.cc
+++ b/compiler/optimizing/stack_map_stream.cc
@@ -49,7 +49,6 @@ void StackMapStream::BeginStackMapEntry(uint32_t dex_pc,
   }
 
   dex_pc_max_ = std::max(dex_pc_max_, dex_pc);
-  native_pc_offset_max_ = std::max(native_pc_offset_max_, native_pc_offset);
   register_mask_max_ = std::max(register_mask_max_, register_mask);
   current_dex_register_ = 0;
 }
@@ -128,16 +127,25 @@ void StackMapStream::EndInlineInfoEntry() {
   current_inline_info_ = InlineInfoEntry();
 }
 
+uint32_t StackMapStream::ComputeMaxNativePcOffset() const {
+  uint32_t max_native_pc_offset = 0u;
+  for (size_t i = 0, size = stack_maps_.Size(); i != size; ++i) {
+    max_native_pc_offset = std::max(max_native_pc_offset, stack_maps_.Get(i).native_pc_offset);
+  }
+  return max_native_pc_offset;
+}
+
 size_t StackMapStream::PrepareForFillIn() {
   int stack_mask_number_of_bits = stack_mask_max_ + 1;  // Need room for max element too.
   stack_mask_size_ = RoundUp(stack_mask_number_of_bits, kBitsPerByte) / kBitsPerByte;
   inline_info_size_ = ComputeInlineInfoSize();
   dex_register_maps_size_ = ComputeDexRegisterMapsSize();
+  uint32_t max_native_pc_offset = ComputeMaxNativePcOffset();
   stack_map_encoding_ = StackMapEncoding::CreateFromSizes(stack_mask_size_,
                                                           inline_info_size_,
                                                           dex_register_maps_size_,
                                                           dex_pc_max_,
-                                                          native_pc_offset_max_,
+                                                          max_native_pc_offset,
                                                           register_mask_max_);
   stack_maps_size_ = stack_maps_.Size() * stack_map_encoding_.ComputeStackMapSize();
   dex_register_location_catalog_size_ = ComputeDexRegisterLocationCatalogSize();
diff --git a/compiler/optimizing/stack_map_stream.h b/compiler/optimizing/stack_map_stream.h
index 274d573350..550ed70e0f 100644
--- a/compiler/optimizing/stack_map_stream.h
+++ b/compiler/optimizing/stack_map_stream.h
@@ -67,7 +67,6 @@ class StackMapStream : public ValueObject {
         inline_infos_(allocator, 2),
         stack_mask_max_(-1),
         dex_pc_max_(0),
-        native_pc_offset_max_(0),
         register_mask_max_(0),
         number_of_stack_maps_with_inline_info_(0),
         dex_map_hash_to_stack_map_indices_(std::less<uint32_t>(), allocator->Adapter()),
@@ -126,6 +125,22 @@ class StackMapStream : public ValueObject {
                             uint32_t num_dex_registers);
   void EndInlineInfoEntry();
 
+  size_t GetNumberOfStackMaps() const {
+    return stack_maps_.Size();
+  }
+
+  const StackMapEntry& GetStackMap(size_t i) const {
+    DCHECK_LT(i, stack_maps_.Size());
+    return stack_maps_.GetRawStorage()[i];
+  }
+
+  void SetStackMapNativePcOffset(size_t i, uint32_t native_pc_offset) {
+    DCHECK_LT(i, stack_maps_.Size());
+    stack_maps_.GetRawStorage()[i].native_pc_offset = native_pc_offset;
+  }
+
+  uint32_t ComputeMaxNativePcOffset() const;
+
   // Prepares the stream to fill in a memory region. Must be called before FillIn.
   // Returns the size (in bytes) needed to store this stream.
   size_t PrepareForFillIn();
@@ -163,7 +178,6 @@ class StackMapStream : public ValueObject {
   GrowableArray<InlineInfoEntry> inline_infos_;
   int stack_mask_max_;
   uint32_t dex_pc_max_;
-  uint32_t native_pc_offset_max_;
   uint32_t register_mask_max_;
   size_t number_of_stack_maps_with_inline_info_;
 
diff --git a/compiler/trampolines/trampoline_compiler.cc b/compiler/trampolines/trampoline_compiler.cc
index cb51ed8fc8..facc6304e5 100644
--- a/compiler/trampolines/trampoline_compiler.cc
+++ b/compiler/trampolines/trampoline_compiler.cc
@@ -17,21 +17,21 @@
 #include "trampoline_compiler.h"
 
 #include "jni_env_ext.h"
-#include "utils/arm/assembler_arm.h"
+#include "utils/arm/assembler_thumb2.h"
 #include "utils/arm64/assembler_arm64.h"
 #include "utils/mips/assembler_mips.h"
 #include "utils/mips64/assembler_mips64.h"
 #include "utils/x86/assembler_x86.h"
 #include "utils/x86_64/assembler_x86_64.h"
 
-#define __ assembler->
+#define __ assembler.
 
 namespace art {
 
 namespace arm {
 static const std::vector<uint8_t>* CreateTrampoline(EntryPointCallingConvention abi,
                                                     ThreadOffset<4> offset) {
-  std::unique_ptr<ArmAssembler> assembler(static_cast<ArmAssembler*>(Assembler::Create(kThumb2)));
+  Thumb2Assembler assembler;
 
   switch (abi) {
     case kInterpreterAbi:  // Thread* is first argument (R0) in interpreter ABI.
@@ -46,10 +46,11 @@ static const std::vector<uint8_t>* CreateTrampoline(EntryPointCallingConvention
   }
   __ bkpt(0);
 
-  size_t cs = assembler->CodeSize();
+  __ FinalizeCode();
+  size_t cs = __ CodeSize();
   std::unique_ptr<std::vector<uint8_t>> entry_stub(new std::vector<uint8_t>(cs));
   MemoryRegion code(&(*entry_stub)[0], entry_stub->size());
-  assembler->FinalizeInstructions(code);
+  __ FinalizeInstructions(code);
 
   return entry_stub.release();
 }
@@ -58,7 +59,7 @@ static const std::vector<uint8_t>* CreateTrampoline(EntryPointCallingConvention
 namespace arm64 {
 static const std::vector<uint8_t>* CreateTrampoline(EntryPointCallingConvention abi,
                                                     ThreadOffset<8> offset) {
-  std::unique_ptr<Arm64Assembler> assembler(static_cast<Arm64Assembler*>(Assembler::Create(kArm64)));
+  Arm64Assembler assembler;
 
   switch (abi) {
     case kInterpreterAbi:  // Thread* is first argument (X0) in interpreter ABI.
@@ -82,11 +83,11 @@ static const std::vector<uint8_t>* CreateTrampoline(EntryPointCallingConvention
       break;
   }
 
-  assembler->EmitSlowPaths();
-  size_t cs = assembler->CodeSize();
+  __ FinalizeCode();
+  size_t cs = __ CodeSize();
   std::unique_ptr<std::vector<uint8_t>> entry_stub(new std::vector<uint8_t>(cs));
   MemoryRegion code(&(*entry_stub)[0], entry_stub->size());
-  assembler->FinalizeInstructions(code);
+  __ FinalizeInstructions(code);
 
   return entry_stub.release();
 }
@@ -95,7 +96,7 @@ static const std::vector<uint8_t>* CreateTrampoline(EntryPointCallingConvention
 namespace mips {
 static const std::vector<uint8_t>* CreateTrampoline(EntryPointCallingConvention abi,
                                                     ThreadOffset<4> offset) {
-  std::unique_ptr<MipsAssembler> assembler(static_cast<MipsAssembler*>(Assembler::Create(kMips)));
+  MipsAssembler assembler;
 
   switch (abi) {
     case kInterpreterAbi:  // Thread* is first argument (A0) in interpreter ABI.
@@ -112,10 +113,11 @@ static const std::vector<uint8_t>* CreateTrampoline(EntryPointCallingConvention
   __ Nop();
   __ Break();
 
-  size_t cs = assembler->CodeSize();
+  __ FinalizeCode();
+  size_t cs = __ CodeSize();
   std::unique_ptr<std::vector<uint8_t>> entry_stub(new std::vector<uint8_t>(cs));
   MemoryRegion code(&(*entry_stub)[0], entry_stub->size());
-  assembler->FinalizeInstructions(code);
+  __ FinalizeInstructions(code);
 
   return entry_stub.release();
 }
@@ -124,7 +126,7 @@ static const std::vector<uint8_t>* CreateTrampoline(EntryPointCallingConvention
 namespace mips64 {
 static const std::vector<uint8_t>* CreateTrampoline(EntryPointCallingConvention abi,
                                                     ThreadOffset<8> offset) {
-  std::unique_ptr<Mips64Assembler> assembler(static_cast<Mips64Assembler*>(Assembler::Create(kMips64)));
+  Mips64Assembler assembler;
 
   switch (abi) {
     case kInterpreterAbi:  // Thread* is first argument (A0) in interpreter ABI.
@@ -141,10 +143,11 @@ static const std::vector<uint8_t>* CreateTrampoline(EntryPointCallingConvention
   __ Nop();
   __ Break();
 
-  size_t cs = assembler->CodeSize();
+  __ FinalizeCode();
+  size_t cs = __ CodeSize();
   std::unique_ptr<std::vector<uint8_t>> entry_stub(new std::vector<uint8_t>(cs));
   MemoryRegion code(&(*entry_stub)[0], entry_stub->size());
-  assembler->FinalizeInstructions(code);
+  __ FinalizeInstructions(code);
 
   return entry_stub.release();
 }
@@ -152,16 +155,17 @@ static const std::vector<uint8_t>* CreateTrampoline(EntryPointCallingConvention
 
 namespace x86 {
 static const std::vector<uint8_t>* CreateTrampoline(ThreadOffset<4> offset) {
-  std::unique_ptr<X86Assembler> assembler(static_cast<X86Assembler*>(Assembler::Create(kX86)));
+  X86Assembler assembler;
 
   // All x86 trampolines call via the Thread* held in fs.
   __ fs()->jmp(Address::Absolute(offset));
   __ int3();
 
-  size_t cs = assembler->CodeSize();
+  __ FinalizeCode();
+  size_t cs = __ CodeSize();
   std::unique_ptr<std::vector<uint8_t>> entry_stub(new std::vector<uint8_t>(cs));
   MemoryRegion code(&(*entry_stub)[0], entry_stub->size());
-  assembler->FinalizeInstructions(code);
+  __ FinalizeInstructions(code);
 
   return entry_stub.release();
 }
@@ -169,17 +173,17 @@ static const std::vector<uint8_t>* CreateTrampoline(ThreadOffset<4> offset) {
 
 namespace x86_64 {
 static const std::vector<uint8_t>* CreateTrampoline(ThreadOffset<8> offset) {
-  std::unique_ptr<x86_64::X86_64Assembler>
-      assembler(static_cast<x86_64::X86_64Assembler*>(Assembler::Create(kX86_64)));
+  x86_64::X86_64Assembler assembler;
 
   // All x86 trampolines call via the Thread* held in gs.
   __ gs()->jmp(x86_64::Address::Absolute(offset, true));
   __ int3();
 
-  size_t cs = assembler->CodeSize();
+  __ FinalizeCode();
+  size_t cs = __ CodeSize();
   std::unique_ptr<std::vector<uint8_t>> entry_stub(new std::vector<uint8_t>(cs));
   MemoryRegion code(&(*entry_stub)[0], entry_stub->size());
-  assembler->FinalizeInstructions(code);
+  __ FinalizeInstructions(code);
 
   return entry_stub.release();
 }
diff --git a/compiler/utils/arm/assembler_arm.h b/compiler/utils/arm/assembler_arm.h
index 350efca3e2..f8ca48ef57 100644
--- a/compiler/utils/arm/assembler_arm.h
+++ b/compiler/utils/arm/assembler_arm.h
@@ -17,6 +17,7 @@
 #ifndef ART_COMPILER_UTILS_ARM_ASSEMBLER_ARM_H_
 #define ART_COMPILER_UTILS_ARM_ASSEMBLER_ARM_H_
 
+#include <type_traits>
 #include <vector>
 
 #include "base/bit_utils.h"
@@ -33,14 +34,47 @@ namespace arm {
 class Arm32Assembler;
 class Thumb2Assembler;
 
-// This class indicates that the label and its uses
-// will fall into a range that is encodable in 16bits on thumb2.
-class NearLabel : public Label {
+// Assembler literal is a value embedded in code, retrieved using a PC-relative load.
+class Literal {
  public:
-  NearLabel() {}
+  static constexpr size_t kMaxSize = 8;
+
+  Literal(uint32_t size, const uint8_t* data)
+      : label_(), size_(size) {
+    DCHECK_LE(size, Literal::kMaxSize);
+    memcpy(data_, data, size);
+  }
+
+  template <typename T>
+  T GetValue() const {
+    DCHECK_EQ(size_, sizeof(T));
+    T value;
+    memcpy(&value, data_, sizeof(T));
+    return value;
+  }
+
+  uint32_t GetSize() const {
+    return size_;
+  }
+
+  const uint8_t* GetData() const {
+    return data_;
+  }
+
+  Label* GetLabel() {
+    return &label_;
+  }
+
+  const Label* GetLabel() const {
+    return &label_;
+  }
 
  private:
-  DISALLOW_COPY_AND_ASSIGN(NearLabel);
+  Label label_;
+  const uint32_t size_;
+  uint8_t data_[kMaxSize];
+
+  DISALLOW_COPY_AND_ASSIGN(Literal);
 };
 
 class ShifterOperand {
@@ -529,9 +563,6 @@ class ArmAssembler : public Assembler {
 
   // Branch instructions.
   virtual void b(Label* label, Condition cond = AL) = 0;
-  virtual void b(NearLabel* label, Condition cond = AL) {
-    b(static_cast<Label*>(label), cond);
-  }
   virtual void bl(Label* label, Condition cond = AL) = 0;
   virtual void blx(Register rm, Condition cond = AL) = 0;
   virtual void bx(Register rm, Condition cond = AL) = 0;
@@ -541,9 +572,41 @@ class ArmAssembler : public Assembler {
 
   void Pad(uint32_t bytes);
 
+  // Adjust label position.
+  void AdjustLabelPosition(Label* label) {
+    DCHECK(label->IsBound());
+    uint32_t old_position = static_cast<uint32_t>(label->Position());
+    uint32_t new_position = GetAdjustedPosition(old_position);
+    label->Reinitialize();
+    DCHECK_GE(static_cast<int>(new_position), 0);
+    label->BindTo(static_cast<int>(new_position));
+  }
+
+  // Get the final position of a label after local fixup based on the old position
+  // recorded before FinalizeCode().
+  virtual uint32_t GetAdjustedPosition(uint32_t old_position) = 0;
+
   // Macros.
   // Most of these are pure virtual as they need to be implemented per instruction set.
 
+  // Create a new literal with a given value.
+  // NOTE: Force the template parameter to be explicitly specified. In the absence of
+  // std::omit_from_type_deduction<T> or std::identity<T>, use std::decay<T>.
+  template <typename T>
+  Literal* NewLiteral(typename std::decay<T>::type value) {
+    static_assert(std::is_integral<T>::value, "T must be an integral type.");
+    return NewLiteral(sizeof(value), reinterpret_cast<const uint8_t*>(&value));
+  }
+
+  // Create a new literal with the given data.
+  virtual Literal* NewLiteral(size_t size, const uint8_t* data) = 0;
+
+  // Load literal.
+  virtual void LoadLiteral(Register rt, Literal* literal) = 0;
+  virtual void LoadLiteral(Register rt, Register rt2, Literal* literal) = 0;
+  virtual void LoadLiteral(SRegister sd, Literal* literal) = 0;
+  virtual void LoadLiteral(DRegister dd, Literal* literal) = 0;
+
   // Add signed constant value to rd. May clobber IP.
   virtual void AddConstant(Register rd, int32_t value, Condition cond = AL) = 0;
   virtual void AddConstant(Register rd, Register rn, int32_t value,
@@ -667,9 +730,6 @@ class ArmAssembler : public Assembler {
   virtual void Bind(Label* label) = 0;
 
   virtual void CompareAndBranchIfZero(Register r, Label* label) = 0;
-  virtual void CompareAndBranchIfZero(Register r, NearLabel* label) {
-    CompareAndBranchIfZero(r, static_cast<Label*>(label));
-  }
   virtual void CompareAndBranchIfNonZero(Register r, Label* label) = 0;
 
   //
diff --git a/compiler/utils/arm/assembler_arm32.cc b/compiler/utils/arm/assembler_arm32.cc
index cdf62bf885..6e60ddc260 100644
--- a/compiler/utils/arm/assembler_arm32.cc
+++ b/compiler/utils/arm/assembler_arm32.cc
@@ -1354,6 +1354,41 @@ int Arm32Assembler::DecodeBranchOffset(int32_t inst) {
 }
 
 
+uint32_t Arm32Assembler::GetAdjustedPosition(uint32_t old_position ATTRIBUTE_UNUSED) {
+  LOG(FATAL) << "Unimplemented.";
+  UNREACHABLE();
+}
+
+Literal* Arm32Assembler::NewLiteral(size_t size ATTRIBUTE_UNUSED,
+                                    const uint8_t* data ATTRIBUTE_UNUSED)  {
+  LOG(FATAL) << "Unimplemented.";
+  UNREACHABLE();
+}
+
+void Arm32Assembler::LoadLiteral(Register rt ATTRIBUTE_UNUSED,
+                                 Literal* literal ATTRIBUTE_UNUSED)  {
+  LOG(FATAL) << "Unimplemented.";
+  UNREACHABLE();
+}
+
+void Arm32Assembler::LoadLiteral(Register rt ATTRIBUTE_UNUSED, Register rt2 ATTRIBUTE_UNUSED,
+                                 Literal* literal ATTRIBUTE_UNUSED)  {
+  LOG(FATAL) << "Unimplemented.";
+  UNREACHABLE();
+}
+
+void Arm32Assembler::LoadLiteral(SRegister sd ATTRIBUTE_UNUSED,
+                                 Literal* literal ATTRIBUTE_UNUSED)  {
+  LOG(FATAL) << "Unimplemented.";
+  UNREACHABLE();
+}
+
+void Arm32Assembler::LoadLiteral(DRegister dd ATTRIBUTE_UNUSED,
+                                 Literal* literal ATTRIBUTE_UNUSED) {
+  LOG(FATAL) << "Unimplemented.";
+  UNREACHABLE();
+}
+
 void Arm32Assembler::AddConstant(Register rd, int32_t value, Condition cond) {
   AddConstant(rd, rd, value, cond);
 }
diff --git a/compiler/utils/arm/assembler_arm32.h b/compiler/utils/arm/assembler_arm32.h
index 3164623fd9..1c38eec12c 100644
--- a/compiler/utils/arm/assembler_arm32.h
+++ b/compiler/utils/arm/assembler_arm32.h
@@ -238,7 +238,16 @@ class Arm32Assembler FINAL : public ArmAssembler {
   // Memory barriers.
   void dmb(DmbOptions flavor) OVERRIDE;
 
-  // Macros.
+  // Get the final position of a label after local fixup based on the old position
+  // recorded before FinalizeCode().
+  uint32_t GetAdjustedPosition(uint32_t old_position) OVERRIDE;
+
+  Literal* NewLiteral(size_t size, const uint8_t* data) OVERRIDE;
+  void LoadLiteral(Register rt, Literal* literal) OVERRIDE;
+  void LoadLiteral(Register rt, Register rt2, Literal* literal) OVERRIDE;
+  void LoadLiteral(SRegister sd, Literal* literal) OVERRIDE;
+  void LoadLiteral(DRegister dd, Literal* literal) OVERRIDE;
+
   // Add signed constant value to rd. May clobber IP.
   void AddConstant(Register rd, int32_t value, Condition cond = AL) OVERRIDE;
   void AddConstant(Register rd, Register rn, int32_t value,
diff --git a/compiler/utils/arm/assembler_thumb2.cc b/compiler/utils/arm/assembler_thumb2.cc
index 26cb6c3739..f9e1ac672e 100644
--- a/compiler/utils/arm/assembler_thumb2.cc
+++ b/compiler/utils/arm/assembler_thumb2.cc
@@ -25,6 +25,309 @@
 namespace art {
 namespace arm {
 
+void Thumb2Assembler::BindLabel(Label* label, uint32_t bound_pc) {
+  CHECK(!label->IsBound());
+
+  while (label->IsLinked()) {
+    FixupId fixup_id = label->Position();                     // The id for linked Fixup.
+    Fixup* fixup = GetFixup(fixup_id);                        // Get the Fixup at this id.
+    fixup->Resolve(bound_pc);                                 // Fixup can be resolved now.
+    // Add this fixup as a dependency of all later fixups.
+    for (FixupId id = fixup_id + 1u, end = fixups_.size(); id != end; ++id) {
+      GetFixup(id)->AddDependent(fixup_id);
+    }
+    uint32_t fixup_location = fixup->GetLocation();
+    uint16_t next = buffer_.Load<uint16_t>(fixup_location);   // Get next in chain.
+    buffer_.Store<int16_t>(fixup_location, 0);
+    label->position_ = next;                                  // Move to next.
+  }
+  label->BindTo(bound_pc);
+}
+
+void Thumb2Assembler::BindLiterals() {
+  // We don't add the padding here, that's done only after adjusting the Fixup sizes.
+  uint32_t code_size = buffer_.Size();
+  for (Literal& lit : literals_) {
+    Label* label = lit.GetLabel();
+    BindLabel(label, code_size);
+    code_size += lit.GetSize();
+  }
+}
+
+void Thumb2Assembler::AdjustFixupIfNeeded(Fixup* fixup, uint32_t* current_code_size,
+                                          std::deque<FixupId>* fixups_to_recalculate) {
+  uint32_t adjustment = fixup->AdjustSizeIfNeeded(*current_code_size);
+  if (adjustment != 0u) {
+    *current_code_size += adjustment;
+    for (FixupId dependent_id : fixup->Dependents()) {
+      Fixup* dependent = GetFixup(dependent_id);
+      dependent->IncreaseAdjustment(adjustment);
+      if (buffer_.Load<int16_t>(dependent->GetLocation()) == 0) {
+        buffer_.Store<int16_t>(dependent->GetLocation(), 1);
+        fixups_to_recalculate->push_back(dependent_id);
+      }
+    }
+  }
+}
+
+uint32_t Thumb2Assembler::AdjustFixups() {
+  uint32_t current_code_size = buffer_.Size();
+  std::deque<FixupId> fixups_to_recalculate;
+  if (kIsDebugBuild) {
+    // We will use the placeholders in the buffer_ to mark whether the fixup has
+    // been added to the fixups_to_recalculate. Make sure we start with zeros.
+    for (Fixup& fixup : fixups_) {
+      CHECK_EQ(buffer_.Load<int16_t>(fixup.GetLocation()), 0);
+    }
+  }
+  for (Fixup& fixup : fixups_) {
+    AdjustFixupIfNeeded(&fixup, &current_code_size, &fixups_to_recalculate);
+  }
+  while (!fixups_to_recalculate.empty()) {
+    // Pop the fixup.
+    FixupId fixup_id = fixups_to_recalculate.front();
+    fixups_to_recalculate.pop_front();
+    Fixup* fixup = GetFixup(fixup_id);
+    DCHECK_NE(buffer_.Load<int16_t>(fixup->GetLocation()), 0);
+    buffer_.Store<int16_t>(fixup->GetLocation(), 0);
+    // See if it needs adjustment.
+    AdjustFixupIfNeeded(fixup, &current_code_size, &fixups_to_recalculate);
+  }
+  if (kIsDebugBuild) {
+    // Check that no fixup is marked as being in fixups_to_recalculate anymore.
+    for (Fixup& fixup : fixups_) {
+      CHECK_EQ(buffer_.Load<int16_t>(fixup.GetLocation()), 0);
+    }
+  }
+
+  // Adjust literal pool labels for padding.
+  DCHECK_EQ(current_code_size & 1u, 0u);
+  uint32_t literals_adjustment = current_code_size + (current_code_size & 2) - buffer_.Size();
+  if (literals_adjustment != 0u) {
+    for (Literal& literal : literals_) {
+      Label* label = literal.GetLabel();
+      DCHECK(label->IsBound());
+      int old_position = label->Position();
+      label->Reinitialize();
+      label->BindTo(old_position + literals_adjustment);
+    }
+  }
+
+  return current_code_size;
+}
+
+void Thumb2Assembler::EmitFixups(uint32_t adjusted_code_size) {
+  // Move non-fixup code to its final place and emit fixups.
+  // Process fixups in reverse order so that we don't repeatedly move the same data.
+  size_t src_end = buffer_.Size();
+  size_t dest_end = adjusted_code_size;
+  buffer_.Resize(dest_end);
+  DCHECK_GE(dest_end, src_end);
+  for (auto i = fixups_.rbegin(), end = fixups_.rend(); i != end; ++i) {
+    Fixup* fixup = &*i;
+    if (fixup->GetOriginalSize() == fixup->GetSize()) {
+      // The size of this Fixup didn't change. To avoid moving the data
+      // in small chunks, emit the code to its original position.
+      fixup->Emit(&buffer_, adjusted_code_size);
+      fixup->Finalize(dest_end - src_end);
+    } else {
+      // Move the data between the end of the fixup and src_end to its final location.
+      size_t old_fixup_location = fixup->GetLocation();
+      size_t src_begin = old_fixup_location + fixup->GetOriginalSizeInBytes();
+      size_t data_size = src_end - src_begin;
+      size_t dest_begin  = dest_end - data_size;
+      buffer_.Move(dest_begin, src_begin, data_size);
+      src_end = old_fixup_location;
+      dest_end = dest_begin - fixup->GetSizeInBytes();
+      // Finalize the Fixup and emit the data to the new location.
+      fixup->Finalize(dest_end - src_end);
+      fixup->Emit(&buffer_, adjusted_code_size);
+    }
+  }
+  CHECK_EQ(src_end, dest_end);
+}
+
+void Thumb2Assembler::EmitLiterals() {
+  if (!literals_.empty()) {
+    // Load literal instructions (LDR, LDRD, VLDR) require 4-byte alignment.
+    // We don't support byte and half-word literals.
+    uint32_t code_size = buffer_.Size();
+    DCHECK_EQ(code_size & 1u, 0u);
+    if ((code_size & 2u) != 0u) {
+      Emit16(0);
+    }
+    for (Literal& literal : literals_) {
+      AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+      DCHECK_EQ(static_cast<size_t>(literal.GetLabel()->Position()), buffer_.Size());
+      DCHECK(literal.GetSize() == 4u || literal.GetSize() == 8u);
+      for (size_t i = 0, size = literal.GetSize(); i != size; ++i) {
+        buffer_.Emit<uint8_t>(literal.GetData()[i]);
+      }
+    }
+  }
+}
+
+inline int16_t Thumb2Assembler::BEncoding16(int32_t offset, Condition cond) {
+  DCHECK_EQ(offset & 1, 0);
+  int16_t encoding = B15 | B14;
+  if (cond != AL) {
+    DCHECK(IsInt<9>(offset));
+    encoding |= B12 |  (static_cast<int32_t>(cond) << 8) | ((offset >> 1) & 0xff);
+  } else {
+    DCHECK(IsInt<12>(offset));
+    encoding |= B13 | ((offset >> 1) & 0x7ff);
+  }
+  return encoding;
+}
+
+inline int32_t Thumb2Assembler::BEncoding32(int32_t offset, Condition cond) {
+  DCHECK_EQ(offset & 1, 0);
+  int32_t s = (offset >> 31) & 1;   // Sign bit.
+  int32_t encoding = B31 | B30 | B29 | B28 | B15 |
+      (s << 26) |                   // Sign bit goes to bit 26.
+      ((offset >> 1) & 0x7ff);      // imm11 goes to bits 0-10.
+  if (cond != AL) {
+    DCHECK(IsInt<21>(offset));
+    // Encode cond, move imm6 from bits 12-17 to bits 16-21 and move J1 and J2.
+    encoding |= (static_cast<int32_t>(cond) << 22) | ((offset & 0x3f000) << (16 - 12)) |
+        ((offset & (1 << 19)) >> (19 - 13)) |   // Extract J1 from bit 19 to bit 13.
+        ((offset & (1 << 18)) >> (18 - 11));    // Extract J2 from bit 18 to bit 11.
+  } else {
+    DCHECK(IsInt<25>(offset));
+    int32_t j1 = ((offset >> 23) ^ s ^ 1) & 1;  // Calculate J1 from I1 extracted from bit 23.
+    int32_t j2 = ((offset >> 22)^ s ^ 1) & 1;   // Calculate J2 from I2 extracted from bit 22.
+    // Move imm10 from bits 12-21 to bits 16-25 and add J1 and J2.
+    encoding |= B12 | ((offset & 0x3ff000) << (16 - 12)) |
+        (j1 << 13) | (j2 << 11);
+  }
+  return encoding;
+}
+
+inline int16_t Thumb2Assembler::CbxzEncoding16(Register rn, int32_t offset, Condition cond) {
+  DCHECK(!IsHighRegister(rn));
+  DCHECK_EQ(offset & 1, 0);
+  DCHECK(IsUint<7>(offset));
+  DCHECK(cond == EQ || cond == NE);
+  return B15 | B13 | B12 | B8 | (cond == NE ? B11 : 0) | static_cast<int32_t>(rn) |
+      ((offset & 0x3e) << (3 - 1)) |    // Move imm5 from bits 1-5 to bits 3-7.
+      ((offset & 0x40) << (9 - 6));     // Move i from bit 6 to bit 11
+}
+
+inline int16_t Thumb2Assembler::CmpRnImm8Encoding16(Register rn, int32_t value) {
+  DCHECK(!IsHighRegister(rn));
+  DCHECK(IsUint<8>(value));
+  return B13 | B11 | (rn << 8) | value;
+}
+
+inline int16_t Thumb2Assembler::AddRdnRmEncoding16(Register rdn, Register rm) {
+  // The high bit of rn is moved across 4-bit rm.
+  return B14 | B10 | (static_cast<int32_t>(rm) << 3) |
+      (static_cast<int32_t>(rdn) & 7) | ((static_cast<int32_t>(rdn) & 8) << 4);
+}
+
+inline int32_t Thumb2Assembler::MovwEncoding32(Register rd, int32_t value) {
+  DCHECK(IsUint<16>(value));
+  return B31 | B30 | B29 | B28 | B25 | B22 |
+      (static_cast<int32_t>(rd) << 8) |
+      ((value & 0xf000) << (16 - 12)) |   // Move imm4 from bits 12-15 to bits 16-19.
+      ((value & 0x0800) << (26 - 11)) |   // Move i from bit 11 to bit 26.
+      ((value & 0x0700) << (12 - 8)) |    // Move imm3 from bits 8-10 to bits 12-14.
+      (value & 0xff);                     // Keep imm8 in bits 0-7.
+}
+
+inline int32_t Thumb2Assembler::MovtEncoding32(Register rd, int32_t value) {
+  DCHECK_EQ(value & 0xffff, 0);
+  int32_t movw_encoding = MovwEncoding32(rd, (value >> 16) & 0xffff);
+  return movw_encoding | B25 | B23;
+}
+
+inline int32_t Thumb2Assembler::MovModImmEncoding32(Register rd, int32_t value) {
+  uint32_t mod_imm = ModifiedImmediate(value);
+  DCHECK_NE(mod_imm, kInvalidModifiedImmediate);
+  return B31 | B30 | B29 | B28 | B22 | B19 | B18 | B17 | B16 |
+      (static_cast<int32_t>(rd) << 8) | static_cast<int32_t>(mod_imm);
+}
+
+inline int16_t Thumb2Assembler::LdrLitEncoding16(Register rt, int32_t offset) {
+  DCHECK(!IsHighRegister(rt));
+  DCHECK_EQ(offset & 3, 0);
+  DCHECK(IsUint<10>(offset));
+  return B14 | B11 | (static_cast<int32_t>(rt) << 8) | (offset >> 2);
+}
+
+inline int32_t Thumb2Assembler::LdrLitEncoding32(Register rt, int32_t offset) {
+  // NOTE: We don't support negative offset, i.e. U=0 (B23).
+  return LdrRtRnImm12Encoding(rt, PC, offset);
+}
+
+inline int32_t Thumb2Assembler::LdrdEncoding32(Register rt, Register rt2, Register rn, int32_t offset) {
+  DCHECK_EQ(offset & 3, 0);
+  CHECK(IsUint<10>(offset));
+  return B31 | B30 | B29 | B27 |
+      B24 /* P = 1 */ | B23 /* U = 1 */ | B22 | 0 /* W = 0 */ | B20 |
+      (static_cast<int32_t>(rn) << 16) | (static_cast<int32_t>(rt) << 12) |
+      (static_cast<int32_t>(rt2) << 8) | (offset >> 2);
+}
+
+inline int32_t Thumb2Assembler::VldrsEncoding32(SRegister sd, Register rn, int32_t offset) {
+  DCHECK_EQ(offset & 3, 0);
+  CHECK(IsUint<10>(offset));
+  return B31 | B30 | B29 | B27 | B26 | B24 |
+      B23 /* U = 1 */ | B20 | B11 | B9 |
+      (static_cast<int32_t>(rn) << 16) |
+      ((static_cast<int32_t>(sd) & 0x01) << (22 - 0)) |   // Move D from bit 0 to bit 22.
+      ((static_cast<int32_t>(sd) & 0x1e) << (12 - 1)) |   // Move Vd from bits 1-4 to bits 12-15.
+      (offset >> 2);
+}
+
+inline int32_t Thumb2Assembler::VldrdEncoding32(DRegister dd, Register rn, int32_t offset) {
+  DCHECK_EQ(offset & 3, 0);
+  CHECK(IsUint<10>(offset));
+  return B31 | B30 | B29 | B27 | B26 | B24 |
+      B23 /* U = 1 */ | B20 | B11 | B9 | B8 |
+      (rn << 16) |
+      ((static_cast<int32_t>(dd) & 0x10) << (22 - 4)) |   // Move D from bit 4 to bit 22.
+      ((static_cast<int32_t>(dd) & 0x0f) << (12 - 0)) |   // Move Vd from bits 0-3 to bits 12-15.
+      (offset >> 2);
+}
+
+inline int16_t Thumb2Assembler::LdrRtRnImm5Encoding16(Register rt, Register rn, int32_t offset) {
+  DCHECK(!IsHighRegister(rt));
+  DCHECK(!IsHighRegister(rn));
+  DCHECK_EQ(offset & 3, 0);
+  DCHECK(IsUint<7>(offset));
+  return B14 | B13 | B11 |
+      (static_cast<int32_t>(rn) << 3) | static_cast<int32_t>(rt) |
+      (offset << (6 - 2));                // Move imm5 from bits 2-6 to bits 6-10.
+}
+
+int32_t Thumb2Assembler::Fixup::LoadWideOrFpEncoding(Register rbase, int32_t offset) const {
+  switch (type_) {
+    case kLoadLiteralWide:
+      return LdrdEncoding32(rn_, rt2_, rbase, offset);
+    case kLoadFPLiteralSingle:
+      return VldrsEncoding32(sd_, rbase, offset);
+    case kLoadFPLiteralDouble:
+      return VldrdEncoding32(dd_, rbase, offset);
+    default:
+      LOG(FATAL) << "Unexpected type: " << static_cast<int>(type_);
+      UNREACHABLE();
+  }
+}
+
+inline int32_t Thumb2Assembler::LdrRtRnImm12Encoding(Register rt, Register rn, int32_t offset) {
+  DCHECK(IsUint<12>(offset));
+  return B31 | B30 | B29 | B28 | B27 | B23 | B22 | B20 | (rn << 16) | (rt << 12) | offset;
+}
+
+void Thumb2Assembler::FinalizeCode() {
+  ArmAssembler::FinalizeCode();
+  BindLiterals();
+  uint32_t adjusted_code_size = AdjustFixups();
+  EmitFixups(adjusted_code_size);
+  EmitLiterals();
+}
+
 bool Thumb2Assembler::ShifterOperandCanHold(Register rd ATTRIBUTE_UNUSED,
                                             Register rn ATTRIBUTE_UNUSED,
                                             Opcode opcode,
@@ -671,17 +974,11 @@ void Thumb2Assembler::vcmpdz(DRegister dd, Condition cond) {
   EmitVFPddd(cond, B23 | B21 | B20 | B18 | B16 | B6, dd, D0, D0);
 }
 
-
 void Thumb2Assembler::b(Label* label, Condition cond) {
   EmitBranch(cond, label, false, false);
 }
 
 
-void Thumb2Assembler::b(NearLabel* label, Condition cond) {
-  EmitBranch(cond, label, false, false, /* is_near */ true);
-}
-
-
 void Thumb2Assembler::bl(Label* label, Condition cond) {
   CheckCondition(cond);
   EmitBranch(cond, label, true, false);
@@ -1308,80 +1605,359 @@ void Thumb2Assembler::EmitShift(Register rd, Register rn, Shift shift, Register
   }
 }
 
+inline size_t Thumb2Assembler::Fixup::SizeInBytes(Size size) {
+  switch (size) {
+    case kBranch16Bit:
+      return 2u;
+    case kBranch32Bit:
+      return 4u;
+
+    case kCbxz16Bit:
+      return 2u;
+    case kCbxz32Bit:
+      return 4u;
+    case kCbxz48Bit:
+      return 6u;
+
+    case kLiteral1KiB:
+      return 2u;
+    case kLiteral4KiB:
+      return 4u;
+    case kLiteral64KiB:
+      return 8u;
+    case kLiteral1MiB:
+      return 10u;
+    case kLiteralFar:
+      return 14u;
+
+    case kLongOrFPLiteral1KiB:
+      return 4u;
+    case kLongOrFPLiteral256KiB:
+      return 10u;
+    case kLongOrFPLiteralFar:
+      return 14u;
+  }
+  LOG(FATAL) << "Unexpected size: " << static_cast<int>(size);
+  UNREACHABLE();
+}
+
+inline uint32_t Thumb2Assembler::Fixup::GetOriginalSizeInBytes() const {
+  return SizeInBytes(original_size_);
+}
+
+inline uint32_t Thumb2Assembler::Fixup::GetSizeInBytes() const {
+  return SizeInBytes(size_);
+}
+
+inline size_t Thumb2Assembler::Fixup::LiteralPoolPaddingSize(uint32_t current_code_size) {
+  // The code size must be a multiple of 2.
+  DCHECK_EQ(current_code_size & 1u, 0u);
+  // If it isn't a multiple of 4, we need to add a 2-byte padding before the literal pool.
+  return current_code_size & 2;
+}
+
+inline int32_t Thumb2Assembler::Fixup::GetOffset(uint32_t current_code_size) const {
+  static constexpr int32_t int32_min = std::numeric_limits<int32_t>::min();
+  static constexpr int32_t int32_max = std::numeric_limits<int32_t>::max();
+  DCHECK_LE(target_, static_cast<uint32_t>(int32_max));
+  DCHECK_LE(location_, static_cast<uint32_t>(int32_max));
+  DCHECK_LE(adjustment_, static_cast<uint32_t>(int32_max));
+  int32_t diff = static_cast<int32_t>(target_) - static_cast<int32_t>(location_);
+  if (target_ > location_) {
+    DCHECK_LE(adjustment_, static_cast<uint32_t>(int32_max - diff));
+    diff += static_cast<int32_t>(adjustment_);
+  } else {
+    DCHECK_LE(int32_min + static_cast<int32_t>(adjustment_), diff);
+    diff -= static_cast<int32_t>(adjustment_);
+  }
+  // The default PC adjustment for Thumb2 is 4 bytes.
+  DCHECK_GE(diff, int32_min + 4);
+  diff -= 4;
+  // Add additional adjustment for instructions preceding the PC usage, padding
+  // before the literal pool and rounding down the PC for literal loads.
+  switch (GetSize()) {
+    case kBranch16Bit:
+    case kBranch32Bit:
+      break;
 
+    case kCbxz16Bit:
+      break;
+    case kCbxz32Bit:
+    case kCbxz48Bit:
+      DCHECK_GE(diff, int32_min + 2);
+      diff -= 2;        // Extra CMP Rn, #0, 16-bit.
+      break;
 
-void Thumb2Assembler::Branch::Emit(AssemblerBuffer* buffer) const {
-  bool link = type_ == kUnconditionalLinkX || type_ == kUnconditionalLink;
-  bool x = type_ == kUnconditionalX || type_ == kUnconditionalLinkX;
-  int32_t offset = target_ - location_;
+    case kLiteral1KiB:
+    case kLiteral4KiB:
+    case kLongOrFPLiteral1KiB:
+      DCHECK(diff >= 0 || (GetSize() == kLiteral1KiB && diff == -2));
+      diff += LiteralPoolPaddingSize(current_code_size);
+      // Load literal instructions round down the PC+4 to a multiple of 4, so if the PC
+      // isn't a multiple of 2, we need to adjust. Since we already adjusted for the target
+      // being aligned, current PC alignment can be inferred from diff.
+      DCHECK_EQ(diff & 1, 0);
+      diff = diff + (diff & 2);
+      DCHECK_GE(diff, 0);
+      break;
+    case kLiteral1MiB:
+    case kLiteral64KiB:
+    case kLongOrFPLiteral256KiB:
+      DCHECK_GE(diff, 4);  // The target must be at least 4 bytes after the ADD rX, PC.
+      diff -= 4;        // One extra 32-bit MOV.
+      diff += LiteralPoolPaddingSize(current_code_size);
+      break;
+    case kLiteralFar:
+    case kLongOrFPLiteralFar:
+      DCHECK_GE(diff, 8);  // The target must be at least 4 bytes after the ADD rX, PC.
+      diff -= 8;        // Extra MOVW+MOVT; both 32-bit.
+      diff += LiteralPoolPaddingSize(current_code_size);
+      break;
+  }
+  return diff;
+}
 
-  if (size_ == k32Bit) {
-    int32_t encoding = B31 | B30 | B29 | B28 | B15;
-    if (link) {
-      // BL or BLX immediate.
-      encoding |= B14;
-      if (!x) {
-        encoding |= B12;
-      } else {
-        // Bottom bit of offset must be 0.
-        CHECK_EQ((offset & 1), 0);
+inline size_t Thumb2Assembler::Fixup::IncreaseSize(Size new_size) {
+  DCHECK_NE(target_, kUnresolved);
+  Size old_size = size_;
+  size_ = new_size;
+  DCHECK_GT(SizeInBytes(new_size), SizeInBytes(old_size));
+  size_t adjustment = SizeInBytes(new_size) - SizeInBytes(old_size);
+  if (target_ > location_) {
+    adjustment_ += adjustment;
+  }
+  return adjustment;
+}
+
+uint32_t Thumb2Assembler::Fixup::AdjustSizeIfNeeded(uint32_t current_code_size) {
+  uint32_t old_code_size = current_code_size;
+  switch (GetSize()) {
+    case kBranch16Bit:
+      if (IsInt(cond_ != AL ? 9 : 12, GetOffset(current_code_size))) {
+        break;
       }
-    } else {
-      if (x) {
-        LOG(FATAL) << "Invalid use of BX";
-        UNREACHABLE();
-      } else {
-        if (cond_ == AL) {
-          // Can use the T4 encoding allowing a 24 bit offset.
-          if (!x) {
-            encoding |= B12;
-          }
-        } else {
-          // Must be T3 encoding with a 20 bit offset.
-          encoding |= cond_ << 22;
-        }
+      current_code_size += IncreaseSize(kBranch32Bit);
+      FALLTHROUGH_INTENDED;
+    case kBranch32Bit:
+      // We don't support conditional branches beyond +-1MiB
+      // or unconditional branches beyond +-16MiB.
+      break;
+
+    case kCbxz16Bit:
+      if (IsUint<7>(GetOffset(current_code_size))) {
+        break;
       }
-    }
-    encoding = Thumb2Assembler::EncodeBranchOffset(offset, encoding);
-    buffer->Store<int16_t>(location_, static_cast<int16_t>(encoding >> 16));
-    buffer->Store<int16_t>(location_+2, static_cast<int16_t>(encoding & 0xffff));
-  } else {
-    if (IsCompareAndBranch()) {
-      offset -= 4;
-      uint16_t i = (offset >> 6) & 1;
-      uint16_t imm5 = (offset >> 1) & 31U /* 0b11111 */;
-      int16_t encoding = B15 | B13 | B12 |
-            (type_ ==  kCompareAndBranchNonZero ? B11 : 0) |
-            static_cast<uint32_t>(rn_) |
-            B8 |
-            i << 9 |
-            imm5 << 3;
+      current_code_size += IncreaseSize(kCbxz32Bit);
+      FALLTHROUGH_INTENDED;
+    case kCbxz32Bit:
+      if (IsInt<9>(GetOffset(current_code_size))) {
+        break;
+      }
+      current_code_size += IncreaseSize(kCbxz48Bit);
+      FALLTHROUGH_INTENDED;
+    case kCbxz48Bit:
+      // We don't support conditional branches beyond +-1MiB.
+      break;
+
+    case kLiteral1KiB:
+      DCHECK(!IsHighRegister(rn_));
+      if (IsUint<10>(GetOffset(current_code_size))) {
+        break;
+      }
+      current_code_size += IncreaseSize(kLiteral4KiB);
+      FALLTHROUGH_INTENDED;
+    case kLiteral4KiB:
+      if (IsUint<12>(GetOffset(current_code_size))) {
+        break;
+      }
+      current_code_size += IncreaseSize(kLiteral64KiB);
+      FALLTHROUGH_INTENDED;
+    case kLiteral64KiB:
+      // Can't handle high register which we can encounter by fall-through from kLiteral4KiB.
+      if (!IsHighRegister(rn_) && IsUint<16>(GetOffset(current_code_size))) {
+        break;
+      }
+      current_code_size += IncreaseSize(kLiteral1MiB);
+      FALLTHROUGH_INTENDED;
+    case kLiteral1MiB:
+      if (IsUint<20>(GetOffset(current_code_size))) {
+        break;
+      }
+      current_code_size += IncreaseSize(kLiteralFar);
+      FALLTHROUGH_INTENDED;
+    case kLiteralFar:
+      // This encoding can reach any target.
+      break;
+
+    case kLongOrFPLiteral1KiB:
+      if (IsUint<10>(GetOffset(current_code_size))) {
+        break;
+      }
+      current_code_size += IncreaseSize(kLongOrFPLiteral256KiB);
+      FALLTHROUGH_INTENDED;
+    case kLongOrFPLiteral256KiB:
+      if (IsUint<18>(GetOffset(current_code_size))) {
+        break;
+      }
+      current_code_size += IncreaseSize(kLongOrFPLiteralFar);
+      FALLTHROUGH_INTENDED;
+    case kLongOrFPLiteralFar:
+      // This encoding can reach any target.
+      break;
+  }
+  return current_code_size - old_code_size;
+}
+
+void Thumb2Assembler::Fixup::Emit(AssemblerBuffer* buffer, uint32_t code_size) const {
+  switch (GetSize()) {
+    case kBranch16Bit: {
+      DCHECK(type_ == kUnconditional || type_ == kConditional);
+      DCHECK_EQ(type_ == kConditional, cond_ != AL);
+      int16_t encoding = BEncoding16(GetOffset(code_size), cond_);
       buffer->Store<int16_t>(location_, encoding);
-    } else {
-      offset -= 4;    // Account for PC offset.
-      int16_t encoding;
-      // 16 bit.
-      if (cond_ == AL) {
-        encoding = B15 | B14 | B13 |
-            ((offset >> 1) & 0x7ff);
-      } else {
-        encoding = B15 | B14 | B12 |
-            cond_ << 8 | ((offset >> 1) & 0xff);
+      break;
+    }
+    case kBranch32Bit: {
+      DCHECK(type_ == kConditional || type_ == kUnconditional ||
+             type_ == kUnconditionalLink || type_ == kUnconditionalLinkX);
+      DCHECK_EQ(type_ == kConditional, cond_ != AL);
+      int32_t encoding = BEncoding32(GetOffset(code_size), cond_);
+      if (type_ == kUnconditionalLink) {
+        DCHECK_NE(encoding & B12, 0);
+        encoding |= B14;
+      } else if (type_ == kUnconditionalLinkX) {
+        DCHECK_NE(encoding & B12, 0);
+        encoding ^= B14 | B12;
       }
+      buffer->Store<int16_t>(location_, encoding >> 16);
+      buffer->Store<int16_t>(location_ + 2u, static_cast<int16_t>(encoding & 0xffff));
+      break;
+    }
+
+    case kCbxz16Bit: {
+      DCHECK(type_ == kCompareAndBranchXZero);
+      int16_t encoding = CbxzEncoding16(rn_, GetOffset(code_size), cond_);
+      buffer->Store<int16_t>(location_, encoding);
+      break;
+    }
+    case kCbxz32Bit: {
+      DCHECK(type_ == kCompareAndBranchXZero);
+      DCHECK(cond_ == EQ || cond_ == NE);
+      int16_t cmp_encoding = CmpRnImm8Encoding16(rn_, 0);
+      int16_t b_encoding = BEncoding16(GetOffset(code_size), cond_);
+      buffer->Store<int16_t>(location_, cmp_encoding);
+      buffer->Store<int16_t>(location_ + 2, b_encoding);
+      break;
+    }
+    case kCbxz48Bit: {
+      DCHECK(type_ == kCompareAndBranchXZero);
+      DCHECK(cond_ == EQ || cond_ == NE);
+      int16_t cmp_encoding = CmpRnImm8Encoding16(rn_, 0);
+      int32_t b_encoding = BEncoding32(GetOffset(code_size), cond_);
+      buffer->Store<int16_t>(location_, cmp_encoding);
+      buffer->Store<int16_t>(location_ + 2u, b_encoding >> 16);
+      buffer->Store<int16_t>(location_ + 4u, static_cast<int16_t>(b_encoding & 0xffff));
+      break;
+    }
+
+    case kLiteral1KiB: {
+      DCHECK(type_ == kLoadLiteralNarrow);
+      int16_t encoding = LdrLitEncoding16(rn_, GetOffset(code_size));
       buffer->Store<int16_t>(location_, encoding);
+      break;
+    }
+    case kLiteral4KiB: {
+      DCHECK(type_ == kLoadLiteralNarrow);
+      // GetOffset() uses PC+4 but load literal uses AlignDown(PC+4, 4). Adjust offset accordingly.
+      int32_t encoding = LdrLitEncoding32(rn_, GetOffset(code_size));
+      buffer->Store<int16_t>(location_, encoding >> 16);
+      buffer->Store<int16_t>(location_ + 2u, static_cast<int16_t>(encoding & 0xffff));
+      break;
+    }
+    case kLiteral64KiB: {
+      DCHECK(type_ == kLoadLiteralNarrow);
+      int32_t mov_encoding = MovwEncoding32(rn_, GetOffset(code_size));
+      int16_t add_pc_encoding = AddRdnRmEncoding16(rn_, PC);
+      int16_t ldr_encoding = LdrRtRnImm5Encoding16(rn_, rn_, 0);
+      buffer->Store<int16_t>(location_, mov_encoding >> 16);
+      buffer->Store<int16_t>(location_ + 2u, static_cast<int16_t>(mov_encoding & 0xffff));
+      buffer->Store<int16_t>(location_ + 4u, add_pc_encoding);
+      buffer->Store<int16_t>(location_ + 6u, ldr_encoding);
+      break;
+    }
+    case kLiteral1MiB: {
+      DCHECK(type_ == kLoadLiteralNarrow);
+      int32_t offset = GetOffset(code_size);
+      int32_t mov_encoding = MovModImmEncoding32(rn_, offset & ~0xfff);
+      int16_t add_pc_encoding = AddRdnRmEncoding16(rn_, PC);
+      int32_t ldr_encoding = LdrRtRnImm12Encoding(rn_, rn_, offset & 0xfff);
+      buffer->Store<int16_t>(location_, mov_encoding >> 16);
+      buffer->Store<int16_t>(location_ + 2u, static_cast<int16_t>(mov_encoding & 0xffff));
+      buffer->Store<int16_t>(location_ + 4u, add_pc_encoding);
+      buffer->Store<int16_t>(location_ + 6u, ldr_encoding >> 16);
+      buffer->Store<int16_t>(location_ + 8u, static_cast<int16_t>(ldr_encoding & 0xffff));
+      break;
+    }
+    case kLiteralFar: {
+      DCHECK(type_ == kLoadLiteralNarrow);
+      int32_t offset = GetOffset(code_size);
+      int32_t movw_encoding = MovwEncoding32(rn_, offset & 0xffff);
+      int32_t movt_encoding = MovtEncoding32(rn_, offset & ~0xffff);
+      int16_t add_pc_encoding = AddRdnRmEncoding16(rn_, PC);
+      int32_t ldr_encoding = LdrRtRnImm12Encoding(rn_, rn_, 0);
+      buffer->Store<int16_t>(location_, movw_encoding >> 16);
+      buffer->Store<int16_t>(location_ + 2u, static_cast<int16_t>(movw_encoding & 0xffff));
+      buffer->Store<int16_t>(location_ + 4u, movt_encoding >> 16);
+      buffer->Store<int16_t>(location_ + 6u, static_cast<int16_t>(movt_encoding & 0xffff));
+      buffer->Store<int16_t>(location_ + 8u, add_pc_encoding);
+      buffer->Store<int16_t>(location_ + 10u, ldr_encoding >> 16);
+      buffer->Store<int16_t>(location_ + 12u, static_cast<int16_t>(ldr_encoding & 0xffff));
+      break;
+    }
+
+    case kLongOrFPLiteral1KiB: {
+      int32_t encoding = LoadWideOrFpEncoding(PC, GetOffset(code_size));  // DCHECKs type_.
+      buffer->Store<int16_t>(location_, encoding >> 16);
+      buffer->Store<int16_t>(location_ + 2u, static_cast<int16_t>(encoding & 0xffff));
+      break;
+    }
+    case kLongOrFPLiteral256KiB: {
+      int32_t offset = GetOffset(code_size);
+      int32_t mov_encoding = MovModImmEncoding32(IP, offset & ~0x3ff);
+      int16_t add_pc_encoding = AddRdnRmEncoding16(IP, PC);
+      int32_t ldr_encoding = LoadWideOrFpEncoding(IP, offset & 0x3ff);    // DCHECKs type_.
+      buffer->Store<int16_t>(location_, mov_encoding >> 16);
+      buffer->Store<int16_t>(location_ + 2u, static_cast<int16_t>(mov_encoding & 0xffff));
+      buffer->Store<int16_t>(location_ + 4u, add_pc_encoding);
+      buffer->Store<int16_t>(location_ + 6u, ldr_encoding >> 16);
+      buffer->Store<int16_t>(location_ + 8u, static_cast<int16_t>(ldr_encoding & 0xffff));
+      break;
+    }
+    case kLongOrFPLiteralFar: {
+      int32_t offset = GetOffset(code_size);
+      int32_t movw_encoding = MovwEncoding32(IP, offset & 0xffff);
+      int32_t movt_encoding = MovtEncoding32(IP, offset & ~0xffff);
+      int16_t add_pc_encoding = AddRdnRmEncoding16(IP, PC);
+      int32_t ldr_encoding = LoadWideOrFpEncoding(IP, 0);                 // DCHECKs type_.
+      buffer->Store<int16_t>(location_, movw_encoding >> 16);
+      buffer->Store<int16_t>(location_ + 2u, static_cast<int16_t>(movw_encoding & 0xffff));
+      buffer->Store<int16_t>(location_ + 4u, movt_encoding >> 16);
+      buffer->Store<int16_t>(location_ + 6u, static_cast<int16_t>(movt_encoding & 0xffff));
+      buffer->Store<int16_t>(location_ + 8u, add_pc_encoding);
+      buffer->Store<int16_t>(location_ + 10u, ldr_encoding >> 16);
+      buffer->Store<int16_t>(location_ + 12u, static_cast<int16_t>(ldr_encoding & 0xffff));
+      break;
     }
   }
 }
 
-
 uint16_t Thumb2Assembler::EmitCompareAndBranch(Register rn, uint16_t prev, bool n) {
   CHECK(IsLowRegister(rn));
   uint32_t location = buffer_.Size();
 
   // This is always unresolved as it must be a forward branch.
   Emit16(prev);      // Previous link.
-  return AddBranch(n ? Branch::kCompareAndBranchNonZero : Branch::kCompareAndBranchZero,
-      location, rn);
+  return AddFixup(Fixup::CompareAndBranch(location, rn, n ? NE : EQ));
 }
 
 
@@ -1619,47 +2195,53 @@ void Thumb2Assembler::EmitMultiMemOp(Condition cond,
   }
 }
 
-
-void Thumb2Assembler::EmitBranch(Condition cond, Label* label, bool link, bool x, bool is_near) {
+void Thumb2Assembler::EmitBranch(Condition cond, Label* label, bool link, bool x) {
+  bool use32bit = IsForced32Bit() || !CanRelocateBranches();
   uint32_t pc = buffer_.Size();
-  Branch::Type branch_type;
+  Fixup::Type branch_type;
   if (cond == AL) {
     if (link) {
+      use32bit = true;
       if (x) {
-        branch_type = Branch::kUnconditionalLinkX;      // BLX.
+        branch_type = Fixup::kUnconditionalLinkX;      // BLX.
       } else {
-        branch_type = Branch::kUnconditionalLink;       // BX.
+        branch_type = Fixup::kUnconditionalLink;       // BX.
       }
     } else {
-      branch_type = Branch::kUnconditional;             // B.
+      branch_type = Fixup::kUnconditional;             // B.
     }
   } else {
-    branch_type = Branch::kConditional;                 // B<cond>.
+    branch_type = Fixup::kConditional;                 // B<cond>.
   }
 
+  Fixup::Size size = use32bit ? Fixup::kBranch32Bit : Fixup::kBranch16Bit;
+  FixupId branch_id = AddFixup(Fixup::Branch(pc, branch_type, size, cond));
+
   if (label->IsBound()) {
-    Branch::Size size = AddBranch(branch_type, pc, label->Position(), cond);  // Resolved branch.
-
-    // The branch is to a bound label which means that it's a backwards branch.  We know the
-    // current size of it so we can emit the appropriate space.  Note that if it's a 16 bit
-    // branch the size may change if it so happens that other branches change size that change
-    // the distance to the target and that distance puts this branch over the limit for 16 bits.
-    if (size == Branch::k16Bit) {
-      Emit16(0);          // Space for a 16 bit branch.
-    } else {
-      Emit32(0);            // Space for a 32 bit branch.
+    // The branch is to a bound label which means that it's a backwards branch.
+    // Record this branch as a dependency of all Fixups between the label and the branch.
+    GetFixup(branch_id)->Resolve(label->Position());
+    for (FixupId fixup_id = branch_id; fixup_id != 0u; ) {
+      --fixup_id;
+      Fixup* fixup = GetFixup(fixup_id);
+      DCHECK_GE(label->Position(), 0);
+      if (fixup->GetLocation() < static_cast<uint32_t>(label->Position())) {
+        break;
+      }
+      fixup->AddDependent(branch_id);
     }
+    Emit16(0);
   } else {
-    // Branch is to an unbound label.  Emit space for it.
-    uint16_t branch_id = AddBranch(branch_type, pc, cond, is_near);    // Unresolved branch.
-    if (force_32bit_ || (!CanRelocateBranches() && !is_near)) {
-      Emit16(static_cast<uint16_t>(label->position_));    // Emit current label link.
-      Emit16(0);                   // another 16 bits.
-    } else {
-      Emit16(static_cast<uint16_t>(label->position_));    // Emit current label link.
-    }
-    label->LinkTo(branch_id);           // Link to the branch ID.
+    // Branch target is an unbound label. Add it to a singly-linked list maintained within
+    // the code with the label serving as the head.
+    Emit16(static_cast<uint16_t>(label->position_));
+    label->LinkTo(branch_id);
   }
+
+  if (use32bit) {
+    Emit16(0);
+  }
+  DCHECK_EQ(buffer_.Size() - pc, GetFixup(branch_id)->GetSizeInBytes());
 }
 
 
@@ -2274,82 +2856,8 @@ void Thumb2Assembler::Mov(Register rd, Register rm, Condition cond) {
 }
 
 
-// A branch has changed size.  Make a hole for it.
-void Thumb2Assembler::MakeHoleForBranch(uint32_t location, uint32_t delta) {
-  // Move the contents of the buffer using: Move(newposition, oldposition)
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  buffer_.Move(location + delta, location);
-}
-
-
 void Thumb2Assembler::Bind(Label* label) {
-  CHECK(!label->IsBound());
-  uint32_t bound_pc = buffer_.Size();
-  std::vector<Branch*> changed_branches;
-
-  while (label->IsLinked()) {
-    uint16_t position = label->Position();                  // Branch id for linked branch.
-    Branch* branch = GetBranch(position);                   // Get the branch at this id.
-    bool changed = branch->Resolve(bound_pc);               // Branch can be resolved now.
-    uint32_t branch_location = branch->GetLocation();
-    uint16_t next = buffer_.Load<uint16_t>(branch_location);       // Get next in chain.
-    if (changed) {
-      DCHECK(CanRelocateBranches());
-      MakeHoleForBranch(branch->GetLocation(), 2);
-      if (branch->IsCompareAndBranch()) {
-        // A cbz/cbnz instruction has changed size.  There is no valid encoding for
-        // a 32 bit cbz/cbnz so we need to change this to an instruction pair:
-        // cmp rn, #0
-        // b<eq|ne> target
-        bool n = branch->GetType() == Branch::kCompareAndBranchNonZero;
-        Condition cond = n ? NE : EQ;
-        branch->Move(2);      // Move the branch forward by 2 bytes.
-        branch->ResetTypeAndCondition(Branch::kConditional, cond);
-        branch->ResetSize(Branch::k16Bit);
-
-        // Now add a compare instruction in the place the branch was.
-        buffer_.Store<int16_t>(branch_location,
-                               B13 | B11 | static_cast<int16_t>(branch->GetRegister()) << 8);
-
-        // Since have moved made a hole in the code we need to reload the
-        // current pc.
-        bound_pc = buffer_.Size();
-
-        // Now resolve the newly added branch.
-        changed = branch->Resolve(bound_pc);
-        if (changed) {
-          MakeHoleForBranch(branch->GetLocation(), 2);
-          changed_branches.push_back(branch);
-        }
-      } else {
-        changed_branches.push_back(branch);
-      }
-    }
-    label->position_ = next;                                // Move to next.
-  }
-  label->BindTo(bound_pc);
-
-  // Now relocate any changed branches.  Do this until there are no more changes.
-  std::vector<Branch*> branches_to_process = changed_branches;
-  while (branches_to_process.size() != 0) {
-    changed_branches.clear();
-    for (auto& changed_branch : branches_to_process) {
-      for (auto& branch : branches_) {
-        bool changed = branch->Relocate(changed_branch->GetLocation(), 2);
-        if (changed) {
-          changed_branches.push_back(branch);
-        }
-      }
-      branches_to_process = changed_branches;
-    }
-  }
-}
-
-
-void Thumb2Assembler::EmitBranches() {
-  for (auto& branch : branches_) {
-    branch->Emit(&buffer_);
-  }
+  BindLabel(label, buffer_.Size());
 }
 
 
@@ -2487,6 +2995,85 @@ int Thumb2Assembler::DecodeBranchOffset(int32_t instr) {
   return imm32;
 }
 
+uint32_t Thumb2Assembler::GetAdjustedPosition(uint32_t old_position) {
+  // We can reconstruct the adjustment by going through all the fixups from the beginning
+  // up to the old_position. Since we expect AdjustedPosition() to be called in a loop
+  // with increasing old_position, we can use the data from last AdjustedPosition() to
+  // continue where we left off and the whole loop should be O(m+n) where m is the number
+  // of positions to adjust and n is the number of fixups.
+  if (old_position < last_old_position_) {
+    last_position_adjustment_ = 0u;
+    last_old_position_ = 0u;
+    last_fixup_id_ = 0u;
+  }
+  while (last_fixup_id_ != fixups_.size()) {
+    Fixup* fixup = GetFixup(last_fixup_id_);
+    if (fixup->GetLocation() >= old_position + last_position_adjustment_) {
+      break;
+    }
+    if (fixup->GetSize() != fixup->GetOriginalSize()) {
+      last_position_adjustment_ += fixup->GetSizeInBytes() - fixup->GetOriginalSizeInBytes();
+    }
+     ++last_fixup_id_;
+  }
+  last_old_position_ = old_position;
+  return old_position + last_position_adjustment_;
+}
+
+Literal* Thumb2Assembler::NewLiteral(size_t size, const uint8_t* data)  {
+  DCHECK(size == 4u || size == 8u) << size;
+  literals_.emplace_back(size, data);
+  return &literals_.back();
+}
+
+void Thumb2Assembler::LoadLiteral(Register rt, Literal* literal)  {
+  DCHECK_EQ(literal->GetSize(), 4u);
+  DCHECK(!literal->GetLabel()->IsBound());
+  bool use32bit = IsForced32Bit() || IsHighRegister(rt);
+  uint32_t location = buffer_.Size();
+  Fixup::Size size = use32bit ? Fixup::kLiteral4KiB : Fixup::kLiteral1KiB;
+  FixupId fixup_id = AddFixup(Fixup::LoadNarrowLiteral(location, rt, size));
+  Emit16(static_cast<uint16_t>(literal->GetLabel()->position_));
+  literal->GetLabel()->LinkTo(fixup_id);
+  if (use32bit) {
+    Emit16(0);
+  }
+  DCHECK_EQ(location + GetFixup(fixup_id)->GetSizeInBytes(), buffer_.Size());
+}
+
+void Thumb2Assembler::LoadLiteral(Register rt, Register rt2, Literal* literal)  {
+  DCHECK_EQ(literal->GetSize(), 8u);
+  DCHECK(!literal->GetLabel()->IsBound());
+  uint32_t location = buffer_.Size();
+  FixupId fixup_id =
+      AddFixup(Fixup::LoadWideLiteral(location, rt, rt2, Fixup::kLongOrFPLiteral1KiB));
+  Emit16(static_cast<uint16_t>(literal->GetLabel()->position_));
+  literal->GetLabel()->LinkTo(fixup_id);
+  Emit16(0);
+  DCHECK_EQ(location + GetFixup(fixup_id)->GetSizeInBytes(), buffer_.Size());
+}
+
+void Thumb2Assembler::LoadLiteral(SRegister sd, Literal* literal)  {
+  DCHECK_EQ(literal->GetSize(), 4u);
+  DCHECK(!literal->GetLabel()->IsBound());
+  uint32_t location = buffer_.Size();
+  FixupId fixup_id = AddFixup(Fixup::LoadSingleLiteral(location, sd, Fixup::kLongOrFPLiteral1KiB));
+  Emit16(static_cast<uint16_t>(literal->GetLabel()->position_));
+  literal->GetLabel()->LinkTo(fixup_id);
+  Emit16(0);
+  DCHECK_EQ(location + GetFixup(fixup_id)->GetSizeInBytes(), buffer_.Size());
+}
+
+void Thumb2Assembler::LoadLiteral(DRegister dd, Literal* literal) {
+  DCHECK_EQ(literal->GetSize(), 8u);
+  DCHECK(!literal->GetLabel()->IsBound());
+  uint32_t location = buffer_.Size();
+  FixupId fixup_id = AddFixup(Fixup::LoadDoubleLiteral(location, dd, Fixup::kLongOrFPLiteral1KiB));
+  Emit16(static_cast<uint16_t>(literal->GetLabel()->position_));
+  literal->GetLabel()->LinkTo(fixup_id);
+  Emit16(0);
+  DCHECK_EQ(location + GetFixup(fixup_id)->GetSizeInBytes(), buffer_.Size());
+}
 
 void Thumb2Assembler::AddConstant(Register rd, int32_t value, Condition cond) {
   AddConstant(rd, rd, value, cond);
@@ -2763,16 +3350,6 @@ void Thumb2Assembler::CompareAndBranchIfZero(Register r, Label* label) {
 }
 
 
-void Thumb2Assembler::CompareAndBranchIfZero(Register r, NearLabel* label) {
-  if (IsLowRegister(r)) {
-    cbz(r, label);
-  } else {
-    cmp(r, ShifterOperand(0));
-    b(label, EQ);
-  }
-}
-
-
 void Thumb2Assembler::CompareAndBranchIfNonZero(Register r, Label* label) {
   if (CanRelocateBranches() && IsLowRegister(r)) {
     cbnz(r, label);
diff --git a/compiler/utils/arm/assembler_thumb2.h b/compiler/utils/arm/assembler_thumb2.h
index 2382b74c30..5e6969b4c2 100644
--- a/compiler/utils/arm/assembler_thumb2.h
+++ b/compiler/utils/arm/assembler_thumb2.h
@@ -17,6 +17,7 @@
 #ifndef ART_COMPILER_UTILS_ARM_ASSEMBLER_THUMB2_H_
 #define ART_COMPILER_UTILS_ARM_ASSEMBLER_THUMB2_H_
 
+#include <deque>
 #include <vector>
 
 #include "base/logging.h"
@@ -34,13 +35,15 @@ class Thumb2Assembler FINAL : public ArmAssembler {
       : can_relocate_branches_(can_relocate_branches),
         force_32bit_(false),
         it_cond_index_(kNoItCondition),
-        next_condition_(AL) {
+        next_condition_(AL),
+        fixups_(),
+        literals_(),
+        last_position_adjustment_(0u),
+        last_old_position_(0u),
+        last_fixup_id_(0u) {
   }
 
   virtual ~Thumb2Assembler() {
-    for (auto& branch : branches_) {
-      delete branch;
-    }
   }
 
   bool IsThumb() const OVERRIDE {
@@ -55,10 +58,7 @@ class Thumb2Assembler FINAL : public ArmAssembler {
     return can_relocate_branches_;
   }
 
-  void FinalizeInstructions(const MemoryRegion& region) OVERRIDE {
-    EmitBranches();
-    Assembler::FinalizeInstructions(region);
-  }
+  void FinalizeCode() OVERRIDE;
 
   // Data-processing instructions.
   void and_(Register rd, Register rn, const ShifterOperand& so, Condition cond = AL) OVERRIDE;
@@ -238,7 +238,6 @@ class Thumb2Assembler FINAL : public ArmAssembler {
 
   // Branch instructions.
   void b(Label* label, Condition cond = AL);
-  void b(NearLabel* label, Condition cond = AL);
   void bl(Label* label, Condition cond = AL);
   void blx(Label* label);
   void blx(Register rm, Condition cond = AL) OVERRIDE;
@@ -273,13 +272,23 @@ class Thumb2Assembler FINAL : public ArmAssembler {
   void Mov(Register rd, Register rm, Condition cond = AL) OVERRIDE;
 
   void CompareAndBranchIfZero(Register r, Label* label) OVERRIDE;
-  void CompareAndBranchIfZero(Register r, NearLabel* label) OVERRIDE;
   void CompareAndBranchIfNonZero(Register r, Label* label) OVERRIDE;
 
   // Memory barriers.
   void dmb(DmbOptions flavor) OVERRIDE;
 
-  // Macros.
+  // Get the final position of a label after local fixup based on the old position
+  // recorded before FinalizeCode().
+  uint32_t GetAdjustedPosition(uint32_t old_position) OVERRIDE;
+
+  using ArmAssembler::NewLiteral;  // Make the helper template visible.
+
+  Literal* NewLiteral(size_t size, const uint8_t* data) OVERRIDE;
+  void LoadLiteral(Register rt, Literal* literal) OVERRIDE;
+  void LoadLiteral(Register rt, Register rt2, Literal* literal) OVERRIDE;
+  void LoadLiteral(SRegister sd, Literal* literal) OVERRIDE;
+  void LoadLiteral(DRegister dd, Literal* literal) OVERRIDE;
+
   // Add signed constant value to rd. May clobber IP.
   void AddConstant(Register rd, int32_t value, Condition cond = AL) OVERRIDE;
   void AddConstant(Register rd, Register rn, int32_t value,
@@ -340,6 +349,244 @@ class Thumb2Assembler FINAL : public ArmAssembler {
   }
 
  private:
+  typedef uint16_t FixupId;
+
+  // Fixup: branches and literal pool references.
+  //
+  // The thumb2 architecture allows branches to be either 16 or 32 bit instructions. This
+  // depends on both the type of branch and the offset to which it is branching. The 16-bit
+  // cbz and cbnz instructions may also need to be replaced with a separate 16-bit compare
+  // instruction and a 16- or 32-bit branch instruction. Load from a literal pool can also be
+  // 16-bit or 32-bit instruction and, if the method is large, we may need to use a sequence
+  // of instructions to make up for the limited range of load literal instructions (up to
+  // 4KiB for the 32-bit variant). When generating code for these insns we don't know the
+  // size before hand, so we assume it is the smallest available size and determine the final
+  // code offsets and sizes and emit code in FinalizeCode().
+  //
+  // To handle this, we keep a record of every branch and literal pool load in the program.
+  // The actual instruction encoding for these is delayed until we know the final size of
+  // every instruction. When we bind a label to a branch we don't know the final location yet
+  // as some preceding instructions may need to be expanded, so we record a non-final offset.
+  // In FinalizeCode(), we expand the sizes of branches and literal loads that are out of
+  // range. With each expansion, we need to update dependent Fixups, i.e. insntructios with
+  // target on the other side of the expanded insn, as their offsets change and this may
+  // trigger further expansion.
+  //
+  // All Fixups have a 'fixup id' which is a 16 bit unsigned number used to identify the
+  // Fixup. For each unresolved label we keep a singly-linked list of all Fixups pointing
+  // to it, using the fixup ids as links. The first link is stored in the label's position
+  // (the label is linked but not bound), the following links are stored in the code buffer,
+  // in the placeholder where we will eventually emit the actual code.
+
+  class Fixup {
+   public:
+    // Branch type.
+    enum Type : uint8_t {
+      kConditional,               // B<cond>.
+      kUnconditional,             // B.
+      kUnconditionalLink,         // BL.
+      kUnconditionalLinkX,        // BLX.
+      kCompareAndBranchXZero,     // cbz/cbnz.
+      kLoadLiteralNarrow,         // Load narrrow integer literal.
+      kLoadLiteralWide,           // Load wide integer literal.
+      kLoadFPLiteralSingle,       // Load FP literal single.
+      kLoadFPLiteralDouble,       // Load FP literal double.
+    };
+
+    // Calculated size of branch instruction based on type and offset.
+    enum Size : uint8_t {
+      // Branch variants.
+      kBranch16Bit,
+      kBranch32Bit,
+      // NOTE: We don't support branches which would require multiple instructions, i.e.
+      // conditinoal branches beyond +-1MiB and unconditional branches beyond +-16MiB.
+
+      // CBZ/CBNZ variants.
+      kCbxz16Bit,   // CBZ/CBNZ rX, label; X < 8; 7-bit positive offset.
+      kCbxz32Bit,   // CMP rX, #0 + Bcc label; X < 8; 16-bit Bcc; +-8-bit offset.
+      kCbxz48Bit,   // CMP rX, #0 + Bcc label; X < 8; 32-bit Bcc; up to +-1MiB offset.
+
+      // Load integer literal variants.
+      // LDR rX, label; X < 8; 16-bit variant up to 1KiB offset; 2 bytes.
+      kLiteral1KiB,
+      // LDR rX, label; 32-bit variant up to 4KiB offset; 4 bytes.
+      kLiteral4KiB,
+      // MOV rX, imm16 + ADD rX, pc + LDR rX, [rX]; X < 8; up to 64KiB offset; 8 bytes.
+      kLiteral64KiB,
+      // MOV rX, modimm + ADD rX, pc + LDR rX, [rX, #imm12]; up to 1MiB offset; 10 bytes.
+      kLiteral1MiB,
+      // NOTE: We don't provide the 12-byte version of kLiteralFar below where the LDR is 16-bit.
+      // MOV rX, imm16 + MOVT rX, imm16 + ADD rX, pc + LDR rX, [rX]; any offset; 14 bytes.
+      kLiteralFar,
+
+      // Load long or FP literal variants.
+      // VLDR s/dX, label; 32-bit insn, up to 1KiB offset; 4 bytes.
+      kLongOrFPLiteral1KiB,
+      // MOV ip, modimm + ADD ip, pc + VLDR s/dX, [IP, #imm8*4]; up to 256KiB offset; 10 bytes.
+      kLongOrFPLiteral256KiB,
+      // MOV ip, imm16 + MOVT ip, imm16 + ADD ip, pc + VLDR s/dX, [IP]; any offset; 14 bytes.
+      kLongOrFPLiteralFar,
+    };
+
+    // Unresolved branch possibly with a condition.
+    static Fixup Branch(uint32_t location, Type type, Size size = kBranch16Bit,
+                        Condition cond = AL) {
+      DCHECK(type == kConditional || type == kUnconditional ||
+             type == kUnconditionalLink || type == kUnconditionalLinkX);
+      DCHECK(size == kBranch16Bit || size == kBranch32Bit);
+      DCHECK(size == kBranch32Bit || (type == kConditional || type == kUnconditional));
+      return Fixup(kNoRegister, kNoRegister, kNoSRegister, kNoDRegister,
+                   cond, type, size, location);
+    }
+
+    // Unresolved compare-and-branch instruction with a register and condition (EQ or NE).
+    static Fixup CompareAndBranch(uint32_t location, Register rn, Condition cond) {
+      DCHECK(cond == EQ || cond == NE);
+      return Fixup(rn, kNoRegister, kNoSRegister, kNoDRegister,
+                   cond, kCompareAndBranchXZero, kCbxz16Bit, location);
+    }
+
+    // Load narrow literal.
+    static Fixup LoadNarrowLiteral(uint32_t location, Register rt, Size size = kLiteral1KiB) {
+      DCHECK(size == kLiteral1KiB || size == kLiteral4KiB || size == kLiteral64KiB ||
+             size == kLiteral1MiB || size == kLiteralFar);
+      DCHECK(!IsHighRegister(rt) || (size != kLiteral1KiB && size != kLiteral64KiB));
+      return Fixup(rt, kNoRegister, kNoSRegister, kNoDRegister,
+                   AL, kLoadLiteralNarrow, size, location);
+    }
+
+    // Load wide literal.
+    static Fixup LoadWideLiteral(uint32_t location, Register rt, Register rt2,
+                                 Size size = kLongOrFPLiteral1KiB) {
+      DCHECK(size == kLongOrFPLiteral1KiB || size == kLongOrFPLiteral256KiB ||
+             size == kLongOrFPLiteralFar);
+      DCHECK(!IsHighRegister(rt) || (size != kLiteral1KiB && size != kLiteral64KiB));
+      return Fixup(rt, rt2, kNoSRegister, kNoDRegister,
+                   AL, kLoadLiteralWide, size, location);
+    }
+
+    // Load FP single literal.
+    static Fixup LoadSingleLiteral(uint32_t location, SRegister sd,
+                                   Size size = kLongOrFPLiteral1KiB) {
+      DCHECK(size == kLongOrFPLiteral1KiB || size == kLongOrFPLiteral256KiB ||
+             size == kLongOrFPLiteralFar);
+      return Fixup(kNoRegister, kNoRegister, sd, kNoDRegister,
+                   AL, kLoadFPLiteralSingle, size, location);
+    }
+
+    // Load FP double literal.
+    static Fixup LoadDoubleLiteral(uint32_t location, DRegister dd,
+                                   Size size = kLongOrFPLiteral1KiB) {
+      DCHECK(size == kLongOrFPLiteral1KiB || size == kLongOrFPLiteral256KiB ||
+             size == kLongOrFPLiteralFar);
+      return Fixup(kNoRegister, kNoRegister, kNoSRegister, dd,
+                   AL, kLoadFPLiteralDouble, size, location);
+    }
+
+    Type GetType() const {
+      return type_;
+    }
+
+    Size GetOriginalSize() const {
+      return original_size_;
+    }
+
+    Size GetSize() const {
+      return size_;
+    }
+
+    uint32_t GetOriginalSizeInBytes() const;
+
+    uint32_t GetSizeInBytes() const;
+
+    uint32_t GetLocation() const {
+      return location_;
+    }
+
+    uint32_t GetAdjustment() const {
+      return adjustment_;
+    }
+
+    const std::vector<FixupId>& Dependents() const {
+      return dependents_;
+    }
+
+    void AddDependent(FixupId dependent_id) {
+      dependents_.push_back(dependent_id);
+    }
+
+    // Resolve a branch when the target is known.
+    void Resolve(uint32_t target) {
+      DCHECK_EQ(target_, kUnresolved);
+      DCHECK_NE(target, kUnresolved);
+      target_ = target;
+    }
+
+    // Check if the current size is OK for current location_, target_ and adjustment_.
+    // If not, increase the size. Return the size increase, 0 if unchanged.
+    // If the target if after this Fixup, also add the difference to adjustment_,
+    // so that we don't need to consider forward Fixups as their own dependencies.
+    uint32_t AdjustSizeIfNeeded(uint32_t current_code_size);
+
+    // Increase adjustments. This is called for dependents of a Fixup when its size changes.
+    void IncreaseAdjustment(uint32_t increase) {
+      adjustment_ += increase;
+    }
+
+    // Finalize the branch with an adjustment to the location. Both location and target are updated.
+    void Finalize(uint32_t location_adjustment) {
+      DCHECK_NE(target_, kUnresolved);
+      location_ += location_adjustment;
+      target_ += location_adjustment;
+    }
+
+    // Emit the branch instruction into the assembler buffer.  This does the
+    // encoding into the thumb instruction.
+    void Emit(AssemblerBuffer* buffer, uint32_t code_size) const;
+
+   private:
+    Fixup(Register rn, Register rt2, SRegister sd, DRegister dd,
+          Condition cond, Type type, Size size, uint32_t location)
+        : rn_(rn),
+          rt2_(rt2),
+          sd_(sd),
+          dd_(dd),
+          cond_(cond),
+          type_(type),
+          original_size_(size), size_(size),
+          location_(location),
+          target_(kUnresolved),
+          adjustment_(0u),
+          dependents_() {
+    }
+    static size_t SizeInBytes(Size size);
+
+    // The size of padding added before the literal pool.
+    static size_t LiteralPoolPaddingSize(uint32_t current_code_size);
+
+    // Returns the offset from the PC-using insn to the target.
+    int32_t GetOffset(uint32_t current_code_size) const;
+
+    size_t IncreaseSize(Size new_size);
+
+    int32_t LoadWideOrFpEncoding(Register rbase, int32_t offset) const;
+
+    static constexpr uint32_t kUnresolved = 0xffffffff;     // Value for target_ for unresolved.
+
+    const Register rn_;   // Rn for cbnz/cbz, Rt for literal loads.
+    Register rt2_;        // For kLoadLiteralWide.
+    SRegister sd_;        // For kLoadFPLiteralSingle.
+    DRegister dd_;        // For kLoadFPLiteralDouble.
+    const Condition cond_;
+    const Type type_;
+    Size original_size_;
+    Size size_;
+    uint32_t location_;     // Offset into assembler buffer in bytes.
+    uint32_t target_;       // Offset into assembler buffer in bytes.
+    uint32_t adjustment_;   // The number of extra bytes inserted between location_ and target_.
+    std::vector<FixupId> dependents_;  // Fixups that require adjustment when current size changes.
+  };
+
   // Emit a single 32 or 16 bit data processing instruction.
   void EmitDataProcessing(Condition cond,
                           Opcode opcode,
@@ -432,7 +679,7 @@ class Thumb2Assembler FINAL : public ArmAssembler {
 
   void EmitVPushPop(uint32_t reg, int nregs, bool push, bool dbl, Condition cond);
 
-  void EmitBranch(Condition cond, Label* label, bool link, bool x, bool is_near = false);
+  void EmitBranch(Condition cond, Label* label, bool link, bool x);
   static int32_t EncodeBranchOffset(int32_t offset, int32_t inst);
   static int DecodeBranchOffset(int32_t inst);
   int32_t EncodeTstOffset(int offset, int32_t inst);
@@ -475,275 +722,53 @@ class Thumb2Assembler FINAL : public ArmAssembler {
     CheckCondition(cond);
   }
 
-  // Branches.
-  //
-  // The thumb2 architecture allows branches to be either 16 or 32 bit instructions.  This
-  // depends on both the type of branch and the offset to which it is branching.  When
-  // generating code for branches we don't know the size before hand (if the branch is
-  // going forward, because we haven't seen the target address yet), so we need to assume
-  // that it is going to be one of 16 or 32 bits.  When we know the target (the label is 'bound')
-  // we can determine the actual size of the branch.  However, if we had guessed wrong before
-  // we knew the target there will be no room in the instruction sequence for the new
-  // instruction (assume that we never decrease the size of a branch).
-  //
-  // To handle this, we keep a record of every branch in the program.  The actual instruction
-  // encoding for these is delayed until we know the final size of every branch.  When we
-  // bind a label to a branch (we then know the target address) we determine if the branch
-  // has changed size.  If it has we need to move all the instructions in the buffer after
-  // the branch point forward by the change in size of the branch.  This will create a gap
-  // in the code big enough for the new branch encoding.  However, since we have moved
-  // a chunk of code we need to relocate the branches in that code to their new address.
-  //
-  // Creating a hole in the code for the new branch encoding might cause another branch that was
-  // 16 bits to become 32 bits, so we need to find this in another pass.
-  //
-  // We also need to deal with a cbz/cbnz instruction that becomes too big for its offset
-  // range.  We do this by converting it to two instructions:
-  //     cmp Rn, #0
-  //     b<cond> target
-  // But we also need to handle the case where the conditional branch is out of range and
-  // becomes a 32 bit conditional branch.
-  //
-  // All branches have a 'branch id' which is a 16 bit unsigned number used to identify
-  // the branch.  Unresolved labels use the branch id to link to the next unresolved branch.
-
-  class Branch {
-   public:
-    // Branch type.
-    enum Type {
-      kUnconditional,             // B.
-      kConditional,               // B<cond>.
-      kCompareAndBranchZero,      // cbz.
-      kCompareAndBranchNonZero,   // cbnz.
-      kUnconditionalLink,         // BL.
-      kUnconditionalLinkX,        // BLX.
-      kUnconditionalX             // BX.
-    };
-
-    // Calculated size of branch instruction based on type and offset.
-    enum Size {
-      k16Bit,
-      k32Bit
-    };
-
-    // Unresolved branch possibly with a condition.
-    Branch(const Thumb2Assembler* assembler, Type type, uint32_t location, Condition cond = AL) :
-        assembler_(assembler), type_(type), location_(location),
-        target_(kUnresolved),
-        cond_(cond), rn_(R0) {
-      CHECK(!IsCompareAndBranch());
-      size_ = CalculateSize();
-    }
-
-    // Unresolved compare-and-branch instruction with a register.
-    Branch(const Thumb2Assembler* assembler, Type type, uint32_t location, Register rn) :
-        assembler_(assembler), type_(type), location_(location),
-        target_(kUnresolved), cond_(AL), rn_(rn) {
-      CHECK(IsCompareAndBranch());
-      size_ = CalculateSize();
-    }
-
-    // Resolved branch (can't be compare-and-branch) with a target and possibly a condition.
-    Branch(const Thumb2Assembler* assembler, Type type, uint32_t location, uint32_t target,
-           Condition cond = AL) :
-           assembler_(assembler), type_(type), location_(location),
-           target_(target), cond_(cond), rn_(R0) {
-      CHECK(!IsCompareAndBranch());
-      // Resolved branch.
-      size_ = CalculateSize();
-    }
-
-    bool IsCompareAndBranch() const {
-      return type_ == kCompareAndBranchNonZero || type_ == kCompareAndBranchZero;
-    }
-
-    // Resolve a branch when the target is known.  If this causes the
-    // size of the branch to change return true.  Otherwise return false.
-    bool Resolve(uint32_t target) {
-      uint32_t old_target = target_;
-      target_ = target;
-      if (assembler_->CanRelocateBranches()) {
-        Size new_size = CalculateSize();
-        if (size_ != new_size) {
-          size_ = new_size;
-          return true;
-        }
-        return false;
-      } else {
-        if (kIsDebugBuild) {
-          if (old_target == kUnresolved) {
-            // Check that the size has not increased.
-            DCHECK(!(CalculateSize() == k32Bit && size_ == k16Bit));
-          } else {
-            DCHECK(CalculateSize() == size_);
-          }
-        }
-        return false;
-      }
-    }
-
-    // Move a cbz/cbnz branch.  This is always forward.
-    void Move(int32_t delta) {
-      CHECK(IsCompareAndBranch());
-      CHECK_GT(delta, 0);
-      location_ += delta;
-      target_ += delta;
-    }
-
-    // Relocate a branch by a given delta.  This changed the location and
-    // target if they need to be changed.  It also recalculates the
-    // size of the branch instruction.  It returns true if the branch
-    // has changed size.
-    bool Relocate(uint32_t oldlocation, int32_t delta) {
-      DCHECK(assembler_->CanRelocateBranches());
-      if (location_ > oldlocation) {
-        location_ += delta;
-      }
-      if (target_ != kUnresolved) {
-        if (target_ > oldlocation) {
-          target_ += delta;
-        }
-      } else {
-        return false;       // Don't know the size yet.
-      }
-
-      // Calculate the new size.
-      Size new_size = CalculateSize();
-      if (size_ != new_size) {
-        size_ = new_size;
-        return true;
-      }
-      return false;
-    }
-
-    Size GetSize() const {
-      return size_;
-    }
-
-    Type GetType() const {
-      return type_;
-    }
-
-    uint32_t GetLocation() const {
-      return location_;
-    }
-
-    // Emit the branch instruction into the assembler buffer.  This does the
-    // encoding into the thumb instruction.
-    void Emit(AssemblerBuffer* buffer) const;
-
-    // Reset the type and condition to those given.  This used for
-    // cbz/cbnz instructions when they are converted to cmp/b<cond>
-    void ResetTypeAndCondition(Type type, Condition cond) {
-      CHECK(IsCompareAndBranch());
-      CHECK(cond == EQ || cond == NE);
-      type_ = type;
-      cond_ = cond;
-    }
-
-    Register GetRegister() const {
-      return rn_;
-    }
-
-    void ResetSize(Size size) {
-      size_ = size;
-    }
-
-   private:
-    // Calculate the size of the branch instruction based on its type and offset.
-    Size CalculateSize() const {
-      if (target_ == kUnresolved) {
-        if (assembler_->IsForced32Bit() && (type_ == kUnconditional || type_ == kConditional)) {
-          return k32Bit;
-        }
-        if (IsCompareAndBranch()) {
-          // Compare and branch instructions can only be encoded on 16 bits.
-          return k16Bit;
-        }
-        return assembler_->CanRelocateBranches() ? k16Bit : k32Bit;
-      }
-      // When the target is resolved, we know the best encoding for it.
-      int32_t delta = target_ - location_ - 4;
-      if (delta < 0) {
-        delta = -delta;
-      }
-      switch (type_) {
-        case kUnconditional:
-          if (assembler_->IsForced32Bit() || delta >= (1 << 11)) {
-            return k32Bit;
-          } else {
-            return k16Bit;
-          }
-        case kConditional:
-          if (assembler_->IsForced32Bit() || delta >= (1 << 8)) {
-            return k32Bit;
-          } else {
-            return k16Bit;
-          }
-        case kCompareAndBranchZero:
-        case kCompareAndBranchNonZero:
-          if (delta >= (1 << 7)) {
-            return k32Bit;      // Will cause this branch to become invalid.
-          }
-          return k16Bit;
-
-        case kUnconditionalX:
-        case kUnconditionalLinkX:
-          return k16Bit;
-        case kUnconditionalLink:
-          return k32Bit;
-      }
-      LOG(FATAL) << "Cannot reach";
-      return k16Bit;
-    }
-
-    static constexpr uint32_t kUnresolved = 0xffffffff;     // Value for target_ for unresolved.
-    const Thumb2Assembler* assembler_;
-    Type type_;
-    uint32_t location_;     // Offset into assembler buffer in bytes.
-    uint32_t target_;       // Offset into assembler buffer in bytes.
-    Size size_;
-    Condition cond_;
-    const Register rn_;
-  };
-
-  std::vector<Branch*> branches_;
-
-  // Add a resolved branch and return its size.
-  Branch::Size AddBranch(Branch::Type type, uint32_t location, uint32_t target,
-                         Condition cond = AL) {
-    branches_.push_back(new Branch(this, type, location, target, cond));
-    return branches_[branches_.size()-1]->GetSize();
-  }
-
-  // Add a compare and branch (with a register) and return its id.
-  uint16_t AddBranch(Branch::Type type, uint32_t location, Register rn) {
-    branches_.push_back(new Branch(this, type, location, rn));
-    return branches_.size() - 1;
+  FixupId AddFixup(Fixup fixup) {
+    FixupId fixup_id = static_cast<FixupId>(fixups_.size());
+    fixups_.push_back(fixup);
+    // For iterating using FixupId, we need the next id to be representable.
+    DCHECK_EQ(static_cast<size_t>(static_cast<FixupId>(fixups_.size())), fixups_.size());
+    return fixup_id;
   }
 
-  // Add an unresolved branch and return its id.
-  uint16_t AddBranch(Branch::Type type,
-                     uint32_t location,
-                     Condition cond = AL,
-                     bool is_near = false) {
-    Branch* branch = new Branch(this, type, location, cond);
-    if (is_near) {
-      branch->ResetSize(Branch::k16Bit);
-    }
-    branches_.push_back(branch);
-    return branches_.size() - 1;
-  }
-
-  Branch* GetBranch(uint16_t branchid) {
-    if (branchid >= branches_.size()) {
-      return nullptr;
-    }
-    return branches_[branchid];
+  Fixup* GetFixup(FixupId fixup_id) {
+    DCHECK_LT(fixup_id, fixups_.size());
+    return &fixups_[fixup_id];
   }
 
-  void EmitBranches();
-  void MakeHoleForBranch(uint32_t location, uint32_t size);
+  void BindLabel(Label* label, uint32_t bound_pc);
+  void BindLiterals();
+  void AdjustFixupIfNeeded(Fixup* fixup, uint32_t* current_code_size,
+                           std::deque<FixupId>* fixups_to_recalculate);
+  uint32_t AdjustFixups();
+  void EmitFixups(uint32_t adjusted_code_size);
+  void EmitLiterals();
+
+  static int16_t BEncoding16(int32_t offset, Condition cond);
+  static int32_t BEncoding32(int32_t offset, Condition cond);
+  static int16_t CbxzEncoding16(Register rn, int32_t offset, Condition cond);
+  static int16_t CmpRnImm8Encoding16(Register rn, int32_t value);
+  static int16_t AddRdnRmEncoding16(Register rdn, Register rm);
+  static int32_t MovwEncoding32(Register rd, int32_t value);
+  static int32_t MovtEncoding32(Register rd, int32_t value);
+  static int32_t MovModImmEncoding32(Register rd, int32_t value);
+  static int16_t LdrLitEncoding16(Register rt, int32_t offset);
+  static int32_t LdrLitEncoding32(Register rt, int32_t offset);
+  static int32_t LdrdEncoding32(Register rt, Register rt2, Register rn, int32_t offset);
+  static int32_t VldrsEncoding32(SRegister sd, Register rn, int32_t offset);
+  static int32_t VldrdEncoding32(DRegister dd, Register rn, int32_t offset);
+  static int16_t LdrRtRnImm5Encoding16(Register rt, Register rn, int32_t offset);
+  static int32_t LdrRtRnImm12Encoding(Register rt, Register rn, int32_t offset);
+
+  std::vector<Fixup> fixups_;
+
+  // Use std::deque<> for literal labels to allow insertions at the end
+  // without invalidating pointers and references to existing elements.
+  std::deque<Literal> literals_;
+
+  // Data for AdjustedPosition(), see the description there.
+  uint32_t last_position_adjustment_;
+  uint32_t last_old_position_;
+  FixupId last_fixup_id_;
 };
 
 }  // namespace arm
diff --git a/compiler/utils/arm/assembler_thumb2_test.cc b/compiler/utils/arm/assembler_thumb2_test.cc
index 733441b889..68b7931a0c 100644
--- a/compiler/utils/arm/assembler_thumb2_test.cc
+++ b/compiler/utils/arm/assembler_thumb2_test.cc
@@ -78,13 +78,20 @@ class AssemblerThumb2Test : public AssemblerTest<arm::Thumb2Assembler,
     return imm_value;
   }
 
+  std::string RepeatInsn(size_t count, const std::string& insn) {
+    std::string result;
+    for (; count != 0u; --count) {
+      result += insn;
+    }
+    return result;
+  }
+
  private:
   std::vector<arm::Register*> registers_;
 
   static constexpr const char* kThumb2AssemblyHeader = ".syntax unified\n.thumb\n";
 };
 
-
 TEST_F(AssemblerThumb2Test, Toolchain) {
   EXPECT_TRUE(CheckTools());
 }
@@ -370,4 +377,577 @@ TEST_F(AssemblerThumb2Test, StoreWordPairToNonThumbOffset) {
   DriverStr(expected, "StoreWordPairToNonThumbOffset");
 }
 
+TEST_F(AssemblerThumb2Test, TwoCbzMaxOffset) {
+  Label label0, label1, label2;
+  __ cbz(arm::R0, &label1);
+  constexpr size_t kLdrR0R0Count1 = 63;
+  for (size_t i = 0; i != kLdrR0R0Count1; ++i) {
+    __ ldr(arm::R0, arm::Address(arm::R0));
+  }
+  __ Bind(&label0);
+  __ cbz(arm::R0, &label2);
+  __ Bind(&label1);
+  constexpr size_t kLdrR0R0Count2 = 64;
+  for (size_t i = 0; i != kLdrR0R0Count2; ++i) {
+    __ ldr(arm::R0, arm::Address(arm::R0));
+  }
+  __ Bind(&label2);
+
+  std::string expected =
+      "cbz r0, 1f\n" +            // cbz r0, label1
+      RepeatInsn(kLdrR0R0Count1, "ldr r0, [r0]\n") +
+      "0:\n"
+      "cbz r0, 2f\n"              // cbz r0, label2
+      "1:\n" +
+      RepeatInsn(kLdrR0R0Count2, "ldr r0, [r0]\n") +
+      "2:\n";
+  DriverStr(expected, "TwoCbzMaxOffset");
+
+  EXPECT_EQ(static_cast<uint32_t>(label0.Position()) + 0u,
+            __ GetAdjustedPosition(label0.Position()));
+  EXPECT_EQ(static_cast<uint32_t>(label1.Position()) + 0u,
+            __ GetAdjustedPosition(label1.Position()));
+  EXPECT_EQ(static_cast<uint32_t>(label2.Position()) + 0u,
+            __ GetAdjustedPosition(label2.Position()));
+}
+
+TEST_F(AssemblerThumb2Test, TwoCbzBeyondMaxOffset) {
+  Label label0, label1, label2;
+  __ cbz(arm::R0, &label1);
+  constexpr size_t kLdrR0R0Count1 = 63;
+  for (size_t i = 0; i != kLdrR0R0Count1; ++i) {
+    __ ldr(arm::R0, arm::Address(arm::R0));
+  }
+  __ Bind(&label0);
+  __ cbz(arm::R0, &label2);
+  __ Bind(&label1);
+  constexpr size_t kLdrR0R0Count2 = 65;
+  for (size_t i = 0; i != kLdrR0R0Count2; ++i) {
+    __ ldr(arm::R0, arm::Address(arm::R0));
+  }
+  __ Bind(&label2);
+
+  std::string expected =
+      "cmp r0, #0\n"              // cbz r0, label1
+      "beq.n 1f\n" +
+      RepeatInsn(kLdrR0R0Count1, "ldr r0, [r0]\n") +
+      "0:\n"
+      "cmp r0, #0\n"              // cbz r0, label2
+      "beq.n 2f\n"
+      "1:\n" +
+      RepeatInsn(kLdrR0R0Count2, "ldr r0, [r0]\n") +
+      "2:\n";
+  DriverStr(expected, "TwoCbzBeyondMaxOffset");
+
+  EXPECT_EQ(static_cast<uint32_t>(label0.Position()) + 2u,
+            __ GetAdjustedPosition(label0.Position()));
+  EXPECT_EQ(static_cast<uint32_t>(label1.Position()) + 4u,
+            __ GetAdjustedPosition(label1.Position()));
+  EXPECT_EQ(static_cast<uint32_t>(label2.Position()) + 4u,
+            __ GetAdjustedPosition(label2.Position()));
+}
+
+TEST_F(AssemblerThumb2Test, TwoCbzSecondAtMaxB16Offset) {
+  Label label0, label1, label2;
+  __ cbz(arm::R0, &label1);
+  constexpr size_t kLdrR0R0Count1 = 62;
+  for (size_t i = 0; i != kLdrR0R0Count1; ++i) {
+    __ ldr(arm::R0, arm::Address(arm::R0));
+  }
+  __ Bind(&label0);
+  __ cbz(arm::R0, &label2);
+  __ Bind(&label1);
+  constexpr size_t kLdrR0R0Count2 = 128;
+  for (size_t i = 0; i != kLdrR0R0Count2; ++i) {
+    __ ldr(arm::R0, arm::Address(arm::R0));
+  }
+  __ Bind(&label2);
+
+  std::string expected =
+      "cbz r0, 1f\n" +            // cbz r0, label1
+      RepeatInsn(kLdrR0R0Count1, "ldr r0, [r0]\n") +
+      "0:\n"
+      "cmp r0, #0\n"              // cbz r0, label2
+      "beq.n 2f\n"
+      "1:\n" +
+      RepeatInsn(kLdrR0R0Count2, "ldr r0, [r0]\n") +
+      "2:\n";
+  DriverStr(expected, "TwoCbzSecondAtMaxB16Offset");
+
+  EXPECT_EQ(static_cast<uint32_t>(label0.Position()) + 0u,
+            __ GetAdjustedPosition(label0.Position()));
+  EXPECT_EQ(static_cast<uint32_t>(label1.Position()) + 2u,
+            __ GetAdjustedPosition(label1.Position()));
+  EXPECT_EQ(static_cast<uint32_t>(label2.Position()) + 2u,
+            __ GetAdjustedPosition(label2.Position()));
+}
+
+TEST_F(AssemblerThumb2Test, TwoCbzSecondBeyondMaxB16Offset) {
+  Label label0, label1, label2;
+  __ cbz(arm::R0, &label1);
+  constexpr size_t kLdrR0R0Count1 = 62;
+  for (size_t i = 0; i != kLdrR0R0Count1; ++i) {
+    __ ldr(arm::R0, arm::Address(arm::R0));
+  }
+  __ Bind(&label0);
+  __ cbz(arm::R0, &label2);
+  __ Bind(&label1);
+  constexpr size_t kLdrR0R0Count2 = 129;
+  for (size_t i = 0; i != kLdrR0R0Count2; ++i) {
+    __ ldr(arm::R0, arm::Address(arm::R0));
+  }
+  __ Bind(&label2);
+
+  std::string expected =
+      "cmp r0, #0\n"              // cbz r0, label1
+      "beq.n 1f\n" +
+      RepeatInsn(kLdrR0R0Count1, "ldr r0, [r0]\n") +
+      "0:\n"
+      "cmp r0, #0\n"              // cbz r0, label2
+      "beq.w 2f\n"
+      "1:\n" +
+      RepeatInsn(kLdrR0R0Count2, "ldr r0, [r0]\n") +
+      "2:\n";
+  DriverStr(expected, "TwoCbzSecondBeyondMaxB16Offset");
+
+  EXPECT_EQ(static_cast<uint32_t>(label0.Position()) + 2u,
+            __ GetAdjustedPosition(label0.Position()));
+  EXPECT_EQ(static_cast<uint32_t>(label1.Position()) + 6u,
+            __ GetAdjustedPosition(label1.Position()));
+  EXPECT_EQ(static_cast<uint32_t>(label2.Position()) + 6u,
+            __ GetAdjustedPosition(label2.Position()));
+}
+
+TEST_F(AssemblerThumb2Test, TwoCbzFirstAtMaxB16Offset) {
+  Label label0, label1, label2;
+  __ cbz(arm::R0, &label1);
+  constexpr size_t kLdrR0R0Count1 = 127;
+  for (size_t i = 0; i != kLdrR0R0Count1; ++i) {
+    __ ldr(arm::R0, arm::Address(arm::R0));
+  }
+  __ Bind(&label0);
+  __ cbz(arm::R0, &label2);
+  __ Bind(&label1);
+  constexpr size_t kLdrR0R0Count2 = 64;
+  for (size_t i = 0; i != kLdrR0R0Count2; ++i) {
+    __ ldr(arm::R0, arm::Address(arm::R0));
+  }
+  __ Bind(&label2);
+
+  std::string expected =
+      "cmp r0, #0\n"              // cbz r0, label1
+      "beq.n 1f\n" +
+      RepeatInsn(kLdrR0R0Count1, "ldr r0, [r0]\n") +
+      "0:\n"
+      "cbz r0, 2f\n"              // cbz r0, label2
+      "1:\n" +
+      RepeatInsn(kLdrR0R0Count2, "ldr r0, [r0]\n") +
+      "2:\n";
+  DriverStr(expected, "TwoCbzFirstAtMaxB16Offset");
+
+  EXPECT_EQ(static_cast<uint32_t>(label0.Position()) + 2u,
+            __ GetAdjustedPosition(label0.Position()));
+  EXPECT_EQ(static_cast<uint32_t>(label1.Position()) + 2u,
+            __ GetAdjustedPosition(label1.Position()));
+  EXPECT_EQ(static_cast<uint32_t>(label2.Position()) + 2u,
+            __ GetAdjustedPosition(label2.Position()));
+}
+
+TEST_F(AssemblerThumb2Test, TwoCbzFirstBeyondMaxB16Offset) {
+  Label label0, label1, label2;
+  __ cbz(arm::R0, &label1);
+  constexpr size_t kLdrR0R0Count1 = 127;
+  for (size_t i = 0; i != kLdrR0R0Count1; ++i) {
+    __ ldr(arm::R0, arm::Address(arm::R0));
+  }
+  __ Bind(&label0);
+  __ cbz(arm::R0, &label2);
+  __ Bind(&label1);
+  constexpr size_t kLdrR0R0Count2 = 65;
+  for (size_t i = 0; i != kLdrR0R0Count2; ++i) {
+    __ ldr(arm::R0, arm::Address(arm::R0));
+  }
+  __ Bind(&label2);
+
+  std::string expected =
+      "cmp r0, #0\n"              // cbz r0, label1
+      "beq.w 1f\n" +
+      RepeatInsn(kLdrR0R0Count1, "ldr r0, [r0]\n") +
+      "0:\n"
+      "cmp r0, #0\n"              // cbz r0, label2
+      "beq.n 2f\n"
+      "1:\n" +
+      RepeatInsn(kLdrR0R0Count2, "ldr r0, [r0]\n") +
+      "2:\n";
+  DriverStr(expected, "TwoCbzFirstBeyondMaxB16Offset");
+
+  EXPECT_EQ(static_cast<uint32_t>(label0.Position()) + 4u,
+            __ GetAdjustedPosition(label0.Position()));
+  EXPECT_EQ(static_cast<uint32_t>(label1.Position()) + 6u,
+            __ GetAdjustedPosition(label1.Position()));
+  EXPECT_EQ(static_cast<uint32_t>(label2.Position()) + 6u,
+            __ GetAdjustedPosition(label2.Position()));
+}
+
+TEST_F(AssemblerThumb2Test, LoadLiteralMax1KiB) {
+  arm::Literal* literal = __ NewLiteral<int32_t>(0x12345678);
+  __ LoadLiteral(arm::R0, literal);
+  Label label;
+  __ Bind(&label);
+  constexpr size_t kLdrR0R0Count = 511;
+  for (size_t i = 0; i != kLdrR0R0Count; ++i) {
+    __ ldr(arm::R0, arm::Address(arm::R0));
+  }
+
+  std::string expected =
+      "1:\n"
+      "ldr.n r0, [pc, #((2f - 1b - 2) & ~2)]\n" +
+      RepeatInsn(kLdrR0R0Count, "ldr r0, [r0]\n") +
+      ".align 2, 0\n"
+      "2:\n"
+      ".word 0x12345678\n";
+  DriverStr(expected, "LoadLiteralMax1KiB");
+
+  EXPECT_EQ(static_cast<uint32_t>(label.Position()) + 0u,
+            __ GetAdjustedPosition(label.Position()));
+}
+
+TEST_F(AssemblerThumb2Test, LoadLiteralBeyondMax1KiB) {
+  arm::Literal* literal = __ NewLiteral<int32_t>(0x12345678);
+  __ LoadLiteral(arm::R0, literal);
+  Label label;
+  __ Bind(&label);
+  constexpr size_t kLdrR0R0Count = 512;
+  for (size_t i = 0; i != kLdrR0R0Count; ++i) {
+    __ ldr(arm::R0, arm::Address(arm::R0));
+  }
+
+  std::string expected =
+      "1:\n"
+      "ldr.w r0, [pc, #((2f - 1b - 2) & ~2)]\n" +
+      RepeatInsn(kLdrR0R0Count, "ldr r0, [r0]\n") +
+      ".align 2, 0\n"
+      "2:\n"
+      ".word 0x12345678\n";
+  DriverStr(expected, "LoadLiteralBeyondMax1KiB");
+
+  EXPECT_EQ(static_cast<uint32_t>(label.Position()) + 2u,
+            __ GetAdjustedPosition(label.Position()));
+}
+
+TEST_F(AssemblerThumb2Test, LoadLiteralMax4KiB) {
+  arm::Literal* literal = __ NewLiteral<int32_t>(0x12345678);
+  __ LoadLiteral(arm::R1, literal);
+  Label label;
+  __ Bind(&label);
+  constexpr size_t kLdrR0R0Count = 2046;
+  for (size_t i = 0; i != kLdrR0R0Count; ++i) {
+    __ ldr(arm::R0, arm::Address(arm::R0));
+  }
+
+  std::string expected =
+      "1:\n"
+      "ldr.w r1, [pc, #((2f - 1b - 2) & ~2)]\n" +
+      RepeatInsn(kLdrR0R0Count, "ldr r0, [r0]\n") +
+      ".align 2, 0\n"
+      "2:\n"
+      ".word 0x12345678\n";
+  DriverStr(expected, "LoadLiteralMax4KiB");
+
+  EXPECT_EQ(static_cast<uint32_t>(label.Position()) + 2u,
+            __ GetAdjustedPosition(label.Position()));
+}
+
+TEST_F(AssemblerThumb2Test, LoadLiteralBeyondMax4KiB) {
+  arm::Literal* literal = __ NewLiteral<int32_t>(0x12345678);
+  __ LoadLiteral(arm::R1, literal);
+  Label label;
+  __ Bind(&label);
+  constexpr size_t kLdrR0R0Count = 2047;
+  for (size_t i = 0; i != kLdrR0R0Count; ++i) {
+    __ ldr(arm::R0, arm::Address(arm::R0));
+  }
+
+  std::string expected =
+      "movw r1, #4096\n"  // "as" does not consider (2f - 1f - 4) a constant expression for movw.
+      "1:\n"
+      "add r1, pc\n"
+      "ldr r1, [r1, #0]\n" +
+      RepeatInsn(kLdrR0R0Count, "ldr r0, [r0]\n") +
+      ".align 2, 0\n"
+      "2:\n"
+      ".word 0x12345678\n";
+  DriverStr(expected, "LoadLiteralBeyondMax4KiB");
+
+  EXPECT_EQ(static_cast<uint32_t>(label.Position()) + 6u,
+            __ GetAdjustedPosition(label.Position()));
+}
+
+TEST_F(AssemblerThumb2Test, LoadLiteralMax64KiB) {
+  arm::Literal* literal = __ NewLiteral<int32_t>(0x12345678);
+  __ LoadLiteral(arm::R1, literal);
+  Label label;
+  __ Bind(&label);
+  constexpr size_t kLdrR0R0Count = (1u << 15) - 2u;
+  for (size_t i = 0; i != kLdrR0R0Count; ++i) {
+    __ ldr(arm::R0, arm::Address(arm::R0));
+  }
+
+  std::string expected =
+      "movw r1, #0xfffc\n"  // "as" does not consider (2f - 1f - 4) a constant expression for movw.
+      "1:\n"
+      "add r1, pc\n"
+      "ldr r1, [r1, #0]\n" +
+      RepeatInsn(kLdrR0R0Count, "ldr r0, [r0]\n") +
+      ".align 2, 0\n"
+      "2:\n"
+      ".word 0x12345678\n";
+  DriverStr(expected, "LoadLiteralMax64KiB");
+
+  EXPECT_EQ(static_cast<uint32_t>(label.Position()) + 6u,
+            __ GetAdjustedPosition(label.Position()));
+}
+
+TEST_F(AssemblerThumb2Test, LoadLiteralBeyondMax64KiB) {
+  arm::Literal* literal = __ NewLiteral<int32_t>(0x12345678);
+  __ LoadLiteral(arm::R1, literal);
+  Label label;
+  __ Bind(&label);
+  constexpr size_t kLdrR0R0Count = (1u << 15) - 1u;
+  for (size_t i = 0; i != kLdrR0R0Count; ++i) {
+    __ ldr(arm::R0, arm::Address(arm::R0));
+  }
+
+  std::string expected =
+      "mov.w r1, #((2f - 1f - 4) & ~0xfff)\n"
+      "1:\n"
+      "add r1, pc\n"
+      "ldr r1, [r1, #((2f - 1b - 4) & 0xfff)]\n" +
+      RepeatInsn(kLdrR0R0Count, "ldr r0, [r0]\n") +
+      ".align 2, 0\n"
+      "2:\n"
+      ".word 0x12345678\n";
+  DriverStr(expected, "LoadLiteralBeyondMax64KiB");
+
+  EXPECT_EQ(static_cast<uint32_t>(label.Position()) + 8u,
+            __ GetAdjustedPosition(label.Position()));
+}
+
+TEST_F(AssemblerThumb2Test, LoadLiteralMax1MiB) {
+  arm::Literal* literal = __ NewLiteral<int32_t>(0x12345678);
+  __ LoadLiteral(arm::R1, literal);
+  Label label;
+  __ Bind(&label);
+  constexpr size_t kLdrR0R0Count = (1u << 19) - 3u;
+  for (size_t i = 0; i != kLdrR0R0Count; ++i) {
+    __ ldr(arm::R0, arm::Address(arm::R0));
+  }
+
+  std::string expected =
+      "mov.w r1, #((2f - 1f - 4) & ~0xfff)\n"
+      "1:\n"
+      "add r1, pc\n"
+      "ldr r1, [r1, #((2f - 1b - 4) & 0xfff)]\n" +
+      RepeatInsn(kLdrR0R0Count, "ldr r0, [r0]\n") +
+      ".align 2, 0\n"
+      "2:\n"
+      ".word 0x12345678\n";
+  DriverStr(expected, "LoadLiteralMax1MiB");
+
+  EXPECT_EQ(static_cast<uint32_t>(label.Position()) + 8u,
+            __ GetAdjustedPosition(label.Position()));
+}
+
+TEST_F(AssemblerThumb2Test, LoadLiteralBeyondMax1MiB) {
+  arm::Literal* literal = __ NewLiteral<int32_t>(0x12345678);
+  __ LoadLiteral(arm::R1, literal);
+  Label label;
+  __ Bind(&label);
+  constexpr size_t kLdrR0R0Count = (1u << 19) - 2u;
+  for (size_t i = 0; i != kLdrR0R0Count; ++i) {
+    __ ldr(arm::R0, arm::Address(arm::R0));
+  }
+
+  std::string expected =
+      // "as" does not consider ((2f - 1f - 4) & 0xffff) a constant expression for movw.
+      "movw r1, #(0x100000 & 0xffff)\n"
+      // "as" does not consider ((2f - 1f - 4) >> 16) a constant expression for movt.
+      "movt r1, #(0x100000 >> 16)\n"
+      "1:\n"
+      "add r1, pc\n"
+      "ldr.w r1, [r1, #0]\n" +
+      RepeatInsn(kLdrR0R0Count, "ldr r0, [r0]\n") +
+      ".align 2, 0\n"
+      "2:\n"
+      ".word 0x12345678\n";
+  DriverStr(expected, "LoadLiteralBeyondMax1MiB");
+
+  EXPECT_EQ(static_cast<uint32_t>(label.Position()) + 12u,
+            __ GetAdjustedPosition(label.Position()));
+}
+
+TEST_F(AssemblerThumb2Test, LoadLiteralFar) {
+  arm::Literal* literal = __ NewLiteral<int32_t>(0x12345678);
+  __ LoadLiteral(arm::R1, literal);
+  Label label;
+  __ Bind(&label);
+  constexpr size_t kLdrR0R0Count = (1u << 19) - 2u + 0x1234;
+  for (size_t i = 0; i != kLdrR0R0Count; ++i) {
+    __ ldr(arm::R0, arm::Address(arm::R0));
+  }
+
+  std::string expected =
+      // "as" does not consider ((2f - 1f - 4) & 0xffff) a constant expression for movw.
+      "movw r1, #((0x100000 + 2 * 0x1234) & 0xffff)\n"
+      // "as" does not consider ((2f - 1f - 4) >> 16) a constant expression for movt.
+      "movt r1, #((0x100000 + 2 * 0x1234) >> 16)\n"
+      "1:\n"
+      "add r1, pc\n"
+      "ldr.w r1, [r1, #0]\n" +
+      RepeatInsn(kLdrR0R0Count, "ldr r0, [r0]\n") +
+      ".align 2, 0\n"
+      "2:\n"
+      ".word 0x12345678\n";
+  DriverStr(expected, "LoadLiteralFar");
+
+  EXPECT_EQ(static_cast<uint32_t>(label.Position()) + 12u,
+            __ GetAdjustedPosition(label.Position()));
+}
+
+TEST_F(AssemblerThumb2Test, LoadLiteralWideMax1KiB) {
+  arm::Literal* literal = __ NewLiteral<int64_t>(INT64_C(0x1234567887654321));
+  __ LoadLiteral(arm::R1, arm::R3, literal);
+  Label label;
+  __ Bind(&label);
+  constexpr size_t kLdrR0R0Count = 510;
+  for (size_t i = 0; i != kLdrR0R0Count; ++i) {
+    __ ldr(arm::R0, arm::Address(arm::R0));
+  }
+
+  std::string expected =
+      "1:\n"
+      "ldrd r1, r3, [pc, #((2f - 1b - 2) & ~2)]\n" +
+      RepeatInsn(kLdrR0R0Count, "ldr r0, [r0]\n") +
+      ".align 2, 0\n"
+      "2:\n"
+      ".word 0x87654321\n"
+      ".word 0x12345678\n";
+  DriverStr(expected, "LoadLiteralWideMax1KiB");
+
+  EXPECT_EQ(static_cast<uint32_t>(label.Position()) + 0u,
+            __ GetAdjustedPosition(label.Position()));
+}
+
+TEST_F(AssemblerThumb2Test, LoadLiteralWideBeyondMax1KiB) {
+  arm::Literal* literal = __ NewLiteral<int64_t>(INT64_C(0x1234567887654321));
+  __ LoadLiteral(arm::R1, arm::R3, literal);
+  Label label;
+  __ Bind(&label);
+  constexpr size_t kLdrR0R0Count = 511;
+  for (size_t i = 0; i != kLdrR0R0Count; ++i) {
+    __ ldr(arm::R0, arm::Address(arm::R0));
+  }
+
+  std::string expected =
+      "mov.w ip, #((2f - 1f - 4) & ~0x3ff)\n"
+      "1:\n"
+      "add ip, pc\n"
+      "ldrd r1, r3, [ip, #((2f - 1b - 4) & 0x3ff)]\n" +
+      RepeatInsn(kLdrR0R0Count, "ldr r0, [r0]\n") +
+      ".align 2, 0\n"
+      "2:\n"
+      ".word 0x87654321\n"
+      ".word 0x12345678\n";
+  DriverStr(expected, "LoadLiteralWideBeyondMax1KiB");
+
+  EXPECT_EQ(static_cast<uint32_t>(label.Position()) + 6u,
+            __ GetAdjustedPosition(label.Position()));
+}
+
+TEST_F(AssemblerThumb2Test, LoadLiteralSingleMax256KiB) {
+  // The literal size must match but the type doesn't, so use an int32_t rather than float.
+  arm::Literal* literal = __ NewLiteral<int32_t>(0x12345678);
+  __ LoadLiteral(arm::S3, literal);
+  Label label;
+  __ Bind(&label);
+  constexpr size_t kLdrR0R0Count = (1 << 17) - 3u;
+  for (size_t i = 0; i != kLdrR0R0Count; ++i) {
+    __ ldr(arm::R0, arm::Address(arm::R0));
+  }
+
+  std::string expected =
+      "mov.w ip, #((2f - 1f - 4) & ~0x3ff)\n"
+      "1:\n"
+      "add ip, pc\n"
+      "vldr s3, [ip, #((2f - 1b - 4) & 0x3ff)]\n" +
+      RepeatInsn(kLdrR0R0Count, "ldr r0, [r0]\n") +
+      ".align 2, 0\n"
+      "2:\n"
+      ".word 0x12345678\n";
+  DriverStr(expected, "LoadLiteralSingleMax256KiB");
+
+  EXPECT_EQ(static_cast<uint32_t>(label.Position()) + 6u,
+            __ GetAdjustedPosition(label.Position()));
+}
+
+TEST_F(AssemblerThumb2Test, LoadLiteralDoubleBeyondMax256KiB) {
+  // The literal size must match but the type doesn't, so use an int64_t rather than double.
+  arm::Literal* literal = __ NewLiteral<int64_t>(INT64_C(0x1234567887654321));
+  __ LoadLiteral(arm::D3, literal);
+  Label label;
+  __ Bind(&label);
+  constexpr size_t kLdrR0R0Count = (1 << 17) - 2u;
+  for (size_t i = 0; i != kLdrR0R0Count; ++i) {
+    __ ldr(arm::R0, arm::Address(arm::R0));
+  }
+
+  std::string expected =
+      // "as" does not consider ((2f - 1f - 4) & 0xffff) a constant expression for movw.
+      "movw ip, #(0x40000 & 0xffff)\n"
+      // "as" does not consider ((2f - 1f - 4) >> 16) a constant expression for movt.
+      "movt ip, #(0x40000 >> 16)\n"
+      "1:\n"
+      "add ip, pc\n"
+      "vldr d3, [ip, #0]\n" +
+      RepeatInsn(kLdrR0R0Count, "ldr r0, [r0]\n") +
+      ".align 2, 0\n"
+      "2:\n"
+      ".word 0x87654321\n"
+      ".word 0x12345678\n";
+  DriverStr(expected, "LoadLiteralDoubleBeyondMax256KiB");
+
+  EXPECT_EQ(static_cast<uint32_t>(label.Position()) + 10u,
+            __ GetAdjustedPosition(label.Position()));
+}
+
+TEST_F(AssemblerThumb2Test, LoadLiteralDoubleFar) {
+  // The literal size must match but the type doesn't, so use an int64_t rather than double.
+  arm::Literal* literal = __ NewLiteral<int64_t>(INT64_C(0x1234567887654321));
+  __ LoadLiteral(arm::D3, literal);
+  Label label;
+  __ Bind(&label);
+  constexpr size_t kLdrR0R0Count = (1 << 17) - 2u + 0x1234;
+  for (size_t i = 0; i != kLdrR0R0Count; ++i) {
+    __ ldr(arm::R0, arm::Address(arm::R0));
+  }
+
+  std::string expected =
+      // "as" does not consider ((2f - 1f - 4) & 0xffff) a constant expression for movw.
+      "movw ip, #((0x40000 + 2 * 0x1234) & 0xffff)\n"
+      // "as" does not consider ((2f - 1f - 4) >> 16) a constant expression for movt.
+      "movt ip, #((0x40000 + 2 * 0x1234) >> 16)\n"
+      "1:\n"
+      "add ip, pc\n"
+      "vldr d3, [ip, #0]\n" +
+      RepeatInsn(kLdrR0R0Count, "ldr r0, [r0]\n") +
+      ".align 2, 0\n"
+      "2:\n"
+      ".word 0x87654321\n"
+      ".word 0x12345678\n";
+  DriverStr(expected, "LoadLiteralDoubleFar");
+
+  EXPECT_EQ(static_cast<uint32_t>(label.Position()) + 10u,
+            __ GetAdjustedPosition(label.Position()));
+}
+
 }  // namespace art
diff --git a/compiler/utils/arm64/assembler_arm64.cc b/compiler/utils/arm64/assembler_arm64.cc
index cc78002ab0..eb8de0620b 100644
--- a/compiler/utils/arm64/assembler_arm64.cc
+++ b/compiler/utils/arm64/assembler_arm64.cc
@@ -31,7 +31,7 @@ namespace arm64 {
 #define ___   vixl_masm_->
 #endif
 
-void Arm64Assembler::EmitSlowPaths() {
+void Arm64Assembler::FinalizeCode() {
   if (!exception_blocks_.empty()) {
     for (size_t i = 0; i < exception_blocks_.size(); i++) {
       EmitExceptionPoll(exception_blocks_.at(i));
diff --git a/compiler/utils/arm64/assembler_arm64.h b/compiler/utils/arm64/assembler_arm64.h
index fa9faed66b..b53c11bc24 100644
--- a/compiler/utils/arm64/assembler_arm64.h
+++ b/compiler/utils/arm64/assembler_arm64.h
@@ -73,8 +73,8 @@ class Arm64Assembler FINAL : public Assembler {
     delete vixl_masm_;
   }
 
-  // Emit slow paths queued during assembly.
-  void EmitSlowPaths();
+  // Finalize the code.
+  void FinalizeCode() OVERRIDE;
 
   // Size of generated code.
   size_t CodeSize() const;
diff --git a/compiler/utils/assembler.cc b/compiler/utils/assembler.cc
index b016e74aba..6d8a98931f 100644
--- a/compiler/utils/assembler.cc
+++ b/compiler/utils/assembler.cc
@@ -80,10 +80,11 @@ void AssemblerBuffer::FinalizeInstructions(const MemoryRegion& instructions) {
 }
 
 
-void AssemblerBuffer::ExtendCapacity() {
+void AssemblerBuffer::ExtendCapacity(size_t min_capacity) {
   size_t old_size = Size();
   size_t old_capacity = Capacity();
   size_t new_capacity = std::min(old_capacity * 2, old_capacity + 1 * MB);
+  new_capacity = std::max(new_capacity, min_capacity);
 
   // Allocate the new data area and copy contents of the old one to it.
   uint8_t* new_contents = NewContents(new_capacity);
diff --git a/compiler/utils/assembler.h b/compiler/utils/assembler.h
index 672e1503be..0381af3956 100644
--- a/compiler/utils/assembler.h
+++ b/compiler/utils/assembler.h
@@ -199,13 +199,18 @@ class AssemblerBuffer {
     *reinterpret_cast<T*>(contents_ + position) = value;
   }
 
-  void Move(size_t newposition, size_t oldposition) {
-    CHECK(HasEnsuredCapacity());
-    // Move the contents of the buffer from oldposition to
-    // newposition by nbytes.
-    size_t nbytes = Size() - oldposition;
-    memmove(contents_ + newposition, contents_ + oldposition, nbytes);
-    cursor_ += newposition - oldposition;
+  void Resize(size_t new_size) {
+    if (new_size > Capacity()) {
+      ExtendCapacity(new_size);
+    }
+    cursor_ = contents_ + new_size;
+  }
+
+  void Move(size_t newposition, size_t oldposition, size_t size) {
+    // Move a chunk of the buffer from oldposition to newposition.
+    DCHECK_LE(oldposition + size, Size());
+    DCHECK_LE(newposition + size, Size());
+    memmove(contents_ + newposition, contents_ + oldposition, size);
   }
 
   // Emit a fixup at the current location.
@@ -350,7 +355,7 @@ class AssemblerBuffer {
     return data + capacity - kMinimumGap;
   }
 
-  void ExtendCapacity();
+  void ExtendCapacity(size_t min_capacity = 0u);
 
   friend class AssemblerFixup;
 };
@@ -376,8 +381,8 @@ class Assembler {
  public:
   static Assembler* Create(InstructionSet instruction_set);
 
-  // Emit slow paths queued during assembly
-  virtual void EmitSlowPaths() { buffer_.EmitSlowPaths(this); }
+  // Finalize the code; emit slow paths, fixup branches, add literal pool, etc.
+  virtual void FinalizeCode() { buffer_.EmitSlowPaths(this); }
 
   // Size of generated code
   virtual size_t CodeSize() const { return buffer_.Size(); }
diff --git a/compiler/utils/assembler_test.h b/compiler/utils/assembler_test.h
index a339633efe..017402dbd3 100644
--- a/compiler/utils/assembler_test.h
+++ b/compiler/utils/assembler_test.h
@@ -544,6 +544,7 @@ class AssemblerTest : public testing::Test {
   }
 
   void DriverWrapper(std::string assembly_text, std::string test_name) {
+    assembler_->FinalizeCode();
     size_t cs = assembler_->CodeSize();
     std::unique_ptr<std::vector<uint8_t>> data(new std::vector<uint8_t>(cs));
     MemoryRegion code(&(*data)[0], data->size());
diff --git a/compiler/utils/assembler_thumb_test.cc b/compiler/utils/assembler_thumb_test.cc
index 1a2c9a9000..20f61f942b 100644
--- a/compiler/utils/assembler_thumb_test.cc
+++ b/compiler/utils/assembler_thumb_test.cc
@@ -65,20 +65,33 @@ int CompareIgnoringSpace(const char* s1, const char* s2) {
   return *s1 - *s2;
 }
 
-void dump(std::vector<uint8_t>& code, const char* testname) {
-  // This will only work on the host.  There is no as, objcopy or objdump on the
-  // device.
+void InitResults() {
+  if (test_results.empty()) {
+    setup_results();
+  }
+}
+
+std::string GetToolsDir() {
 #ifndef HAVE_ANDROID_OS
-  static bool results_ok = false;
+  // This will only work on the host.  There is no as, objcopy or objdump on the device.
   static std::string toolsdir;
 
-  if (!results_ok) {
+  if (toolsdir.empty()) {
     setup_results();
     toolsdir = CommonRuntimeTest::GetAndroidTargetToolsDir(kThumb2);
     SetAndroidData();
-    results_ok = true;
   }
 
+  return toolsdir;
+#else
+  return std::string();
+#endif
+}
+
+void DumpAndCheck(std::vector<uint8_t>& code, const char* testname, const char* const* results) {
+#ifndef HAVE_ANDROID_OS
+  static std::string toolsdir = GetToolsDir();
+
   ScratchFile file;
 
   const char* filename = file.GetFilename().c_str();
@@ -130,9 +143,6 @@ void dump(std::vector<uint8_t>& code, const char* testname) {
     FILE *fp = popen(cmd, "r");
     ASSERT_TRUE(fp != nullptr);
 
-    std::map<std::string, const char**>::iterator results = test_results.find(testname);
-    ASSERT_NE(results, test_results.end());
-
     uint32_t lineindex = 0;
 
     while (!feof(fp)) {
@@ -141,14 +151,14 @@ void dump(std::vector<uint8_t>& code, const char* testname) {
       if (s == nullptr) {
         break;
       }
-      if (CompareIgnoringSpace(results->second[lineindex], testline) != 0) {
+      if (CompareIgnoringSpace(results[lineindex], testline) != 0) {
         LOG(FATAL) << "Output is not as expected at line: " << lineindex
-          << results->second[lineindex] << "/" << testline;
+          << results[lineindex] << "/" << testline;
       }
       ++lineindex;
     }
     // Check that we are at the end.
-    ASSERT_TRUE(results->second[lineindex] == nullptr);
+    ASSERT_TRUE(results[lineindex] == nullptr);
     fclose(fp);
   }
 
@@ -163,8 +173,31 @@ void dump(std::vector<uint8_t>& code, const char* testname) {
 
 #define __ assembler->
 
+void EmitAndCheck(arm::Thumb2Assembler* assembler, const char* testname,
+                  const char* const* results) {
+  __ FinalizeCode();
+  size_t cs = __ CodeSize();
+  std::vector<uint8_t> managed_code(cs);
+  MemoryRegion code(&managed_code[0], managed_code.size());
+  __ FinalizeInstructions(code);
+
+  DumpAndCheck(managed_code, testname, results);
+}
+
+void EmitAndCheck(arm::Thumb2Assembler* assembler, const char* testname) {
+  InitResults();
+  std::map<std::string, const char* const*>::iterator results = test_results.find(testname);
+  ASSERT_NE(results, test_results.end());
+
+  EmitAndCheck(assembler, testname, results->second);
+}
+
+#undef __
+
+#define __ assembler.
+
 TEST(Thumb2AssemblerTest, SimpleMov) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   __ mov(R0, ShifterOperand(R1));
   __ mov(R8, ShifterOperand(R9));
@@ -172,46 +205,31 @@ TEST(Thumb2AssemblerTest, SimpleMov) {
   __ mov(R0, ShifterOperand(1));
   __ mov(R8, ShifterOperand(9));
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "SimpleMov");
-  delete assembler;
+  EmitAndCheck(&assembler, "SimpleMov");
 }
 
 TEST(Thumb2AssemblerTest, SimpleMov32) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
-  assembler->Force32Bit();
+  arm::Thumb2Assembler assembler;
+  __ Force32Bit();
 
   __ mov(R0, ShifterOperand(R1));
   __ mov(R8, ShifterOperand(R9));
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "SimpleMov32");
-  delete assembler;
+  EmitAndCheck(&assembler, "SimpleMov32");
 }
 
 TEST(Thumb2AssemblerTest, SimpleMovAdd) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   __ mov(R0, ShifterOperand(R1));
   __ add(R0, R1, ShifterOperand(R2));
   __ add(R0, R1, ShifterOperand());
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "SimpleMovAdd");
-  delete assembler;
+  EmitAndCheck(&assembler, "SimpleMovAdd");
 }
 
 TEST(Thumb2AssemblerTest, DataProcessingRegister) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   __ mov(R0, ShifterOperand(R1));
   __ mvn(R0, ShifterOperand(R1));
@@ -249,16 +267,11 @@ TEST(Thumb2AssemblerTest, DataProcessingRegister) {
   // 32 bit variants.
   __ add(R12, R1, ShifterOperand(R0));
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "DataProcessingRegister");
-  delete assembler;
+  EmitAndCheck(&assembler, "DataProcessingRegister");
 }
 
 TEST(Thumb2AssemblerTest, DataProcessingImmediate) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   __ mov(R0, ShifterOperand(0x55));
   __ mvn(R0, ShifterOperand(0x55));
@@ -283,16 +296,11 @@ TEST(Thumb2AssemblerTest, DataProcessingImmediate) {
   __ movs(R0, ShifterOperand(0x55));
   __ mvns(R0, ShifterOperand(0x55));
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "DataProcessingImmediate");
-  delete assembler;
+  EmitAndCheck(&assembler, "DataProcessingImmediate");
 }
 
 TEST(Thumb2AssemblerTest, DataProcessingModifiedImmediate) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   __ mov(R0, ShifterOperand(0x550055));
   __ mvn(R0, ShifterOperand(0x550055));
@@ -311,17 +319,12 @@ TEST(Thumb2AssemblerTest, DataProcessingModifiedImmediate) {
   __ cmp(R0, ShifterOperand(0x550055));
   __ cmn(R0, ShifterOperand(0x550055));
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "DataProcessingModifiedImmediate");
-  delete assembler;
+  EmitAndCheck(&assembler, "DataProcessingModifiedImmediate");
 }
 
 
 TEST(Thumb2AssemblerTest, DataProcessingModifiedImmediates) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   __ mov(R0, ShifterOperand(0x550055));
   __ mov(R0, ShifterOperand(0x55005500));
@@ -331,16 +334,11 @@ TEST(Thumb2AssemblerTest, DataProcessingModifiedImmediates) {
   __ mov(R0, ShifterOperand(0x350));            // rotated to 2nd last position
   __ mov(R0, ShifterOperand(0x1a8));            // rotated to last position
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "DataProcessingModifiedImmediates");
-  delete assembler;
+  EmitAndCheck(&assembler, "DataProcessingModifiedImmediates");
 }
 
 TEST(Thumb2AssemblerTest, DataProcessingShiftedRegister) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   __ mov(R3, ShifterOperand(R4, LSL, 4));
   __ mov(R3, ShifterOperand(R4, LSR, 5));
@@ -355,17 +353,12 @@ TEST(Thumb2AssemblerTest, DataProcessingShiftedRegister) {
   __ mov(R8, ShifterOperand(R4, ROR, 7));
   __ mov(R8, ShifterOperand(R4, RRX));
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "DataProcessingShiftedRegister");
-  delete assembler;
+  EmitAndCheck(&assembler, "DataProcessingShiftedRegister");
 }
 
 
 TEST(Thumb2AssemblerTest, BasicLoad) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   __ ldr(R3, Address(R4, 24));
   __ ldrb(R3, Address(R4, 24));
@@ -382,17 +375,12 @@ TEST(Thumb2AssemblerTest, BasicLoad) {
   __ ldrsb(R8, Address(R4, 24));
   __ ldrsh(R8, Address(R4, 24));
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "BasicLoad");
-  delete assembler;
+  EmitAndCheck(&assembler, "BasicLoad");
 }
 
 
 TEST(Thumb2AssemblerTest, BasicStore) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   __ str(R3, Address(R4, 24));
   __ strb(R3, Address(R4, 24));
@@ -405,16 +393,11 @@ TEST(Thumb2AssemblerTest, BasicStore) {
   __ strb(R8, Address(R4, 24));
   __ strh(R8, Address(R4, 24));
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "BasicStore");
-  delete assembler;
+  EmitAndCheck(&assembler, "BasicStore");
 }
 
 TEST(Thumb2AssemblerTest, ComplexLoad) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   __ ldr(R3, Address(R4, 24, Address::Mode::Offset));
   __ ldr(R3, Address(R4, 24, Address::Mode::PreIndex));
@@ -451,17 +434,12 @@ TEST(Thumb2AssemblerTest, ComplexLoad) {
   __ ldrsh(R3, Address(R4, 24, Address::Mode::NegPreIndex));
   __ ldrsh(R3, Address(R4, 24, Address::Mode::NegPostIndex));
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "ComplexLoad");
-  delete assembler;
+  EmitAndCheck(&assembler, "ComplexLoad");
 }
 
 
 TEST(Thumb2AssemblerTest, ComplexStore) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   __ str(R3, Address(R4, 24, Address::Mode::Offset));
   __ str(R3, Address(R4, 24, Address::Mode::PreIndex));
@@ -484,16 +462,11 @@ TEST(Thumb2AssemblerTest, ComplexStore) {
   __ strh(R3, Address(R4, 24, Address::Mode::NegPreIndex));
   __ strh(R3, Address(R4, 24, Address::Mode::NegPostIndex));
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "ComplexStore");
-  delete assembler;
+  EmitAndCheck(&assembler, "ComplexStore");
 }
 
 TEST(Thumb2AssemblerTest, NegativeLoadStore) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   __ ldr(R3, Address(R4, -24, Address::Mode::Offset));
   __ ldr(R3, Address(R4, -24, Address::Mode::PreIndex));
@@ -551,30 +524,20 @@ TEST(Thumb2AssemblerTest, NegativeLoadStore) {
   __ strh(R3, Address(R4, -24, Address::Mode::NegPreIndex));
   __ strh(R3, Address(R4, -24, Address::Mode::NegPostIndex));
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "NegativeLoadStore");
-  delete assembler;
+  EmitAndCheck(&assembler, "NegativeLoadStore");
 }
 
 TEST(Thumb2AssemblerTest, SimpleLoadStoreDual) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   __ strd(R2, Address(R0, 24, Address::Mode::Offset));
   __ ldrd(R2, Address(R0, 24, Address::Mode::Offset));
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "SimpleLoadStoreDual");
-  delete assembler;
+  EmitAndCheck(&assembler, "SimpleLoadStoreDual");
 }
 
 TEST(Thumb2AssemblerTest, ComplexLoadStoreDual) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   __ strd(R2, Address(R0, 24, Address::Mode::Offset));
   __ strd(R2, Address(R0, 24, Address::Mode::PreIndex));
@@ -590,16 +553,11 @@ TEST(Thumb2AssemblerTest, ComplexLoadStoreDual) {
   __ ldrd(R2, Address(R0, 24, Address::Mode::NegPreIndex));
   __ ldrd(R2, Address(R0, 24, Address::Mode::NegPostIndex));
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "ComplexLoadStoreDual");
-  delete assembler;
+  EmitAndCheck(&assembler, "ComplexLoadStoreDual");
 }
 
 TEST(Thumb2AssemblerTest, NegativeLoadStoreDual) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   __ strd(R2, Address(R0, -24, Address::Mode::Offset));
   __ strd(R2, Address(R0, -24, Address::Mode::PreIndex));
@@ -615,16 +573,11 @@ TEST(Thumb2AssemblerTest, NegativeLoadStoreDual) {
   __ ldrd(R2, Address(R0, -24, Address::Mode::NegPreIndex));
   __ ldrd(R2, Address(R0, -24, Address::Mode::NegPostIndex));
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "NegativeLoadStoreDual");
-  delete assembler;
+  EmitAndCheck(&assembler, "NegativeLoadStoreDual");
 }
 
 TEST(Thumb2AssemblerTest, SimpleBranch) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   Label l1;
   __ mov(R0, ShifterOperand(2));
@@ -658,17 +611,12 @@ TEST(Thumb2AssemblerTest, SimpleBranch) {
   __ Bind(&l5);
   __ mov(R0, ShifterOperand(6));
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "SimpleBranch");
-  delete assembler;
+  EmitAndCheck(&assembler, "SimpleBranch");
 }
 
 TEST(Thumb2AssemblerTest, LongBranch) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
-  assembler->Force32Bit();
+  arm::Thumb2Assembler assembler;
+  __ Force32Bit();
   // 32 bit branches.
   Label l1;
   __ mov(R0, ShifterOperand(2));
@@ -703,16 +651,11 @@ TEST(Thumb2AssemblerTest, LongBranch) {
   __ Bind(&l5);
   __ mov(R0, ShifterOperand(6));
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "LongBranch");
-  delete assembler;
+  EmitAndCheck(&assembler, "LongBranch");
 }
 
 TEST(Thumb2AssemblerTest, LoadMultiple) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   // 16 bit.
   __ ldm(DB_W, R4, (1 << R0 | 1 << R3));
@@ -724,16 +667,11 @@ TEST(Thumb2AssemblerTest, LoadMultiple) {
   // Single reg is converted to ldr
   __ ldm(DB_W, R4, (1 << R5));
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "LoadMultiple");
-  delete assembler;
+  EmitAndCheck(&assembler, "LoadMultiple");
 }
 
 TEST(Thumb2AssemblerTest, StoreMultiple) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   // 16 bit.
   __ stm(IA_W, R4, (1 << R0 | 1 << R3));
@@ -746,16 +684,11 @@ TEST(Thumb2AssemblerTest, StoreMultiple) {
   __ stm(IA_W, R4, (1 << R5));
   __ stm(IA, R4, (1 << R5));
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "StoreMultiple");
-  delete assembler;
+  EmitAndCheck(&assembler, "StoreMultiple");
 }
 
 TEST(Thumb2AssemblerTest, MovWMovT) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   __ movw(R4, 0);         // 16 bit.
   __ movw(R4, 0x34);      // 16 bit.
@@ -768,16 +701,11 @@ TEST(Thumb2AssemblerTest, MovWMovT) {
   __ movt(R0, 0x1234);
   __ movt(R1, 0xffff);
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "MovWMovT");
-  delete assembler;
+  EmitAndCheck(&assembler, "MovWMovT");
 }
 
 TEST(Thumb2AssemblerTest, SpecialAddSub) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   __ add(R2, SP, ShifterOperand(0x50));   // 16 bit.
   __ add(SP, SP, ShifterOperand(0x50));   // 16 bit.
@@ -792,16 +720,11 @@ TEST(Thumb2AssemblerTest, SpecialAddSub) {
 
   __ sub(SP, SP, ShifterOperand(0xf00));   // 32 bit due to imm size
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "SpecialAddSub");
-  delete assembler;
+  EmitAndCheck(&assembler, "SpecialAddSub");
 }
 
 TEST(Thumb2AssemblerTest, StoreToOffset) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   __ StoreToOffset(kStoreWord, R2, R4, 12);     // Simple
   __ StoreToOffset(kStoreWord, R2, R4, 0x2000);     // Offset too big.
@@ -809,17 +732,12 @@ TEST(Thumb2AssemblerTest, StoreToOffset) {
   __ StoreToOffset(kStoreHalfword, R0, R12, 12);
   __ StoreToOffset(kStoreByte, R2, R12, 12);
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "StoreToOffset");
-  delete assembler;
+  EmitAndCheck(&assembler, "StoreToOffset");
 }
 
 
 TEST(Thumb2AssemblerTest, IfThen) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   __ it(EQ);
   __ mov(R1, ShifterOperand(1), EQ);
@@ -848,16 +766,11 @@ TEST(Thumb2AssemblerTest, IfThen) {
   __ mov(R3, ShifterOperand(3), EQ);
   __ mov(R4, ShifterOperand(4), NE);
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "IfThen");
-  delete assembler;
+  EmitAndCheck(&assembler, "IfThen");
 }
 
 TEST(Thumb2AssemblerTest, CbzCbnz) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   Label l1;
   __ cbz(R2, &l1);
@@ -873,16 +786,11 @@ TEST(Thumb2AssemblerTest, CbzCbnz) {
   __ Bind(&l2);
   __ mov(R2, ShifterOperand(4));
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "CbzCbnz");
-  delete assembler;
+  EmitAndCheck(&assembler, "CbzCbnz");
 }
 
 TEST(Thumb2AssemblerTest, Multiply) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   __ mul(R0, R1, R0);
   __ mul(R0, R1, R2);
@@ -898,16 +806,11 @@ TEST(Thumb2AssemblerTest, Multiply) {
   __ umull(R0, R1, R2, R3);
   __ umull(R8, R9, R10, R11);
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "Multiply");
-  delete assembler;
+  EmitAndCheck(&assembler, "Multiply");
 }
 
 TEST(Thumb2AssemblerTest, Divide) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   __ sdiv(R0, R1, R2);
   __ sdiv(R8, R9, R10);
@@ -915,16 +818,11 @@ TEST(Thumb2AssemblerTest, Divide) {
   __ udiv(R0, R1, R2);
   __ udiv(R8, R9, R10);
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "Divide");
-  delete assembler;
+  EmitAndCheck(&assembler, "Divide");
 }
 
 TEST(Thumb2AssemblerTest, VMov) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   __ vmovs(S1, 1.0);
   __ vmovd(D1, 1.0);
@@ -932,17 +830,12 @@ TEST(Thumb2AssemblerTest, VMov) {
   __ vmovs(S1, S2);
   __ vmovd(D1, D2);
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "VMov");
-  delete assembler;
+  EmitAndCheck(&assembler, "VMov");
 }
 
 
 TEST(Thumb2AssemblerTest, BasicFloatingPoint) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   __ vadds(S0, S1, S2);
   __ vsubs(S0, S1, S2);
@@ -964,16 +857,11 @@ TEST(Thumb2AssemblerTest, BasicFloatingPoint) {
   __ vnegd(D0, D1);
   __ vsqrtd(D0, D1);
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "BasicFloatingPoint");
-  delete assembler;
+  EmitAndCheck(&assembler, "BasicFloatingPoint");
 }
 
 TEST(Thumb2AssemblerTest, FloatingPointConversions) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   __ vcvtsd(S2, D2);
   __ vcvtds(D2, S2);
@@ -990,16 +878,11 @@ TEST(Thumb2AssemblerTest, FloatingPointConversions) {
   __ vcvtud(S1, D2);
   __ vcvtdu(D1, S2);
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "FloatingPointConversions");
-  delete assembler;
+  EmitAndCheck(&assembler, "FloatingPointConversions");
 }
 
 TEST(Thumb2AssemblerTest, FloatingPointComparisons) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   __ vcmps(S0, S1);
   __ vcmpd(D0, D1);
@@ -1007,57 +890,37 @@ TEST(Thumb2AssemblerTest, FloatingPointComparisons) {
   __ vcmpsz(S2);
   __ vcmpdz(D2);
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "FloatingPointComparisons");
-  delete assembler;
+  EmitAndCheck(&assembler, "FloatingPointComparisons");
 }
 
 TEST(Thumb2AssemblerTest, Calls) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   __ blx(LR);
   __ bx(LR);
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "Calls");
-  delete assembler;
+  EmitAndCheck(&assembler, "Calls");
 }
 
 TEST(Thumb2AssemblerTest, Breakpoint) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   __ bkpt(0);
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "Breakpoint");
-  delete assembler;
+  EmitAndCheck(&assembler, "Breakpoint");
 }
 
 TEST(Thumb2AssemblerTest, StrR1) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   __ str(R1, Address(SP, 68));
   __ str(R1, Address(SP, 1068));
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "StrR1");
-  delete assembler;
+  EmitAndCheck(&assembler, "StrR1");
 }
 
 TEST(Thumb2AssemblerTest, VPushPop) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   __ vpushs(S2, 4);
   __ vpushd(D2, 4);
@@ -1065,16 +928,11 @@ TEST(Thumb2AssemblerTest, VPushPop) {
   __ vpops(S2, 4);
   __ vpopd(D2, 4);
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "VPushPop");
-  delete assembler;
+  EmitAndCheck(&assembler, "VPushPop");
 }
 
 TEST(Thumb2AssemblerTest, Max16BitBranch) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   Label l1;
   __ b(&l1);
@@ -1084,16 +942,11 @@ TEST(Thumb2AssemblerTest, Max16BitBranch) {
   __ Bind(&l1);
   __ mov(R1, ShifterOperand(R2));
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "Max16BitBranch");
-  delete assembler;
+  EmitAndCheck(&assembler, "Max16BitBranch");
 }
 
 TEST(Thumb2AssemblerTest, Branch32) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   Label l1;
   __ b(&l1);
@@ -1103,16 +956,11 @@ TEST(Thumb2AssemblerTest, Branch32) {
   __ Bind(&l1);
   __ mov(R1, ShifterOperand(R2));
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "Branch32");
-  delete assembler;
+  EmitAndCheck(&assembler, "Branch32");
 }
 
 TEST(Thumb2AssemblerTest, CompareAndBranchMax) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   Label l1;
   __ cbz(R4, &l1);
@@ -1122,16 +970,11 @@ TEST(Thumb2AssemblerTest, CompareAndBranchMax) {
   __ Bind(&l1);
   __ mov(R1, ShifterOperand(R2));
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "CompareAndBranchMax");
-  delete assembler;
+  EmitAndCheck(&assembler, "CompareAndBranchMax");
 }
 
 TEST(Thumb2AssemblerTest, CompareAndBranchRelocation16) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   Label l1;
   __ cbz(R4, &l1);
@@ -1141,16 +984,11 @@ TEST(Thumb2AssemblerTest, CompareAndBranchRelocation16) {
   __ Bind(&l1);
   __ mov(R1, ShifterOperand(R2));
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "CompareAndBranchRelocation16");
-  delete assembler;
+  EmitAndCheck(&assembler, "CompareAndBranchRelocation16");
 }
 
 TEST(Thumb2AssemblerTest, CompareAndBranchRelocation32) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   Label l1;
   __ cbz(R4, &l1);
@@ -1160,16 +998,11 @@ TEST(Thumb2AssemblerTest, CompareAndBranchRelocation32) {
   __ Bind(&l1);
   __ mov(R1, ShifterOperand(R2));
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "CompareAndBranchRelocation32");
-  delete assembler;
+  EmitAndCheck(&assembler, "CompareAndBranchRelocation32");
 }
 
 TEST(Thumb2AssemblerTest, MixedBranch32) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   Label l1;
   Label l2;
@@ -1184,16 +1017,11 @@ TEST(Thumb2AssemblerTest, MixedBranch32) {
   __ Bind(&l1);
   __ mov(R1, ShifterOperand(R2));
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "MixedBranch32");
-  delete assembler;
+  EmitAndCheck(&assembler, "MixedBranch32");
 }
 
 TEST(Thumb2AssemblerTest, Shifts) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   // 16 bit
   __ Lsl(R0, R1, 5);
@@ -1240,16 +1068,11 @@ TEST(Thumb2AssemblerTest, Shifts) {
   __ Lsr(R0, R8, R2, true);
   __ Asr(R0, R1, R8, true);
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "Shifts");
-  delete assembler;
+  EmitAndCheck(&assembler, "Shifts");
 }
 
 TEST(Thumb2AssemblerTest, LoadStoreRegOffset) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   // 16 bit.
   __ ldr(R0, Address(R1, R2));
@@ -1272,16 +1095,11 @@ TEST(Thumb2AssemblerTest, LoadStoreRegOffset) {
   __ ldr(R0, Address(R1, R8));
   __ str(R0, Address(R1, R8));
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "LoadStoreRegOffset");
-  delete assembler;
+  EmitAndCheck(&assembler, "LoadStoreRegOffset");
 }
 
 TEST(Thumb2AssemblerTest, LoadStoreLiteral) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   __ ldr(R0, Address(4));
   __ str(R0, Address(4));
@@ -1295,16 +1113,11 @@ TEST(Thumb2AssemblerTest, LoadStoreLiteral) {
   __ str(R0, Address(0x3ff));       // 32 bit (no 16 bit str(literal)).
   __ str(R0, Address(0x7ff));       // 11 bits (32 bit).
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "LoadStoreLiteral");
-  delete assembler;
+  EmitAndCheck(&assembler, "LoadStoreLiteral");
 }
 
 TEST(Thumb2AssemblerTest, LoadStoreLimits) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
   __ ldr(R0, Address(R4, 124));     // 16 bit.
   __ ldr(R0, Address(R4, 128));     // 32 bit.
@@ -1330,30 +1143,20 @@ TEST(Thumb2AssemblerTest, LoadStoreLimits) {
   __ strh(R0, Address(R4, 62));     // 16 bit.
   __ strh(R0, Address(R4, 64));     // 32 bit.
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "LoadStoreLimits");
-  delete assembler;
+  EmitAndCheck(&assembler, "LoadStoreLimits");
 }
 
 TEST(Thumb2AssemblerTest, CompareAndBranch) {
-  arm::Thumb2Assembler* assembler = static_cast<arm::Thumb2Assembler*>(Assembler::Create(kThumb2));
+  arm::Thumb2Assembler assembler;
 
-  arm::NearLabel label;
+  Label label;
   __ CompareAndBranchIfZero(arm::R0, &label);
   __ CompareAndBranchIfZero(arm::R11, &label);
   __ CompareAndBranchIfNonZero(arm::R0, &label);
   __ CompareAndBranchIfNonZero(arm::R11, &label);
   __ Bind(&label);
 
-  size_t cs = __ CodeSize();
-  std::vector<uint8_t> managed_code(cs);
-  MemoryRegion code(&managed_code[0], managed_code.size());
-  __ FinalizeInstructions(code);
-  dump(managed_code, "CompareAndBranch");
-  delete assembler;
+  EmitAndCheck(&assembler, "CompareAndBranch");
 }
 
 #undef __
diff --git a/compiler/utils/assembler_thumb_test_expected.cc.inc b/compiler/utils/assembler_thumb_test_expected.cc.inc
index 841d6a00c0..280ed779b3 100644
--- a/compiler/utils/assembler_thumb_test_expected.cc.inc
+++ b/compiler/utils/assembler_thumb_test_expected.cc.inc
@@ -4832,7 +4832,7 @@ const char* CompareAndBranchResults[] = {
   nullptr
 };
 
-std::map<std::string, const char**> test_results;
+std::map<std::string, const char* const*> test_results;
 void setup_results() {
     test_results["SimpleMov"] = SimpleMovResults;
     test_results["SimpleMov32"] = SimpleMov32Results;