64 files changed, 8938 insertions, 644 deletions
diff --git a/compiler/Android.bp b/compiler/Android.bp
index d57f301ff9..312fc7b35a 100644
--- a/compiler/Android.bp
+++ b/compiler/Android.bp
@@ -106,7 +106,9 @@ art_cc_defaults {
                 "linker/arm/relative_patcher_arm_base.cc",
                 "linker/arm/relative_patcher_thumb2.cc",
                 "optimizing/code_generator_arm.cc",
+                "optimizing/code_generator_vector_arm.cc",
                 "optimizing/code_generator_arm_vixl.cc",
+                "optimizing/code_generator_vector_arm_vixl.cc",
                 "optimizing/dex_cache_array_fixups_arm.cc",
                 "optimizing/instruction_simplifier_arm.cc",
                 "optimizing/instruction_simplifier_shared.cc",
@@ -126,6 +128,7 @@ art_cc_defaults {
                 "jni/quick/arm64/calling_convention_arm64.cc",
                 "linker/arm64/relative_patcher_arm64.cc",
                 "optimizing/code_generator_arm64.cc",
+                "optimizing/code_generator_vector_arm64.cc",
                 "optimizing/scheduler_arm64.cc",
                 "optimizing/instruction_simplifier_arm64.cc",
                 "optimizing/intrinsics_arm64.cc",
@@ -139,6 +142,7 @@ art_cc_defaults {
                 "jni/quick/mips/calling_convention_mips.cc",
                 "linker/mips/relative_patcher_mips.cc",
                 "optimizing/code_generator_mips.cc",
+                "optimizing/code_generator_vector_mips.cc",
                 "optimizing/dex_cache_array_fixups_mips.cc",
                 "optimizing/intrinsics_mips.cc",
                 "optimizing/pc_relative_fixups_mips.cc",
@@ -151,6 +155,7 @@ art_cc_defaults {
                 "jni/quick/mips64/calling_convention_mips64.cc",
                 "linker/mips64/relative_patcher_mips64.cc",
                 "optimizing/code_generator_mips64.cc",
+                "optimizing/code_generator_vector_mips64.cc",
                 "optimizing/intrinsics_mips64.cc",
                 "utils/mips64/assembler_mips64.cc",
                 "utils/mips64/managed_register_mips64.cc",
@@ -162,6 +167,7 @@ art_cc_defaults {
                 "linker/x86/relative_patcher_x86.cc",
                 "linker/x86/relative_patcher_x86_base.cc",
                 "optimizing/code_generator_x86.cc",
+                "optimizing/code_generator_vector_x86.cc",
                 "optimizing/intrinsics_x86.cc",
                 "optimizing/pc_relative_fixups_x86.cc",
                 "optimizing/x86_memory_gen.cc",
@@ -176,6 +182,7 @@ art_cc_defaults {
                 "linker/x86_64/relative_patcher_x86_64.cc",
                 "optimizing/intrinsics_x86_64.cc",
                 "optimizing/code_generator_x86_64.cc",
+                "optimizing/code_generator_vector_x86_64.cc",
                 "utils/x86_64/assembler_x86_64.cc",
                 "utils/x86_64/jni_macro_assembler_x86_64.cc",
                 "utils/x86_64/managed_register_x86_64.cc",
@@ -391,6 +398,7 @@ art_cc_test {
         mips64: {
             srcs: [
                 "linker/mips64/relative_patcher_mips64_test.cc",
+                "utils/mips64/managed_register_mips64_test.cc",
             ],
         },
         x86: {
diff --git a/compiler/dex/dex_to_dex_compiler.cc b/compiler/dex/dex_to_dex_compiler.cc
index 808e28c9ea..538fe93793 100644
--- a/compiler/dex/dex_to_dex_compiler.cc
+++ b/compiler/dex/dex_to_dex_compiler.cc
@@ -70,10 +70,6 @@ class DexCompiler {
     return *unit_.GetDexFile();
   }
 
-  bool PerformOptimizations() const {
-    return dex_to_dex_compilation_level_ >= DexToDexCompilationLevel::kOptimize;
-  }
-
   // Compiles a RETURN-VOID into a RETURN-VOID-BARRIER within a constructor where
   // a barrier is required.
   void CompileReturnVoid(Instruction* inst, uint32_t dex_pc);
@@ -114,7 +110,7 @@ class DexCompiler {
 };
 
 void DexCompiler::Compile() {
-  DCHECK_GE(dex_to_dex_compilation_level_, DexToDexCompilationLevel::kRequired);
+  DCHECK_EQ(dex_to_dex_compilation_level_, DexToDexCompilationLevel::kOptimize);
   const DexFile::CodeItem* code_item = unit_.GetCodeItem();
   const uint16_t* insns = code_item->insns_;
   const uint32_t insns_size = code_item->insns_size_in_code_units_;
@@ -221,7 +217,7 @@ void DexCompiler::CompileReturnVoid(Instruction* inst, uint32_t dex_pc) {
 }
 
 Instruction* DexCompiler::CompileCheckCast(Instruction* inst, uint32_t dex_pc) {
-  if (!kEnableCheckCastEllision || !PerformOptimizations()) {
+  if (!kEnableCheckCastEllision) {
     return inst;
   }
   if (!driver_.IsSafeCast(&unit_, dex_pc)) {
@@ -254,7 +250,7 @@ void DexCompiler::CompileInstanceFieldAccess(Instruction* inst,
                                              uint32_t dex_pc,
                                              Instruction::Code new_opcode,
                                              bool is_put) {
-  if (!kEnableQuickening || !PerformOptimizations()) {
+  if (!kEnableQuickening) {
     return;
   }
   uint32_t field_idx = inst->VRegC_22c();
@@ -279,7 +275,7 @@ void DexCompiler::CompileInstanceFieldAccess(Instruction* inst,
 
 void DexCompiler::CompileInvokeVirtual(Instruction* inst, uint32_t dex_pc,
                                        Instruction::Code new_opcode, bool is_range) {
-  if (!kEnableQuickening || !PerformOptimizations()) {
+  if (!kEnableQuickening) {
     return;
   }
   uint32_t method_idx = is_range ? inst->VRegB_3rc() : inst->VRegB_35c();
diff --git a/compiler/dex/dex_to_dex_compiler.h b/compiler/dex/dex_to_dex_compiler.h
index 00c596d60e..87ddb395ad 100644
--- a/compiler/dex/dex_to_dex_compiler.h
+++ b/compiler/dex/dex_to_dex_compiler.h
@@ -34,8 +34,7 @@ namespace optimizer {
 
 enum class DexToDexCompilationLevel {
   kDontDexToDexCompile,   // Only meaning wrt image time interpretation.
-  kRequired,              // Dex-to-dex compilation required for correctness.
-  kOptimize               // Perform required transformation and peep-hole optimizations.
+  kOptimize               // Perform peep-hole optimizations.
 };
 std::ostream& operator<<(std::ostream& os, const DexToDexCompilationLevel& rhs);
 
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index 995098799c..e823f67d3c 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -532,16 +532,13 @@ static optimizer::DexToDexCompilationLevel GetDexToDexCompilationLevel(
   if (driver.GetCompilerOptions().GetDebuggable()) {
     // We are debuggable so definitions of classes might be changed. We don't want to do any
     // optimizations that could break that.
-    max_level = optimizer::DexToDexCompilationLevel::kRequired;
+    max_level = optimizer::DexToDexCompilationLevel::kDontDexToDexCompile;
   }
   if (klass->IsVerified()) {
     // Class is verified so we can enable DEX-to-DEX compilation for performance.
     return max_level;
-  } else if (klass->ShouldVerifyAtRuntime()) {
-    // Class verification has soft-failed. Anyway, ensure at least correctness.
-    return optimizer::DexToDexCompilationLevel::kRequired;
   } else {
-    // Class verification has failed: do not run DEX-to-DEX compilation.
+    // Class verification has failed: do not run DEX-to-DEX optimizations.
     return optimizer::DexToDexCompilationLevel::kDontDexToDexCompile;
   }
 }
@@ -611,7 +608,7 @@ static void CompileMethod(Thread* self,
           dex_file,
           (verified_method != nullptr)
               ? dex_to_dex_compilation_level
-              : optimizer::DexToDexCompilationLevel::kRequired);
+              : optimizer::DexToDexCompilationLevel::kDontDexToDexCompile);
     }
   } else if ((access_flags & kAccNative) != 0) {
     // Are we extracting only and have support for generic JNI down calls?
diff --git a/compiler/image_writer.cc b/compiler/image_writer.cc
index d156644484..d129249d63 100644
--- a/compiler/image_writer.cc
+++ b/compiler/image_writer.cc
@@ -1338,21 +1338,20 @@ mirror::Object* ImageWriter::TryAssignBinSlot(WorkStack& work_stack,
       // live.
       if (as_klass->ShouldHaveImt()) {
         ImTable* imt = as_klass->GetImt(target_ptr_size_);
-        for (size_t i = 0; i < ImTable::kSize; ++i) {
-          ArtMethod* imt_method = imt->Get(i, target_ptr_size_);
-          DCHECK(imt_method != nullptr);
-          if (imt_method->IsRuntimeMethod() &&
-              !IsInBootImage(imt_method) &&
-              !NativeRelocationAssigned(imt_method)) {
-            AssignMethodOffset(imt_method, kNativeObjectRelocationTypeRuntimeMethod, oat_index);
+        if (TryAssignImTableOffset(imt, oat_index)) {
+          // Since imt's can be shared only do this the first time to not double count imt method
+          // fixups.
+          for (size_t i = 0; i < ImTable::kSize; ++i) {
+            ArtMethod* imt_method = imt->Get(i, target_ptr_size_);
+            DCHECK(imt_method != nullptr);
+            if (imt_method->IsRuntimeMethod() &&
+                !IsInBootImage(imt_method) &&
+                !NativeRelocationAssigned(imt_method)) {
+              AssignMethodOffset(imt_method, kNativeObjectRelocationTypeRuntimeMethod, oat_index);
+            }
           }
         }
       }
-
-      if (as_klass->ShouldHaveImt()) {
-        ImTable* imt = as_klass->GetImt(target_ptr_size_);
-        TryAssignImTableOffset(imt, oat_index);
-      }
     } else if (obj->IsClassLoader()) {
       // Register the class loader if it has a class table.
       // The fake boot class loader should not get registered and we should end up with only one
@@ -1386,10 +1385,10 @@ bool ImageWriter::NativeRelocationAssigned(void* ptr) const {
   return native_object_relocations_.find(ptr) != native_object_relocations_.end();
 }
 
-void ImageWriter::TryAssignImTableOffset(ImTable* imt, size_t oat_index) {
+bool ImageWriter::TryAssignImTableOffset(ImTable* imt, size_t oat_index) {
   // No offset, or already assigned.
   if (imt == nullptr || IsInBootImage(imt) || NativeRelocationAssigned(imt)) {
-    return;
+    return false;
   }
   // If the method is a conflict method we also want to assign the conflict table offset.
   ImageInfo& image_info = GetImageInfo(oat_index);
@@ -1401,6 +1400,7 @@ void ImageWriter::TryAssignImTableOffset(ImTable* imt, size_t oat_index) {
           image_info.bin_slot_sizes_[kBinImTable],
           kNativeObjectRelocationTypeIMTable});
   image_info.bin_slot_sizes_[kBinImTable] += size;
+  return true;
 }
 
 void ImageWriter::TryAssignConflictTableOffset(ImtConflictTable* table, size_t oat_index) {
@@ -1499,8 +1499,7 @@ class ImageWriter::VisitReferencesVisitor {
   ALWAYS_INLINE void operator() (ObjPtr<mirror::Class> klass ATTRIBUTE_UNUSED,
                                  ObjPtr<mirror::Reference> ref) const
       REQUIRES_SHARED(Locks::mutator_lock_) {
-    ref->SetReferent</*kTransactionActive*/false>(
-        VisitReference(ref->GetReferent<kWithoutReadBarrier>()));
+    operator()(ref, mirror::Reference::ReferentOffset(), /* is_static */ false);
   }
 
  private:
@@ -1658,7 +1657,7 @@ void ImageWriter::CalculateNewObjectOffsets() {
   // Calculate size of the dex cache arrays slot and prepare offsets.
   PrepareDexCacheArraySlots();
 
-  // Calculate the sizes of the intern tables and class tables.
+  // Calculate the sizes of the intern tables, class tables, and fixup tables.
   for (ImageInfo& image_info : image_infos_) {
     // Calculate how big the intern table will be after being serialized.
     InternTable* const intern_table = image_info.intern_table_.get();
@@ -1666,6 +1665,7 @@ void ImageWriter::CalculateNewObjectOffsets() {
     if (intern_table->StrongSize() != 0u) {
       image_info.intern_table_bytes_ = intern_table->WriteToMemory(nullptr);
     }
+
     // Calculate the size of the class table.
     ReaderMutexLock mu(self, *Locks::classlinker_classes_lock_);
     DCHECK_EQ(image_info.class_table_->NumReferencedZygoteClasses(), 0u);
@@ -1718,8 +1718,6 @@ void ImageWriter::CalculateNewObjectOffsets() {
   // Transform each object's bin slot into an offset which will be used to do the final copy.
   heap->VisitObjects(UnbinObjectsIntoOffsetCallback, this);
 
-  // DCHECK_EQ(image_end_, GetBinSizeSum(kBinMirrorCount) + image_objects_offset_begin_);
-
   size_t i = 0;
   for (ImageInfo& image_info : image_infos_) {
     image_info.image_roots_address_ = PointerToLowMemUInt32(GetImageAddress(image_roots[i].Get()));
@@ -1733,8 +1731,6 @@ void ImageWriter::CalculateNewObjectOffsets() {
     ImageInfo& image_info = GetImageInfo(relocation.oat_index);
     relocation.offset += image_info.bin_slot_offsets_[bin_type];
   }
-
-  // Note that image_info.image_end_ is left at end of used mirror object section.
 }
 
 size_t ImageWriter::ImageInfo::CreateImageSections(ImageSection* out_sections) const {
@@ -1776,7 +1772,6 @@ size_t ImageWriter::ImageInfo::CreateImageSections(ImageSection* out_sections) c
   ImageSection* dex_cache_arrays_section = &out_sections[ImageHeader::kSectionDexCacheArrays];
   *dex_cache_arrays_section = ImageSection(bin_slot_offsets_[kBinDexCacheArray],
                                            bin_slot_sizes_[kBinDexCacheArray]);
-
   // Round up to the alignment the string table expects. See HashSet::WriteToMemory.
   size_t cur_pos = RoundUp(dex_cache_arrays_section->End(), sizeof(uint64_t));
   // Calculate the size of the interned strings.
@@ -1868,18 +1863,18 @@ class ImageWriter::FixupRootVisitor : public RootVisitor {
   explicit FixupRootVisitor(ImageWriter* image_writer) : image_writer_(image_writer) {
   }
 
-  void VisitRoots(mirror::Object*** roots, size_t count, const RootInfo& info ATTRIBUTE_UNUSED)
+  void VisitRoots(mirror::Object*** roots ATTRIBUTE_UNUSED,
+                  size_t count ATTRIBUTE_UNUSED,
+                  const RootInfo& info ATTRIBUTE_UNUSED)
       OVERRIDE REQUIRES_SHARED(Locks::mutator_lock_) {
-    for (size_t i = 0; i < count; ++i) {
-      *roots[i] = image_writer_->GetImageAddress(*roots[i]);
-    }
+    LOG(FATAL) << "Unsupported";
   }
 
   void VisitRoots(mirror::CompressedReference<mirror::Object>** roots, size_t count,
                   const RootInfo& info ATTRIBUTE_UNUSED)
       OVERRIDE REQUIRES_SHARED(Locks::mutator_lock_) {
     for (size_t i = 0; i < count; ++i) {
-      roots[i]->Assign(image_writer_->GetImageAddress(roots[i]->AsMirrorPtr()));
+      image_writer_->CopyReference(roots[i], roots[i]->AsMirrorPtr());
     }
   }
 
@@ -1890,7 +1885,9 @@ class ImageWriter::FixupRootVisitor : public RootVisitor {
 void ImageWriter::CopyAndFixupImTable(ImTable* orig, ImTable* copy) {
   for (size_t i = 0; i < ImTable::kSize; ++i) {
     ArtMethod* method = orig->Get(i, target_ptr_size_);
-    copy->Set(i, NativeLocationInImage(method), target_ptr_size_);
+    void** address = reinterpret_cast<void**>(copy->AddressOfElement(i, target_ptr_size_));
+    CopyAndFixupPointer(address, method);
+    DCHECK_EQ(copy->Get(i, target_ptr_size_), NativeLocationInImage(method));
   }
 }
 
@@ -1899,10 +1896,13 @@ void ImageWriter::CopyAndFixupImtConflictTable(ImtConflictTable* orig, ImtConfli
   for (size_t i = 0; i < count; ++i) {
     ArtMethod* interface_method = orig->GetInterfaceMethod(i, target_ptr_size_);
     ArtMethod* implementation_method = orig->GetImplementationMethod(i, target_ptr_size_);
-    copy->SetInterfaceMethod(i, target_ptr_size_, NativeLocationInImage(interface_method));
-    copy->SetImplementationMethod(i,
-                                  target_ptr_size_,
-                                  NativeLocationInImage(implementation_method));
+    CopyAndFixupPointer(copy->AddressOfInterfaceMethod(i, target_ptr_size_), interface_method);
+    CopyAndFixupPointer(copy->AddressOfImplementationMethod(i, target_ptr_size_),
+                        implementation_method);
+    DCHECK_EQ(copy->GetInterfaceMethod(i, target_ptr_size_),
+              NativeLocationInImage(interface_method));
+    DCHECK_EQ(copy->GetImplementationMethod(i, target_ptr_size_),
+              NativeLocationInImage(implementation_method));
   }
 }
 
@@ -1921,8 +1921,9 @@ void ImageWriter::CopyAndFixupNativeData(size_t oat_index) {
     switch (relocation.type) {
       case kNativeObjectRelocationTypeArtField: {
         memcpy(dest, pair.first, sizeof(ArtField));
-        reinterpret_cast<ArtField*>(dest)->SetDeclaringClass(
-            GetImageAddress(reinterpret_cast<ArtField*>(pair.first)->GetDeclaringClass().Ptr()));
+        CopyReference(
+            reinterpret_cast<ArtField*>(dest)->GetDeclaringClassAddressWithoutBarrier(),
+            reinterpret_cast<ArtField*>(pair.first)->GetDeclaringClass().Ptr());
         break;
       }
       case kNativeObjectRelocationTypeRuntimeMethod:
@@ -2039,8 +2040,10 @@ void ImageWriter::CopyAndFixupObjectsCallback(Object* obj, void* arg) {
   reinterpret_cast<ImageWriter*>(arg)->CopyAndFixupObject(obj);
 }
 
-void ImageWriter::FixupPointerArray(mirror::Object* dst, mirror::PointerArray* arr,
-                                    mirror::Class* klass, Bin array_type) {
+void ImageWriter::FixupPointerArray(mirror::Object* dst,
+                                    mirror::PointerArray* arr,
+                                    mirror::Class* klass,
+                                    Bin array_type) {
   CHECK(klass->IsArrayClass());
   CHECK(arr->IsIntArray() || arr->IsLongArray()) << klass->PrettyClass() << " " << arr;
   // Fixup int and long pointers for the ArtMethod or ArtField arrays.
@@ -2049,7 +2052,7 @@ void ImageWriter::FixupPointerArray(mirror::Object* dst, mirror::PointerArray* a
   auto* dest_array = down_cast<mirror::PointerArray*>(dst);
   for (size_t i = 0, count = num_elements; i < count; ++i) {
     void* elem = arr->GetElementPtrSize<void*>(i, target_ptr_size_);
-    if (elem != nullptr && !IsInBootImage(elem)) {
+    if (kIsDebugBuild && elem != nullptr && !IsInBootImage(elem)) {
       auto it = native_object_relocations_.find(elem);
       if (UNLIKELY(it == native_object_relocations_.end())) {
         if (it->second.IsArtMethodRelocation()) {
@@ -2065,12 +2068,9 @@ void ImageWriter::FixupPointerArray(mirror::Object* dst, mirror::PointerArray* a
               << Class::PrettyClass(field->GetDeclaringClass());
         }
         UNREACHABLE();
-      } else {
-        ImageInfo& image_info = GetImageInfo(it->second.oat_index);
-        elem = image_info.image_begin_ + it->second.offset;
       }
     }
-    dest_array->SetElementPtrSize<false, true>(i, elem, target_ptr_size_);
+    CopyAndFixupPointer(dest_array->ElementAddress(i, target_ptr_size_), elem);
   }
 }
 
@@ -2118,22 +2118,19 @@ class ImageWriter::FixupVisitor {
 
 
   void operator()(ObjPtr<Object> obj, MemberOffset offset, bool is_static ATTRIBUTE_UNUSED) const
-      REQUIRES(Locks::mutator_lock_, Locks::heap_bitmap_lock_) {
+      REQUIRES_SHARED(Locks::mutator_lock_) REQUIRES(Locks::heap_bitmap_lock_) {
     ObjPtr<Object> ref = obj->GetFieldObject<Object, kVerifyNone>(offset);
-    // Use SetFieldObjectWithoutWriteBarrier to avoid card marking since we are writing to the
-    // image.
-    copy_->SetFieldObjectWithoutWriteBarrier<false, true, kVerifyNone>(
-        offset,
-        image_writer_->GetImageAddress(ref.Ptr()));
+    // Copy the reference and record the fixup if necessary.
+    image_writer_->CopyReference(
+        copy_->GetFieldObjectReferenceAddr<kVerifyNone>(offset),
+        ref.Ptr());
   }
 
   // java.lang.ref.Reference visitor.
   void operator()(ObjPtr<mirror::Class> klass ATTRIBUTE_UNUSED,
                   ObjPtr<mirror::Reference> ref) const
       REQUIRES_SHARED(Locks::mutator_lock_) REQUIRES(Locks::heap_bitmap_lock_) {
-    copy_->SetFieldObjectWithoutWriteBarrier<false, true, kVerifyNone>(
-        mirror::Reference::ReferentOffset(),
-        image_writer_->GetImageAddress(ref->GetReferent()));
+    operator()(ref, mirror::Reference::ReferentOffset(), /* is_static */ false);
   }
 
  protected:
@@ -2211,7 +2208,10 @@ class ImageWriter::NativeLocationVisitor {
   explicit NativeLocationVisitor(ImageWriter* image_writer) : image_writer_(image_writer) {}
 
   template <typename T>
-  T* operator()(T* ptr) const REQUIRES_SHARED(Locks::mutator_lock_) {
+  T* operator()(T* ptr, void** dest_addr = nullptr) const REQUIRES_SHARED(Locks::mutator_lock_) {
+    if (dest_addr != nullptr) {
+      image_writer_->CopyAndFixupPointer(dest_addr, ptr);
+    }
     return image_writer_->NativeLocationInImage(ptr);
   }
 
@@ -2274,10 +2274,10 @@ void ImageWriter::FixupObject(Object* orig, Object* copy) {
   }
 }
 
-
-class ImageAddressVisitor {
+class ImageWriter::ImageAddressVisitorForDexCacheArray {
  public:
-  explicit ImageAddressVisitor(ImageWriter* image_writer) : image_writer_(image_writer) {}
+  explicit ImageAddressVisitorForDexCacheArray(ImageWriter* image_writer)
+      : image_writer_(image_writer) {}
 
   template <typename T>
   T* operator()(T* ptr) const REQUIRES_SHARED(Locks::mutator_lock_) {
@@ -2288,9 +2288,9 @@ class ImageAddressVisitor {
   ImageWriter* const image_writer_;
 };
 
-
 void ImageWriter::FixupDexCache(mirror::DexCache* orig_dex_cache,
                                 mirror::DexCache* copy_dex_cache) {
+  ImageAddressVisitorForDexCacheArray fixup_visitor(this);
   // Though the DexCache array fields are usually treated as native pointers, we set the full
   // 64-bit values here, clearing the top 32 bits for 32-bit targets. The zero-extension is
   // done by casting to the unsigned type uintptr_t before casting to int64_t, i.e.
@@ -2300,8 +2300,7 @@ void ImageWriter::FixupDexCache(mirror::DexCache* orig_dex_cache,
     copy_dex_cache->SetFieldPtrWithSize<false>(mirror::DexCache::StringsOffset(),
                                                NativeLocationInImage(orig_strings),
                                                PointerSize::k64);
-    orig_dex_cache->FixupStrings(NativeCopyLocation(orig_strings, orig_dex_cache),
-                                 ImageAddressVisitor(this));
+    orig_dex_cache->FixupStrings(NativeCopyLocation(orig_strings, orig_dex_cache), fixup_visitor);
   }
   mirror::TypeDexCacheType* orig_types = orig_dex_cache->GetResolvedTypes();
   if (orig_types != nullptr) {
@@ -2309,7 +2308,7 @@ void ImageWriter::FixupDexCache(mirror::DexCache* orig_dex_cache,
                                                NativeLocationInImage(orig_types),
                                                PointerSize::k64);
     orig_dex_cache->FixupResolvedTypes(NativeCopyLocation(orig_types, orig_dex_cache),
-                                       ImageAddressVisitor(this));
+                                       fixup_visitor);
   }
   ArtMethod** orig_methods = orig_dex_cache->GetResolvedMethods();
   if (orig_methods != nullptr) {
@@ -2333,7 +2332,8 @@ void ImageWriter::FixupDexCache(mirror::DexCache* orig_dex_cache,
     for (size_t i = 0, num = orig_dex_cache->NumResolvedFields(); i != num; ++i) {
       mirror::FieldDexCachePair orig =
           mirror::DexCache::GetNativePairPtrSize(orig_fields, i, target_ptr_size_);
-      mirror::FieldDexCachePair copy(NativeLocationInImage(orig.object), orig.index);
+      mirror::FieldDexCachePair copy = orig;
+      copy.object = NativeLocationInImage(orig.object);
       mirror::DexCache::SetNativePairPtrSize(copy_fields, i, copy, target_ptr_size_);
     }
   }
@@ -2343,7 +2343,7 @@ void ImageWriter::FixupDexCache(mirror::DexCache* orig_dex_cache,
                                                NativeLocationInImage(orig_method_types),
                                                PointerSize::k64);
     orig_dex_cache->FixupResolvedMethodTypes(NativeCopyLocation(orig_method_types, orig_dex_cache),
-                                             ImageAddressVisitor(this));
+                                             fixup_visitor);
   }
   GcRoot<mirror::CallSite>* orig_call_sites = orig_dex_cache->GetResolvedCallSites();
   if (orig_call_sites != nullptr) {
@@ -2351,7 +2351,7 @@ void ImageWriter::FixupDexCache(mirror::DexCache* orig_dex_cache,
                                                NativeLocationInImage(orig_call_sites),
                                                PointerSize::k64);
     orig_dex_cache->FixupResolvedCallSites(NativeCopyLocation(orig_call_sites, orig_dex_cache),
-                                           ImageAddressVisitor(this));
+                                           fixup_visitor);
   }
 
   // Remove the DexFile pointers. They will be fixed up when the runtime loads the oat file. Leaving
@@ -2459,7 +2459,8 @@ void ImageWriter::CopyAndFixupMethod(ArtMethod* orig,
 
   memcpy(copy, orig, ArtMethod::Size(target_ptr_size_));
 
-  copy->SetDeclaringClass(GetImageAddress(orig->GetDeclaringClassUnchecked()));
+  CopyReference(copy->GetDeclaringClassAddressWithoutBarrier(), orig->GetDeclaringClassUnchecked());
+
   ArtMethod** orig_resolved_methods = orig->GetDexCacheResolvedMethods(target_ptr_size_);
   copy->SetDexCacheResolvedMethods(NativeLocationInImage(orig_resolved_methods), target_ptr_size_);
 
@@ -2571,7 +2572,7 @@ size_t ImageWriter::GetOatIndex(mirror::Object* obj) const {
     return GetDefaultOatIndex();
   }
   auto it = oat_index_map_.find(obj);
-  DCHECK(it != oat_index_map_.end());
+  DCHECK(it != oat_index_map_.end()) << obj;
   return it->second;
 }
 
@@ -2672,4 +2673,31 @@ ImageWriter::ImageInfo::ImageInfo()
     : intern_table_(new InternTable),
       class_table_(new ClassTable) {}
 
+void ImageWriter::CopyReference(mirror::HeapReference<mirror::Object>* dest,
+                                ObjPtr<mirror::Object> src) {
+  dest->Assign(GetImageAddress(src.Ptr()));
+}
+
+void ImageWriter::CopyReference(mirror::CompressedReference<mirror::Object>* dest,
+                                ObjPtr<mirror::Object> src) {
+  dest->Assign(GetImageAddress(src.Ptr()));
+}
+
+void ImageWriter::CopyAndFixupPointer(void** target, void* value) {
+  void* new_value = value;
+  if (value != nullptr && !IsInBootImage(value)) {
+    auto it = native_object_relocations_.find(value);
+    CHECK(it != native_object_relocations_.end()) << value;
+    const NativeObjectRelocation& relocation = it->second;
+    ImageInfo& image_info = GetImageInfo(relocation.oat_index);
+    new_value = reinterpret_cast<void*>(image_info.image_begin_ + relocation.offset);
+  }
+  if (target_ptr_size_ == PointerSize::k32) {
+    *reinterpret_cast<uint32_t*>(target) = PointerToLowMemUInt32(new_value);
+  } else {
+    *reinterpret_cast<uint64_t*>(target) = reinterpret_cast<uintptr_t>(new_value);
+  }
+}
+
+
 }  // namespace art
diff --git a/compiler/image_writer.h b/compiler/image_writer.h
index 16aff61dab..39113c8143 100644
--- a/compiler/image_writer.h
+++ b/compiler/image_writer.h
@@ -38,8 +38,9 @@
 #include "image.h"
 #include "lock_word.h"
 #include "mem_map.h"
-#include "oat_file.h"
 #include "mirror/dex_cache.h"
+#include "obj_ptr.h"
+#include "oat_file.h"
 #include "os.h"
 #include "safe_map.h"
 #include "utils.h"
@@ -317,6 +318,12 @@ class ImageWriter FINAL {
     // Number of image class table bytes.
     size_t class_table_bytes_ = 0;
 
+    // Number of object fixup bytes.
+    size_t object_fixup_bytes_ = 0;
+
+    // Number of pointer fixup bytes.
+    size_t pointer_fixup_bytes_ = 0;
+
     // Intern table associated with this image for serialization.
     std::unique_ptr<InternTable> intern_table_;
 
@@ -464,7 +471,8 @@ class ImageWriter FINAL {
                           size_t oat_index)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
-  void TryAssignImTableOffset(ImTable* imt, size_t oat_index) REQUIRES_SHARED(Locks::mutator_lock_);
+  // Return true if imt was newly inserted.
+  bool TryAssignImTableOffset(ImTable* imt, size_t oat_index) REQUIRES_SHARED(Locks::mutator_lock_);
 
   // Assign the offset for an IMT conflict table. Does nothing if the table already has a native
   // relocation.
@@ -534,6 +542,14 @@ class ImageWriter FINAL {
   // Return true if there already exists a native allocation for an object.
   bool NativeRelocationAssigned(void* ptr) const;
 
+  void CopyReference(mirror::HeapReference<mirror::Object>* dest, ObjPtr<mirror::Object> src)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
+  void CopyReference(mirror::CompressedReference<mirror::Object>* dest, ObjPtr<mirror::Object> src)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
+  void CopyAndFixupPointer(void** target, void* value);
+
   const CompilerDriver& compiler_driver_;
 
   // Beginning target image address for the first image.
@@ -608,9 +624,11 @@ class ImageWriter FINAL {
   class FixupRootVisitor;
   class FixupVisitor;
   class GetRootsVisitor;
+  class ImageAddressVisitorForDexCacheArray;
   class NativeLocationVisitor;
   class PruneClassesVisitor;
   class PruneClassLoaderClassesVisitor;
+  class RegisterBootClassPathClassesVisitor;
   class VisitReferencesVisitor;
 
   DISALLOW_COPY_AND_ASSIGN(ImageWriter);
diff --git a/compiler/optimizing/bounds_check_elimination.cc b/compiler/optimizing/bounds_check_elimination.cc
index 2ee4db923a..476906a768 100644
--- a/compiler/optimizing/bounds_check_elimination.cc
+++ b/compiler/optimizing/bounds_check_elimination.cc
@@ -528,7 +528,8 @@ class BCEVisitor : public HGraphVisitor {
         has_dom_based_dynamic_bce_(false),
         initial_block_size_(graph->GetBlocks().size()),
         side_effects_(side_effects),
-        induction_range_(induction_analysis) {}
+        induction_range_(induction_analysis),
+        next_(nullptr) {}
 
   void VisitBasicBlock(HBasicBlock* block) OVERRIDE {
     DCHECK(!IsAddedBlock(block));
@@ -1618,8 +1619,8 @@ class BCEVisitor : public HGraphVisitor {
   void InsertDeoptInLoop(HLoopInformation* loop, HBasicBlock* block, HInstruction* condition) {
     HInstruction* suspend = loop->GetSuspendCheck();
     block->InsertInstructionBefore(condition, block->GetLastInstruction());
-    HDeoptimize* deoptimize =
-        new (GetGraph()->GetArena()) HDeoptimize(condition, suspend->GetDexPc());
+    HDeoptimize* deoptimize = new (GetGraph()->GetArena()) HDeoptimize(
+        GetGraph()->GetArena(), condition, HDeoptimize::Kind::kBCE, suspend->GetDexPc());
     block->InsertInstructionBefore(deoptimize, block->GetLastInstruction());
     if (suspend->HasEnvironment()) {
       deoptimize->CopyEnvironmentFromWithLoopPhiAdjustment(
@@ -1631,8 +1632,8 @@ class BCEVisitor : public HGraphVisitor {
   void InsertDeoptInBlock(HBoundsCheck* bounds_check, HInstruction* condition) {
     HBasicBlock* block = bounds_check->GetBlock();
     block->InsertInstructionBefore(condition, bounds_check);
-    HDeoptimize* deoptimize =
-        new (GetGraph()->GetArena()) HDeoptimize(condition, bounds_check->GetDexPc());
+    HDeoptimize* deoptimize = new (GetGraph()->GetArena()) HDeoptimize(
+        GetGraph()->GetArena(), condition, HDeoptimize::Kind::kBCE, bounds_check->GetDexPc());
     block->InsertInstructionBefore(deoptimize, bounds_check);
     deoptimize->CopyEnvironmentFrom(bounds_check->GetEnvironment());
   }
diff --git a/compiler/optimizing/cha_guard_optimization.cc b/compiler/optimizing/cha_guard_optimization.cc
index fe423012ca..048073e37a 100644
--- a/compiler/optimizing/cha_guard_optimization.cc
+++ b/compiler/optimizing/cha_guard_optimization.cc
@@ -36,7 +36,8 @@ class CHAGuardVisitor : HGraphVisitor {
       : HGraphVisitor(graph),
         block_has_cha_guard_(GetGraph()->GetBlocks().size(),
                              0,
-                             graph->GetArena()->Adapter(kArenaAllocCHA)) {
+                             graph->GetArena()->Adapter(kArenaAllocCHA)),
+        instruction_iterator_(nullptr) {
     number_of_guards_to_visit_ = GetGraph()->GetNumberOfCHAGuards();
     DCHECK_NE(number_of_guards_to_visit_, 0u);
     // Will recount number of guards during guard optimization.
@@ -201,8 +202,8 @@ bool CHAGuardVisitor::HoistGuard(HShouldDeoptimizeFlag* flag,
     HInstruction* suspend = loop_info->GetSuspendCheck();
     // Need a new deoptimize instruction that copies the environment
     // of the suspend instruction for the loop.
-    HDeoptimize* deoptimize =
-        new (GetGraph()->GetArena()) HDeoptimize(compare, suspend->GetDexPc());
+    HDeoptimize* deoptimize = new (GetGraph()->GetArena()) HDeoptimize(
+        GetGraph()->GetArena(), compare, HDeoptimize::Kind::kInline, suspend->GetDexPc());
     pre_header->InsertInstructionBefore(deoptimize, pre_header->GetLastInstruction());
     deoptimize->CopyEnvironmentFromWithLoopPhiAdjustment(
         suspend->GetEnvironment(), loop_info->GetHeader());
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index d735b27090..d7cc577580 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -1134,7 +1134,7 @@ class ReadBarrierForHeapReferenceSlowPathARM : public SlowPathCodeARM {
            instruction_->IsArrayGet() ||
            instruction_->IsInstanceOf() ||
            instruction_->IsCheckCast() ||
-           (instruction_->IsInvokeVirtual()) && instruction_->GetLocations()->Intrinsified())
+           (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()))
         << "Unexpected instruction in read barrier for heap reference slow path: "
         << instruction_->DebugName();
     // The read barrier instrumentation of object ArrayGet
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 28cc942dfb..d463830ff6 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -1150,7 +1150,7 @@ class ReadBarrierForHeapReferenceSlowPathARM64 : public SlowPathCodeARM64 {
            instruction_->IsArrayGet() ||
            instruction_->IsInstanceOf() ||
            instruction_->IsCheckCast() ||
-           (instruction_->IsInvokeVirtual()) && instruction_->GetLocations()->Intrinsified())
+           (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()))
         << "Unexpected instruction in read barrier for heap reference slow path: "
         << instruction_->DebugName();
     // The read barrier instrumentation of object ArrayGet
@@ -3281,7 +3281,7 @@ void InstructionCodeGeneratorARM64::GenerateDivRemWithAnyConstant(HBinaryOperati
 void InstructionCodeGeneratorARM64::GenerateDivRemIntegral(HBinaryOperation* instruction) {
   DCHECK(instruction->IsDiv() || instruction->IsRem());
   Primitive::Type type = instruction->GetResultType();
-  DCHECK(type == Primitive::kPrimInt || Primitive::kPrimLong);
+  DCHECK(type == Primitive::kPrimInt || type == Primitive::kPrimLong);
 
   LocationSummary* locations = instruction->GetLocations();
   Register out = OutputRegister(instruction);
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 7471cd5f12..10d8b841f8 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -318,6 +318,11 @@ class InstructionCodeGeneratorARM64 : public InstructionCodeGenerator {
   void GenerateDivRemIntegral(HBinaryOperation* instruction);
   void HandleGoto(HInstruction* got, HBasicBlock* successor);
 
+  vixl::aarch64::MemOperand CreateVecMemRegisters(
+      HVecMemoryOperation* instruction,
+      Location* reg_loc,
+      bool is_load);
+
   Arm64Assembler* const assembler_;
   CodeGeneratorARM64* const codegen_;
 
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index a1c3da9e9c..cce412b314 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -1175,7 +1175,7 @@ class ReadBarrierForHeapReferenceSlowPathARMVIXL : public SlowPathCodeARMVIXL {
            instruction_->IsArrayGet() ||
            instruction_->IsInstanceOf() ||
            instruction_->IsCheckCast() ||
-           (instruction_->IsInvokeVirtual()) && instruction_->GetLocations()->Intrinsified())
+           (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()))
         << "Unexpected instruction in read barrier for heap reference slow path: "
         << instruction_->DebugName();
     // The read barrier instrumentation of object ArrayGet
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index 5f02a52417..287891feae 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -461,6 +461,536 @@ class DeoptimizationSlowPathMIPS : public SlowPathCodeMIPS {
   DISALLOW_COPY_AND_ASSIGN(DeoptimizationSlowPathMIPS);
 };
 
+class ArraySetSlowPathMIPS : public SlowPathCodeMIPS {
+ public:
+  explicit ArraySetSlowPathMIPS(HInstruction* instruction) : SlowPathCodeMIPS(instruction) {}
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    LocationSummary* locations = instruction_->GetLocations();
+    __ Bind(GetEntryLabel());
+    SaveLiveRegisters(codegen, locations);
+
+    InvokeRuntimeCallingConvention calling_convention;
+    HParallelMove parallel_move(codegen->GetGraph()->GetArena());
+    parallel_move.AddMove(
+        locations->InAt(0),
+        Location::RegisterLocation(calling_convention.GetRegisterAt(0)),
+        Primitive::kPrimNot,
+        nullptr);
+    parallel_move.AddMove(
+        locations->InAt(1),
+        Location::RegisterLocation(calling_convention.GetRegisterAt(1)),
+        Primitive::kPrimInt,
+        nullptr);
+    parallel_move.AddMove(
+        locations->InAt(2),
+        Location::RegisterLocation(calling_convention.GetRegisterAt(2)),
+        Primitive::kPrimNot,
+        nullptr);
+    codegen->GetMoveResolver()->EmitNativeCode(&parallel_move);
+
+    CodeGeneratorMIPS* mips_codegen = down_cast<CodeGeneratorMIPS*>(codegen);
+    mips_codegen->InvokeRuntime(kQuickAputObject, instruction_, instruction_->GetDexPc(), this);
+    CheckEntrypointTypes<kQuickAputObject, void, mirror::Array*, int32_t, mirror::Object*>();
+    RestoreLiveRegisters(codegen, locations);
+    __ B(GetExitLabel());
+  }
+
+  const char* GetDescription() const OVERRIDE { return "ArraySetSlowPathMIPS"; }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(ArraySetSlowPathMIPS);
+};
+
+// Slow path marking an object reference `ref` during a read
+// barrier. The field `obj.field` in the object `obj` holding this
+// reference does not get updated by this slow path after marking (see
+// ReadBarrierMarkAndUpdateFieldSlowPathMIPS below for that).
+//
+// This means that after the execution of this slow path, `ref` will
+// always be up-to-date, but `obj.field` may not; i.e., after the
+// flip, `ref` will be a to-space reference, but `obj.field` will
+// probably still be a from-space reference (unless it gets updated by
+// another thread, or if another thread installed another object
+// reference (different from `ref`) in `obj.field`).
+//
+// If `entrypoint` is a valid location it is assumed to already be
+// holding the entrypoint. The case where the entrypoint is passed in
+// is for the GcRoot read barrier.
+class ReadBarrierMarkSlowPathMIPS : public SlowPathCodeMIPS {
+ public:
+  ReadBarrierMarkSlowPathMIPS(HInstruction* instruction,
+                              Location ref,
+                              Location entrypoint = Location::NoLocation())
+      : SlowPathCodeMIPS(instruction), ref_(ref), entrypoint_(entrypoint) {
+    DCHECK(kEmitCompilerReadBarrier);
+  }
+
+  const char* GetDescription() const OVERRIDE { return "ReadBarrierMarkSlowPathMIPS"; }
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    LocationSummary* locations = instruction_->GetLocations();
+    Register ref_reg = ref_.AsRegister<Register>();
+    DCHECK(locations->CanCall());
+    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_reg)) << ref_reg;
+    DCHECK(instruction_->IsInstanceFieldGet() ||
+           instruction_->IsStaticFieldGet() ||
+           instruction_->IsArrayGet() ||
+           instruction_->IsArraySet() ||
+           instruction_->IsLoadClass() ||
+           instruction_->IsLoadString() ||
+           instruction_->IsInstanceOf() ||
+           instruction_->IsCheckCast() ||
+           (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()) ||
+           (instruction_->IsInvokeStaticOrDirect() && instruction_->GetLocations()->Intrinsified()))
+        << "Unexpected instruction in read barrier marking slow path: "
+        << instruction_->DebugName();
+
+    __ Bind(GetEntryLabel());
+    // No need to save live registers; it's taken care of by the
+    // entrypoint. Also, there is no need to update the stack mask,
+    // as this runtime call will not trigger a garbage collection.
+    CodeGeneratorMIPS* mips_codegen = down_cast<CodeGeneratorMIPS*>(codegen);
+    DCHECK((V0 <= ref_reg && ref_reg <= T7) ||
+           (S2 <= ref_reg && ref_reg <= S7) ||
+           (ref_reg == FP)) << ref_reg;
+    // "Compact" slow path, saving two moves.
+    //
+    // Instead of using the standard runtime calling convention (input
+    // and output in A0 and V0 respectively):
+    //
+    //   A0 <- ref
+    //   V0 <- ReadBarrierMark(A0)
+    //   ref <- V0
+    //
+    // we just use rX (the register containing `ref`) as input and output
+    // of a dedicated entrypoint:
+    //
+    //   rX <- ReadBarrierMarkRegX(rX)
+    //
+    if (entrypoint_.IsValid()) {
+      mips_codegen->ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction_, this);
+      DCHECK_EQ(entrypoint_.AsRegister<Register>(), T9);
+      __ Jalr(entrypoint_.AsRegister<Register>());
+      __ NopIfNoReordering();
+    } else {
+      int32_t entry_point_offset =
+          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kMipsPointerSize>(ref_reg - 1);
+      // This runtime call does not require a stack map.
+      mips_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset,
+                                                        instruction_,
+                                                        this,
+                                                        /* direct */ false);
+    }
+    __ B(GetExitLabel());
+  }
+
+ private:
+  // The location (register) of the marked object reference.
+  const Location ref_;
+
+  // The location of the entrypoint if already loaded.
+  const Location entrypoint_;
+
+  DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathMIPS);
+};
+
+// Slow path marking an object reference `ref` during a read barrier,
+// and if needed, atomically updating the field `obj.field` in the
+// object `obj` holding this reference after marking (contrary to
+// ReadBarrierMarkSlowPathMIPS above, which never tries to update
+// `obj.field`).
+//
+// This means that after the execution of this slow path, both `ref`
+// and `obj.field` will be up-to-date; i.e., after the flip, both will
+// hold the same to-space reference (unless another thread installed
+// another object reference (different from `ref`) in `obj.field`).
+class ReadBarrierMarkAndUpdateFieldSlowPathMIPS : public SlowPathCodeMIPS {
+ public:
+  ReadBarrierMarkAndUpdateFieldSlowPathMIPS(HInstruction* instruction,
+                                            Location ref,
+                                            Register obj,
+                                            Location field_offset,
+                                            Register temp1)
+      : SlowPathCodeMIPS(instruction),
+        ref_(ref),
+        obj_(obj),
+        field_offset_(field_offset),
+        temp1_(temp1) {
+    DCHECK(kEmitCompilerReadBarrier);
+  }
+
+  const char* GetDescription() const OVERRIDE {
+    return "ReadBarrierMarkAndUpdateFieldSlowPathMIPS";
+  }
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    LocationSummary* locations = instruction_->GetLocations();
+    Register ref_reg = ref_.AsRegister<Register>();
+    DCHECK(locations->CanCall());
+    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_reg)) << ref_reg;
+    // This slow path is only used by the UnsafeCASObject intrinsic.
+    DCHECK((instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()))
+        << "Unexpected instruction in read barrier marking and field updating slow path: "
+        << instruction_->DebugName();
+    DCHECK(instruction_->GetLocations()->Intrinsified());
+    DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kUnsafeCASObject);
+    DCHECK(field_offset_.IsRegisterPair()) << field_offset_;
+
+    __ Bind(GetEntryLabel());
+
+    // Save the old reference.
+    // Note that we cannot use AT or TMP to save the old reference, as those
+    // are used by the code that follows, but we need the old reference after
+    // the call to the ReadBarrierMarkRegX entry point.
+    DCHECK_NE(temp1_, AT);
+    DCHECK_NE(temp1_, TMP);
+    __ Move(temp1_, ref_reg);
+
+    // No need to save live registers; it's taken care of by the
+    // entrypoint. Also, there is no need to update the stack mask,
+    // as this runtime call will not trigger a garbage collection.
+    CodeGeneratorMIPS* mips_codegen = down_cast<CodeGeneratorMIPS*>(codegen);
+    DCHECK((V0 <= ref_reg && ref_reg <= T7) ||
+           (S2 <= ref_reg && ref_reg <= S7) ||
+           (ref_reg == FP)) << ref_reg;
+    // "Compact" slow path, saving two moves.
+    //
+    // Instead of using the standard runtime calling convention (input
+    // and output in A0 and V0 respectively):
+    //
+    //   A0 <- ref
+    //   V0 <- ReadBarrierMark(A0)
+    //   ref <- V0
+    //
+    // we just use rX (the register containing `ref`) as input and output
+    // of a dedicated entrypoint:
+    //
+    //   rX <- ReadBarrierMarkRegX(rX)
+    //
+    int32_t entry_point_offset =
+        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kMipsPointerSize>(ref_reg - 1);
+    // This runtime call does not require a stack map.
+    mips_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset,
+                                                      instruction_,
+                                                      this,
+                                                      /* direct */ false);
+
+    // If the new reference is different from the old reference,
+    // update the field in the holder (`*(obj_ + field_offset_)`).
+    //
+    // Note that this field could also hold a different object, if
+    // another thread had concurrently changed it. In that case, the
+    // the compare-and-set (CAS) loop below would abort, leaving the
+    // field as-is.
+    MipsLabel done;
+    __ Beq(temp1_, ref_reg, &done);
+
+    // Update the the holder's field atomically.  This may fail if
+    // mutator updates before us, but it's OK.  This is achieved
+    // using a strong compare-and-set (CAS) operation with relaxed
+    // memory synchronization ordering, where the expected value is
+    // the old reference and the desired value is the new reference.
+
+    // Convenience aliases.
+    Register base = obj_;
+    // The UnsafeCASObject intrinsic uses a register pair as field
+    // offset ("long offset"), of which only the low part contains
+    // data.
+    Register offset = field_offset_.AsRegisterPairLow<Register>();
+    Register expected = temp1_;
+    Register value = ref_reg;
+    Register tmp_ptr = TMP;      // Pointer to actual memory.
+    Register tmp = AT;           // Value in memory.
+
+    __ Addu(tmp_ptr, base, offset);
+
+    if (kPoisonHeapReferences) {
+      __ PoisonHeapReference(expected);
+      // Do not poison `value` if it is the same register as
+      // `expected`, which has just been poisoned.
+      if (value != expected) {
+        __ PoisonHeapReference(value);
+      }
+    }
+
+    // do {
+    //   tmp = [r_ptr] - expected;
+    // } while (tmp == 0 && failure([r_ptr] <- r_new_value));
+
+    bool is_r6 = mips_codegen->GetInstructionSetFeatures().IsR6();
+    MipsLabel loop_head, exit_loop;
+    __ Bind(&loop_head);
+    if (is_r6) {
+      __ LlR6(tmp, tmp_ptr);
+    } else {
+      __ LlR2(tmp, tmp_ptr);
+    }
+    __ Bne(tmp, expected, &exit_loop);
+    __ Move(tmp, value);
+    if (is_r6) {
+      __ ScR6(tmp, tmp_ptr);
+    } else {
+      __ ScR2(tmp, tmp_ptr);
+    }
+    __ Beqz(tmp, &loop_head);
+    __ Bind(&exit_loop);
+
+    if (kPoisonHeapReferences) {
+      __ UnpoisonHeapReference(expected);
+      // Do not unpoison `value` if it is the same register as
+      // `expected`, which has just been unpoisoned.
+      if (value != expected) {
+        __ UnpoisonHeapReference(value);
+      }
+    }
+
+    __ Bind(&done);
+    __ B(GetExitLabel());
+  }
+
+ private:
+  // The location (register) of the marked object reference.
+  const Location ref_;
+  // The register containing the object holding the marked object reference field.
+  const Register obj_;
+  // The location of the offset of the marked reference field within `obj_`.
+  Location field_offset_;
+
+  const Register temp1_;
+
+  DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkAndUpdateFieldSlowPathMIPS);
+};
+
+// Slow path generating a read barrier for a heap reference.
+class ReadBarrierForHeapReferenceSlowPathMIPS : public SlowPathCodeMIPS {
+ public:
+  ReadBarrierForHeapReferenceSlowPathMIPS(HInstruction* instruction,
+                                          Location out,
+                                          Location ref,
+                                          Location obj,
+                                          uint32_t offset,
+                                          Location index)
+      : SlowPathCodeMIPS(instruction),
+        out_(out),
+        ref_(ref),
+        obj_(obj),
+        offset_(offset),
+        index_(index) {
+    DCHECK(kEmitCompilerReadBarrier);
+    // If `obj` is equal to `out` or `ref`, it means the initial object
+    // has been overwritten by (or after) the heap object reference load
+    // to be instrumented, e.g.:
+    //
+    //   __ LoadFromOffset(kLoadWord, out, out, offset);
+    //   codegen_->GenerateReadBarrierSlow(instruction, out_loc, out_loc, out_loc, offset);
+    //
+    // In that case, we have lost the information about the original
+    // object, and the emitted read barrier cannot work properly.
+    DCHECK(!obj.Equals(out)) << "obj=" << obj << " out=" << out;
+    DCHECK(!obj.Equals(ref)) << "obj=" << obj << " ref=" << ref;
+  }
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    CodeGeneratorMIPS* mips_codegen = down_cast<CodeGeneratorMIPS*>(codegen);
+    LocationSummary* locations = instruction_->GetLocations();
+    Register reg_out = out_.AsRegister<Register>();
+    DCHECK(locations->CanCall());
+    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg_out));
+    DCHECK(instruction_->IsInstanceFieldGet() ||
+           instruction_->IsStaticFieldGet() ||
+           instruction_->IsArrayGet() ||
+           instruction_->IsInstanceOf() ||
+           instruction_->IsCheckCast() ||
+           (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()))
+        << "Unexpected instruction in read barrier for heap reference slow path: "
+        << instruction_->DebugName();
+
+    __ Bind(GetEntryLabel());
+    SaveLiveRegisters(codegen, locations);
+
+    // We may have to change the index's value, but as `index_` is a
+    // constant member (like other "inputs" of this slow path),
+    // introduce a copy of it, `index`.
+    Location index = index_;
+    if (index_.IsValid()) {
+      // Handle `index_` for HArrayGet and UnsafeGetObject/UnsafeGetObjectVolatile intrinsics.
+      if (instruction_->IsArrayGet()) {
+        // Compute the actual memory offset and store it in `index`.
+        Register index_reg = index_.AsRegister<Register>();
+        DCHECK(locations->GetLiveRegisters()->ContainsCoreRegister(index_reg));
+        if (codegen->IsCoreCalleeSaveRegister(index_reg)) {
+          // We are about to change the value of `index_reg` (see the
+          // calls to art::mips::MipsAssembler::Sll and
+          // art::mips::MipsAssembler::Addiu32 below), but it has
+          // not been saved by the previous call to
+          // art::SlowPathCode::SaveLiveRegisters, as it is a
+          // callee-save register --
+          // art::SlowPathCode::SaveLiveRegisters does not consider
+          // callee-save registers, as it has been designed with the
+          // assumption that callee-save registers are supposed to be
+          // handled by the called function.  So, as a callee-save
+          // register, `index_reg` _would_ eventually be saved onto
+          // the stack, but it would be too late: we would have
+          // changed its value earlier.  Therefore, we manually save
+          // it here into another freely available register,
+          // `free_reg`, chosen of course among the caller-save
+          // registers (as a callee-save `free_reg` register would
+          // exhibit the same problem).
+          //
+          // Note we could have requested a temporary register from
+          // the register allocator instead; but we prefer not to, as
+          // this is a slow path, and we know we can find a
+          // caller-save register that is available.
+          Register free_reg = FindAvailableCallerSaveRegister(codegen);
+          __ Move(free_reg, index_reg);
+          index_reg = free_reg;
+          index = Location::RegisterLocation(index_reg);
+        } else {
+          // The initial register stored in `index_` has already been
+          // saved in the call to art::SlowPathCode::SaveLiveRegisters
+          // (as it is not a callee-save register), so we can freely
+          // use it.
+        }
+        // Shifting the index value contained in `index_reg` by the scale
+        // factor (2) cannot overflow in practice, as the runtime is
+        // unable to allocate object arrays with a size larger than
+        // 2^26 - 1 (that is, 2^28 - 4 bytes).
+        __ Sll(index_reg, index_reg, TIMES_4);
+        static_assert(
+            sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
+            "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
+        __ Addiu32(index_reg, index_reg, offset_);
+      } else {
+        // In the case of the UnsafeGetObject/UnsafeGetObjectVolatile
+        // intrinsics, `index_` is not shifted by a scale factor of 2
+        // (as in the case of ArrayGet), as it is actually an offset
+        // to an object field within an object.
+        DCHECK(instruction_->IsInvoke()) << instruction_->DebugName();
+        DCHECK(instruction_->GetLocations()->Intrinsified());
+        DCHECK((instruction_->AsInvoke()->GetIntrinsic() == Intrinsics::kUnsafeGetObject) ||
+               (instruction_->AsInvoke()->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile))
+            << instruction_->AsInvoke()->GetIntrinsic();
+        DCHECK_EQ(offset_, 0U);
+        DCHECK(index_.IsRegisterPair());
+        // UnsafeGet's offset location is a register pair, the low
+        // part contains the correct offset.
+        index = index_.ToLow();
+      }
+    }
+
+    // We're moving two or three locations to locations that could
+    // overlap, so we need a parallel move resolver.
+    InvokeRuntimeCallingConvention calling_convention;
+    HParallelMove parallel_move(codegen->GetGraph()->GetArena());
+    parallel_move.AddMove(ref_,
+                          Location::RegisterLocation(calling_convention.GetRegisterAt(0)),
+                          Primitive::kPrimNot,
+                          nullptr);
+    parallel_move.AddMove(obj_,
+                          Location::RegisterLocation(calling_convention.GetRegisterAt(1)),
+                          Primitive::kPrimNot,
+                          nullptr);
+    if (index.IsValid()) {
+      parallel_move.AddMove(index,
+                            Location::RegisterLocation(calling_convention.GetRegisterAt(2)),
+                            Primitive::kPrimInt,
+                            nullptr);
+      codegen->GetMoveResolver()->EmitNativeCode(&parallel_move);
+    } else {
+      codegen->GetMoveResolver()->EmitNativeCode(&parallel_move);
+      __ LoadConst32(calling_convention.GetRegisterAt(2), offset_);
+    }
+    mips_codegen->InvokeRuntime(kQuickReadBarrierSlow,
+                                instruction_,
+                                instruction_->GetDexPc(),
+                                this);
+    CheckEntrypointTypes<
+        kQuickReadBarrierSlow, mirror::Object*, mirror::Object*, mirror::Object*, uint32_t>();
+    mips_codegen->Move32(out_, calling_convention.GetReturnLocation(Primitive::kPrimNot));
+
+    RestoreLiveRegisters(codegen, locations);
+    __ B(GetExitLabel());
+  }
+
+  const char* GetDescription() const OVERRIDE { return "ReadBarrierForHeapReferenceSlowPathMIPS"; }
+
+ private:
+  Register FindAvailableCallerSaveRegister(CodeGenerator* codegen) {
+    size_t ref = static_cast<int>(ref_.AsRegister<Register>());
+    size_t obj = static_cast<int>(obj_.AsRegister<Register>());
+    for (size_t i = 0, e = codegen->GetNumberOfCoreRegisters(); i < e; ++i) {
+      if (i != ref &&
+          i != obj &&
+          !codegen->IsCoreCalleeSaveRegister(i) &&
+          !codegen->IsBlockedCoreRegister(i)) {
+        return static_cast<Register>(i);
+      }
+    }
+    // We shall never fail to find a free caller-save register, as
+    // there are more than two core caller-save registers on MIPS
+    // (meaning it is possible to find one which is different from
+    // `ref` and `obj`).
+    DCHECK_GT(codegen->GetNumberOfCoreCallerSaveRegisters(), 2u);
+    LOG(FATAL) << "Could not find a free caller-save register";
+    UNREACHABLE();
+  }
+
+  const Location out_;
+  const Location ref_;
+  const Location obj_;
+  const uint32_t offset_;
+  // An additional location containing an index to an array.
+  // Only used for HArrayGet and the UnsafeGetObject &
+  // UnsafeGetObjectVolatile intrinsics.
+  const Location index_;
+
+  DISALLOW_COPY_AND_ASSIGN(ReadBarrierForHeapReferenceSlowPathMIPS);
+};
+
+// Slow path generating a read barrier for a GC root.
+class ReadBarrierForRootSlowPathMIPS : public SlowPathCodeMIPS {
+ public:
+  ReadBarrierForRootSlowPathMIPS(HInstruction* instruction, Location out, Location root)
+      : SlowPathCodeMIPS(instruction), out_(out), root_(root) {
+    DCHECK(kEmitCompilerReadBarrier);
+  }
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    LocationSummary* locations = instruction_->GetLocations();
+    Register reg_out = out_.AsRegister<Register>();
+    DCHECK(locations->CanCall());
+    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg_out));
+    DCHECK(instruction_->IsLoadClass() || instruction_->IsLoadString())
+        << "Unexpected instruction in read barrier for GC root slow path: "
+        << instruction_->DebugName();
+
+    __ Bind(GetEntryLabel());
+    SaveLiveRegisters(codegen, locations);
+
+    InvokeRuntimeCallingConvention calling_convention;
+    CodeGeneratorMIPS* mips_codegen = down_cast<CodeGeneratorMIPS*>(codegen);
+    mips_codegen->Move32(Location::RegisterLocation(calling_convention.GetRegisterAt(0)), root_);
+    mips_codegen->InvokeRuntime(kQuickReadBarrierForRootSlow,
+                                instruction_,
+                                instruction_->GetDexPc(),
+                                this);
+    CheckEntrypointTypes<kQuickReadBarrierForRootSlow, mirror::Object*, GcRoot<mirror::Object>*>();
+    mips_codegen->Move32(out_, calling_convention.GetReturnLocation(Primitive::kPrimNot));
+
+    RestoreLiveRegisters(codegen, locations);
+    __ B(GetExitLabel());
+  }
+
+  const char* GetDescription() const OVERRIDE { return "ReadBarrierForRootSlowPathMIPS"; }
+
+ private:
+  const Location out_;
+  const Location root_;
+
+  DISALLOW_COPY_AND_ASSIGN(ReadBarrierForRootSlowPathMIPS);
+};
+
 CodeGeneratorMIPS::CodeGeneratorMIPS(HGraph* graph,
                                      const MipsInstructionSetFeatures& isa_features,
                                      const CompilerOptions& compiler_options,
@@ -1310,10 +1840,26 @@ void CodeGeneratorMIPS::InvokeRuntime(QuickEntrypointEnum entrypoint,
                                       uint32_t dex_pc,
                                       SlowPathCode* slow_path) {
   ValidateInvokeRuntime(entrypoint, instruction, slow_path);
+  GenerateInvokeRuntime(GetThreadOffset<kMipsPointerSize>(entrypoint).Int32Value(),
+                        IsDirectEntrypoint(entrypoint));
+  if (EntrypointRequiresStackMap(entrypoint)) {
+    RecordPcInfo(instruction, dex_pc, slow_path);
+  }
+}
+
+void CodeGeneratorMIPS::InvokeRuntimeWithoutRecordingPcInfo(int32_t entry_point_offset,
+                                                            HInstruction* instruction,
+                                                            SlowPathCode* slow_path,
+                                                            bool direct) {
+  ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction, slow_path);
+  GenerateInvokeRuntime(entry_point_offset, direct);
+}
+
+void CodeGeneratorMIPS::GenerateInvokeRuntime(int32_t entry_point_offset, bool direct) {
   bool reordering = __ SetReorder(false);
-  __ LoadFromOffset(kLoadWord, T9, TR, GetThreadOffset<kMipsPointerSize>(entrypoint).Int32Value());
+  __ LoadFromOffset(kLoadWord, T9, TR, entry_point_offset);
   __ Jalr(T9);
-  if (IsDirectEntrypoint(entrypoint)) {
+  if (direct) {
     // Reserve argument space on stack (for $a0-$a3) for
     // entrypoints that directly reference native implementations.
     // Called function may use this space to store $a0-$a3 regs.
@@ -1323,9 +1869,6 @@ void CodeGeneratorMIPS::InvokeRuntime(QuickEntrypointEnum entrypoint,
     __ Nop();  // In delay slot.
   }
   __ SetReorder(reordering);
-  if (EntrypointRequiresStackMap(entrypoint)) {
-    RecordPcInfo(instruction, dex_pc, slow_path);
-  }
 }
 
 void InstructionCodeGeneratorMIPS::GenerateClassInitializationCheck(SlowPathCodeMIPS* slow_path,
@@ -1885,14 +2428,31 @@ void InstructionCodeGeneratorMIPS::VisitAnd(HAnd* instruction) {
 }
 
 void LocationsBuilderMIPS::VisitArrayGet(HArrayGet* instruction) {
+  Primitive::Type type = instruction->GetType();
+  bool object_array_get_with_read_barrier =
+      kEmitCompilerReadBarrier && (type == Primitive::kPrimNot);
   LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
+      new (GetGraph()->GetArena()) LocationSummary(instruction,
+                                                   object_array_get_with_read_barrier
+                                                       ? LocationSummary::kCallOnSlowPath
+                                                       : LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
-  if (Primitive::IsFloatingPointType(instruction->GetType())) {
+  if (Primitive::IsFloatingPointType(type)) {
     locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
   } else {
-    locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+    // The output overlaps in the case of an object array get with
+    // read barriers enabled: we do not want the move to overwrite the
+    // array's location, as we need it to emit the read barrier.
+    locations->SetOut(Location::RequiresRegister(),
+                      object_array_get_with_read_barrier
+                          ? Location::kOutputOverlap
+                          : Location::kNoOutputOverlap);
+  }
+  // We need a temporary register for the read barrier marking slow
+  // path in CodeGeneratorMIPS::GenerateArrayLoadWithBakerReadBarrier.
+  if (object_array_get_with_read_barrier && kUseBakerReadBarrier) {
+    locations->AddTemp(Location::RequiresRegister());
   }
 }
 
@@ -1905,7 +2465,9 @@ static auto GetImplicitNullChecker(HInstruction* instruction, CodeGeneratorMIPS*
 
 void InstructionCodeGeneratorMIPS::VisitArrayGet(HArrayGet* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  Register obj = locations->InAt(0).AsRegister<Register>();
+  Location obj_loc = locations->InAt(0);
+  Register obj = obj_loc.AsRegister<Register>();
+  Location out_loc = locations->Out();
   Location index = locations->InAt(1);
   uint32_t data_offset = CodeGenerator::GetArrayDataOffset(instruction);
   auto null_checker = GetImplicitNullChecker(instruction, codegen_);
@@ -1915,7 +2477,7 @@ void InstructionCodeGeneratorMIPS::VisitArrayGet(HArrayGet* instruction) {
                                         instruction->IsStringCharAt();
   switch (type) {
     case Primitive::kPrimBoolean: {
-      Register out = locations->Out().AsRegister<Register>();
+      Register out = out_loc.AsRegister<Register>();
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset;
@@ -1928,7 +2490,7 @@ void InstructionCodeGeneratorMIPS::VisitArrayGet(HArrayGet* instruction) {
     }
 
     case Primitive::kPrimByte: {
-      Register out = locations->Out().AsRegister<Register>();
+      Register out = out_loc.AsRegister<Register>();
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset;
@@ -1941,7 +2503,7 @@ void InstructionCodeGeneratorMIPS::VisitArrayGet(HArrayGet* instruction) {
     }
 
     case Primitive::kPrimShort: {
-      Register out = locations->Out().AsRegister<Register>();
+      Register out = out_loc.AsRegister<Register>();
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_2) + data_offset;
@@ -1955,7 +2517,7 @@ void InstructionCodeGeneratorMIPS::VisitArrayGet(HArrayGet* instruction) {
     }
 
     case Primitive::kPrimChar: {
-      Register out = locations->Out().AsRegister<Register>();
+      Register out = out_loc.AsRegister<Register>();
       if (maybe_compressed_char_at) {
         uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
         __ LoadFromOffset(kLoadWord, TMP, obj, count_offset, null_checker);
@@ -2008,10 +2570,9 @@ void InstructionCodeGeneratorMIPS::VisitArrayGet(HArrayGet* instruction) {
       break;
     }
 
-    case Primitive::kPrimInt:
-    case Primitive::kPrimNot: {
+    case Primitive::kPrimInt: {
       DCHECK_EQ(sizeof(mirror::HeapReference<mirror::Object>), sizeof(int32_t));
-      Register out = locations->Out().AsRegister<Register>();
+      Register out = out_loc.AsRegister<Register>();
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
@@ -2024,8 +2585,53 @@ void InstructionCodeGeneratorMIPS::VisitArrayGet(HArrayGet* instruction) {
       break;
     }
 
+    case Primitive::kPrimNot: {
+      static_assert(
+          sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
+          "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
+      // /* HeapReference<Object> */ out =
+      //     *(obj + data_offset + index * sizeof(HeapReference<Object>))
+      if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+        Location temp = locations->GetTemp(0);
+        // Note that a potential implicit null check is handled in this
+        // CodeGeneratorMIPS::GenerateArrayLoadWithBakerReadBarrier call.
+        codegen_->GenerateArrayLoadWithBakerReadBarrier(instruction,
+                                                        out_loc,
+                                                        obj,
+                                                        data_offset,
+                                                        index,
+                                                        temp,
+                                                        /* needs_null_check */ true);
+      } else {
+        Register out = out_loc.AsRegister<Register>();
+        if (index.IsConstant()) {
+          size_t offset =
+              (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
+          __ LoadFromOffset(kLoadWord, out, obj, offset, null_checker);
+          // If read barriers are enabled, emit read barriers other than
+          // Baker's using a slow path (and also unpoison the loaded
+          // reference, if heap poisoning is enabled).
+          codegen_->MaybeGenerateReadBarrierSlow(instruction, out_loc, out_loc, obj_loc, offset);
+        } else {
+          __ Sll(TMP, index.AsRegister<Register>(), TIMES_4);
+          __ Addu(TMP, obj, TMP);
+          __ LoadFromOffset(kLoadWord, out, TMP, data_offset, null_checker);
+          // If read barriers are enabled, emit read barriers other than
+          // Baker's using a slow path (and also unpoison the loaded
+          // reference, if heap poisoning is enabled).
+          codegen_->MaybeGenerateReadBarrierSlow(instruction,
+                                                 out_loc,
+                                                 out_loc,
+                                                 obj_loc,
+                                                 data_offset,
+                                                 index);
+        }
+      }
+      break;
+    }
+
     case Primitive::kPrimLong: {
-      Register out = locations->Out().AsRegisterPairLow<Register>();
+      Register out = out_loc.AsRegisterPairLow<Register>();
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset;
@@ -2039,7 +2645,7 @@ void InstructionCodeGeneratorMIPS::VisitArrayGet(HArrayGet* instruction) {
     }
 
     case Primitive::kPrimFloat: {
-      FRegister out = locations->Out().AsFpuRegister<FRegister>();
+      FRegister out = out_loc.AsFpuRegister<FRegister>();
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
@@ -2053,7 +2659,7 @@ void InstructionCodeGeneratorMIPS::VisitArrayGet(HArrayGet* instruction) {
     }
 
     case Primitive::kPrimDouble: {
-      FRegister out = locations->Out().AsFpuRegister<FRegister>();
+      FRegister out = out_loc.AsFpuRegister<FRegister>();
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset;
@@ -2070,11 +2676,6 @@ void InstructionCodeGeneratorMIPS::VisitArrayGet(HArrayGet* instruction) {
       LOG(FATAL) << "Unreachable type " << instruction->GetType();
       UNREACHABLE();
   }
-
-  if (type == Primitive::kPrimNot) {
-    Register out = locations->Out().AsRegister<Register>();
-    __ MaybeUnpoisonHeapReference(out);
-  }
 }
 
 void LocationsBuilderMIPS::VisitArrayLength(HArrayLength* instruction) {
@@ -2116,23 +2717,28 @@ Location LocationsBuilderMIPS::FpuRegisterOrConstantForStore(HInstruction* instr
 }
 
 void LocationsBuilderMIPS::VisitArraySet(HArraySet* instruction) {
-  bool needs_runtime_call = instruction->NeedsTypeCheck();
+  Primitive::Type value_type = instruction->GetComponentType();
+
+  bool needs_write_barrier =
+      CodeGenerator::StoreNeedsWriteBarrier(value_type, instruction->GetValue());
+  bool may_need_runtime_call_for_type_check = instruction->NeedsTypeCheck();
+
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(
       instruction,
-      needs_runtime_call ? LocationSummary::kCallOnMainOnly : LocationSummary::kNoCall);
-  if (needs_runtime_call) {
-    InvokeRuntimeCallingConvention calling_convention;
-    locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
-    locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
-    locations->SetInAt(2, Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
+      may_need_runtime_call_for_type_check ?
+          LocationSummary::kCallOnSlowPath :
+          LocationSummary::kNoCall);
+
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
+  if (Primitive::IsFloatingPointType(instruction->InputAt(2)->GetType())) {
+    locations->SetInAt(2, FpuRegisterOrConstantForStore(instruction->InputAt(2)));
   } else {
-    locations->SetInAt(0, Location::RequiresRegister());
-    locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
-    if (Primitive::IsFloatingPointType(instruction->InputAt(2)->GetType())) {
-      locations->SetInAt(2, FpuRegisterOrConstantForStore(instruction->InputAt(2)));
-    } else {
-      locations->SetInAt(2, RegisterOrZeroConstant(instruction->InputAt(2)));
-    }
+    locations->SetInAt(2, RegisterOrZeroConstant(instruction->InputAt(2)));
+  }
+  if (needs_write_barrier) {
+    // Temporary register for the write barrier.
+    locations->AddTemp(Location::RequiresRegister());  // Possibly used for ref. poisoning too.
   }
 }
 
@@ -2142,7 +2748,7 @@ void InstructionCodeGeneratorMIPS::VisitArraySet(HArraySet* instruction) {
   Location index = locations->InAt(1);
   Location value_location = locations->InAt(2);
   Primitive::Type value_type = instruction->GetComponentType();
-  bool needs_runtime_call = locations->WillCall();
+  bool may_need_runtime_call_for_type_check = instruction->NeedsTypeCheck();
   bool needs_write_barrier =
       CodeGenerator::StoreNeedsWriteBarrier(value_type, instruction->GetValue());
   auto null_checker = GetImplicitNullChecker(instruction, codegen_);
@@ -2186,9 +2792,27 @@ void InstructionCodeGeneratorMIPS::VisitArraySet(HArraySet* instruction) {
       break;
     }
 
-    case Primitive::kPrimInt:
+    case Primitive::kPrimInt: {
+      uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value();
+      if (index.IsConstant()) {
+        data_offset += index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4;
+      } else {
+        __ Sll(base_reg, index.AsRegister<Register>(), TIMES_4);
+        __ Addu(base_reg, obj, base_reg);
+      }
+      if (value_location.IsConstant()) {
+        int32_t value = CodeGenerator::GetInt32ValueOf(value_location.GetConstant());
+        __ StoreConstToOffset(kStoreWord, value, base_reg, data_offset, TMP, null_checker);
+      } else {
+        Register value = value_location.AsRegister<Register>();
+        __ StoreToOffset(kStoreWord, value, base_reg, data_offset, null_checker);
+      }
+      break;
+    }
+
     case Primitive::kPrimNot: {
-      if (!needs_runtime_call) {
+      if (value_location.IsConstant()) {
+        // Just setting null.
         uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value();
         if (index.IsConstant()) {
           data_offset += index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4;
@@ -2196,48 +2820,110 @@ void InstructionCodeGeneratorMIPS::VisitArraySet(HArraySet* instruction) {
           __ Sll(base_reg, index.AsRegister<Register>(), TIMES_4);
           __ Addu(base_reg, obj, base_reg);
         }
-        if (value_location.IsConstant()) {
-          int32_t value = CodeGenerator::GetInt32ValueOf(value_location.GetConstant());
-          __ StoreConstToOffset(kStoreWord, value, base_reg, data_offset, TMP, null_checker);
-          DCHECK(!needs_write_barrier);
-        } else {
-          Register value = value_location.AsRegister<Register>();
-          if (kPoisonHeapReferences && needs_write_barrier) {
-            // Note that in the case where `value` is a null reference,
-            // we do not enter this block, as a null reference does not
-            // need poisoning.
-            DCHECK_EQ(value_type, Primitive::kPrimNot);
-            // Use Sw() instead of StoreToOffset() in order to be able to
-            // hold the poisoned reference in AT and thus avoid allocating
-            // yet another temporary register.
-            if (index.IsConstant()) {
-              if (!IsInt<16>(static_cast<int32_t>(data_offset))) {
-                int16_t low = Low16Bits(data_offset);
-                uint32_t high = data_offset - low;
-                __ Addiu32(TMP, obj, high);
-                base_reg = TMP;
-                data_offset = low;
-              }
-            } else {
-              DCHECK(IsInt<16>(static_cast<int32_t>(data_offset)));
-            }
-            __ PoisonHeapReference(AT, value);
-            __ Sw(AT, base_reg, data_offset);
-            null_checker();
+        int32_t value = CodeGenerator::GetInt32ValueOf(value_location.GetConstant());
+        DCHECK_EQ(value, 0);
+        __ StoreConstToOffset(kStoreWord, value, base_reg, data_offset, TMP, null_checker);
+        DCHECK(!needs_write_barrier);
+        DCHECK(!may_need_runtime_call_for_type_check);
+        break;
+      }
+
+      DCHECK(needs_write_barrier);
+      Register value = value_location.AsRegister<Register>();
+      Register temp1 = locations->GetTemp(0).AsRegister<Register>();
+      Register temp2 = TMP;  // Doesn't need to survive slow path.
+      uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
+      uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
+      uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
+      MipsLabel done;
+      SlowPathCodeMIPS* slow_path = nullptr;
+
+      if (may_need_runtime_call_for_type_check) {
+        slow_path = new (GetGraph()->GetArena()) ArraySetSlowPathMIPS(instruction);
+        codegen_->AddSlowPath(slow_path);
+        if (instruction->GetValueCanBeNull()) {
+          MipsLabel non_zero;
+          __ Bnez(value, &non_zero);
+          uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value();
+          if (index.IsConstant()) {
+            data_offset += index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4;
           } else {
-            __ StoreToOffset(kStoreWord, value, base_reg, data_offset, null_checker);
-          }
-          if (needs_write_barrier) {
-            DCHECK_EQ(value_type, Primitive::kPrimNot);
-            codegen_->MarkGCCard(obj, value, instruction->GetValueCanBeNull());
+            __ Sll(base_reg, index.AsRegister<Register>(), TIMES_4);
+            __ Addu(base_reg, obj, base_reg);
           }
+          __ StoreToOffset(kStoreWord, value, base_reg, data_offset, null_checker);
+          __ B(&done);
+          __ Bind(&non_zero);
         }
+
+        // Note that when read barriers are enabled, the type checks
+        // are performed without read barriers.  This is fine, even in
+        // the case where a class object is in the from-space after
+        // the flip, as a comparison involving such a type would not
+        // produce a false positive; it may of course produce a false
+        // negative, in which case we would take the ArraySet slow
+        // path.
+
+        // /* HeapReference<Class> */ temp1 = obj->klass_
+        __ LoadFromOffset(kLoadWord, temp1, obj, class_offset, null_checker);
+        __ MaybeUnpoisonHeapReference(temp1);
+
+        // /* HeapReference<Class> */ temp1 = temp1->component_type_
+        __ LoadFromOffset(kLoadWord, temp1, temp1, component_offset);
+        // /* HeapReference<Class> */ temp2 = value->klass_
+        __ LoadFromOffset(kLoadWord, temp2, value, class_offset);
+        // If heap poisoning is enabled, no need to unpoison `temp1`
+        // nor `temp2`, as we are comparing two poisoned references.
+
+        if (instruction->StaticTypeOfArrayIsObjectArray()) {
+          MipsLabel do_put;
+          __ Beq(temp1, temp2, &do_put);
+          // If heap poisoning is enabled, the `temp1` reference has
+          // not been unpoisoned yet; unpoison it now.
+          __ MaybeUnpoisonHeapReference(temp1);
+
+          // /* HeapReference<Class> */ temp1 = temp1->super_class_
+          __ LoadFromOffset(kLoadWord, temp1, temp1, super_offset);
+          // If heap poisoning is enabled, no need to unpoison
+          // `temp1`, as we are comparing against null below.
+          __ Bnez(temp1, slow_path->GetEntryLabel());
+          __ Bind(&do_put);
+        } else {
+          __ Bne(temp1, temp2, slow_path->GetEntryLabel());
+        }
+      }
+
+      Register source = value;
+      if (kPoisonHeapReferences) {
+        // Note that in the case where `value` is a null reference,
+        // we do not enter this block, as a null reference does not
+        // need poisoning.
+        __ Move(temp1, value);
+        __ PoisonHeapReference(temp1);
+        source = temp1;
+      }
+
+      uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value();
+      if (index.IsConstant()) {
+        data_offset += index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4;
       } else {
-        DCHECK_EQ(value_type, Primitive::kPrimNot);
-        // Note: if heap poisoning is enabled, pAputObject takes care
-        // of poisoning the reference.
-        codegen_->InvokeRuntime(kQuickAputObject, instruction, instruction->GetDexPc());
-        CheckEntrypointTypes<kQuickAputObject, void, mirror::Array*, int32_t, mirror::Object*>();
+        __ Sll(base_reg, index.AsRegister<Register>(), TIMES_4);
+        __ Addu(base_reg, obj, base_reg);
+      }
+      __ StoreToOffset(kStoreWord, source, base_reg, data_offset);
+
+      if (!may_need_runtime_call_for_type_check) {
+        codegen_->MaybeRecordImplicitNullCheck(instruction);
+      }
+
+      codegen_->MarkGCCard(obj, value, instruction->GetValueCanBeNull());
+
+      if (done.IsLinked()) {
+        __ Bind(&done);
+      }
+
+      if (slow_path != nullptr) {
+        __ Bind(slow_path->GetExitLabel());
       }
       break;
     }
@@ -2327,6 +3013,23 @@ void InstructionCodeGeneratorMIPS::VisitBoundsCheck(HBoundsCheck* instruction) {
   __ Bgeu(index, length, slow_path->GetEntryLabel());
 }
 
+// Temp is used for read barrier.
+static size_t NumberOfInstanceOfTemps(TypeCheckKind type_check_kind) {
+  if (kEmitCompilerReadBarrier &&
+      (kUseBakerReadBarrier ||
+       type_check_kind == TypeCheckKind::kAbstractClassCheck ||
+       type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
+       type_check_kind == TypeCheckKind::kArrayObjectCheck)) {
+    return 1;
+  }
+  return 0;
+}
+
+// Extra temp is used for read barrier.
+static size_t NumberOfCheckCastTemps(TypeCheckKind type_check_kind) {
+  return 1 + NumberOfInstanceOfTemps(type_check_kind);
+}
+
 void LocationsBuilderMIPS::VisitCheckCast(HCheckCast* instruction) {
   LocationSummary::CallKind call_kind = LocationSummary::kNoCall;
   bool throws_into_catch = instruction->CanThrowIntoCatchBlock();
@@ -2337,7 +3040,7 @@ void LocationsBuilderMIPS::VisitCheckCast(HCheckCast* instruction) {
     case TypeCheckKind::kAbstractClassCheck:
     case TypeCheckKind::kClassHierarchyCheck:
     case TypeCheckKind::kArrayObjectCheck:
-      call_kind = throws_into_catch
+      call_kind = (throws_into_catch || kEmitCompilerReadBarrier)
           ? LocationSummary::kCallOnSlowPath
           : LocationSummary::kNoCall;  // In fact, call on a fatal (non-returning) slow path.
       break;
@@ -2351,15 +3054,20 @@ void LocationsBuilderMIPS::VisitCheckCast(HCheckCast* instruction) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, call_kind);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RequiresRegister());
-  locations->AddTemp(Location::RequiresRegister());
+  locations->AddRegisterTemps(NumberOfCheckCastTemps(type_check_kind));
 }
 
 void InstructionCodeGeneratorMIPS::VisitCheckCast(HCheckCast* instruction) {
   TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
   LocationSummary* locations = instruction->GetLocations();
-  Register obj = locations->InAt(0).AsRegister<Register>();
+  Location obj_loc = locations->InAt(0);
+  Register obj = obj_loc.AsRegister<Register>();
   Register cls = locations->InAt(1).AsRegister<Register>();
-  Register temp = locations->GetTemp(0).AsRegister<Register>();
+  Location temp_loc = locations->GetTemp(0);
+  Register temp = temp_loc.AsRegister<Register>();
+  const size_t num_temps = NumberOfCheckCastTemps(type_check_kind);
+  DCHECK_LE(num_temps, 2u);
+  Location maybe_temp2_loc = (num_temps >= 2) ? locations->GetTemp(1) : Location::NoLocation();
   const uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
   const uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
   const uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
@@ -2396,8 +3104,12 @@ void InstructionCodeGeneratorMIPS::VisitCheckCast(HCheckCast* instruction) {
     case TypeCheckKind::kExactCheck:
     case TypeCheckKind::kArrayCheck: {
       // /* HeapReference<Class> */ temp = obj->klass_
-      __ LoadFromOffset(kLoadWord, temp, obj, class_offset);
-      __ MaybeUnpoisonHeapReference(temp);
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        temp_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp2_loc,
+                                        kWithoutReadBarrier);
       // Jump to slow path for throwing the exception or doing a
       // more involved array check.
       __ Bne(temp, cls, slow_path->GetEntryLabel());
@@ -2406,15 +3118,22 @@ void InstructionCodeGeneratorMIPS::VisitCheckCast(HCheckCast* instruction) {
 
     case TypeCheckKind::kAbstractClassCheck: {
       // /* HeapReference<Class> */ temp = obj->klass_
-      __ LoadFromOffset(kLoadWord, temp, obj, class_offset);
-      __ MaybeUnpoisonHeapReference(temp);
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        temp_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp2_loc,
+                                        kWithoutReadBarrier);
       // If the class is abstract, we eagerly fetch the super class of the
       // object to avoid doing a comparison we know will fail.
       MipsLabel loop;
       __ Bind(&loop);
       // /* HeapReference<Class> */ temp = temp->super_class_
-      __ LoadFromOffset(kLoadWord, temp, temp, super_offset);
-      __ MaybeUnpoisonHeapReference(temp);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       temp_loc,
+                                       super_offset,
+                                       maybe_temp2_loc,
+                                       kWithoutReadBarrier);
       // If the class reference currently in `temp` is null, jump to the slow path to throw the
       // exception.
       __ Beqz(temp, slow_path->GetEntryLabel());
@@ -2425,15 +3144,22 @@ void InstructionCodeGeneratorMIPS::VisitCheckCast(HCheckCast* instruction) {
 
     case TypeCheckKind::kClassHierarchyCheck: {
       // /* HeapReference<Class> */ temp = obj->klass_
-      __ LoadFromOffset(kLoadWord, temp, obj, class_offset);
-      __ MaybeUnpoisonHeapReference(temp);
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        temp_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp2_loc,
+                                        kWithoutReadBarrier);
       // Walk over the class hierarchy to find a match.
       MipsLabel loop;
       __ Bind(&loop);
       __ Beq(temp, cls, &done);
       // /* HeapReference<Class> */ temp = temp->super_class_
-      __ LoadFromOffset(kLoadWord, temp, temp, super_offset);
-      __ MaybeUnpoisonHeapReference(temp);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       temp_loc,
+                                       super_offset,
+                                       maybe_temp2_loc,
+                                       kWithoutReadBarrier);
       // If the class reference currently in `temp` is null, jump to the slow path to throw the
       // exception. Otherwise, jump to the beginning of the loop.
       __ Bnez(temp, &loop);
@@ -2443,14 +3169,21 @@ void InstructionCodeGeneratorMIPS::VisitCheckCast(HCheckCast* instruction) {
 
     case TypeCheckKind::kArrayObjectCheck: {
       // /* HeapReference<Class> */ temp = obj->klass_
-      __ LoadFromOffset(kLoadWord, temp, obj, class_offset);
-      __ MaybeUnpoisonHeapReference(temp);
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        temp_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp2_loc,
+                                        kWithoutReadBarrier);
       // Do an exact check.
       __ Beq(temp, cls, &done);
       // Otherwise, we need to check that the object's class is a non-primitive array.
       // /* HeapReference<Class> */ temp = temp->component_type_
-      __ LoadFromOffset(kLoadWord, temp, temp, component_offset);
-      __ MaybeUnpoisonHeapReference(temp);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       temp_loc,
+                                       component_offset,
+                                       maybe_temp2_loc,
+                                       kWithoutReadBarrier);
       // If the component type is null, jump to the slow path to throw the exception.
       __ Beqz(temp, slow_path->GetEntryLabel());
       // Otherwise, the object is indeed an array, further check that this component
@@ -2477,11 +3210,19 @@ void InstructionCodeGeneratorMIPS::VisitCheckCast(HCheckCast* instruction) {
       // Avoid read barriers to improve performance of the fast path. We can not get false
       // positives by doing this.
       // /* HeapReference<Class> */ temp = obj->klass_
-      __ LoadFromOffset(kLoadWord, temp, obj, class_offset);
-      __ MaybeUnpoisonHeapReference(temp);
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        temp_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp2_loc,
+                                        kWithoutReadBarrier);
       // /* HeapReference<Class> */ temp = temp->iftable_
-      __ LoadFromOffset(kLoadWord, temp, temp, iftable_offset);
-      __ MaybeUnpoisonHeapReference(temp);
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        temp_loc,
+                                        temp_loc,
+                                        iftable_offset,
+                                        maybe_temp2_loc,
+                                        kWithoutReadBarrier);
       // Iftable is never null.
       __ Lw(TMP, temp, array_length_offset);
       // Loop through the iftable and check if any class matches.
@@ -5032,8 +5773,15 @@ void LocationsBuilderMIPS::HandleFieldGet(HInstruction* instruction, const Field
   Primitive::Type field_type = field_info.GetFieldType();
   bool is_wide = (field_type == Primitive::kPrimLong) || (field_type == Primitive::kPrimDouble);
   bool generate_volatile = field_info.IsVolatile() && is_wide;
+  bool object_field_get_with_read_barrier =
+      kEmitCompilerReadBarrier && (field_type == Primitive::kPrimNot);
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(
-      instruction, generate_volatile ? LocationSummary::kCallOnMainOnly : LocationSummary::kNoCall);
+      instruction,
+      generate_volatile
+          ? LocationSummary::kCallOnMainOnly
+          : (object_field_get_with_read_barrier
+              ? LocationSummary::kCallOnSlowPath
+              : LocationSummary::kNoCall));
 
   locations->SetInAt(0, Location::RequiresRegister());
   if (generate_volatile) {
@@ -5054,7 +5802,18 @@ void LocationsBuilderMIPS::HandleFieldGet(HInstruction* instruction, const Field
     if (Primitive::IsFloatingPointType(instruction->GetType())) {
       locations->SetOut(Location::RequiresFpuRegister());
     } else {
-      locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+      // The output overlaps in the case of an object field get with
+      // read barriers enabled: we do not want the move to overwrite the
+      // object's location, as we need it to emit the read barrier.
+      locations->SetOut(Location::RequiresRegister(),
+                        object_field_get_with_read_barrier
+                            ? Location::kOutputOverlap
+                            : Location::kNoOutputOverlap);
+    }
+    if (object_field_get_with_read_barrier && kUseBakerReadBarrier) {
+      // We need a temporary register for the read barrier marking slow
+      // path in CodeGeneratorMIPS::GenerateFieldLoadWithBakerReadBarrier.
+      locations->AddTemp(Location::RequiresRegister());
     }
   }
 }
@@ -5064,7 +5823,9 @@ void InstructionCodeGeneratorMIPS::HandleFieldGet(HInstruction* instruction,
                                                   uint32_t dex_pc) {
   Primitive::Type type = field_info.GetFieldType();
   LocationSummary* locations = instruction->GetLocations();
-  Register obj = locations->InAt(0).AsRegister<Register>();
+  Location obj_loc = locations->InAt(0);
+  Register obj = obj_loc.AsRegister<Register>();
+  Location dst_loc = locations->Out();
   LoadOperandType load_type = kLoadUnsignedByte;
   bool is_volatile = field_info.IsVolatile();
   uint32_t offset = field_info.GetFieldOffset().Uint32Value();
@@ -5107,40 +5868,61 @@ void InstructionCodeGeneratorMIPS::HandleFieldGet(HInstruction* instruction,
     CheckEntrypointTypes<kQuickA64Load, int64_t, volatile const int64_t*>();
     if (type == Primitive::kPrimDouble) {
       // FP results are returned in core registers. Need to move them.
-      Location out = locations->Out();
-      if (out.IsFpuRegister()) {
-        __ Mtc1(locations->GetTemp(1).AsRegister<Register>(), out.AsFpuRegister<FRegister>());
+      if (dst_loc.IsFpuRegister()) {
+        __ Mtc1(locations->GetTemp(1).AsRegister<Register>(), dst_loc.AsFpuRegister<FRegister>());
         __ MoveToFpuHigh(locations->GetTemp(2).AsRegister<Register>(),
-                         out.AsFpuRegister<FRegister>());
+                         dst_loc.AsFpuRegister<FRegister>());
       } else {
-        DCHECK(out.IsDoubleStackSlot());
+        DCHECK(dst_loc.IsDoubleStackSlot());
         __ StoreToOffset(kStoreWord,
                          locations->GetTemp(1).AsRegister<Register>(),
                          SP,
-                         out.GetStackIndex());
+                         dst_loc.GetStackIndex());
         __ StoreToOffset(kStoreWord,
                          locations->GetTemp(2).AsRegister<Register>(),
                          SP,
-                         out.GetStackIndex() + 4);
+                         dst_loc.GetStackIndex() + 4);
       }
     }
   } else {
-    if (!Primitive::IsFloatingPointType(type)) {
+    if (type == Primitive::kPrimNot) {
+      // /* HeapReference<Object> */ dst = *(obj + offset)
+      if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+        Location temp_loc = locations->GetTemp(0);
+        // Note that a potential implicit null check is handled in this
+        // CodeGeneratorMIPS::GenerateFieldLoadWithBakerReadBarrier call.
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(instruction,
+                                                        dst_loc,
+                                                        obj,
+                                                        offset,
+                                                        temp_loc,
+                                                        /* needs_null_check */ true);
+        if (is_volatile) {
+          GenerateMemoryBarrier(MemBarrierKind::kLoadAny);
+        }
+      } else {
+        __ LoadFromOffset(kLoadWord, dst_loc.AsRegister<Register>(), obj, offset, null_checker);
+        if (is_volatile) {
+          GenerateMemoryBarrier(MemBarrierKind::kLoadAny);
+        }
+        // If read barriers are enabled, emit read barriers other than
+        // Baker's using a slow path (and also unpoison the loaded
+        // reference, if heap poisoning is enabled).
+        codegen_->MaybeGenerateReadBarrierSlow(instruction, dst_loc, dst_loc, obj_loc, offset);
+      }
+    } else if (!Primitive::IsFloatingPointType(type)) {
       Register dst;
       if (type == Primitive::kPrimLong) {
-        DCHECK(locations->Out().IsRegisterPair());
-        dst = locations->Out().AsRegisterPairLow<Register>();
+        DCHECK(dst_loc.IsRegisterPair());
+        dst = dst_loc.AsRegisterPairLow<Register>();
       } else {
-        DCHECK(locations->Out().IsRegister());
-        dst = locations->Out().AsRegister<Register>();
+        DCHECK(dst_loc.IsRegister());
+        dst = dst_loc.AsRegister<Register>();
       }
       __ LoadFromOffset(load_type, dst, obj, offset, null_checker);
-      if (type == Primitive::kPrimNot) {
-        __ MaybeUnpoisonHeapReference(dst);
-      }
     } else {
-      DCHECK(locations->Out().IsFpuRegister());
-      FRegister dst = locations->Out().AsFpuRegister<FRegister>();
+      DCHECK(dst_loc.IsFpuRegister());
+      FRegister dst = dst_loc.AsFpuRegister<FRegister>();
       if (type == Primitive::kPrimFloat) {
         __ LoadSFromOffset(dst, obj, offset, null_checker);
       } else {
@@ -5149,7 +5931,9 @@ void InstructionCodeGeneratorMIPS::HandleFieldGet(HInstruction* instruction,
     }
   }
 
-  if (is_volatile) {
+  // Memory barriers, in the case of references, are handled in the
+  // previous switch statement.
+  if (is_volatile && (type != Primitive::kPrimNot)) {
     GenerateMemoryBarrier(MemBarrierKind::kLoadAny);
   }
 }
@@ -5290,7 +6074,6 @@ void InstructionCodeGeneratorMIPS::HandleFieldSet(HInstruction* instruction,
     }
   }
 
-  // TODO: memory barriers?
   if (needs_write_barrier) {
     Register src = value_location.AsRegister<Register>();
     codegen_->MarkGCCard(obj, src, value_can_be_null);
@@ -5320,14 +6103,133 @@ void InstructionCodeGeneratorMIPS::VisitInstanceFieldSet(HInstanceFieldSet* inst
                  instruction->GetValueCanBeNull());
 }
 
-void InstructionCodeGeneratorMIPS::GenerateGcRootFieldLoad(
-    HInstruction* instruction ATTRIBUTE_UNUSED,
-    Location root,
-    Register obj,
-    uint32_t offset) {
+void InstructionCodeGeneratorMIPS::GenerateReferenceLoadOneRegister(
+    HInstruction* instruction,
+    Location out,
+    uint32_t offset,
+    Location maybe_temp,
+    ReadBarrierOption read_barrier_option) {
+  Register out_reg = out.AsRegister<Register>();
+  if (read_barrier_option == kWithReadBarrier) {
+    CHECK(kEmitCompilerReadBarrier);
+    DCHECK(maybe_temp.IsRegister()) << maybe_temp;
+    if (kUseBakerReadBarrier) {
+      // Load with fast path based Baker's read barrier.
+      // /* HeapReference<Object> */ out = *(out + offset)
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(instruction,
+                                                      out,
+                                                      out_reg,
+                                                      offset,
+                                                      maybe_temp,
+                                                      /* needs_null_check */ false);
+    } else {
+      // Load with slow path based read barrier.
+      // Save the value of `out` into `maybe_temp` before overwriting it
+      // in the following move operation, as we will need it for the
+      // read barrier below.
+      __ Move(maybe_temp.AsRegister<Register>(), out_reg);
+      // /* HeapReference<Object> */ out = *(out + offset)
+      __ LoadFromOffset(kLoadWord, out_reg, out_reg, offset);
+      codegen_->GenerateReadBarrierSlow(instruction, out, out, maybe_temp, offset);
+    }
+  } else {
+    // Plain load with no read barrier.
+    // /* HeapReference<Object> */ out = *(out + offset)
+    __ LoadFromOffset(kLoadWord, out_reg, out_reg, offset);
+    __ MaybeUnpoisonHeapReference(out_reg);
+  }
+}
+
+void InstructionCodeGeneratorMIPS::GenerateReferenceLoadTwoRegisters(
+    HInstruction* instruction,
+    Location out,
+    Location obj,
+    uint32_t offset,
+    Location maybe_temp,
+    ReadBarrierOption read_barrier_option) {
+  Register out_reg = out.AsRegister<Register>();
+  Register obj_reg = obj.AsRegister<Register>();
+  if (read_barrier_option == kWithReadBarrier) {
+    CHECK(kEmitCompilerReadBarrier);
+    if (kUseBakerReadBarrier) {
+      DCHECK(maybe_temp.IsRegister()) << maybe_temp;
+      // Load with fast path based Baker's read barrier.
+      // /* HeapReference<Object> */ out = *(obj + offset)
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(instruction,
+                                                      out,
+                                                      obj_reg,
+                                                      offset,
+                                                      maybe_temp,
+                                                      /* needs_null_check */ false);
+    } else {
+      // Load with slow path based read barrier.
+      // /* HeapReference<Object> */ out = *(obj + offset)
+      __ LoadFromOffset(kLoadWord, out_reg, obj_reg, offset);
+      codegen_->GenerateReadBarrierSlow(instruction, out, out, obj, offset);
+    }
+  } else {
+    // Plain load with no read barrier.
+    // /* HeapReference<Object> */ out = *(obj + offset)
+    __ LoadFromOffset(kLoadWord, out_reg, obj_reg, offset);
+    __ MaybeUnpoisonHeapReference(out_reg);
+  }
+}
+
+void InstructionCodeGeneratorMIPS::GenerateGcRootFieldLoad(HInstruction* instruction,
+                                                           Location root,
+                                                           Register obj,
+                                                           uint32_t offset,
+                                                           ReadBarrierOption read_barrier_option) {
   Register root_reg = root.AsRegister<Register>();
-  if (kEmitCompilerReadBarrier) {
-    UNIMPLEMENTED(FATAL) << "for read barrier";
+  if (read_barrier_option == kWithReadBarrier) {
+    DCHECK(kEmitCompilerReadBarrier);
+    if (kUseBakerReadBarrier) {
+      // Fast path implementation of art::ReadBarrier::BarrierForRoot when
+      // Baker's read barrier are used:
+      //
+      //   root = obj.field;
+      //   temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
+      //   if (temp != null) {
+      //     root = temp(root)
+      //   }
+
+      // /* GcRoot<mirror::Object> */ root = *(obj + offset)
+      __ LoadFromOffset(kLoadWord, root_reg, obj, offset);
+      static_assert(
+          sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(GcRoot<mirror::Object>),
+          "art::mirror::CompressedReference<mirror::Object> and art::GcRoot<mirror::Object> "
+          "have different sizes.");
+      static_assert(sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(int32_t),
+                    "art::mirror::CompressedReference<mirror::Object> and int32_t "
+                    "have different sizes.");
+
+      // Slow path marking the GC root `root`.
+      Location temp = Location::RegisterLocation(T9);
+      SlowPathCodeMIPS* slow_path =
+          new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathMIPS(
+              instruction,
+              root,
+              /*entrypoint*/ temp);
+      codegen_->AddSlowPath(slow_path);
+
+      // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
+      const int32_t entry_point_offset =
+          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kMipsPointerSize>(root.reg() - 1);
+      // Loading the entrypoint does not require a load acquire since it is only changed when
+      // threads are suspended or running a checkpoint.
+      __ LoadFromOffset(kLoadWord, temp.AsRegister<Register>(), TR, entry_point_offset);
+      // The entrypoint is null when the GC is not marking, this prevents one load compared to
+      // checking GetIsGcMarking.
+      __ Bnez(temp.AsRegister<Register>(), slow_path->GetEntryLabel());
+      __ Bind(slow_path->GetExitLabel());
+    } else {
+      // GC root loaded through a slow path for read barriers other
+      // than Baker's.
+      // /* GcRoot<mirror::Object>* */ root = obj + offset
+      __ Addiu32(root_reg, obj, offset);
+      // /* mirror::Object* */ root = root->Read()
+      codegen_->GenerateReadBarrierForRootSlow(instruction, root, root);
+    }
   } else {
     // Plain GC root load with no read barrier.
     // /* GcRoot<mirror::Object> */ root = *(obj + offset)
@@ -5337,6 +6239,226 @@ void InstructionCodeGeneratorMIPS::GenerateGcRootFieldLoad(
   }
 }
 
+void CodeGeneratorMIPS::GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction,
+                                                              Location ref,
+                                                              Register obj,
+                                                              uint32_t offset,
+                                                              Location temp,
+                                                              bool needs_null_check) {
+  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(kUseBakerReadBarrier);
+
+  // /* HeapReference<Object> */ ref = *(obj + offset)
+  Location no_index = Location::NoLocation();
+  ScaleFactor no_scale_factor = TIMES_1;
+  GenerateReferenceLoadWithBakerReadBarrier(instruction,
+                                            ref,
+                                            obj,
+                                            offset,
+                                            no_index,
+                                            no_scale_factor,
+                                            temp,
+                                            needs_null_check);
+}
+
+void CodeGeneratorMIPS::GenerateArrayLoadWithBakerReadBarrier(HInstruction* instruction,
+                                                              Location ref,
+                                                              Register obj,
+                                                              uint32_t data_offset,
+                                                              Location index,
+                                                              Location temp,
+                                                              bool needs_null_check) {
+  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(kUseBakerReadBarrier);
+
+  static_assert(
+      sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
+      "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
+  // /* HeapReference<Object> */ ref =
+  //     *(obj + data_offset + index * sizeof(HeapReference<Object>))
+  ScaleFactor scale_factor = TIMES_4;
+  GenerateReferenceLoadWithBakerReadBarrier(instruction,
+                                            ref,
+                                            obj,
+                                            data_offset,
+                                            index,
+                                            scale_factor,
+                                            temp,
+                                            needs_null_check);
+}
+
+void CodeGeneratorMIPS::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* instruction,
+                                                                  Location ref,
+                                                                  Register obj,
+                                                                  uint32_t offset,
+                                                                  Location index,
+                                                                  ScaleFactor scale_factor,
+                                                                  Location temp,
+                                                                  bool needs_null_check,
+                                                                  bool always_update_field) {
+  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(kUseBakerReadBarrier);
+
+  // In slow path based read barriers, the read barrier call is
+  // inserted after the original load. However, in fast path based
+  // Baker's read barriers, we need to perform the load of
+  // mirror::Object::monitor_ *before* the original reference load.
+  // This load-load ordering is required by the read barrier.
+  // The fast path/slow path (for Baker's algorithm) should look like:
+  //
+  //   uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
+  //   lfence;  // Load fence or artificial data dependency to prevent load-load reordering
+  //   HeapReference<Object> ref = *src;  // Original reference load.
+  //   bool is_gray = (rb_state == ReadBarrier::GrayState());
+  //   if (is_gray) {
+  //     ref = ReadBarrier::Mark(ref);  // Performed by runtime entrypoint slow path.
+  //   }
+  //
+  // Note: the original implementation in ReadBarrier::Barrier is
+  // slightly more complex as it performs additional checks that we do
+  // not do here for performance reasons.
+
+  Register ref_reg = ref.AsRegister<Register>();
+  Register temp_reg = temp.AsRegister<Register>();
+  uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
+
+  // /* int32_t */ monitor = obj->monitor_
+  __ LoadFromOffset(kLoadWord, temp_reg, obj, monitor_offset);
+  if (needs_null_check) {
+    MaybeRecordImplicitNullCheck(instruction);
+  }
+  // /* LockWord */ lock_word = LockWord(monitor)
+  static_assert(sizeof(LockWord) == sizeof(int32_t),
+                "art::LockWord and int32_t have different sizes.");
+
+  __ Sync(0);  // Barrier to prevent load-load reordering.
+
+  // The actual reference load.
+  if (index.IsValid()) {
+    // Load types involving an "index": ArrayGet,
+    // UnsafeGetObject/UnsafeGetObjectVolatile and UnsafeCASObject
+    // intrinsics.
+    // /* HeapReference<Object> */ ref = *(obj + offset + (index << scale_factor))
+    if (index.IsConstant()) {
+      size_t computed_offset =
+          (index.GetConstant()->AsIntConstant()->GetValue() << scale_factor) + offset;
+      __ LoadFromOffset(kLoadWord, ref_reg, obj, computed_offset);
+    } else {
+      // Handle the special case of the
+      // UnsafeGetObject/UnsafeGetObjectVolatile and UnsafeCASObject
+      // intrinsics, which use a register pair as index ("long
+      // offset"), of which only the low part contains data.
+      Register index_reg = index.IsRegisterPair()
+          ? index.AsRegisterPairLow<Register>()
+          : index.AsRegister<Register>();
+      __ Sll(TMP, index_reg, scale_factor);
+      __ Addu(TMP, obj, TMP);
+      __ LoadFromOffset(kLoadWord, ref_reg, TMP, offset);
+    }
+  } else {
+    // /* HeapReference<Object> */ ref = *(obj + offset)
+    __ LoadFromOffset(kLoadWord, ref_reg, obj, offset);
+  }
+
+  // Object* ref = ref_addr->AsMirrorPtr()
+  __ MaybeUnpoisonHeapReference(ref_reg);
+
+  // Slow path marking the object `ref` when it is gray.
+  SlowPathCodeMIPS* slow_path;
+  if (always_update_field) {
+    // ReadBarrierMarkAndUpdateFieldSlowPathMIPS only supports address
+    // of the form `obj + field_offset`, where `obj` is a register and
+    // `field_offset` is a register pair (of which only the lower half
+    // is used). Thus `offset` and `scale_factor` above are expected
+    // to be null in this code path.
+    DCHECK_EQ(offset, 0u);
+    DCHECK_EQ(scale_factor, ScaleFactor::TIMES_1);
+    slow_path = new (GetGraph()->GetArena())
+        ReadBarrierMarkAndUpdateFieldSlowPathMIPS(instruction,
+                                                  ref,
+                                                  obj,
+                                                  /* field_offset */ index,
+                                                  temp_reg);
+  } else {
+    slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathMIPS(instruction, ref);
+  }
+  AddSlowPath(slow_path);
+
+  // if (rb_state == ReadBarrier::GrayState())
+  //   ref = ReadBarrier::Mark(ref);
+  // Given the numeric representation, it's enough to check the low bit of the
+  // rb_state. We do that by shifting the bit into the sign bit (31) and
+  // performing a branch on less than zero.
+  static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0");
+  static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1");
+  static_assert(LockWord::kReadBarrierStateSize == 1, "Expecting 1-bit read barrier state size");
+  __ Sll(temp_reg, temp_reg, 31 - LockWord::kReadBarrierStateShift);
+  __ Bltz(temp_reg, slow_path->GetEntryLabel());
+  __ Bind(slow_path->GetExitLabel());
+}
+
+void CodeGeneratorMIPS::GenerateReadBarrierSlow(HInstruction* instruction,
+                                                Location out,
+                                                Location ref,
+                                                Location obj,
+                                                uint32_t offset,
+                                                Location index) {
+  DCHECK(kEmitCompilerReadBarrier);
+
+  // Insert a slow path based read barrier *after* the reference load.
+  //
+  // If heap poisoning is enabled, the unpoisoning of the loaded
+  // reference will be carried out by the runtime within the slow
+  // path.
+  //
+  // Note that `ref` currently does not get unpoisoned (when heap
+  // poisoning is enabled), which is alright as the `ref` argument is
+  // not used by the artReadBarrierSlow entry point.
+  //
+  // TODO: Unpoison `ref` when it is used by artReadBarrierSlow.
+  SlowPathCodeMIPS* slow_path = new (GetGraph()->GetArena())
+      ReadBarrierForHeapReferenceSlowPathMIPS(instruction, out, ref, obj, offset, index);
+  AddSlowPath(slow_path);
+
+  __ B(slow_path->GetEntryLabel());
+  __ Bind(slow_path->GetExitLabel());
+}
+
+void CodeGeneratorMIPS::MaybeGenerateReadBarrierSlow(HInstruction* instruction,
+                                                     Location out,
+                                                     Location ref,
+                                                     Location obj,
+                                                     uint32_t offset,
+                                                     Location index) {
+  if (kEmitCompilerReadBarrier) {
+    // Baker's read barriers shall be handled by the fast path
+    // (CodeGeneratorMIPS::GenerateReferenceLoadWithBakerReadBarrier).
+    DCHECK(!kUseBakerReadBarrier);
+    // If heap poisoning is enabled, unpoisoning will be taken care of
+    // by the runtime within the slow path.
+    GenerateReadBarrierSlow(instruction, out, ref, obj, offset, index);
+  } else if (kPoisonHeapReferences) {
+    __ UnpoisonHeapReference(out.AsRegister<Register>());
+  }
+}
+
+void CodeGeneratorMIPS::GenerateReadBarrierForRootSlow(HInstruction* instruction,
+                                                       Location out,
+                                                       Location root) {
+  DCHECK(kEmitCompilerReadBarrier);
+
+  // Insert a slow path based read barrier *after* the GC root load.
+  //
+  // Note that GC roots are not affected by heap poisoning, so we do
+  // not need to do anything special for this here.
+  SlowPathCodeMIPS* slow_path =
+      new (GetGraph()->GetArena()) ReadBarrierForRootSlowPathMIPS(instruction, out, root);
+  AddSlowPath(slow_path);
+
+  __ B(slow_path->GetEntryLabel());
+  __ Bind(slow_path->GetExitLabel());
+}
+
 void LocationsBuilderMIPS::VisitInstanceOf(HInstanceOf* instruction) {
   LocationSummary::CallKind call_kind = LocationSummary::kNoCall;
   TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
@@ -5345,7 +6467,8 @@ void LocationsBuilderMIPS::VisitInstanceOf(HInstanceOf* instruction) {
     case TypeCheckKind::kAbstractClassCheck:
     case TypeCheckKind::kClassHierarchyCheck:
     case TypeCheckKind::kArrayObjectCheck:
-      call_kind = LocationSummary::kNoCall;
+      call_kind =
+          kEmitCompilerReadBarrier ? LocationSummary::kCallOnSlowPath : LocationSummary::kNoCall;
       break;
     case TypeCheckKind::kArrayCheck:
     case TypeCheckKind::kUnresolvedCheck:
@@ -5360,14 +6483,20 @@ void LocationsBuilderMIPS::VisitInstanceOf(HInstanceOf* instruction) {
   // The output does overlap inputs.
   // Note that TypeCheckSlowPathMIPS uses this register too.
   locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
+  locations->AddRegisterTemps(NumberOfInstanceOfTemps(type_check_kind));
 }
 
 void InstructionCodeGeneratorMIPS::VisitInstanceOf(HInstanceOf* instruction) {
   TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
   LocationSummary* locations = instruction->GetLocations();
-  Register obj = locations->InAt(0).AsRegister<Register>();
+  Location obj_loc = locations->InAt(0);
+  Register obj = obj_loc.AsRegister<Register>();
   Register cls = locations->InAt(1).AsRegister<Register>();
-  Register out = locations->Out().AsRegister<Register>();
+  Location out_loc = locations->Out();
+  Register out = out_loc.AsRegister<Register>();
+  const size_t num_temps = NumberOfInstanceOfTemps(type_check_kind);
+  DCHECK_LE(num_temps, 1u);
+  Location maybe_temp_loc = (num_temps >= 1) ? locations->GetTemp(0) : Location::NoLocation();
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
   uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
   uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
@@ -5385,8 +6514,12 @@ void InstructionCodeGeneratorMIPS::VisitInstanceOf(HInstanceOf* instruction) {
   switch (type_check_kind) {
     case TypeCheckKind::kExactCheck: {
       // /* HeapReference<Class> */ out = obj->klass_
-      __ LoadFromOffset(kLoadWord, out, obj, class_offset);
-      __ MaybeUnpoisonHeapReference(out);
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        out_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp_loc,
+                                        kCompilerReadBarrierOption);
       // Classes must be equal for the instanceof to succeed.
       __ Xor(out, out, cls);
       __ Sltiu(out, out, 1);
@@ -5395,15 +6528,22 @@ void InstructionCodeGeneratorMIPS::VisitInstanceOf(HInstanceOf* instruction) {
 
     case TypeCheckKind::kAbstractClassCheck: {
       // /* HeapReference<Class> */ out = obj->klass_
-      __ LoadFromOffset(kLoadWord, out, obj, class_offset);
-      __ MaybeUnpoisonHeapReference(out);
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        out_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp_loc,
+                                        kCompilerReadBarrierOption);
       // If the class is abstract, we eagerly fetch the super class of the
       // object to avoid doing a comparison we know will fail.
       MipsLabel loop;
       __ Bind(&loop);
       // /* HeapReference<Class> */ out = out->super_class_
-      __ LoadFromOffset(kLoadWord, out, out, super_offset);
-      __ MaybeUnpoisonHeapReference(out);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       out_loc,
+                                       super_offset,
+                                       maybe_temp_loc,
+                                       kCompilerReadBarrierOption);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ Beqz(out, &done);
       __ Bne(out, cls, &loop);
@@ -5413,15 +6553,22 @@ void InstructionCodeGeneratorMIPS::VisitInstanceOf(HInstanceOf* instruction) {
 
     case TypeCheckKind::kClassHierarchyCheck: {
       // /* HeapReference<Class> */ out = obj->klass_
-      __ LoadFromOffset(kLoadWord, out, obj, class_offset);
-      __ MaybeUnpoisonHeapReference(out);
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        out_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp_loc,
+                                        kCompilerReadBarrierOption);
       // Walk over the class hierarchy to find a match.
       MipsLabel loop, success;
       __ Bind(&loop);
       __ Beq(out, cls, &success);
       // /* HeapReference<Class> */ out = out->super_class_
-      __ LoadFromOffset(kLoadWord, out, out, super_offset);
-      __ MaybeUnpoisonHeapReference(out);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       out_loc,
+                                       super_offset,
+                                       maybe_temp_loc,
+                                       kCompilerReadBarrierOption);
       __ Bnez(out, &loop);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ B(&done);
@@ -5432,15 +6579,22 @@ void InstructionCodeGeneratorMIPS::VisitInstanceOf(HInstanceOf* instruction) {
 
     case TypeCheckKind::kArrayObjectCheck: {
       // /* HeapReference<Class> */ out = obj->klass_
-      __ LoadFromOffset(kLoadWord, out, obj, class_offset);
-      __ MaybeUnpoisonHeapReference(out);
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        out_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp_loc,
+                                        kCompilerReadBarrierOption);
       // Do an exact check.
       MipsLabel success;
       __ Beq(out, cls, &success);
       // Otherwise, we need to check that the object's class is a non-primitive array.
       // /* HeapReference<Class> */ out = out->component_type_
-      __ LoadFromOffset(kLoadWord, out, out, component_offset);
-      __ MaybeUnpoisonHeapReference(out);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       out_loc,
+                                       component_offset,
+                                       maybe_temp_loc,
+                                       kCompilerReadBarrierOption);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ Beqz(out, &done);
       __ LoadFromOffset(kLoadUnsignedHalfword, out, out, primitive_offset);
@@ -5455,8 +6609,12 @@ void InstructionCodeGeneratorMIPS::VisitInstanceOf(HInstanceOf* instruction) {
     case TypeCheckKind::kArrayCheck: {
       // No read barrier since the slow path will retry upon failure.
       // /* HeapReference<Class> */ out = obj->klass_
-      __ LoadFromOffset(kLoadWord, out, obj, class_offset);
-      __ MaybeUnpoisonHeapReference(out);
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        out_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp_loc,
+                                        kWithoutReadBarrier);
       DCHECK(locations->OnlyCallsOnSlowPath());
       slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathMIPS(instruction,
                                                                      /* is_fatal */ false);
@@ -5627,9 +6785,6 @@ static bool TryGenerateIntrinsicCode(HInvoke* invoke, CodeGeneratorMIPS* codegen
 
 HLoadString::LoadKind CodeGeneratorMIPS::GetSupportedLoadStringKind(
     HLoadString::LoadKind desired_string_load_kind) {
-  if (kEmitCompilerReadBarrier) {
-    UNIMPLEMENTED(FATAL) << "for read barrier";
-  }
   // We disable PC-relative load on pre-R6 when there is an irreducible loop, as the optimization
   // is incompatible with it.
   // TODO: Create as many MipsDexCacheArraysBase instructions as needed for methods
@@ -5665,9 +6820,6 @@ HLoadString::LoadKind CodeGeneratorMIPS::GetSupportedLoadStringKind(
 
 HLoadClass::LoadKind CodeGeneratorMIPS::GetSupportedLoadClassKind(
     HLoadClass::LoadKind desired_class_load_kind) {
-  if (kEmitCompilerReadBarrier) {
-    UNIMPLEMENTED(FATAL) << "for read barrier";
-  }
   // We disable PC-relative load on pre-R6 when there is an irreducible loop, as the optimization
   // is incompatible with it.
   bool has_irreducible_loops = GetGraph()->HasIrreducibleLoops();
@@ -5916,12 +7068,13 @@ void LocationsBuilderMIPS::VisitLoadClass(HLoadClass* cls) {
     CodeGenerator::CreateLoadClassRuntimeCallLocationSummary(
         cls,
         Location::RegisterLocation(calling_convention.GetRegisterAt(0)),
-        Location::RegisterLocation(V0));
+        calling_convention.GetReturnLocation(Primitive::kPrimNot));
     return;
   }
   DCHECK(!cls->NeedsAccessCheck());
 
-  LocationSummary::CallKind call_kind = (cls->NeedsEnvironment() || kEmitCompilerReadBarrier)
+  const bool requires_read_barrier = kEmitCompilerReadBarrier && !cls->IsInBootImage();
+  LocationSummary::CallKind call_kind = (cls->NeedsEnvironment() || requires_read_barrier)
       ? LocationSummary::kCallOnSlowPath
       : LocationSummary::kNoCall;
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(cls, call_kind);
@@ -5976,6 +7129,9 @@ void InstructionCodeGeneratorMIPS::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAF
       break;
   }
 
+  const ReadBarrierOption read_barrier_option = cls->IsInBootImage()
+      ? kWithoutReadBarrier
+      : kCompilerReadBarrierOption;
   bool generate_null_check = false;
   switch (load_kind) {
     case HLoadClass::LoadKind::kReferrersClass: {
@@ -5985,11 +7141,13 @@ void InstructionCodeGeneratorMIPS::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAF
       GenerateGcRootFieldLoad(cls,
                               out_loc,
                               base_or_current_method_reg,
-                              ArtMethod::DeclaringClassOffset().Int32Value());
+                              ArtMethod::DeclaringClassOffset().Int32Value(),
+                              read_barrier_option);
       break;
     }
     case HLoadClass::LoadKind::kBootImageLinkTimeAddress:
       DCHECK(codegen_->GetCompilerOptions().IsBootImage());
+      DCHECK_EQ(read_barrier_option, kWithoutReadBarrier);
       __ LoadLiteral(out,
                      base_or_current_method_reg,
                      codegen_->DeduplicateBootImageTypeLiteral(cls->GetDexFile(),
@@ -5997,6 +7155,7 @@ void InstructionCodeGeneratorMIPS::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAF
       break;
     case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: {
       DCHECK(codegen_->GetCompilerOptions().IsBootImage());
+      DCHECK_EQ(read_barrier_option, kWithoutReadBarrier);
       CodeGeneratorMIPS::PcRelativePatchInfo* info =
           codegen_->NewPcRelativeTypePatch(cls->GetDexFile(), cls->GetTypeIndex());
       bool reordering = __ SetReorder(false);
@@ -6006,7 +7165,7 @@ void InstructionCodeGeneratorMIPS::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAF
       break;
     }
     case HLoadClass::LoadKind::kBootImageAddress: {
-      DCHECK(!kEmitCompilerReadBarrier);
+      DCHECK_EQ(read_barrier_option, kWithoutReadBarrier);
       uint32_t address = dchecked_integral_cast<uint32_t>(
           reinterpret_cast<uintptr_t>(cls->GetClass().Get()));
       DCHECK_NE(address, 0u);
@@ -6020,7 +7179,7 @@ void InstructionCodeGeneratorMIPS::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAF
           codegen_->NewTypeBssEntryPatch(cls->GetDexFile(), cls->GetTypeIndex());
       bool reordering = __ SetReorder(false);
       codegen_->EmitPcRelativeAddressPlaceholderHigh(info, out, base_or_current_method_reg);
-      GenerateGcRootFieldLoad(cls, out_loc, out, /* placeholder */ 0x5678);
+      GenerateGcRootFieldLoad(cls, out_loc, out, /* placeholder */ 0x5678, read_barrier_option);
       __ SetReorder(reordering);
       generate_null_check = true;
       break;
@@ -6032,7 +7191,7 @@ void InstructionCodeGeneratorMIPS::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAF
       bool reordering = __ SetReorder(false);
       __ Bind(&info->high_label);
       __ Lui(out, /* placeholder */ 0x1234);
-      GenerateGcRootFieldLoad(cls, out_loc, out, /* placeholder */ 0x5678);
+      GenerateGcRootFieldLoad(cls, out_loc, out, /* placeholder */ 0x5678, read_barrier_option);
       __ SetReorder(reordering);
       break;
     }
@@ -6165,7 +7324,11 @@ void InstructionCodeGeneratorMIPS::VisitLoadString(HLoadString* load) NO_THREAD_
           codegen_->NewPcRelativeStringPatch(load->GetDexFile(), load->GetStringIndex());
       bool reordering = __ SetReorder(false);
       codegen_->EmitPcRelativeAddressPlaceholderHigh(info, out, base_or_current_method_reg);
-      GenerateGcRootFieldLoad(load, out_loc, out, /* placeholder */ 0x5678);
+      GenerateGcRootFieldLoad(load,
+                              out_loc,
+                              out,
+                              /* placeholder */ 0x5678,
+                              kCompilerReadBarrierOption);
       __ SetReorder(reordering);
       SlowPathCodeMIPS* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathMIPS(load);
       codegen_->AddSlowPath(slow_path);
@@ -6181,7 +7344,11 @@ void InstructionCodeGeneratorMIPS::VisitLoadString(HLoadString* load) NO_THREAD_
       bool reordering = __ SetReorder(false);
       __ Bind(&info->high_label);
       __ Lui(out, /* placeholder */ 0x1234);
-      GenerateGcRootFieldLoad(load, out_loc, out, /* placeholder */ 0x5678);
+      GenerateGcRootFieldLoad(load,
+                              out_loc,
+                              out,
+                              /* placeholder */ 0x5678,
+                              kCompilerReadBarrierOption);
       __ SetReorder(reordering);
       return;
     }
diff --git a/compiler/optimizing/code_generator_mips.h b/compiler/optimizing/code_generator_mips.h
index 98fee24a74..3875c4bdba 100644
--- a/compiler/optimizing/code_generator_mips.h
+++ b/compiler/optimizing/code_generator_mips.h
@@ -241,6 +241,38 @@ class InstructionCodeGeneratorMIPS : public InstructionCodeGenerator {
                       uint32_t dex_pc,
                       bool value_can_be_null);
   void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info, uint32_t dex_pc);
+
+  // Generate a heap reference load using one register `out`:
+  //
+  //   out <- *(out + offset)
+  //
+  // while honoring heap poisoning and/or read barriers (if any).
+  //
+  // Location `maybe_temp` is used when generating a read barrier and
+  // shall be a register in that case; it may be an invalid location
+  // otherwise.
+  void GenerateReferenceLoadOneRegister(HInstruction* instruction,
+                                        Location out,
+                                        uint32_t offset,
+                                        Location maybe_temp,
+                                        ReadBarrierOption read_barrier_option);
+  // Generate a heap reference load using two different registers
+  // `out` and `obj`:
+  //
+  //   out <- *(obj + offset)
+  //
+  // while honoring heap poisoning and/or read barriers (if any).
+  //
+  // Location `maybe_temp` is used when generating a Baker's (fast
+  // path) read barrier and shall be a register in that case; it may
+  // be an invalid location otherwise.
+  void GenerateReferenceLoadTwoRegisters(HInstruction* instruction,
+                                         Location out,
+                                         Location obj,
+                                         uint32_t offset,
+                                         Location maybe_temp,
+                                         ReadBarrierOption read_barrier_option);
+
   // Generate a GC root reference load:
   //
   //   root <- *(obj + offset)
@@ -249,7 +281,9 @@ class InstructionCodeGeneratorMIPS : public InstructionCodeGenerator {
   void GenerateGcRootFieldLoad(HInstruction* instruction,
                                Location root,
                                Register obj,
-                               uint32_t offset);
+                               uint32_t offset,
+                               ReadBarrierOption read_barrier_option);
+
   void GenerateIntCompare(IfCondition cond, LocationSummary* locations);
   // When the function returns `false` it means that the condition holds if `dst` is non-zero
   // and doesn't hold if `dst` is zero. If it returns `true`, the roles of zero and non-zero
@@ -353,6 +387,91 @@ class CodeGeneratorMIPS : public CodeGenerator {
   void EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches) OVERRIDE;
   void EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) OVERRIDE;
 
+  // Fast path implementation of ReadBarrier::Barrier for a heap
+  // reference field load when Baker's read barriers are used.
+  void GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction,
+                                             Location ref,
+                                             Register obj,
+                                             uint32_t offset,
+                                             Location temp,
+                                             bool needs_null_check);
+  // Fast path implementation of ReadBarrier::Barrier for a heap
+  // reference array load when Baker's read barriers are used.
+  void GenerateArrayLoadWithBakerReadBarrier(HInstruction* instruction,
+                                             Location ref,
+                                             Register obj,
+                                             uint32_t data_offset,
+                                             Location index,
+                                             Location temp,
+                                             bool needs_null_check);
+
+  // Factored implementation, used by GenerateFieldLoadWithBakerReadBarrier,
+  // GenerateArrayLoadWithBakerReadBarrier and some intrinsics.
+  //
+  // Load the object reference located at the address
+  // `obj + offset + (index << scale_factor)`, held by object `obj`, into
+  // `ref`, and mark it if needed.
+  //
+  // If `always_update_field` is true, the value of the reference is
+  // atomically updated in the holder (`obj`).
+  void GenerateReferenceLoadWithBakerReadBarrier(HInstruction* instruction,
+                                                 Location ref,
+                                                 Register obj,
+                                                 uint32_t offset,
+                                                 Location index,
+                                                 ScaleFactor scale_factor,
+                                                 Location temp,
+                                                 bool needs_null_check,
+                                                 bool always_update_field = false);
+
+  // Generate a read barrier for a heap reference within `instruction`
+  // using a slow path.
+  //
+  // A read barrier for an object reference read from the heap is
+  // implemented as a call to the artReadBarrierSlow runtime entry
+  // point, which is passed the values in locations `ref`, `obj`, and
+  // `offset`:
+  //
+  //   mirror::Object* artReadBarrierSlow(mirror::Object* ref,
+  //                                      mirror::Object* obj,
+  //                                      uint32_t offset);
+  //
+  // The `out` location contains the value returned by
+  // artReadBarrierSlow.
+  //
+  // When `index` is provided (i.e. for array accesses), the offset
+  // value passed to artReadBarrierSlow is adjusted to take `index`
+  // into account.
+  void GenerateReadBarrierSlow(HInstruction* instruction,
+                               Location out,
+                               Location ref,
+                               Location obj,
+                               uint32_t offset,
+                               Location index = Location::NoLocation());
+
+  // If read barriers are enabled, generate a read barrier for a heap
+  // reference using a slow path. If heap poisoning is enabled, also
+  // unpoison the reference in `out`.
+  void MaybeGenerateReadBarrierSlow(HInstruction* instruction,
+                                    Location out,
+                                    Location ref,
+                                    Location obj,
+                                    uint32_t offset,
+                                    Location index = Location::NoLocation());
+
+  // Generate a read barrier for a GC root within `instruction` using
+  // a slow path.
+  //
+  // A read barrier for an object reference GC root is implemented as
+  // a call to the artReadBarrierForRootSlow runtime entry point,
+  // which is passed the value in location `root`:
+  //
+  //   mirror::Object* artReadBarrierForRootSlow(GcRoot<mirror::Object>* root);
+  //
+  // The `out` location contains the value returned by
+  // artReadBarrierForRootSlow.
+  void GenerateReadBarrierForRootSlow(HInstruction* instruction, Location out, Location root);
+
   void MarkGCCard(Register object, Register value, bool value_can_be_null);
 
   // Register allocation.
@@ -400,6 +519,15 @@ class CodeGeneratorMIPS : public CodeGenerator {
                      uint32_t dex_pc,
                      SlowPathCode* slow_path = nullptr) OVERRIDE;
 
+  // Generate code to invoke a runtime entry point, but do not record
+  // PC-related information in a stack map.
+  void InvokeRuntimeWithoutRecordingPcInfo(int32_t entry_point_offset,
+                                           HInstruction* instruction,
+                                           SlowPathCode* slow_path,
+                                           bool direct);
+
+  void GenerateInvokeRuntime(int32_t entry_point_offset, bool direct);
+
   ParallelMoveResolver* GetMoveResolver() OVERRIDE { return &move_resolver_; }
 
   bool NeedsTwoRegisters(Primitive::Type type) const OVERRIDE {
diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc
index c82533bc7d..78b31e9e86 100644
--- a/compiler/optimizing/code_generator_mips64.cc
+++ b/compiler/optimizing/code_generator_mips64.cc
@@ -407,6 +407,528 @@ class DeoptimizationSlowPathMIPS64 : public SlowPathCodeMIPS64 {
   DISALLOW_COPY_AND_ASSIGN(DeoptimizationSlowPathMIPS64);
 };
 
+class ArraySetSlowPathMIPS64 : public SlowPathCodeMIPS64 {
+ public:
+  explicit ArraySetSlowPathMIPS64(HInstruction* instruction) : SlowPathCodeMIPS64(instruction) {}
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    LocationSummary* locations = instruction_->GetLocations();
+    __ Bind(GetEntryLabel());
+    SaveLiveRegisters(codegen, locations);
+
+    InvokeRuntimeCallingConvention calling_convention;
+    HParallelMove parallel_move(codegen->GetGraph()->GetArena());
+    parallel_move.AddMove(
+        locations->InAt(0),
+        Location::RegisterLocation(calling_convention.GetRegisterAt(0)),
+        Primitive::kPrimNot,
+        nullptr);
+    parallel_move.AddMove(
+        locations->InAt(1),
+        Location::RegisterLocation(calling_convention.GetRegisterAt(1)),
+        Primitive::kPrimInt,
+        nullptr);
+    parallel_move.AddMove(
+        locations->InAt(2),
+        Location::RegisterLocation(calling_convention.GetRegisterAt(2)),
+        Primitive::kPrimNot,
+        nullptr);
+    codegen->GetMoveResolver()->EmitNativeCode(&parallel_move);
+
+    CodeGeneratorMIPS64* mips64_codegen = down_cast<CodeGeneratorMIPS64*>(codegen);
+    mips64_codegen->InvokeRuntime(kQuickAputObject, instruction_, instruction_->GetDexPc(), this);
+    CheckEntrypointTypes<kQuickAputObject, void, mirror::Array*, int32_t, mirror::Object*>();
+    RestoreLiveRegisters(codegen, locations);
+    __ Bc(GetExitLabel());
+  }
+
+  const char* GetDescription() const OVERRIDE { return "ArraySetSlowPathMIPS64"; }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(ArraySetSlowPathMIPS64);
+};
+
+// Slow path marking an object reference `ref` during a read
+// barrier. The field `obj.field` in the object `obj` holding this
+// reference does not get updated by this slow path after marking (see
+// ReadBarrierMarkAndUpdateFieldSlowPathMIPS64 below for that).
+//
+// This means that after the execution of this slow path, `ref` will
+// always be up-to-date, but `obj.field` may not; i.e., after the
+// flip, `ref` will be a to-space reference, but `obj.field` will
+// probably still be a from-space reference (unless it gets updated by
+// another thread, or if another thread installed another object
+// reference (different from `ref`) in `obj.field`).
+//
+// If `entrypoint` is a valid location it is assumed to already be
+// holding the entrypoint. The case where the entrypoint is passed in
+// is for the GcRoot read barrier.
+class ReadBarrierMarkSlowPathMIPS64 : public SlowPathCodeMIPS64 {
+ public:
+  ReadBarrierMarkSlowPathMIPS64(HInstruction* instruction,
+                                Location ref,
+                                Location entrypoint = Location::NoLocation())
+      : SlowPathCodeMIPS64(instruction), ref_(ref), entrypoint_(entrypoint) {
+    DCHECK(kEmitCompilerReadBarrier);
+  }
+
+  const char* GetDescription() const OVERRIDE { return "ReadBarrierMarkSlowPathMIPS"; }
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    LocationSummary* locations = instruction_->GetLocations();
+    GpuRegister ref_reg = ref_.AsRegister<GpuRegister>();
+    DCHECK(locations->CanCall());
+    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_reg)) << ref_reg;
+    DCHECK(instruction_->IsInstanceFieldGet() ||
+           instruction_->IsStaticFieldGet() ||
+           instruction_->IsArrayGet() ||
+           instruction_->IsArraySet() ||
+           instruction_->IsLoadClass() ||
+           instruction_->IsLoadString() ||
+           instruction_->IsInstanceOf() ||
+           instruction_->IsCheckCast() ||
+           (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()) ||
+           (instruction_->IsInvokeStaticOrDirect() && instruction_->GetLocations()->Intrinsified()))
+        << "Unexpected instruction in read barrier marking slow path: "
+        << instruction_->DebugName();
+
+    __ Bind(GetEntryLabel());
+    // No need to save live registers; it's taken care of by the
+    // entrypoint. Also, there is no need to update the stack mask,
+    // as this runtime call will not trigger a garbage collection.
+    CodeGeneratorMIPS64* mips64_codegen = down_cast<CodeGeneratorMIPS64*>(codegen);
+    DCHECK((V0 <= ref_reg && ref_reg <= T2) ||
+           (S2 <= ref_reg && ref_reg <= S7) ||
+           (ref_reg == S8)) << ref_reg;
+    // "Compact" slow path, saving two moves.
+    //
+    // Instead of using the standard runtime calling convention (input
+    // and output in A0 and V0 respectively):
+    //
+    //   A0 <- ref
+    //   V0 <- ReadBarrierMark(A0)
+    //   ref <- V0
+    //
+    // we just use rX (the register containing `ref`) as input and output
+    // of a dedicated entrypoint:
+    //
+    //   rX <- ReadBarrierMarkRegX(rX)
+    //
+    if (entrypoint_.IsValid()) {
+      mips64_codegen->ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction_, this);
+      DCHECK_EQ(entrypoint_.AsRegister<GpuRegister>(), T9);
+      __ Jalr(entrypoint_.AsRegister<GpuRegister>());
+      __ Nop();
+    } else {
+      int32_t entry_point_offset =
+          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kMips64PointerSize>(ref_reg - 1);
+      // This runtime call does not require a stack map.
+      mips64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset,
+                                                          instruction_,
+                                                          this);
+    }
+    __ Bc(GetExitLabel());
+  }
+
+ private:
+  // The location (register) of the marked object reference.
+  const Location ref_;
+
+  // The location of the entrypoint if already loaded.
+  const Location entrypoint_;
+
+  DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathMIPS64);
+};
+
+// Slow path marking an object reference `ref` during a read barrier,
+// and if needed, atomically updating the field `obj.field` in the
+// object `obj` holding this reference after marking (contrary to
+// ReadBarrierMarkSlowPathMIPS64 above, which never tries to update
+// `obj.field`).
+//
+// This means that after the execution of this slow path, both `ref`
+// and `obj.field` will be up-to-date; i.e., after the flip, both will
+// hold the same to-space reference (unless another thread installed
+// another object reference (different from `ref`) in `obj.field`).
+class ReadBarrierMarkAndUpdateFieldSlowPathMIPS64 : public SlowPathCodeMIPS64 {
+ public:
+  ReadBarrierMarkAndUpdateFieldSlowPathMIPS64(HInstruction* instruction,
+                                              Location ref,
+                                              GpuRegister obj,
+                                              Location field_offset,
+                                              GpuRegister temp1)
+      : SlowPathCodeMIPS64(instruction),
+        ref_(ref),
+        obj_(obj),
+        field_offset_(field_offset),
+        temp1_(temp1) {
+    DCHECK(kEmitCompilerReadBarrier);
+  }
+
+  const char* GetDescription() const OVERRIDE {
+    return "ReadBarrierMarkAndUpdateFieldSlowPathMIPS64";
+  }
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    LocationSummary* locations = instruction_->GetLocations();
+    GpuRegister ref_reg = ref_.AsRegister<GpuRegister>();
+    DCHECK(locations->CanCall());
+    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_reg)) << ref_reg;
+    // This slow path is only used by the UnsafeCASObject intrinsic.
+    DCHECK((instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()))
+        << "Unexpected instruction in read barrier marking and field updating slow path: "
+        << instruction_->DebugName();
+    DCHECK(instruction_->GetLocations()->Intrinsified());
+    DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kUnsafeCASObject);
+    DCHECK(field_offset_.IsRegister()) << field_offset_;
+
+    __ Bind(GetEntryLabel());
+
+    // Save the old reference.
+    // Note that we cannot use AT or TMP to save the old reference, as those
+    // are used by the code that follows, but we need the old reference after
+    // the call to the ReadBarrierMarkRegX entry point.
+    DCHECK_NE(temp1_, AT);
+    DCHECK_NE(temp1_, TMP);
+    __ Move(temp1_, ref_reg);
+
+    // No need to save live registers; it's taken care of by the
+    // entrypoint. Also, there is no need to update the stack mask,
+    // as this runtime call will not trigger a garbage collection.
+    CodeGeneratorMIPS64* mips64_codegen = down_cast<CodeGeneratorMIPS64*>(codegen);
+    DCHECK((V0 <= ref_reg && ref_reg <= T2) ||
+           (S2 <= ref_reg && ref_reg <= S7) ||
+           (ref_reg == S8)) << ref_reg;
+    // "Compact" slow path, saving two moves.
+    //
+    // Instead of using the standard runtime calling convention (input
+    // and output in A0 and V0 respectively):
+    //
+    //   A0 <- ref
+    //   V0 <- ReadBarrierMark(A0)
+    //   ref <- V0
+    //
+    // we just use rX (the register containing `ref`) as input and output
+    // of a dedicated entrypoint:
+    //
+    //   rX <- ReadBarrierMarkRegX(rX)
+    //
+    int32_t entry_point_offset =
+        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kMips64PointerSize>(ref_reg - 1);
+    // This runtime call does not require a stack map.
+    mips64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset,
+                                                        instruction_,
+                                                        this);
+
+    // If the new reference is different from the old reference,
+    // update the field in the holder (`*(obj_ + field_offset_)`).
+    //
+    // Note that this field could also hold a different object, if
+    // another thread had concurrently changed it. In that case, the
+    // the compare-and-set (CAS) loop below would abort, leaving the
+    // field as-is.
+    Mips64Label done;
+    __ Beqc(temp1_, ref_reg, &done);
+
+    // Update the the holder's field atomically.  This may fail if
+    // mutator updates before us, but it's OK.  This is achieved
+    // using a strong compare-and-set (CAS) operation with relaxed
+    // memory synchronization ordering, where the expected value is
+    // the old reference and the desired value is the new reference.
+
+    // Convenience aliases.
+    GpuRegister base = obj_;
+    GpuRegister offset = field_offset_.AsRegister<GpuRegister>();
+    GpuRegister expected = temp1_;
+    GpuRegister value = ref_reg;
+    GpuRegister tmp_ptr = TMP;      // Pointer to actual memory.
+    GpuRegister tmp = AT;           // Value in memory.
+
+    __ Daddu(tmp_ptr, base, offset);
+
+    if (kPoisonHeapReferences) {
+      __ PoisonHeapReference(expected);
+      // Do not poison `value` if it is the same register as
+      // `expected`, which has just been poisoned.
+      if (value != expected) {
+        __ PoisonHeapReference(value);
+      }
+    }
+
+    // do {
+    //   tmp = [r_ptr] - expected;
+    // } while (tmp == 0 && failure([r_ptr] <- r_new_value));
+
+    Mips64Label loop_head, exit_loop;
+    __ Bind(&loop_head);
+    __ Ll(tmp, tmp_ptr);
+    // The LL instruction sign-extends the 32-bit value, but
+    // 32-bit references must be zero-extended. Zero-extend `tmp`.
+    __ Dext(tmp, tmp, 0, 32);
+    __ Bnec(tmp, expected, &exit_loop);
+    __ Move(tmp, value);
+    __ Sc(tmp, tmp_ptr);
+    __ Beqzc(tmp, &loop_head);
+    __ Bind(&exit_loop);
+
+    if (kPoisonHeapReferences) {
+      __ UnpoisonHeapReference(expected);
+      // Do not unpoison `value` if it is the same register as
+      // `expected`, which has just been unpoisoned.
+      if (value != expected) {
+        __ UnpoisonHeapReference(value);
+      }
+    }
+
+    __ Bind(&done);
+    __ Bc(GetExitLabel());
+  }
+
+ private:
+  // The location (register) of the marked object reference.
+  const Location ref_;
+  // The register containing the object holding the marked object reference field.
+  const GpuRegister obj_;
+  // The location of the offset of the marked reference field within `obj_`.
+  Location field_offset_;
+
+  const GpuRegister temp1_;
+
+  DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkAndUpdateFieldSlowPathMIPS64);
+};
+
+// Slow path generating a read barrier for a heap reference.
+class ReadBarrierForHeapReferenceSlowPathMIPS64 : public SlowPathCodeMIPS64 {
+ public:
+  ReadBarrierForHeapReferenceSlowPathMIPS64(HInstruction* instruction,
+                                            Location out,
+                                            Location ref,
+                                            Location obj,
+                                            uint32_t offset,
+                                            Location index)
+      : SlowPathCodeMIPS64(instruction),
+        out_(out),
+        ref_(ref),
+        obj_(obj),
+        offset_(offset),
+        index_(index) {
+    DCHECK(kEmitCompilerReadBarrier);
+    // If `obj` is equal to `out` or `ref`, it means the initial object
+    // has been overwritten by (or after) the heap object reference load
+    // to be instrumented, e.g.:
+    //
+    //   __ LoadFromOffset(kLoadWord, out, out, offset);
+    //   codegen_->GenerateReadBarrierSlow(instruction, out_loc, out_loc, out_loc, offset);
+    //
+    // In that case, we have lost the information about the original
+    // object, and the emitted read barrier cannot work properly.
+    DCHECK(!obj.Equals(out)) << "obj=" << obj << " out=" << out;
+    DCHECK(!obj.Equals(ref)) << "obj=" << obj << " ref=" << ref;
+  }
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    CodeGeneratorMIPS64* mips64_codegen = down_cast<CodeGeneratorMIPS64*>(codegen);
+    LocationSummary* locations = instruction_->GetLocations();
+    Primitive::Type type = Primitive::kPrimNot;
+    GpuRegister reg_out = out_.AsRegister<GpuRegister>();
+    DCHECK(locations->CanCall());
+    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg_out));
+    DCHECK(instruction_->IsInstanceFieldGet() ||
+           instruction_->IsStaticFieldGet() ||
+           instruction_->IsArrayGet() ||
+           instruction_->IsInstanceOf() ||
+           instruction_->IsCheckCast() ||
+           (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()))
+        << "Unexpected instruction in read barrier for heap reference slow path: "
+        << instruction_->DebugName();
+
+    __ Bind(GetEntryLabel());
+    SaveLiveRegisters(codegen, locations);
+
+    // We may have to change the index's value, but as `index_` is a
+    // constant member (like other "inputs" of this slow path),
+    // introduce a copy of it, `index`.
+    Location index = index_;
+    if (index_.IsValid()) {
+      // Handle `index_` for HArrayGet and UnsafeGetObject/UnsafeGetObjectVolatile intrinsics.
+      if (instruction_->IsArrayGet()) {
+        // Compute the actual memory offset and store it in `index`.
+        GpuRegister index_reg = index_.AsRegister<GpuRegister>();
+        DCHECK(locations->GetLiveRegisters()->ContainsCoreRegister(index_reg));
+        if (codegen->IsCoreCalleeSaveRegister(index_reg)) {
+          // We are about to change the value of `index_reg` (see the
+          // calls to art::mips64::Mips64Assembler::Sll and
+          // art::mips64::MipsAssembler::Addiu32 below), but it has
+          // not been saved by the previous call to
+          // art::SlowPathCode::SaveLiveRegisters, as it is a
+          // callee-save register --
+          // art::SlowPathCode::SaveLiveRegisters does not consider
+          // callee-save registers, as it has been designed with the
+          // assumption that callee-save registers are supposed to be
+          // handled by the called function.  So, as a callee-save
+          // register, `index_reg` _would_ eventually be saved onto
+          // the stack, but it would be too late: we would have
+          // changed its value earlier.  Therefore, we manually save
+          // it here into another freely available register,
+          // `free_reg`, chosen of course among the caller-save
+          // registers (as a callee-save `free_reg` register would
+          // exhibit the same problem).
+          //
+          // Note we could have requested a temporary register from
+          // the register allocator instead; but we prefer not to, as
+          // this is a slow path, and we know we can find a
+          // caller-save register that is available.
+          GpuRegister free_reg = FindAvailableCallerSaveRegister(codegen);
+          __ Move(free_reg, index_reg);
+          index_reg = free_reg;
+          index = Location::RegisterLocation(index_reg);
+        } else {
+          // The initial register stored in `index_` has already been
+          // saved in the call to art::SlowPathCode::SaveLiveRegisters
+          // (as it is not a callee-save register), so we can freely
+          // use it.
+        }
+        // Shifting the index value contained in `index_reg` by the scale
+        // factor (2) cannot overflow in practice, as the runtime is
+        // unable to allocate object arrays with a size larger than
+        // 2^26 - 1 (that is, 2^28 - 4 bytes).
+        __ Sll(index_reg, index_reg, TIMES_4);
+        static_assert(
+            sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
+            "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
+        __ Addiu32(index_reg, index_reg, offset_);
+      } else {
+        // In the case of the UnsafeGetObject/UnsafeGetObjectVolatile
+        // intrinsics, `index_` is not shifted by a scale factor of 2
+        // (as in the case of ArrayGet), as it is actually an offset
+        // to an object field within an object.
+        DCHECK(instruction_->IsInvoke()) << instruction_->DebugName();
+        DCHECK(instruction_->GetLocations()->Intrinsified());
+        DCHECK((instruction_->AsInvoke()->GetIntrinsic() == Intrinsics::kUnsafeGetObject) ||
+               (instruction_->AsInvoke()->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile))
+            << instruction_->AsInvoke()->GetIntrinsic();
+        DCHECK_EQ(offset_, 0U);
+        DCHECK(index_.IsRegister());
+      }
+    }
+
+    // We're moving two or three locations to locations that could
+    // overlap, so we need a parallel move resolver.
+    InvokeRuntimeCallingConvention calling_convention;
+    HParallelMove parallel_move(codegen->GetGraph()->GetArena());
+    parallel_move.AddMove(ref_,
+                          Location::RegisterLocation(calling_convention.GetRegisterAt(0)),
+                          Primitive::kPrimNot,
+                          nullptr);
+    parallel_move.AddMove(obj_,
+                          Location::RegisterLocation(calling_convention.GetRegisterAt(1)),
+                          Primitive::kPrimNot,
+                          nullptr);
+    if (index.IsValid()) {
+      parallel_move.AddMove(index,
+                            Location::RegisterLocation(calling_convention.GetRegisterAt(2)),
+                            Primitive::kPrimInt,
+                            nullptr);
+      codegen->GetMoveResolver()->EmitNativeCode(&parallel_move);
+    } else {
+      codegen->GetMoveResolver()->EmitNativeCode(&parallel_move);
+      __ LoadConst32(calling_convention.GetRegisterAt(2), offset_);
+    }
+    mips64_codegen->InvokeRuntime(kQuickReadBarrierSlow,
+                                  instruction_,
+                                  instruction_->GetDexPc(),
+                                  this);
+    CheckEntrypointTypes<
+        kQuickReadBarrierSlow, mirror::Object*, mirror::Object*, mirror::Object*, uint32_t>();
+    mips64_codegen->MoveLocation(out_, calling_convention.GetReturnLocation(type), type);
+
+    RestoreLiveRegisters(codegen, locations);
+    __ Bc(GetExitLabel());
+  }
+
+  const char* GetDescription() const OVERRIDE {
+    return "ReadBarrierForHeapReferenceSlowPathMIPS64";
+  }
+
+ private:
+  GpuRegister FindAvailableCallerSaveRegister(CodeGenerator* codegen) {
+    size_t ref = static_cast<int>(ref_.AsRegister<GpuRegister>());
+    size_t obj = static_cast<int>(obj_.AsRegister<GpuRegister>());
+    for (size_t i = 0, e = codegen->GetNumberOfCoreRegisters(); i < e; ++i) {
+      if (i != ref &&
+          i != obj &&
+          !codegen->IsCoreCalleeSaveRegister(i) &&
+          !codegen->IsBlockedCoreRegister(i)) {
+        return static_cast<GpuRegister>(i);
+      }
+    }
+    // We shall never fail to find a free caller-save register, as
+    // there are more than two core caller-save registers on MIPS64
+    // (meaning it is possible to find one which is different from
+    // `ref` and `obj`).
+    DCHECK_GT(codegen->GetNumberOfCoreCallerSaveRegisters(), 2u);
+    LOG(FATAL) << "Could not find a free caller-save register";
+    UNREACHABLE();
+  }
+
+  const Location out_;
+  const Location ref_;
+  const Location obj_;
+  const uint32_t offset_;
+  // An additional location containing an index to an array.
+  // Only used for HArrayGet and the UnsafeGetObject &
+  // UnsafeGetObjectVolatile intrinsics.
+  const Location index_;
+
+  DISALLOW_COPY_AND_ASSIGN(ReadBarrierForHeapReferenceSlowPathMIPS64);
+};
+
+// Slow path generating a read barrier for a GC root.
+class ReadBarrierForRootSlowPathMIPS64 : public SlowPathCodeMIPS64 {
+ public:
+  ReadBarrierForRootSlowPathMIPS64(HInstruction* instruction, Location out, Location root)
+      : SlowPathCodeMIPS64(instruction), out_(out), root_(root) {
+    DCHECK(kEmitCompilerReadBarrier);
+  }
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    LocationSummary* locations = instruction_->GetLocations();
+    Primitive::Type type = Primitive::kPrimNot;
+    GpuRegister reg_out = out_.AsRegister<GpuRegister>();
+    DCHECK(locations->CanCall());
+    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg_out));
+    DCHECK(instruction_->IsLoadClass() || instruction_->IsLoadString())
+        << "Unexpected instruction in read barrier for GC root slow path: "
+        << instruction_->DebugName();
+
+    __ Bind(GetEntryLabel());
+    SaveLiveRegisters(codegen, locations);
+
+    InvokeRuntimeCallingConvention calling_convention;
+    CodeGeneratorMIPS64* mips64_codegen = down_cast<CodeGeneratorMIPS64*>(codegen);
+    mips64_codegen->MoveLocation(Location::RegisterLocation(calling_convention.GetRegisterAt(0)),
+                                 root_,
+                                 Primitive::kPrimNot);
+    mips64_codegen->InvokeRuntime(kQuickReadBarrierForRootSlow,
+                                  instruction_,
+                                  instruction_->GetDexPc(),
+                                  this);
+    CheckEntrypointTypes<kQuickReadBarrierForRootSlow, mirror::Object*, GcRoot<mirror::Object>*>();
+    mips64_codegen->MoveLocation(out_, calling_convention.GetReturnLocation(type), type);
+
+    RestoreLiveRegisters(codegen, locations);
+    __ Bc(GetExitLabel());
+  }
+
+  const char* GetDescription() const OVERRIDE { return "ReadBarrierForRootSlowPathMIPS64"; }
+
+ private:
+  const Location out_;
+  const Location root_;
+
+  DISALLOW_COPY_AND_ASSIGN(ReadBarrierForRootSlowPathMIPS64);
+};
+
 CodeGeneratorMIPS64::CodeGeneratorMIPS64(HGraph* graph,
                                          const Mips64InstructionSetFeatures& isa_features,
                                          const CompilerOptions& compiler_options,
@@ -1140,23 +1662,32 @@ void CodeGeneratorMIPS64::InvokeRuntime(QuickEntrypointEnum entrypoint,
                                         uint32_t dex_pc,
                                         SlowPathCode* slow_path) {
   ValidateInvokeRuntime(entrypoint, instruction, slow_path);
-  __ LoadFromOffset(kLoadDoubleword,
-                    T9,
-                    TR,
-                    GetThreadOffset<kMips64PointerSize>(entrypoint).Int32Value());
-  __ Jalr(T9);
-  __ Nop();
+  GenerateInvokeRuntime(GetThreadOffset<kMips64PointerSize>(entrypoint).Int32Value());
   if (EntrypointRequiresStackMap(entrypoint)) {
     RecordPcInfo(instruction, dex_pc, slow_path);
   }
 }
 
+void CodeGeneratorMIPS64::InvokeRuntimeWithoutRecordingPcInfo(int32_t entry_point_offset,
+                                                              HInstruction* instruction,
+                                                              SlowPathCode* slow_path) {
+  ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction, slow_path);
+  GenerateInvokeRuntime(entry_point_offset);
+}
+
+void CodeGeneratorMIPS64::GenerateInvokeRuntime(int32_t entry_point_offset) {
+  __ LoadFromOffset(kLoadDoubleword, T9, TR, entry_point_offset);
+  __ Jalr(T9);
+  __ Nop();
+}
+
 void InstructionCodeGeneratorMIPS64::GenerateClassInitializationCheck(SlowPathCodeMIPS64* slow_path,
                                                                       GpuRegister class_reg) {
   __ LoadFromOffset(kLoadWord, TMP, class_reg, mirror::Class::StatusOffset().Int32Value());
   __ LoadConst32(AT, mirror::Class::kStatusInitialized);
   __ Bltc(TMP, AT, slow_path->GetEntryLabel());
-  // TODO: barrier needed?
+  // Even if the initialized flag is set, we need to ensure consistent memory ordering.
+  __ Sync(0);
   __ Bind(slow_path->GetExitLabel());
 }
 
@@ -1447,14 +1978,31 @@ void InstructionCodeGeneratorMIPS64::VisitAnd(HAnd* instruction) {
 }
 
 void LocationsBuilderMIPS64::VisitArrayGet(HArrayGet* instruction) {
+  Primitive::Type type = instruction->GetType();
+  bool object_array_get_with_read_barrier =
+      kEmitCompilerReadBarrier && (type == Primitive::kPrimNot);
   LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
+      new (GetGraph()->GetArena()) LocationSummary(instruction,
+                                                   object_array_get_with_read_barrier
+                                                       ? LocationSummary::kCallOnSlowPath
+                                                       : LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
-  if (Primitive::IsFloatingPointType(instruction->GetType())) {
+  if (Primitive::IsFloatingPointType(type)) {
     locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
   } else {
-    locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+    // The output overlaps in the case of an object array get with
+    // read barriers enabled: we do not want the move to overwrite the
+    // array's location, as we need it to emit the read barrier.
+    locations->SetOut(Location::RequiresRegister(),
+                      object_array_get_with_read_barrier
+                          ? Location::kOutputOverlap
+                          : Location::kNoOutputOverlap);
+  }
+  // We need a temporary register for the read barrier marking slow
+  // path in CodeGeneratorMIPS64::GenerateArrayLoadWithBakerReadBarrier.
+  if (object_array_get_with_read_barrier && kUseBakerReadBarrier) {
+    locations->AddTemp(Location::RequiresRegister());
   }
 }
 
@@ -1467,7 +2015,9 @@ static auto GetImplicitNullChecker(HInstruction* instruction, CodeGeneratorMIPS6
 
 void InstructionCodeGeneratorMIPS64::VisitArrayGet(HArrayGet* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  GpuRegister obj = locations->InAt(0).AsRegister<GpuRegister>();
+  Location obj_loc = locations->InAt(0);
+  GpuRegister obj = obj_loc.AsRegister<GpuRegister>();
+  Location out_loc = locations->Out();
   Location index = locations->InAt(1);
   uint32_t data_offset = CodeGenerator::GetArrayDataOffset(instruction);
   auto null_checker = GetImplicitNullChecker(instruction, codegen_);
@@ -1477,7 +2027,7 @@ void InstructionCodeGeneratorMIPS64::VisitArrayGet(HArrayGet* instruction) {
                                         instruction->IsStringCharAt();
   switch (type) {
     case Primitive::kPrimBoolean: {
-      GpuRegister out = locations->Out().AsRegister<GpuRegister>();
+      GpuRegister out = out_loc.AsRegister<GpuRegister>();
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset;
@@ -1490,7 +2040,7 @@ void InstructionCodeGeneratorMIPS64::VisitArrayGet(HArrayGet* instruction) {
     }
 
     case Primitive::kPrimByte: {
-      GpuRegister out = locations->Out().AsRegister<GpuRegister>();
+      GpuRegister out = out_loc.AsRegister<GpuRegister>();
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset;
@@ -1503,7 +2053,7 @@ void InstructionCodeGeneratorMIPS64::VisitArrayGet(HArrayGet* instruction) {
     }
 
     case Primitive::kPrimShort: {
-      GpuRegister out = locations->Out().AsRegister<GpuRegister>();
+      GpuRegister out = out_loc.AsRegister<GpuRegister>();
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_2) + data_offset;
@@ -1517,7 +2067,7 @@ void InstructionCodeGeneratorMIPS64::VisitArrayGet(HArrayGet* instruction) {
     }
 
     case Primitive::kPrimChar: {
-      GpuRegister out = locations->Out().AsRegister<GpuRegister>();
+      GpuRegister out = out_loc.AsRegister<GpuRegister>();
       if (maybe_compressed_char_at) {
         uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
         __ LoadFromOffset(kLoadWord, TMP, obj, count_offset, null_checker);
@@ -1570,10 +2120,9 @@ void InstructionCodeGeneratorMIPS64::VisitArrayGet(HArrayGet* instruction) {
       break;
     }
 
-    case Primitive::kPrimInt:
-    case Primitive::kPrimNot: {
+    case Primitive::kPrimInt: {
       DCHECK_EQ(sizeof(mirror::HeapReference<mirror::Object>), sizeof(int32_t));
-      GpuRegister out = locations->Out().AsRegister<GpuRegister>();
+      GpuRegister out = out_loc.AsRegister<GpuRegister>();
       LoadOperandType load_type = (type == Primitive::kPrimNot) ? kLoadUnsignedWord : kLoadWord;
       if (index.IsConstant()) {
         size_t offset =
@@ -1587,8 +2136,53 @@ void InstructionCodeGeneratorMIPS64::VisitArrayGet(HArrayGet* instruction) {
       break;
     }
 
+    case Primitive::kPrimNot: {
+      static_assert(
+          sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
+          "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
+      // /* HeapReference<Object> */ out =
+      //     *(obj + data_offset + index * sizeof(HeapReference<Object>))
+      if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+        Location temp = locations->GetTemp(0);
+        // Note that a potential implicit null check is handled in this
+        // CodeGeneratorMIPS64::GenerateArrayLoadWithBakerReadBarrier call.
+        codegen_->GenerateArrayLoadWithBakerReadBarrier(instruction,
+                                                        out_loc,
+                                                        obj,
+                                                        data_offset,
+                                                        index,
+                                                        temp,
+                                                        /* needs_null_check */ true);
+      } else {
+        GpuRegister out = out_loc.AsRegister<GpuRegister>();
+        if (index.IsConstant()) {
+          size_t offset =
+              (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
+          __ LoadFromOffset(kLoadUnsignedWord, out, obj, offset, null_checker);
+          // If read barriers are enabled, emit read barriers other than
+          // Baker's using a slow path (and also unpoison the loaded
+          // reference, if heap poisoning is enabled).
+          codegen_->MaybeGenerateReadBarrierSlow(instruction, out_loc, out_loc, obj_loc, offset);
+        } else {
+          __ Sll(TMP, index.AsRegister<GpuRegister>(), TIMES_4);
+          __ Addu(TMP, obj, TMP);
+          __ LoadFromOffset(kLoadUnsignedWord, out, TMP, data_offset, null_checker);
+          // If read barriers are enabled, emit read barriers other than
+          // Baker's using a slow path (and also unpoison the loaded
+          // reference, if heap poisoning is enabled).
+          codegen_->MaybeGenerateReadBarrierSlow(instruction,
+                                                 out_loc,
+                                                 out_loc,
+                                                 obj_loc,
+                                                 data_offset,
+                                                 index);
+        }
+      }
+      break;
+    }
+
     case Primitive::kPrimLong: {
-      GpuRegister out = locations->Out().AsRegister<GpuRegister>();
+      GpuRegister out = out_loc.AsRegister<GpuRegister>();
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset;
@@ -1602,7 +2196,7 @@ void InstructionCodeGeneratorMIPS64::VisitArrayGet(HArrayGet* instruction) {
     }
 
     case Primitive::kPrimFloat: {
-      FpuRegister out = locations->Out().AsFpuRegister<FpuRegister>();
+      FpuRegister out = out_loc.AsFpuRegister<FpuRegister>();
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
@@ -1616,7 +2210,7 @@ void InstructionCodeGeneratorMIPS64::VisitArrayGet(HArrayGet* instruction) {
     }
 
     case Primitive::kPrimDouble: {
-      FpuRegister out = locations->Out().AsFpuRegister<FpuRegister>();
+      FpuRegister out = out_loc.AsFpuRegister<FpuRegister>();
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset;
@@ -1633,11 +2227,6 @@ void InstructionCodeGeneratorMIPS64::VisitArrayGet(HArrayGet* instruction) {
       LOG(FATAL) << "Unreachable type " << instruction->GetType();
       UNREACHABLE();
   }
-
-  if (type == Primitive::kPrimNot) {
-    GpuRegister out = locations->Out().AsRegister<GpuRegister>();
-    __ MaybeUnpoisonHeapReference(out);
-  }
 }
 
 void LocationsBuilderMIPS64::VisitArrayLength(HArrayLength* instruction) {
@@ -1679,23 +2268,28 @@ Location LocationsBuilderMIPS64::FpuRegisterOrConstantForStore(HInstruction* ins
 }
 
 void LocationsBuilderMIPS64::VisitArraySet(HArraySet* instruction) {
-  bool needs_runtime_call = instruction->NeedsTypeCheck();
+  Primitive::Type value_type = instruction->GetComponentType();
+
+  bool needs_write_barrier =
+      CodeGenerator::StoreNeedsWriteBarrier(value_type, instruction->GetValue());
+  bool may_need_runtime_call_for_type_check = instruction->NeedsTypeCheck();
+
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(
       instruction,
-      needs_runtime_call ? LocationSummary::kCallOnMainOnly : LocationSummary::kNoCall);
-  if (needs_runtime_call) {
-    InvokeRuntimeCallingConvention calling_convention;
-    locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
-    locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
-    locations->SetInAt(2, Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
+      may_need_runtime_call_for_type_check ?
+          LocationSummary::kCallOnSlowPath :
+          LocationSummary::kNoCall);
+
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
+  if (Primitive::IsFloatingPointType(instruction->InputAt(2)->GetType())) {
+    locations->SetInAt(2, FpuRegisterOrConstantForStore(instruction->InputAt(2)));
   } else {
-    locations->SetInAt(0, Location::RequiresRegister());
-    locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
-    if (Primitive::IsFloatingPointType(instruction->InputAt(2)->GetType())) {
-      locations->SetInAt(2, FpuRegisterOrConstantForStore(instruction->InputAt(2)));
-    } else {
-      locations->SetInAt(2, RegisterOrZeroConstant(instruction->InputAt(2)));
-    }
+    locations->SetInAt(2, RegisterOrZeroConstant(instruction->InputAt(2)));
+  }
+  if (needs_write_barrier) {
+    // Temporary register for the write barrier.
+    locations->AddTemp(Location::RequiresRegister());  // Possibly used for ref. poisoning too.
   }
 }
 
@@ -1705,7 +2299,7 @@ void InstructionCodeGeneratorMIPS64::VisitArraySet(HArraySet* instruction) {
   Location index = locations->InAt(1);
   Location value_location = locations->InAt(2);
   Primitive::Type value_type = instruction->GetComponentType();
-  bool needs_runtime_call = locations->WillCall();
+  bool may_need_runtime_call_for_type_check = instruction->NeedsTypeCheck();
   bool needs_write_barrier =
       CodeGenerator::StoreNeedsWriteBarrier(value_type, instruction->GetValue());
   auto null_checker = GetImplicitNullChecker(instruction, codegen_);
@@ -1749,68 +2343,138 @@ void InstructionCodeGeneratorMIPS64::VisitArraySet(HArraySet* instruction) {
       break;
     }
 
-    case Primitive::kPrimInt:
+    case Primitive::kPrimInt: {
+      uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value();
+      if (index.IsConstant()) {
+        data_offset += index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4;
+      } else {
+        __ Dsll(base_reg, index.AsRegister<GpuRegister>(), TIMES_4);
+        __ Daddu(base_reg, obj, base_reg);
+      }
+      if (value_location.IsConstant()) {
+        int32_t value = CodeGenerator::GetInt32ValueOf(value_location.GetConstant());
+        __ StoreConstToOffset(kStoreWord, value, base_reg, data_offset, TMP, null_checker);
+      } else {
+        GpuRegister value = value_location.AsRegister<GpuRegister>();
+        __ StoreToOffset(kStoreWord, value, base_reg, data_offset, null_checker);
+      }
+      break;
+    }
+
     case Primitive::kPrimNot: {
-      if (!needs_runtime_call) {
+      if (value_location.IsConstant()) {
+        // Just setting null.
         uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value();
         if (index.IsConstant()) {
           data_offset += index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4;
         } else {
-          DCHECK(index.IsRegister()) << index;
           __ Dsll(base_reg, index.AsRegister<GpuRegister>(), TIMES_4);
           __ Daddu(base_reg, obj, base_reg);
         }
-        if (value_location.IsConstant()) {
-          int32_t value = CodeGenerator::GetInt32ValueOf(value_location.GetConstant());
-          __ StoreConstToOffset(kStoreWord, value, base_reg, data_offset, TMP, null_checker);
-          DCHECK(!needs_write_barrier);
-        } else {
-          GpuRegister value = value_location.AsRegister<GpuRegister>();
-          if (kPoisonHeapReferences && needs_write_barrier) {
-            // Note that in the case where `value` is a null reference,
-            // we do not enter this block, as a null reference does not
-            // need poisoning.
-            DCHECK_EQ(value_type, Primitive::kPrimNot);
-            // Use Sw() instead of StoreToOffset() in order to be able to
-            // hold the poisoned reference in AT and thus avoid allocating
-            // yet another temporary register.
-            if (index.IsConstant()) {
-              if (!IsInt<16>(static_cast<int32_t>(data_offset))) {
-                int16_t low16 = Low16Bits(data_offset);
-                // For consistency with StoreToOffset() and such treat data_offset as int32_t.
-                uint64_t high48 = static_cast<uint64_t>(static_cast<int32_t>(data_offset)) - low16;
-                int16_t upper16 = High16Bits(high48);
-                // Allow the full [-2GB,+2GB) range in case `low16` is negative and needs a
-                // compensatory 64KB added, which may push `high48` above 2GB and require
-                // the dahi instruction.
-                int16_t higher16 = High32Bits(high48) + ((upper16 < 0) ? 1 : 0);
-                __ Daui(TMP, obj, upper16);
-                if (higher16 != 0) {
-                  __ Dahi(TMP, higher16);
-                }
-                base_reg = TMP;
-                data_offset = low16;
-              }
-            } else {
-              DCHECK(IsInt<16>(static_cast<int32_t>(data_offset)));
-            }
-            __ PoisonHeapReference(AT, value);
-            __ Sw(AT, base_reg, data_offset);
-            null_checker();
+        int32_t value = CodeGenerator::GetInt32ValueOf(value_location.GetConstant());
+        DCHECK_EQ(value, 0);
+        __ StoreConstToOffset(kStoreWord, value, base_reg, data_offset, TMP, null_checker);
+        DCHECK(!needs_write_barrier);
+        DCHECK(!may_need_runtime_call_for_type_check);
+        break;
+      }
+
+      DCHECK(needs_write_barrier);
+      GpuRegister value = value_location.AsRegister<GpuRegister>();
+      GpuRegister temp1 = locations->GetTemp(0).AsRegister<GpuRegister>();
+      GpuRegister temp2 = TMP;  // Doesn't need to survive slow path.
+      uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
+      uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
+      uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
+      Mips64Label done;
+      SlowPathCodeMIPS64* slow_path = nullptr;
+
+      if (may_need_runtime_call_for_type_check) {
+        slow_path = new (GetGraph()->GetArena()) ArraySetSlowPathMIPS64(instruction);
+        codegen_->AddSlowPath(slow_path);
+        if (instruction->GetValueCanBeNull()) {
+          Mips64Label non_zero;
+          __ Bnezc(value, &non_zero);
+          uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value();
+          if (index.IsConstant()) {
+            data_offset += index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4;
           } else {
-            __ StoreToOffset(kStoreWord, value, base_reg, data_offset, null_checker);
-          }
-          if (needs_write_barrier) {
-            DCHECK_EQ(value_type, Primitive::kPrimNot);
-            codegen_->MarkGCCard(obj, value, instruction->GetValueCanBeNull());
+            __ Dsll(base_reg, index.AsRegister<GpuRegister>(), TIMES_4);
+            __ Daddu(base_reg, obj, base_reg);
           }
+          __ StoreToOffset(kStoreWord, value, base_reg, data_offset, null_checker);
+          __ Bc(&done);
+          __ Bind(&non_zero);
+        }
+
+        // Note that when read barriers are enabled, the type checks
+        // are performed without read barriers.  This is fine, even in
+        // the case where a class object is in the from-space after
+        // the flip, as a comparison involving such a type would not
+        // produce a false positive; it may of course produce a false
+        // negative, in which case we would take the ArraySet slow
+        // path.
+
+        // /* HeapReference<Class> */ temp1 = obj->klass_
+        __ LoadFromOffset(kLoadUnsignedWord, temp1, obj, class_offset, null_checker);
+        __ MaybeUnpoisonHeapReference(temp1);
+
+        // /* HeapReference<Class> */ temp1 = temp1->component_type_
+        __ LoadFromOffset(kLoadUnsignedWord, temp1, temp1, component_offset);
+        // /* HeapReference<Class> */ temp2 = value->klass_
+        __ LoadFromOffset(kLoadUnsignedWord, temp2, value, class_offset);
+        // If heap poisoning is enabled, no need to unpoison `temp1`
+        // nor `temp2`, as we are comparing two poisoned references.
+
+        if (instruction->StaticTypeOfArrayIsObjectArray()) {
+          Mips64Label do_put;
+          __ Beqc(temp1, temp2, &do_put);
+          // If heap poisoning is enabled, the `temp1` reference has
+          // not been unpoisoned yet; unpoison it now.
+          __ MaybeUnpoisonHeapReference(temp1);
+
+          // /* HeapReference<Class> */ temp1 = temp1->super_class_
+          __ LoadFromOffset(kLoadUnsignedWord, temp1, temp1, super_offset);
+          // If heap poisoning is enabled, no need to unpoison
+          // `temp1`, as we are comparing against null below.
+          __ Bnezc(temp1, slow_path->GetEntryLabel());
+          __ Bind(&do_put);
+        } else {
+          __ Bnec(temp1, temp2, slow_path->GetEntryLabel());
         }
+      }
+
+      GpuRegister source = value;
+      if (kPoisonHeapReferences) {
+        // Note that in the case where `value` is a null reference,
+        // we do not enter this block, as a null reference does not
+        // need poisoning.
+        __ Move(temp1, value);
+        __ PoisonHeapReference(temp1);
+        source = temp1;
+      }
+
+      uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value();
+      if (index.IsConstant()) {
+        data_offset += index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4;
       } else {
-        DCHECK_EQ(value_type, Primitive::kPrimNot);
-        // Note: if heap poisoning is enabled, pAputObject takes care
-        // of poisoning the reference.
-        codegen_->InvokeRuntime(kQuickAputObject, instruction, instruction->GetDexPc());
-        CheckEntrypointTypes<kQuickAputObject, void, mirror::Array*, int32_t, mirror::Object*>();
+        __ Dsll(base_reg, index.AsRegister<GpuRegister>(), TIMES_4);
+        __ Daddu(base_reg, obj, base_reg);
+      }
+      __ StoreToOffset(kStoreWord, source, base_reg, data_offset);
+
+      if (!may_need_runtime_call_for_type_check) {
+        codegen_->MaybeRecordImplicitNullCheck(instruction);
+      }
+
+      codegen_->MarkGCCard(obj, value, instruction->GetValueCanBeNull());
+
+      if (done.IsLinked()) {
+        __ Bind(&done);
+      }
+
+      if (slow_path != nullptr) {
+        __ Bind(slow_path->GetExitLabel());
       }
       break;
     }
@@ -1900,6 +2564,23 @@ void InstructionCodeGeneratorMIPS64::VisitBoundsCheck(HBoundsCheck* instruction)
   __ Bgeuc(index, length, slow_path->GetEntryLabel());
 }
 
+// Temp is used for read barrier.
+static size_t NumberOfInstanceOfTemps(TypeCheckKind type_check_kind) {
+  if (kEmitCompilerReadBarrier &&
+      (kUseBakerReadBarrier ||
+       type_check_kind == TypeCheckKind::kAbstractClassCheck ||
+       type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
+       type_check_kind == TypeCheckKind::kArrayObjectCheck)) {
+    return 1;
+  }
+  return 0;
+}
+
+// Extra temp is used for read barrier.
+static size_t NumberOfCheckCastTemps(TypeCheckKind type_check_kind) {
+  return 1 + NumberOfInstanceOfTemps(type_check_kind);
+}
+
 void LocationsBuilderMIPS64::VisitCheckCast(HCheckCast* instruction) {
   LocationSummary::CallKind call_kind = LocationSummary::kNoCall;
   bool throws_into_catch = instruction->CanThrowIntoCatchBlock();
@@ -1910,7 +2591,7 @@ void LocationsBuilderMIPS64::VisitCheckCast(HCheckCast* instruction) {
     case TypeCheckKind::kAbstractClassCheck:
     case TypeCheckKind::kClassHierarchyCheck:
     case TypeCheckKind::kArrayObjectCheck:
-      call_kind = throws_into_catch
+      call_kind = (throws_into_catch || kEmitCompilerReadBarrier)
           ? LocationSummary::kCallOnSlowPath
           : LocationSummary::kNoCall;  // In fact, call on a fatal (non-returning) slow path.
       break;
@@ -1924,15 +2605,20 @@ void LocationsBuilderMIPS64::VisitCheckCast(HCheckCast* instruction) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, call_kind);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RequiresRegister());
-  locations->AddTemp(Location::RequiresRegister());
+  locations->AddRegisterTemps(NumberOfCheckCastTemps(type_check_kind));
 }
 
 void InstructionCodeGeneratorMIPS64::VisitCheckCast(HCheckCast* instruction) {
   TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
   LocationSummary* locations = instruction->GetLocations();
-  GpuRegister obj = locations->InAt(0).AsRegister<GpuRegister>();
+  Location obj_loc = locations->InAt(0);
+  GpuRegister obj = obj_loc.AsRegister<GpuRegister>();
   GpuRegister cls = locations->InAt(1).AsRegister<GpuRegister>();
-  GpuRegister temp = locations->GetTemp(0).AsRegister<GpuRegister>();
+  Location temp_loc = locations->GetTemp(0);
+  GpuRegister temp = temp_loc.AsRegister<GpuRegister>();
+  const size_t num_temps = NumberOfCheckCastTemps(type_check_kind);
+  DCHECK_LE(num_temps, 2u);
+  Location maybe_temp2_loc = (num_temps >= 2) ? locations->GetTemp(1) : Location::NoLocation();
   const uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
   const uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
   const uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
@@ -1969,8 +2655,12 @@ void InstructionCodeGeneratorMIPS64::VisitCheckCast(HCheckCast* instruction) {
     case TypeCheckKind::kExactCheck:
     case TypeCheckKind::kArrayCheck: {
       // /* HeapReference<Class> */ temp = obj->klass_
-      __ LoadFromOffset(kLoadUnsignedWord, temp, obj, class_offset);
-      __ MaybeUnpoisonHeapReference(temp);
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        temp_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp2_loc,
+                                        kWithoutReadBarrier);
       // Jump to slow path for throwing the exception or doing a
       // more involved array check.
       __ Bnec(temp, cls, slow_path->GetEntryLabel());
@@ -1979,15 +2669,22 @@ void InstructionCodeGeneratorMIPS64::VisitCheckCast(HCheckCast* instruction) {
 
     case TypeCheckKind::kAbstractClassCheck: {
       // /* HeapReference<Class> */ temp = obj->klass_
-      __ LoadFromOffset(kLoadUnsignedWord, temp, obj, class_offset);
-      __ MaybeUnpoisonHeapReference(temp);
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        temp_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp2_loc,
+                                        kWithoutReadBarrier);
       // If the class is abstract, we eagerly fetch the super class of the
       // object to avoid doing a comparison we know will fail.
       Mips64Label loop;
       __ Bind(&loop);
       // /* HeapReference<Class> */ temp = temp->super_class_
-      __ LoadFromOffset(kLoadUnsignedWord, temp, temp, super_offset);
-      __ MaybeUnpoisonHeapReference(temp);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       temp_loc,
+                                       super_offset,
+                                       maybe_temp2_loc,
+                                       kWithoutReadBarrier);
       // If the class reference currently in `temp` is null, jump to the slow path to throw the
       // exception.
       __ Beqzc(temp, slow_path->GetEntryLabel());
@@ -1998,15 +2695,22 @@ void InstructionCodeGeneratorMIPS64::VisitCheckCast(HCheckCast* instruction) {
 
     case TypeCheckKind::kClassHierarchyCheck: {
       // /* HeapReference<Class> */ temp = obj->klass_
-      __ LoadFromOffset(kLoadUnsignedWord, temp, obj, class_offset);
-      __ MaybeUnpoisonHeapReference(temp);
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        temp_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp2_loc,
+                                        kWithoutReadBarrier);
       // Walk over the class hierarchy to find a match.
       Mips64Label loop;
       __ Bind(&loop);
       __ Beqc(temp, cls, &done);
       // /* HeapReference<Class> */ temp = temp->super_class_
-      __ LoadFromOffset(kLoadUnsignedWord, temp, temp, super_offset);
-      __ MaybeUnpoisonHeapReference(temp);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       temp_loc,
+                                       super_offset,
+                                       maybe_temp2_loc,
+                                       kWithoutReadBarrier);
       // If the class reference currently in `temp` is null, jump to the slow path to throw the
       // exception. Otherwise, jump to the beginning of the loop.
       __ Bnezc(temp, &loop);
@@ -2016,14 +2720,21 @@ void InstructionCodeGeneratorMIPS64::VisitCheckCast(HCheckCast* instruction) {
 
     case TypeCheckKind::kArrayObjectCheck: {
       // /* HeapReference<Class> */ temp = obj->klass_
-      __ LoadFromOffset(kLoadUnsignedWord, temp, obj, class_offset);
-      __ MaybeUnpoisonHeapReference(temp);
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        temp_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp2_loc,
+                                        kWithoutReadBarrier);
       // Do an exact check.
       __ Beqc(temp, cls, &done);
       // Otherwise, we need to check that the object's class is a non-primitive array.
       // /* HeapReference<Class> */ temp = temp->component_type_
-      __ LoadFromOffset(kLoadUnsignedWord, temp, temp, component_offset);
-      __ MaybeUnpoisonHeapReference(temp);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       temp_loc,
+                                       component_offset,
+                                       maybe_temp2_loc,
+                                       kWithoutReadBarrier);
       // If the component type is null, jump to the slow path to throw the exception.
       __ Beqzc(temp, slow_path->GetEntryLabel());
       // Otherwise, the object is indeed an array, further check that this component
@@ -2050,11 +2761,19 @@ void InstructionCodeGeneratorMIPS64::VisitCheckCast(HCheckCast* instruction) {
       // Avoid read barriers to improve performance of the fast path. We can not get false
       // positives by doing this.
       // /* HeapReference<Class> */ temp = obj->klass_
-      __ LoadFromOffset(kLoadUnsignedWord, temp, obj, class_offset);
-      __ MaybeUnpoisonHeapReference(temp);
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        temp_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp2_loc,
+                                        kWithoutReadBarrier);
       // /* HeapReference<Class> */ temp = temp->iftable_
-      __ LoadFromOffset(kLoadUnsignedWord, temp, temp, iftable_offset);
-      __ MaybeUnpoisonHeapReference(temp);
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        temp_loc,
+                                        temp_loc,
+                                        iftable_offset,
+                                        maybe_temp2_loc,
+                                        kWithoutReadBarrier);
       // Iftable is never null.
       __ Lw(TMP, temp, array_length_offset);
       // Loop through the iftable and check if any class matches.
@@ -3270,14 +3989,31 @@ void CodeGeneratorMIPS64::GenerateNop() {
 }
 
 void LocationsBuilderMIPS64::HandleFieldGet(HInstruction* instruction,
-                                            const FieldInfo& field_info ATTRIBUTE_UNUSED) {
-  LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
+                                            const FieldInfo& field_info) {
+  Primitive::Type field_type = field_info.GetFieldType();
+  bool object_field_get_with_read_barrier =
+      kEmitCompilerReadBarrier && (field_type == Primitive::kPrimNot);
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(
+      instruction,
+      object_field_get_with_read_barrier
+          ? LocationSummary::kCallOnSlowPath
+          : LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RequiresRegister());
   if (Primitive::IsFloatingPointType(instruction->GetType())) {
     locations->SetOut(Location::RequiresFpuRegister());
   } else {
-    locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+    // The output overlaps in the case of an object field get with
+    // read barriers enabled: we do not want the move to overwrite the
+    // object's location, as we need it to emit the read barrier.
+    locations->SetOut(Location::RequiresRegister(),
+                      object_field_get_with_read_barrier
+                          ? Location::kOutputOverlap
+                          : Location::kNoOutputOverlap);
+  }
+  if (object_field_get_with_read_barrier && kUseBakerReadBarrier) {
+    // We need a temporary register for the read barrier marking slow
+    // path in CodeGeneratorMIPS64::GenerateFieldLoadWithBakerReadBarrier.
+    locations->AddTemp(Location::RequiresRegister());
   }
 }
 
@@ -3285,8 +4021,11 @@ void InstructionCodeGeneratorMIPS64::HandleFieldGet(HInstruction* instruction,
                                                     const FieldInfo& field_info) {
   Primitive::Type type = field_info.GetFieldType();
   LocationSummary* locations = instruction->GetLocations();
-  GpuRegister obj = locations->InAt(0).AsRegister<GpuRegister>();
+  Location obj_loc = locations->InAt(0);
+  GpuRegister obj = obj_loc.AsRegister<GpuRegister>();
+  Location dst_loc = locations->Out();
   LoadOperandType load_type = kLoadUnsignedByte;
+  bool is_volatile = field_info.IsVolatile();
   uint32_t offset = field_info.GetFieldOffset().Uint32Value();
   auto null_checker = GetImplicitNullChecker(instruction, codegen_);
 
@@ -3319,19 +4058,46 @@ void InstructionCodeGeneratorMIPS64::HandleFieldGet(HInstruction* instruction,
       UNREACHABLE();
   }
   if (!Primitive::IsFloatingPointType(type)) {
-    DCHECK(locations->Out().IsRegister());
-    GpuRegister dst = locations->Out().AsRegister<GpuRegister>();
-    __ LoadFromOffset(load_type, dst, obj, offset, null_checker);
+    DCHECK(dst_loc.IsRegister());
+    GpuRegister dst = dst_loc.AsRegister<GpuRegister>();
+    if (type == Primitive::kPrimNot) {
+      // /* HeapReference<Object> */ dst = *(obj + offset)
+      if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+        Location temp_loc = locations->GetTemp(0);
+        // Note that a potential implicit null check is handled in this
+        // CodeGeneratorMIPS64::GenerateFieldLoadWithBakerReadBarrier call.
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(instruction,
+                                                        dst_loc,
+                                                        obj,
+                                                        offset,
+                                                        temp_loc,
+                                                        /* needs_null_check */ true);
+        if (is_volatile) {
+          GenerateMemoryBarrier(MemBarrierKind::kLoadAny);
+        }
+      } else {
+        __ LoadFromOffset(kLoadUnsignedWord, dst, obj, offset, null_checker);
+        if (is_volatile) {
+          GenerateMemoryBarrier(MemBarrierKind::kLoadAny);
+        }
+        // If read barriers are enabled, emit read barriers other than
+        // Baker's using a slow path (and also unpoison the loaded
+        // reference, if heap poisoning is enabled).
+        codegen_->MaybeGenerateReadBarrierSlow(instruction, dst_loc, dst_loc, obj_loc, offset);
+      }
+    } else {
+      __ LoadFromOffset(load_type, dst, obj, offset, null_checker);
+    }
   } else {
-    DCHECK(locations->Out().IsFpuRegister());
-    FpuRegister dst = locations->Out().AsFpuRegister<FpuRegister>();
+    DCHECK(dst_loc.IsFpuRegister());
+    FpuRegister dst = dst_loc.AsFpuRegister<FpuRegister>();
     __ LoadFpuFromOffset(load_type, dst, obj, offset, null_checker);
   }
-  // TODO: memory barrier?
 
-  if (type == Primitive::kPrimNot) {
-    GpuRegister dst = locations->Out().AsRegister<GpuRegister>();
-    __ MaybeUnpoisonHeapReference(dst);
+  // Memory barriers, in the case of references, are handled in the
+  // previous switch statement.
+  if (is_volatile && (type != Primitive::kPrimNot)) {
+    GenerateMemoryBarrier(MemBarrierKind::kLoadAny);
   }
 }
 
@@ -3355,6 +4121,7 @@ void InstructionCodeGeneratorMIPS64::HandleFieldSet(HInstruction* instruction,
   GpuRegister obj = locations->InAt(0).AsRegister<GpuRegister>();
   Location value_location = locations->InAt(1);
   StoreOperandType store_type = kStoreByte;
+  bool is_volatile = field_info.IsVolatile();
   uint32_t offset = field_info.GetFieldOffset().Uint32Value();
   bool needs_write_barrier = CodeGenerator::StoreNeedsWriteBarrier(type, instruction->InputAt(1));
   auto null_checker = GetImplicitNullChecker(instruction, codegen_);
@@ -3382,6 +4149,10 @@ void InstructionCodeGeneratorMIPS64::HandleFieldSet(HInstruction* instruction,
       UNREACHABLE();
   }
 
+  if (is_volatile) {
+    GenerateMemoryBarrier(MemBarrierKind::kAnyStore);
+  }
+
   if (value_location.IsConstant()) {
     int64_t value = CodeGenerator::GetInt64ValueOf(value_location.GetConstant());
     __ StoreConstToOffset(store_type, value, obj, offset, TMP, null_checker);
@@ -3405,12 +4176,16 @@ void InstructionCodeGeneratorMIPS64::HandleFieldSet(HInstruction* instruction,
       __ StoreFpuToOffset(store_type, src, obj, offset, null_checker);
     }
   }
-  // TODO: memory barriers?
+
   if (needs_write_barrier) {
     DCHECK(value_location.IsRegister());
     GpuRegister src = value_location.AsRegister<GpuRegister>();
     codegen_->MarkGCCard(obj, src, value_can_be_null);
   }
+
+  if (is_volatile) {
+    GenerateMemoryBarrier(MemBarrierKind::kAnyAny);
+  }
 }
 
 void LocationsBuilderMIPS64::VisitInstanceFieldGet(HInstanceFieldGet* instruction) {
@@ -3429,14 +4204,134 @@ void InstructionCodeGeneratorMIPS64::VisitInstanceFieldSet(HInstanceFieldSet* in
   HandleFieldSet(instruction, instruction->GetFieldInfo(), instruction->GetValueCanBeNull());
 }
 
+void InstructionCodeGeneratorMIPS64::GenerateReferenceLoadOneRegister(
+    HInstruction* instruction,
+    Location out,
+    uint32_t offset,
+    Location maybe_temp,
+    ReadBarrierOption read_barrier_option) {
+  GpuRegister out_reg = out.AsRegister<GpuRegister>();
+  if (read_barrier_option == kWithReadBarrier) {
+    CHECK(kEmitCompilerReadBarrier);
+    DCHECK(maybe_temp.IsRegister()) << maybe_temp;
+    if (kUseBakerReadBarrier) {
+      // Load with fast path based Baker's read barrier.
+      // /* HeapReference<Object> */ out = *(out + offset)
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(instruction,
+                                                      out,
+                                                      out_reg,
+                                                      offset,
+                                                      maybe_temp,
+                                                      /* needs_null_check */ false);
+    } else {
+      // Load with slow path based read barrier.
+      // Save the value of `out` into `maybe_temp` before overwriting it
+      // in the following move operation, as we will need it for the
+      // read barrier below.
+      __ Move(maybe_temp.AsRegister<GpuRegister>(), out_reg);
+      // /* HeapReference<Object> */ out = *(out + offset)
+      __ LoadFromOffset(kLoadUnsignedWord, out_reg, out_reg, offset);
+      codegen_->GenerateReadBarrierSlow(instruction, out, out, maybe_temp, offset);
+    }
+  } else {
+    // Plain load with no read barrier.
+    // /* HeapReference<Object> */ out = *(out + offset)
+    __ LoadFromOffset(kLoadUnsignedWord, out_reg, out_reg, offset);
+    __ MaybeUnpoisonHeapReference(out_reg);
+  }
+}
+
+void InstructionCodeGeneratorMIPS64::GenerateReferenceLoadTwoRegisters(
+    HInstruction* instruction,
+    Location out,
+    Location obj,
+    uint32_t offset,
+    Location maybe_temp,
+    ReadBarrierOption read_barrier_option) {
+  GpuRegister out_reg = out.AsRegister<GpuRegister>();
+  GpuRegister obj_reg = obj.AsRegister<GpuRegister>();
+  if (read_barrier_option == kWithReadBarrier) {
+    CHECK(kEmitCompilerReadBarrier);
+    if (kUseBakerReadBarrier) {
+      DCHECK(maybe_temp.IsRegister()) << maybe_temp;
+      // Load with fast path based Baker's read barrier.
+      // /* HeapReference<Object> */ out = *(obj + offset)
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(instruction,
+                                                      out,
+                                                      obj_reg,
+                                                      offset,
+                                                      maybe_temp,
+                                                      /* needs_null_check */ false);
+    } else {
+      // Load with slow path based read barrier.
+      // /* HeapReference<Object> */ out = *(obj + offset)
+      __ LoadFromOffset(kLoadUnsignedWord, out_reg, obj_reg, offset);
+      codegen_->GenerateReadBarrierSlow(instruction, out, out, obj, offset);
+    }
+  } else {
+    // Plain load with no read barrier.
+    // /* HeapReference<Object> */ out = *(obj + offset)
+    __ LoadFromOffset(kLoadUnsignedWord, out_reg, obj_reg, offset);
+    __ MaybeUnpoisonHeapReference(out_reg);
+  }
+}
+
 void InstructionCodeGeneratorMIPS64::GenerateGcRootFieldLoad(
-    HInstruction* instruction ATTRIBUTE_UNUSED,
+    HInstruction* instruction,
     Location root,
     GpuRegister obj,
-    uint32_t offset) {
+    uint32_t offset,
+    ReadBarrierOption read_barrier_option) {
   GpuRegister root_reg = root.AsRegister<GpuRegister>();
-  if (kEmitCompilerReadBarrier) {
-    UNIMPLEMENTED(FATAL) << "for read barrier";
+  if (read_barrier_option == kWithReadBarrier) {
+    DCHECK(kEmitCompilerReadBarrier);
+    if (kUseBakerReadBarrier) {
+      // Fast path implementation of art::ReadBarrier::BarrierForRoot when
+      // Baker's read barrier are used:
+      //
+      //   root = obj.field;
+      //   temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
+      //   if (temp != null) {
+      //     root = temp(root)
+      //   }
+
+      // /* GcRoot<mirror::Object> */ root = *(obj + offset)
+      __ LoadFromOffset(kLoadUnsignedWord, root_reg, obj, offset);
+      static_assert(
+          sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(GcRoot<mirror::Object>),
+          "art::mirror::CompressedReference<mirror::Object> and art::GcRoot<mirror::Object> "
+          "have different sizes.");
+      static_assert(sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(int32_t),
+                    "art::mirror::CompressedReference<mirror::Object> and int32_t "
+                    "have different sizes.");
+
+      // Slow path marking the GC root `root`.
+      Location temp = Location::RegisterLocation(T9);
+      SlowPathCodeMIPS64* slow_path =
+          new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathMIPS64(
+              instruction,
+              root,
+              /*entrypoint*/ temp);
+      codegen_->AddSlowPath(slow_path);
+
+      // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
+      const int32_t entry_point_offset =
+          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kMips64PointerSize>(root.reg() - 1);
+      // Loading the entrypoint does not require a load acquire since it is only changed when
+      // threads are suspended or running a checkpoint.
+      __ LoadFromOffset(kLoadDoubleword, temp.AsRegister<GpuRegister>(), TR, entry_point_offset);
+      // The entrypoint is null when the GC is not marking, this prevents one load compared to
+      // checking GetIsGcMarking.
+      __ Bnezc(temp.AsRegister<GpuRegister>(), slow_path->GetEntryLabel());
+      __ Bind(slow_path->GetExitLabel());
+    } else {
+      // GC root loaded through a slow path for read barriers other
+      // than Baker's.
+      // /* GcRoot<mirror::Object>* */ root = obj + offset
+      __ Daddiu64(root_reg, obj, static_cast<int32_t>(offset));
+      // /* mirror::Object* */ root = root->Read()
+      codegen_->GenerateReadBarrierForRootSlow(instruction, root, root);
+    }
   } else {
     // Plain GC root load with no read barrier.
     // /* GcRoot<mirror::Object> */ root = *(obj + offset)
@@ -3446,6 +4341,219 @@ void InstructionCodeGeneratorMIPS64::GenerateGcRootFieldLoad(
   }
 }
 
+void CodeGeneratorMIPS64::GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction,
+                                                                Location ref,
+                                                                GpuRegister obj,
+                                                                uint32_t offset,
+                                                                Location temp,
+                                                                bool needs_null_check) {
+  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(kUseBakerReadBarrier);
+
+  // /* HeapReference<Object> */ ref = *(obj + offset)
+  Location no_index = Location::NoLocation();
+  ScaleFactor no_scale_factor = TIMES_1;
+  GenerateReferenceLoadWithBakerReadBarrier(instruction,
+                                            ref,
+                                            obj,
+                                            offset,
+                                            no_index,
+                                            no_scale_factor,
+                                            temp,
+                                            needs_null_check);
+}
+
+void CodeGeneratorMIPS64::GenerateArrayLoadWithBakerReadBarrier(HInstruction* instruction,
+                                                                Location ref,
+                                                                GpuRegister obj,
+                                                                uint32_t data_offset,
+                                                                Location index,
+                                                                Location temp,
+                                                                bool needs_null_check) {
+  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(kUseBakerReadBarrier);
+
+  static_assert(
+      sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
+      "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
+  // /* HeapReference<Object> */ ref =
+  //     *(obj + data_offset + index * sizeof(HeapReference<Object>))
+  ScaleFactor scale_factor = TIMES_4;
+  GenerateReferenceLoadWithBakerReadBarrier(instruction,
+                                            ref,
+                                            obj,
+                                            data_offset,
+                                            index,
+                                            scale_factor,
+                                            temp,
+                                            needs_null_check);
+}
+
+void CodeGeneratorMIPS64::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* instruction,
+                                                                    Location ref,
+                                                                    GpuRegister obj,
+                                                                    uint32_t offset,
+                                                                    Location index,
+                                                                    ScaleFactor scale_factor,
+                                                                    Location temp,
+                                                                    bool needs_null_check,
+                                                                    bool always_update_field) {
+  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(kUseBakerReadBarrier);
+
+  // In slow path based read barriers, the read barrier call is
+  // inserted after the original load. However, in fast path based
+  // Baker's read barriers, we need to perform the load of
+  // mirror::Object::monitor_ *before* the original reference load.
+  // This load-load ordering is required by the read barrier.
+  // The fast path/slow path (for Baker's algorithm) should look like:
+  //
+  //   uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
+  //   lfence;  // Load fence or artificial data dependency to prevent load-load reordering
+  //   HeapReference<Object> ref = *src;  // Original reference load.
+  //   bool is_gray = (rb_state == ReadBarrier::GrayState());
+  //   if (is_gray) {
+  //     ref = ReadBarrier::Mark(ref);  // Performed by runtime entrypoint slow path.
+  //   }
+  //
+  // Note: the original implementation in ReadBarrier::Barrier is
+  // slightly more complex as it performs additional checks that we do
+  // not do here for performance reasons.
+
+  GpuRegister ref_reg = ref.AsRegister<GpuRegister>();
+  GpuRegister temp_reg = temp.AsRegister<GpuRegister>();
+  uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
+
+  // /* int32_t */ monitor = obj->monitor_
+  __ LoadFromOffset(kLoadWord, temp_reg, obj, monitor_offset);
+  if (needs_null_check) {
+    MaybeRecordImplicitNullCheck(instruction);
+  }
+  // /* LockWord */ lock_word = LockWord(monitor)
+  static_assert(sizeof(LockWord) == sizeof(int32_t),
+                "art::LockWord and int32_t have different sizes.");
+
+  __ Sync(0);  // Barrier to prevent load-load reordering.
+
+  // The actual reference load.
+  if (index.IsValid()) {
+    // Load types involving an "index": ArrayGet,
+    // UnsafeGetObject/UnsafeGetObjectVolatile and UnsafeCASObject
+    // intrinsics.
+    // /* HeapReference<Object> */ ref = *(obj + offset + (index << scale_factor))
+    if (index.IsConstant()) {
+      size_t computed_offset =
+          (index.GetConstant()->AsIntConstant()->GetValue() << scale_factor) + offset;
+      __ LoadFromOffset(kLoadUnsignedWord, ref_reg, obj, computed_offset);
+    } else {
+      GpuRegister index_reg = index.AsRegister<GpuRegister>();
+      __ Dsll(TMP, index_reg, scale_factor);
+      __ Daddu(TMP, obj, TMP);
+      __ LoadFromOffset(kLoadUnsignedWord, ref_reg, TMP, offset);
+    }
+  } else {
+    // /* HeapReference<Object> */ ref = *(obj + offset)
+    __ LoadFromOffset(kLoadUnsignedWord, ref_reg, obj, offset);
+  }
+
+  // Object* ref = ref_addr->AsMirrorPtr()
+  __ MaybeUnpoisonHeapReference(ref_reg);
+
+  // Slow path marking the object `ref` when it is gray.
+  SlowPathCodeMIPS64* slow_path;
+  if (always_update_field) {
+    // ReadBarrierMarkAndUpdateFieldSlowPathMIPS64 only supports address
+    // of the form `obj + field_offset`, where `obj` is a register and
+    // `field_offset` is a register. Thus `offset` and `scale_factor`
+    // above are expected to be null in this code path.
+    DCHECK_EQ(offset, 0u);
+    DCHECK_EQ(scale_factor, ScaleFactor::TIMES_1);
+    slow_path = new (GetGraph()->GetArena())
+        ReadBarrierMarkAndUpdateFieldSlowPathMIPS64(instruction,
+                                                    ref,
+                                                    obj,
+                                                    /* field_offset */ index,
+                                                    temp_reg);
+  } else {
+    slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathMIPS64(instruction, ref);
+  }
+  AddSlowPath(slow_path);
+
+  // if (rb_state == ReadBarrier::GrayState())
+  //   ref = ReadBarrier::Mark(ref);
+  // Given the numeric representation, it's enough to check the low bit of the
+  // rb_state. We do that by shifting the bit into the sign bit (31) and
+  // performing a branch on less than zero.
+  static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0");
+  static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1");
+  static_assert(LockWord::kReadBarrierStateSize == 1, "Expecting 1-bit read barrier state size");
+  __ Sll(temp_reg, temp_reg, 31 - LockWord::kReadBarrierStateShift);
+  __ Bltzc(temp_reg, slow_path->GetEntryLabel());
+  __ Bind(slow_path->GetExitLabel());
+}
+
+void CodeGeneratorMIPS64::GenerateReadBarrierSlow(HInstruction* instruction,
+                                                  Location out,
+                                                  Location ref,
+                                                  Location obj,
+                                                  uint32_t offset,
+                                                  Location index) {
+  DCHECK(kEmitCompilerReadBarrier);
+
+  // Insert a slow path based read barrier *after* the reference load.
+  //
+  // If heap poisoning is enabled, the unpoisoning of the loaded
+  // reference will be carried out by the runtime within the slow
+  // path.
+  //
+  // Note that `ref` currently does not get unpoisoned (when heap
+  // poisoning is enabled), which is alright as the `ref` argument is
+  // not used by the artReadBarrierSlow entry point.
+  //
+  // TODO: Unpoison `ref` when it is used by artReadBarrierSlow.
+  SlowPathCodeMIPS64* slow_path = new (GetGraph()->GetArena())
+      ReadBarrierForHeapReferenceSlowPathMIPS64(instruction, out, ref, obj, offset, index);
+  AddSlowPath(slow_path);
+
+  __ Bc(slow_path->GetEntryLabel());
+  __ Bind(slow_path->GetExitLabel());
+}
+
+void CodeGeneratorMIPS64::MaybeGenerateReadBarrierSlow(HInstruction* instruction,
+                                                       Location out,
+                                                       Location ref,
+                                                       Location obj,
+                                                       uint32_t offset,
+                                                       Location index) {
+  if (kEmitCompilerReadBarrier) {
+    // Baker's read barriers shall be handled by the fast path
+    // (CodeGeneratorMIPS64::GenerateReferenceLoadWithBakerReadBarrier).
+    DCHECK(!kUseBakerReadBarrier);
+    // If heap poisoning is enabled, unpoisoning will be taken care of
+    // by the runtime within the slow path.
+    GenerateReadBarrierSlow(instruction, out, ref, obj, offset, index);
+  } else if (kPoisonHeapReferences) {
+    __ UnpoisonHeapReference(out.AsRegister<GpuRegister>());
+  }
+}
+
+void CodeGeneratorMIPS64::GenerateReadBarrierForRootSlow(HInstruction* instruction,
+                                                         Location out,
+                                                         Location root) {
+  DCHECK(kEmitCompilerReadBarrier);
+
+  // Insert a slow path based read barrier *after* the GC root load.
+  //
+  // Note that GC roots are not affected by heap poisoning, so we do
+  // not need to do anything special for this here.
+  SlowPathCodeMIPS64* slow_path =
+      new (GetGraph()->GetArena()) ReadBarrierForRootSlowPathMIPS64(instruction, out, root);
+  AddSlowPath(slow_path);
+
+  __ Bc(slow_path->GetEntryLabel());
+  __ Bind(slow_path->GetExitLabel());
+}
+
 void LocationsBuilderMIPS64::VisitInstanceOf(HInstanceOf* instruction) {
   LocationSummary::CallKind call_kind = LocationSummary::kNoCall;
   TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
@@ -3454,7 +4562,8 @@ void LocationsBuilderMIPS64::VisitInstanceOf(HInstanceOf* instruction) {
     case TypeCheckKind::kAbstractClassCheck:
     case TypeCheckKind::kClassHierarchyCheck:
     case TypeCheckKind::kArrayObjectCheck:
-      call_kind = LocationSummary::kNoCall;
+      call_kind =
+          kEmitCompilerReadBarrier ? LocationSummary::kCallOnSlowPath : LocationSummary::kNoCall;
       break;
     case TypeCheckKind::kArrayCheck:
     case TypeCheckKind::kUnresolvedCheck:
@@ -3469,14 +4578,20 @@ void LocationsBuilderMIPS64::VisitInstanceOf(HInstanceOf* instruction) {
   // The output does overlap inputs.
   // Note that TypeCheckSlowPathMIPS64 uses this register too.
   locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
+  locations->AddRegisterTemps(NumberOfInstanceOfTemps(type_check_kind));
 }
 
 void InstructionCodeGeneratorMIPS64::VisitInstanceOf(HInstanceOf* instruction) {
   TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
   LocationSummary* locations = instruction->GetLocations();
-  GpuRegister obj = locations->InAt(0).AsRegister<GpuRegister>();
+  Location obj_loc = locations->InAt(0);
+  GpuRegister obj = obj_loc.AsRegister<GpuRegister>();
   GpuRegister cls = locations->InAt(1).AsRegister<GpuRegister>();
-  GpuRegister out = locations->Out().AsRegister<GpuRegister>();
+  Location out_loc = locations->Out();
+  GpuRegister out = out_loc.AsRegister<GpuRegister>();
+  const size_t num_temps = NumberOfInstanceOfTemps(type_check_kind);
+  DCHECK_LE(num_temps, 1u);
+  Location maybe_temp_loc = (num_temps >= 1) ? locations->GetTemp(0) : Location::NoLocation();
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
   uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
   uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
@@ -3494,8 +4609,12 @@ void InstructionCodeGeneratorMIPS64::VisitInstanceOf(HInstanceOf* instruction) {
   switch (type_check_kind) {
     case TypeCheckKind::kExactCheck: {
       // /* HeapReference<Class> */ out = obj->klass_
-      __ LoadFromOffset(kLoadUnsignedWord, out, obj, class_offset);
-      __ MaybeUnpoisonHeapReference(out);
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        out_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp_loc,
+                                        kCompilerReadBarrierOption);
       // Classes must be equal for the instanceof to succeed.
       __ Xor(out, out, cls);
       __ Sltiu(out, out, 1);
@@ -3504,15 +4623,22 @@ void InstructionCodeGeneratorMIPS64::VisitInstanceOf(HInstanceOf* instruction) {
 
     case TypeCheckKind::kAbstractClassCheck: {
       // /* HeapReference<Class> */ out = obj->klass_
-      __ LoadFromOffset(kLoadUnsignedWord, out, obj, class_offset);
-      __ MaybeUnpoisonHeapReference(out);
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        out_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp_loc,
+                                        kCompilerReadBarrierOption);
       // If the class is abstract, we eagerly fetch the super class of the
       // object to avoid doing a comparison we know will fail.
       Mips64Label loop;
       __ Bind(&loop);
       // /* HeapReference<Class> */ out = out->super_class_
-      __ LoadFromOffset(kLoadUnsignedWord, out, out, super_offset);
-      __ MaybeUnpoisonHeapReference(out);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       out_loc,
+                                       super_offset,
+                                       maybe_temp_loc,
+                                       kCompilerReadBarrierOption);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ Beqzc(out, &done);
       __ Bnec(out, cls, &loop);
@@ -3522,15 +4648,22 @@ void InstructionCodeGeneratorMIPS64::VisitInstanceOf(HInstanceOf* instruction) {
 
     case TypeCheckKind::kClassHierarchyCheck: {
       // /* HeapReference<Class> */ out = obj->klass_
-      __ LoadFromOffset(kLoadUnsignedWord, out, obj, class_offset);
-      __ MaybeUnpoisonHeapReference(out);
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        out_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp_loc,
+                                        kCompilerReadBarrierOption);
       // Walk over the class hierarchy to find a match.
       Mips64Label loop, success;
       __ Bind(&loop);
       __ Beqc(out, cls, &success);
       // /* HeapReference<Class> */ out = out->super_class_
-      __ LoadFromOffset(kLoadUnsignedWord, out, out, super_offset);
-      __ MaybeUnpoisonHeapReference(out);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       out_loc,
+                                       super_offset,
+                                       maybe_temp_loc,
+                                       kCompilerReadBarrierOption);
       __ Bnezc(out, &loop);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ Bc(&done);
@@ -3541,15 +4674,22 @@ void InstructionCodeGeneratorMIPS64::VisitInstanceOf(HInstanceOf* instruction) {
 
     case TypeCheckKind::kArrayObjectCheck: {
       // /* HeapReference<Class> */ out = obj->klass_
-      __ LoadFromOffset(kLoadUnsignedWord, out, obj, class_offset);
-      __ MaybeUnpoisonHeapReference(out);
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        out_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp_loc,
+                                        kCompilerReadBarrierOption);
       // Do an exact check.
       Mips64Label success;
       __ Beqc(out, cls, &success);
       // Otherwise, we need to check that the object's class is a non-primitive array.
       // /* HeapReference<Class> */ out = out->component_type_
-      __ LoadFromOffset(kLoadUnsignedWord, out, out, component_offset);
-      __ MaybeUnpoisonHeapReference(out);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       out_loc,
+                                       component_offset,
+                                       maybe_temp_loc,
+                                       kCompilerReadBarrierOption);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ Beqzc(out, &done);
       __ LoadFromOffset(kLoadUnsignedHalfword, out, out, primitive_offset);
@@ -3564,8 +4704,12 @@ void InstructionCodeGeneratorMIPS64::VisitInstanceOf(HInstanceOf* instruction) {
     case TypeCheckKind::kArrayCheck: {
       // No read barrier since the slow path will retry upon failure.
       // /* HeapReference<Class> */ out = obj->klass_
-      __ LoadFromOffset(kLoadUnsignedWord, out, obj, class_offset);
-      __ MaybeUnpoisonHeapReference(out);
+      GenerateReferenceLoadTwoRegisters(instruction,
+                                        out_loc,
+                                        obj_loc,
+                                        class_offset,
+                                        maybe_temp_loc,
+                                        kWithoutReadBarrier);
       DCHECK(locations->OnlyCallsOnSlowPath());
       slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathMIPS64(instruction,
                                                                        /* is_fatal */ false);
@@ -3735,9 +4879,6 @@ static bool TryGenerateIntrinsicCode(HInvoke* invoke, CodeGeneratorMIPS64* codeg
 
 HLoadString::LoadKind CodeGeneratorMIPS64::GetSupportedLoadStringKind(
     HLoadString::LoadKind desired_string_load_kind) {
-  if (kEmitCompilerReadBarrier) {
-    UNIMPLEMENTED(FATAL) << "for read barrier";
-  }
   bool fallback_load = false;
   switch (desired_string_load_kind) {
     case HLoadString::LoadKind::kBootImageLinkTimeAddress:
@@ -3765,9 +4906,6 @@ HLoadString::LoadKind CodeGeneratorMIPS64::GetSupportedLoadStringKind(
 
 HLoadClass::LoadKind CodeGeneratorMIPS64::GetSupportedLoadClassKind(
     HLoadClass::LoadKind desired_class_load_kind) {
-  if (kEmitCompilerReadBarrier) {
-    UNIMPLEMENTED(FATAL) << "for read barrier";
-  }
   bool fallback_load = false;
   switch (desired_class_load_kind) {
     case HLoadClass::LoadKind::kInvalid:
@@ -3960,7 +5098,8 @@ void LocationsBuilderMIPS64::VisitLoadClass(HLoadClass* cls) {
   }
   DCHECK(!cls->NeedsAccessCheck());
 
-  LocationSummary::CallKind call_kind = (cls->NeedsEnvironment() || kEmitCompilerReadBarrier)
+  const bool requires_read_barrier = kEmitCompilerReadBarrier && !cls->IsInBootImage();
+  LocationSummary::CallKind call_kind = (cls->NeedsEnvironment() || requires_read_barrier)
       ? LocationSummary::kCallOnSlowPath
       : LocationSummary::kNoCall;
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(cls, call_kind);
@@ -3989,6 +5128,9 @@ void InstructionCodeGeneratorMIPS64::VisitLoadClass(HLoadClass* cls) NO_THREAD_S
       current_method_reg = locations->InAt(0).AsRegister<GpuRegister>();
   }
 
+  const ReadBarrierOption read_barrier_option = cls->IsInBootImage()
+      ? kWithoutReadBarrier
+      : kCompilerReadBarrierOption;
   bool generate_null_check = false;
   switch (load_kind) {
     case HLoadClass::LoadKind::kReferrersClass:
@@ -3998,10 +5140,12 @@ void InstructionCodeGeneratorMIPS64::VisitLoadClass(HLoadClass* cls) NO_THREAD_S
       GenerateGcRootFieldLoad(cls,
                               out_loc,
                               current_method_reg,
-                              ArtMethod::DeclaringClassOffset().Int32Value());
+                              ArtMethod::DeclaringClassOffset().Int32Value(),
+                              read_barrier_option);
       break;
     case HLoadClass::LoadKind::kBootImageLinkTimeAddress:
       DCHECK(codegen_->GetCompilerOptions().IsBootImage());
+      DCHECK_EQ(read_barrier_option, kWithoutReadBarrier);
       __ LoadLiteral(out,
                      kLoadUnsignedWord,
                      codegen_->DeduplicateBootImageTypeLiteral(cls->GetDexFile(),
@@ -4009,6 +5153,7 @@ void InstructionCodeGeneratorMIPS64::VisitLoadClass(HLoadClass* cls) NO_THREAD_S
       break;
     case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: {
       DCHECK(codegen_->GetCompilerOptions().IsBootImage());
+      DCHECK_EQ(read_barrier_option, kWithoutReadBarrier);
       CodeGeneratorMIPS64::PcRelativePatchInfo* info =
           codegen_->NewPcRelativeTypePatch(cls->GetDexFile(), cls->GetTypeIndex());
       codegen_->EmitPcRelativeAddressPlaceholderHigh(info, AT);
@@ -4016,7 +5161,7 @@ void InstructionCodeGeneratorMIPS64::VisitLoadClass(HLoadClass* cls) NO_THREAD_S
       break;
     }
     case HLoadClass::LoadKind::kBootImageAddress: {
-      DCHECK(!kEmitCompilerReadBarrier);
+      DCHECK_EQ(read_barrier_option, kWithoutReadBarrier);
       uint32_t address = dchecked_integral_cast<uint32_t>(
           reinterpret_cast<uintptr_t>(cls->GetClass().Get()));
       DCHECK_NE(address, 0u);
@@ -4029,7 +5174,7 @@ void InstructionCodeGeneratorMIPS64::VisitLoadClass(HLoadClass* cls) NO_THREAD_S
       CodeGeneratorMIPS64::PcRelativePatchInfo* info =
           codegen_->NewTypeBssEntryPatch(cls->GetDexFile(), cls->GetTypeIndex());
       codegen_->EmitPcRelativeAddressPlaceholderHigh(info, out);
-      GenerateGcRootFieldLoad(cls, out_loc, out, /* placeholder */ 0x5678);
+      GenerateGcRootFieldLoad(cls, out_loc, out, /* placeholder */ 0x5678, read_barrier_option);
       generate_null_check = true;
       break;
     }
@@ -4039,7 +5184,7 @@ void InstructionCodeGeneratorMIPS64::VisitLoadClass(HLoadClass* cls) NO_THREAD_S
                      codegen_->DeduplicateJitClassLiteral(cls->GetDexFile(),
                                                           cls->GetTypeIndex(),
                                                           cls->GetClass()));
-      GenerateGcRootFieldLoad(cls, out_loc, out, 0);
+      GenerateGcRootFieldLoad(cls, out_loc, out, 0, read_barrier_option);
       break;
     case HLoadClass::LoadKind::kDexCacheViaMethod:
     case HLoadClass::LoadKind::kInvalid:
@@ -4136,7 +5281,11 @@ void InstructionCodeGeneratorMIPS64::VisitLoadString(HLoadString* load) NO_THREA
       CodeGeneratorMIPS64::PcRelativePatchInfo* info =
           codegen_->NewPcRelativeStringPatch(load->GetDexFile(), load->GetStringIndex());
       codegen_->EmitPcRelativeAddressPlaceholderHigh(info, out);
-      GenerateGcRootFieldLoad(load, out_loc, out, /* placeholder */ 0x5678);
+      GenerateGcRootFieldLoad(load,
+                              out_loc,
+                              out,
+                              /* placeholder */ 0x5678,
+                              kCompilerReadBarrierOption);
       SlowPathCodeMIPS64* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathMIPS64(load);
       codegen_->AddSlowPath(slow_path);
       __ Beqzc(out, slow_path->GetEntryLabel());
@@ -4149,7 +5298,7 @@ void InstructionCodeGeneratorMIPS64::VisitLoadString(HLoadString* load) NO_THREA
                      codegen_->DeduplicateJitStringLiteral(load->GetDexFile(),
                                                            load->GetStringIndex(),
                                                            load->GetString()));
-      GenerateGcRootFieldLoad(load, out_loc, out, 0);
+      GenerateGcRootFieldLoad(load, out_loc, out, 0, kCompilerReadBarrierOption);
       return;
     default:
       break;
diff --git a/compiler/optimizing/code_generator_mips64.h b/compiler/optimizing/code_generator_mips64.h
index 6040dc9492..fd1a174608 100644
--- a/compiler/optimizing/code_generator_mips64.h
+++ b/compiler/optimizing/code_generator_mips64.h
@@ -237,6 +237,38 @@ class InstructionCodeGeneratorMIPS64 : public InstructionCodeGenerator {
                       const FieldInfo& field_info,
                       bool value_can_be_null);
   void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info);
+
+  // Generate a heap reference load using one register `out`:
+  //
+  //   out <- *(out + offset)
+  //
+  // while honoring heap poisoning and/or read barriers (if any).
+  //
+  // Location `maybe_temp` is used when generating a read barrier and
+  // shall be a register in that case; it may be an invalid location
+  // otherwise.
+  void GenerateReferenceLoadOneRegister(HInstruction* instruction,
+                                        Location out,
+                                        uint32_t offset,
+                                        Location maybe_temp,
+                                        ReadBarrierOption read_barrier_option);
+  // Generate a heap reference load using two different registers
+  // `out` and `obj`:
+  //
+  //   out <- *(obj + offset)
+  //
+  // while honoring heap poisoning and/or read barriers (if any).
+  //
+  // Location `maybe_temp` is used when generating a Baker's (fast
+  // path) read barrier and shall be a register in that case; it may
+  // be an invalid location otherwise.
+  void GenerateReferenceLoadTwoRegisters(HInstruction* instruction,
+                                         Location out,
+                                         Location obj,
+                                         uint32_t offset,
+                                         Location maybe_temp,
+                                         ReadBarrierOption read_barrier_option);
+
   // Generate a GC root reference load:
   //
   //   root <- *(obj + offset)
@@ -245,7 +277,9 @@ class InstructionCodeGeneratorMIPS64 : public InstructionCodeGenerator {
   void GenerateGcRootFieldLoad(HInstruction* instruction,
                                Location root,
                                GpuRegister obj,
-                               uint32_t offset);
+                               uint32_t offset,
+                               ReadBarrierOption read_barrier_option);
+
   void GenerateTestAndBranch(HInstruction* instruction,
                              size_t condition_input_index,
                              Mips64Label* true_target,
@@ -316,6 +350,91 @@ class CodeGeneratorMIPS64 : public CodeGenerator {
   void EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches) OVERRIDE;
   void EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) OVERRIDE;
 
+  // Fast path implementation of ReadBarrier::Barrier for a heap
+  // reference field load when Baker's read barriers are used.
+  void GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction,
+                                             Location ref,
+                                             GpuRegister obj,
+                                             uint32_t offset,
+                                             Location temp,
+                                             bool needs_null_check);
+  // Fast path implementation of ReadBarrier::Barrier for a heap
+  // reference array load when Baker's read barriers are used.
+  void GenerateArrayLoadWithBakerReadBarrier(HInstruction* instruction,
+                                             Location ref,
+                                             GpuRegister obj,
+                                             uint32_t data_offset,
+                                             Location index,
+                                             Location temp,
+                                             bool needs_null_check);
+
+  // Factored implementation, used by GenerateFieldLoadWithBakerReadBarrier,
+  // GenerateArrayLoadWithBakerReadBarrier and some intrinsics.
+  //
+  // Load the object reference located at the address
+  // `obj + offset + (index << scale_factor)`, held by object `obj`, into
+  // `ref`, and mark it if needed.
+  //
+  // If `always_update_field` is true, the value of the reference is
+  // atomically updated in the holder (`obj`).
+  void GenerateReferenceLoadWithBakerReadBarrier(HInstruction* instruction,
+                                                 Location ref,
+                                                 GpuRegister obj,
+                                                 uint32_t offset,
+                                                 Location index,
+                                                 ScaleFactor scale_factor,
+                                                 Location temp,
+                                                 bool needs_null_check,
+                                                 bool always_update_field = false);
+
+  // Generate a read barrier for a heap reference within `instruction`
+  // using a slow path.
+  //
+  // A read barrier for an object reference read from the heap is
+  // implemented as a call to the artReadBarrierSlow runtime entry
+  // point, which is passed the values in locations `ref`, `obj`, and
+  // `offset`:
+  //
+  //   mirror::Object* artReadBarrierSlow(mirror::Object* ref,
+  //                                      mirror::Object* obj,
+  //                                      uint32_t offset);
+  //
+  // The `out` location contains the value returned by
+  // artReadBarrierSlow.
+  //
+  // When `index` is provided (i.e. for array accesses), the offset
+  // value passed to artReadBarrierSlow is adjusted to take `index`
+  // into account.
+  void GenerateReadBarrierSlow(HInstruction* instruction,
+                               Location out,
+                               Location ref,
+                               Location obj,
+                               uint32_t offset,
+                               Location index = Location::NoLocation());
+
+  // If read barriers are enabled, generate a read barrier for a heap
+  // reference using a slow path. If heap poisoning is enabled, also
+  // unpoison the reference in `out`.
+  void MaybeGenerateReadBarrierSlow(HInstruction* instruction,
+                                    Location out,
+                                    Location ref,
+                                    Location obj,
+                                    uint32_t offset,
+                                    Location index = Location::NoLocation());
+
+  // Generate a read barrier for a GC root within `instruction` using
+  // a slow path.
+  //
+  // A read barrier for an object reference GC root is implemented as
+  // a call to the artReadBarrierForRootSlow runtime entry point,
+  // which is passed the value in location `root`:
+  //
+  //   mirror::Object* artReadBarrierForRootSlow(GcRoot<mirror::Object>* root);
+  //
+  // The `out` location contains the value returned by
+  // artReadBarrierForRootSlow.
+  void GenerateReadBarrierForRootSlow(HInstruction* instruction, Location out, Location root);
+
   void MarkGCCard(GpuRegister object, GpuRegister value, bool value_can_be_null);
 
   // Register allocation.
@@ -366,6 +485,14 @@ class CodeGeneratorMIPS64 : public CodeGenerator {
                      uint32_t dex_pc,
                      SlowPathCode* slow_path = nullptr) OVERRIDE;
 
+  // Generate code to invoke a runtime entry point, but do not record
+  // PC-related information in a stack map.
+  void InvokeRuntimeWithoutRecordingPcInfo(int32_t entry_point_offset,
+                                           HInstruction* instruction,
+                                           SlowPathCode* slow_path);
+
+  void GenerateInvokeRuntime(int32_t entry_point_offset);
+
   ParallelMoveResolver* GetMoveResolver() OVERRIDE { return &move_resolver_; }
 
   bool NeedsTwoRegisters(Primitive::Type type ATTRIBUTE_UNUSED) const OVERRIDE { return false; }
diff --git a/compiler/optimizing/code_generator_vector_arm.cc b/compiler/optimizing/code_generator_vector_arm.cc
new file mode 100644
index 0000000000..ba2b2cb2c9
--- /dev/null
+++ b/compiler/optimizing/code_generator_vector_arm.cc
@@ -0,0 +1,235 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "code_generator_arm.h"
+
+namespace art {
+namespace arm {
+
+// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
+#define __ down_cast<ArmAssembler*>(GetAssembler())->  // NOLINT
+
+void LocationsBuilderARM::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorARM::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderARM::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorARM::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderARM::VisitVecSumReduce(HVecSumReduce* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorARM::VisitVecSumReduce(HVecSumReduce* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+// Helper to set up locations for vector unary operations.
+static void CreateVecUnOpLocations(ArenaAllocator* arena, HVecUnaryOperation* instruction) {
+  LocationSummary* locations = new (arena) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      DCHECK(locations);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderARM::VisitVecCnv(HVecCnv* instruction) {
+  CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARM::VisitVecCnv(HVecCnv* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderARM::VisitVecNeg(HVecNeg* instruction) {
+  CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARM::VisitVecNeg(HVecNeg* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderARM::VisitVecNot(HVecNot* instruction) {
+  CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARM::VisitVecNot(HVecNot* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+// Helper to set up locations for vector binary operations.
+static void CreateVecBinOpLocations(ArenaAllocator* arena, HVecBinaryOperation* instruction) {
+  LocationSummary* locations = new (arena) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      DCHECK(locations);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderARM::VisitVecAdd(HVecAdd* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARM::VisitVecAdd(HVecAdd* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderARM::VisitVecSub(HVecSub* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARM::VisitVecSub(HVecSub* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderARM::VisitVecMul(HVecMul* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARM::VisitVecMul(HVecMul* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderARM::VisitVecDiv(HVecDiv* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARM::VisitVecDiv(HVecDiv* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderARM::VisitVecAnd(HVecAnd* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARM::VisitVecAnd(HVecAnd* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderARM::VisitVecAndNot(HVecAndNot* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARM::VisitVecAndNot(HVecAndNot* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderARM::VisitVecOr(HVecOr* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARM::VisitVecOr(HVecOr* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderARM::VisitVecXor(HVecXor* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARM::VisitVecXor(HVecXor* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+// Helper to set up locations for vector shift operations.
+static void CreateVecShiftLocations(ArenaAllocator* arena, HVecBinaryOperation* instruction) {
+  LocationSummary* locations = new (arena) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      DCHECK(locations);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderARM::VisitVecShl(HVecShl* instruction) {
+  CreateVecShiftLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARM::VisitVecShl(HVecShl* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderARM::VisitVecShr(HVecShr* instruction) {
+  CreateVecShiftLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARM::VisitVecShr(HVecShr* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderARM::VisitVecUShr(HVecUShr* instruction) {
+  CreateVecShiftLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARM::VisitVecUShr(HVecUShr* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderARM::VisitVecLoad(HVecLoad* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorARM::VisitVecLoad(HVecLoad* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderARM::VisitVecStore(HVecStore* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorARM::VisitVecStore(HVecStore* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+#undef __
+
+}  // namespace arm
+}  // namespace art
diff --git a/compiler/optimizing/code_generator_vector_arm64.cc b/compiler/optimizing/code_generator_vector_arm64.cc
new file mode 100644
index 0000000000..96d00210b8
--- /dev/null
+++ b/compiler/optimizing/code_generator_vector_arm64.cc
@@ -0,0 +1,641 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "code_generator_arm64.h"
+#include "mirror/array-inl.h"
+
+using namespace vixl::aarch64;  // NOLINT(build/namespaces)
+
+namespace art {
+namespace arm64 {
+
+using helpers::DRegisterFrom;
+using helpers::HeapOperand;
+using helpers::InputRegisterAt;
+using helpers::Int64ConstantFrom;
+using helpers::XRegisterFrom;
+
+#define __ GetVIXLAssembler()->
+
+void LocationsBuilderARM64::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+      locations->SetInAt(0, Location::RequiresRegister());
+      locations->SetOut(Location::RequiresFpuRegister());
+      break;
+    case Primitive::kPrimFloat:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void InstructionCodeGeneratorARM64::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  FPRegister dst = DRegisterFrom(locations->Out());
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ Dup(dst.V8B(), InputRegisterAt(instruction, 0));
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ Dup(dst.V4H(), InputRegisterAt(instruction, 0));
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Dup(dst.V2S(), InputRegisterAt(instruction, 0));
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Dup(dst.V2S(), DRegisterFrom(locations->InAt(0)).V2S(), 0);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderARM64::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorARM64::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderARM64::VisitVecSumReduce(HVecSumReduce* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorARM64::VisitVecSumReduce(HVecSumReduce* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+// Helper to set up locations for vector unary operations.
+static void CreateVecUnOpLocations(ArenaAllocator* arena, HVecUnaryOperation* instruction) {
+  LocationSummary* locations = new (arena) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetOut(Location::RequiresFpuRegister(),
+                        instruction->IsVecNot() ? Location::kOutputOverlap
+                                                : Location::kNoOutputOverlap);
+      break;
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimFloat:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderARM64::VisitVecCnv(HVecCnv* instruction) {
+  CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARM64::VisitVecCnv(HVecCnv* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  FPRegister src = DRegisterFrom(locations->InAt(0));
+  FPRegister dst = DRegisterFrom(locations->Out());
+  Primitive::Type from = instruction->GetInputType();
+  Primitive::Type to = instruction->GetResultType();
+  if (from == Primitive::kPrimInt && to == Primitive::kPrimFloat) {
+    DCHECK_EQ(2u, instruction->GetVectorLength());
+    __ Scvtf(dst.V2S(), src.V2S());
+  } else {
+    LOG(FATAL) << "Unsupported SIMD type";
+  }
+}
+
+void LocationsBuilderARM64::VisitVecNeg(HVecNeg* instruction) {
+  CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARM64::VisitVecNeg(HVecNeg* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  FPRegister src = DRegisterFrom(locations->InAt(0));
+  FPRegister dst = DRegisterFrom(locations->Out());
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ Neg(dst.V8B(), src.V8B());
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ Neg(dst.V4H(), src.V4H());
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Neg(dst.V2S(), src.V2S());
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Fneg(dst.V2S(), src.V2S());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderARM64::VisitVecNot(HVecNot* instruction) {
+  CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARM64::VisitVecNot(HVecNot* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  FPRegister src = DRegisterFrom(locations->InAt(0));
+  FPRegister dst = DRegisterFrom(locations->Out());
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:  // special case boolean-not
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ Movi(dst.V8B(), 1);
+      __ Eor(dst.V8B(), dst.V8B(), src.V8B());
+      break;
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+      __ Not(dst.V8B(), src.V8B());  // lanes do not matter
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+// Helper to set up locations for vector binary operations.
+static void CreateVecBinOpLocations(ArenaAllocator* arena, HVecBinaryOperation* instruction) {
+  LocationSummary* locations = new (arena) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimFloat:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetInAt(1, Location::RequiresFpuRegister());
+      locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderARM64::VisitVecAdd(HVecAdd* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARM64::VisitVecAdd(HVecAdd* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  FPRegister lhs = DRegisterFrom(locations->InAt(0));
+  FPRegister rhs = DRegisterFrom(locations->InAt(1));
+  FPRegister dst = DRegisterFrom(locations->Out());
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ Add(dst.V8B(), lhs.V8B(), rhs.V8B());
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ Add(dst.V4H(), lhs.V4H(), rhs.V4H());
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Add(dst.V2S(), lhs.V2S(), rhs.V2S());
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Fadd(dst.V2S(), lhs.V2S(), rhs.V2S());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderARM64::VisitVecSub(HVecSub* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARM64::VisitVecSub(HVecSub* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  FPRegister lhs = DRegisterFrom(locations->InAt(0));
+  FPRegister rhs = DRegisterFrom(locations->InAt(1));
+  FPRegister dst = DRegisterFrom(locations->Out());
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ Sub(dst.V8B(), lhs.V8B(), rhs.V8B());
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ Sub(dst.V4H(), lhs.V4H(), rhs.V4H());
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Sub(dst.V2S(), lhs.V2S(), rhs.V2S());
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Fsub(dst.V2S(), lhs.V2S(), rhs.V2S());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderARM64::VisitVecMul(HVecMul* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARM64::VisitVecMul(HVecMul* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  FPRegister lhs = DRegisterFrom(locations->InAt(0));
+  FPRegister rhs = DRegisterFrom(locations->InAt(1));
+  FPRegister dst = DRegisterFrom(locations->Out());
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ Mul(dst.V8B(), lhs.V8B(), rhs.V8B());
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ Mul(dst.V4H(), lhs.V4H(), rhs.V4H());
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Mul(dst.V2S(), lhs.V2S(), rhs.V2S());
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Fmul(dst.V2S(), lhs.V2S(), rhs.V2S());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderARM64::VisitVecDiv(HVecDiv* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARM64::VisitVecDiv(HVecDiv* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  FPRegister lhs = DRegisterFrom(locations->InAt(0));
+  FPRegister rhs = DRegisterFrom(locations->InAt(1));
+  FPRegister dst = DRegisterFrom(locations->Out());
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Fdiv(dst.V2S(), lhs.V2S(), rhs.V2S());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderARM64::VisitVecAnd(HVecAnd* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARM64::VisitVecAnd(HVecAnd* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  FPRegister lhs = DRegisterFrom(locations->InAt(0));
+  FPRegister rhs = DRegisterFrom(locations->InAt(1));
+  FPRegister dst = DRegisterFrom(locations->Out());
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimFloat:
+      __ And(dst.V8B(), lhs.V8B(), rhs.V8B());  // lanes do not matter
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderARM64::VisitVecAndNot(HVecAndNot* instruction) {
+  LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorARM64::VisitVecAndNot(HVecAndNot* instruction) {
+  LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId();
+}
+
+void LocationsBuilderARM64::VisitVecOr(HVecOr* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARM64::VisitVecOr(HVecOr* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  FPRegister lhs = DRegisterFrom(locations->InAt(0));
+  FPRegister rhs = DRegisterFrom(locations->InAt(1));
+  FPRegister dst = DRegisterFrom(locations->Out());
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimFloat:
+      __ Orr(dst.V8B(), lhs.V8B(), rhs.V8B());  // lanes do not matter
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderARM64::VisitVecXor(HVecXor* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARM64::VisitVecXor(HVecXor* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  FPRegister lhs = DRegisterFrom(locations->InAt(0));
+  FPRegister rhs = DRegisterFrom(locations->InAt(1));
+  FPRegister dst = DRegisterFrom(locations->Out());
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimFloat:
+      __ Eor(dst.V8B(), lhs.V8B(), rhs.V8B());  // lanes do not matter
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+// Helper to set up locations for vector shift operations.
+static void CreateVecShiftLocations(ArenaAllocator* arena, HVecBinaryOperation* instruction) {
+  LocationSummary* locations = new (arena) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetInAt(1, Location::ConstantLocation(instruction->InputAt(1)->AsConstant()));
+      locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderARM64::VisitVecShl(HVecShl* instruction) {
+  CreateVecShiftLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARM64::VisitVecShl(HVecShl* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  FPRegister lhs = DRegisterFrom(locations->InAt(0));
+  FPRegister dst = DRegisterFrom(locations->Out());
+  int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ Shl(dst.V8B(), lhs.V8B(), value);
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ Shl(dst.V4H(), lhs.V4H(), value);
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Shl(dst.V2S(), lhs.V2S(), value);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderARM64::VisitVecShr(HVecShr* instruction) {
+  CreateVecShiftLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARM64::VisitVecShr(HVecShr* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  FPRegister lhs = DRegisterFrom(locations->InAt(0));
+  FPRegister dst = DRegisterFrom(locations->Out());
+  int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ Sshr(dst.V8B(), lhs.V8B(), value);
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ Sshr(dst.V4H(), lhs.V4H(), value);
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Sshr(dst.V2S(), lhs.V2S(), value);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderARM64::VisitVecUShr(HVecUShr* instruction) {
+  CreateVecShiftLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARM64::VisitVecUShr(HVecUShr* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  FPRegister lhs = DRegisterFrom(locations->InAt(0));
+  FPRegister dst = DRegisterFrom(locations->Out());
+  int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ Ushr(dst.V8B(), lhs.V8B(), value);
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ Ushr(dst.V4H(), lhs.V4H(), value);
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Ushr(dst.V2S(), lhs.V2S(), value);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+// Helper to set up locations for vector memory operations.
+static void CreateVecMemLocations(ArenaAllocator* arena,
+                                  HVecMemoryOperation* instruction,
+                                  bool is_load) {
+  LocationSummary* locations = new (arena) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimFloat:
+      locations->SetInAt(0, Location::RequiresRegister());
+      locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
+      if (is_load) {
+        locations->SetOut(Location::RequiresFpuRegister());
+      } else {
+        locations->SetInAt(2, Location::RequiresFpuRegister());
+      }
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+// Helper to set up registers and address for vector memory operations.
+MemOperand InstructionCodeGeneratorARM64::CreateVecMemRegisters(
+    HVecMemoryOperation* instruction,
+    Location* reg_loc,
+    bool is_load) {
+  LocationSummary* locations = instruction->GetLocations();
+  Register base = InputRegisterAt(instruction, 0);
+  Location index = locations->InAt(1);
+  *reg_loc = is_load ? locations->Out() : locations->InAt(2);
+
+  Primitive::Type packed_type = instruction->GetPackedType();
+  uint32_t offset = mirror::Array::DataOffset(Primitive::ComponentSize(packed_type)).Uint32Value();
+  size_t shift = Primitive::ComponentSizeShift(packed_type);
+
+  UseScratchRegisterScope temps(GetVIXLAssembler());
+  Register temp = temps.AcquireSameSizeAs(base);
+  if (index.IsConstant()) {
+    offset += Int64ConstantFrom(index) << shift;
+    __ Add(temp, base, offset);
+  } else {
+    if (instruction->InputAt(0)->IsIntermediateAddress()) {
+      temp = base;
+    } else {
+      __ Add(temp, base, offset);
+    }
+    __ Add(temp.X(), temp.X(), Operand(XRegisterFrom(index), LSL, shift));
+  }
+  return HeapOperand(temp);
+}
+
+void LocationsBuilderARM64::VisitVecLoad(HVecLoad* instruction) {
+  CreateVecMemLocations(GetGraph()->GetArena(), instruction, /*is_load*/ true);
+}
+
+void InstructionCodeGeneratorARM64::VisitVecLoad(HVecLoad* instruction) {
+  Location reg_loc = Location::NoLocation();
+  MemOperand mem = CreateVecMemRegisters(instruction, &reg_loc, /*is_load*/ true);
+  FPRegister reg = DRegisterFrom(reg_loc);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ Ld1(reg.V8B(), mem);
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ Ld1(reg.V4H(), mem);
+      break;
+    case Primitive::kPrimInt:
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Ld1(reg.V2S(), mem);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderARM64::VisitVecStore(HVecStore* instruction) {
+  CreateVecMemLocations(GetGraph()->GetArena(), instruction, /*is_load*/ false);
+}
+
+void InstructionCodeGeneratorARM64::VisitVecStore(HVecStore* instruction) {
+  Location reg_loc = Location::NoLocation();
+  MemOperand mem = CreateVecMemRegisters(instruction, &reg_loc, /*is_load*/ false);
+  FPRegister reg = DRegisterFrom(reg_loc);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ St1(reg.V8B(), mem);
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ St1(reg.V4H(), mem);
+      break;
+    case Primitive::kPrimInt:
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ St1(reg.V2S(), mem);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+#undef __
+
+}  // namespace arm64
+}  // namespace art
diff --git a/compiler/optimizing/code_generator_vector_arm_vixl.cc b/compiler/optimizing/code_generator_vector_arm_vixl.cc
new file mode 100644
index 0000000000..171198902d
--- /dev/null
+++ b/compiler/optimizing/code_generator_vector_arm_vixl.cc
@@ -0,0 +1,235 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "code_generator_arm_vixl.h"
+
+namespace art {
+namespace arm {
+
+// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
+#define __ reinterpret_cast<ArmVIXLAssembler*>(GetAssembler())->GetVIXLAssembler()->  // NOLINT
+
+void LocationsBuilderARMVIXL::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderARMVIXL::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderARMVIXL::VisitVecSumReduce(HVecSumReduce* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecSumReduce(HVecSumReduce* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+// Helper to set up locations for vector unary operations.
+static void CreateVecUnOpLocations(ArenaAllocator* arena, HVecUnaryOperation* instruction) {
+  LocationSummary* locations = new (arena) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      DCHECK(locations);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderARMVIXL::VisitVecCnv(HVecCnv* instruction) {
+  CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecCnv(HVecCnv* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderARMVIXL::VisitVecNeg(HVecNeg* instruction) {
+  CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecNeg(HVecNeg* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderARMVIXL::VisitVecNot(HVecNot* instruction) {
+  CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecNot(HVecNot* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+// Helper to set up locations for vector binary operations.
+static void CreateVecBinOpLocations(ArenaAllocator* arena, HVecBinaryOperation* instruction) {
+  LocationSummary* locations = new (arena) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      DCHECK(locations);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderARMVIXL::VisitVecAdd(HVecAdd* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecAdd(HVecAdd* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderARMVIXL::VisitVecSub(HVecSub* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecSub(HVecSub* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderARMVIXL::VisitVecMul(HVecMul* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecMul(HVecMul* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderARMVIXL::VisitVecDiv(HVecDiv* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecDiv(HVecDiv* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderARMVIXL::VisitVecAnd(HVecAnd* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecAnd(HVecAnd* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderARMVIXL::VisitVecAndNot(HVecAndNot* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecAndNot(HVecAndNot* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderARMVIXL::VisitVecOr(HVecOr* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecOr(HVecOr* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderARMVIXL::VisitVecXor(HVecXor* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecXor(HVecXor* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+// Helper to set up locations for vector shift operations.
+static void CreateVecShiftLocations(ArenaAllocator* arena, HVecBinaryOperation* instruction) {
+  LocationSummary* locations = new (arena) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      DCHECK(locations);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderARMVIXL::VisitVecShl(HVecShl* instruction) {
+  CreateVecShiftLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecShl(HVecShl* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderARMVIXL::VisitVecShr(HVecShr* instruction) {
+  CreateVecShiftLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecShr(HVecShr* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderARMVIXL::VisitVecUShr(HVecUShr* instruction) {
+  CreateVecShiftLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecUShr(HVecUShr* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderARMVIXL::VisitVecLoad(HVecLoad* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecLoad(HVecLoad* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderARMVIXL::VisitVecStore(HVecStore* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecStore(HVecStore* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+#undef __
+
+}  // namespace arm
+}  // namespace art
diff --git a/compiler/optimizing/code_generator_vector_mips.cc b/compiler/optimizing/code_generator_vector_mips.cc
new file mode 100644
index 0000000000..6f5fe0d2a4
--- /dev/null
+++ b/compiler/optimizing/code_generator_vector_mips.cc
@@ -0,0 +1,235 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "code_generator_mips.h"
+
+namespace art {
+namespace mips {
+
+// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
+#define __ down_cast<MipsAssembler*>(GetAssembler())->  // NOLINT
+
+void LocationsBuilderMIPS::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderMIPS::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderMIPS::VisitVecSumReduce(HVecSumReduce* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecSumReduce(HVecSumReduce* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+// Helper to set up locations for vector unary operations.
+static void CreateVecUnOpLocations(ArenaAllocator* arena, HVecUnaryOperation* instruction) {
+  LocationSummary* locations = new (arena) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      DCHECK(locations);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderMIPS::VisitVecCnv(HVecCnv* instruction) {
+  CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecCnv(HVecCnv* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderMIPS::VisitVecNeg(HVecNeg* instruction) {
+  CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecNeg(HVecNeg* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderMIPS::VisitVecNot(HVecNot* instruction) {
+  CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecNot(HVecNot* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+// Helper to set up locations for vector binary operations.
+static void CreateVecBinOpLocations(ArenaAllocator* arena, HVecBinaryOperation* instruction) {
+  LocationSummary* locations = new (arena) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      DCHECK(locations);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderMIPS::VisitVecAdd(HVecAdd* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecAdd(HVecAdd* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderMIPS::VisitVecSub(HVecSub* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecSub(HVecSub* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderMIPS::VisitVecMul(HVecMul* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecMul(HVecMul* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderMIPS::VisitVecDiv(HVecDiv* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecDiv(HVecDiv* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderMIPS::VisitVecAnd(HVecAnd* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecAnd(HVecAnd* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderMIPS::VisitVecAndNot(HVecAndNot* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecAndNot(HVecAndNot* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderMIPS::VisitVecOr(HVecOr* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecOr(HVecOr* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderMIPS::VisitVecXor(HVecXor* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecXor(HVecXor* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+// Helper to set up locations for vector shift operations.
+static void CreateVecShiftLocations(ArenaAllocator* arena, HVecBinaryOperation* instruction) {
+  LocationSummary* locations = new (arena) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      DCHECK(locations);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderMIPS::VisitVecShl(HVecShl* instruction) {
+  CreateVecShiftLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecShl(HVecShl* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderMIPS::VisitVecShr(HVecShr* instruction) {
+  CreateVecShiftLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecShr(HVecShr* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderMIPS::VisitVecUShr(HVecUShr* instruction) {
+  CreateVecShiftLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecUShr(HVecUShr* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderMIPS::VisitVecLoad(HVecLoad* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecLoad(HVecLoad* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderMIPS::VisitVecStore(HVecStore* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecStore(HVecStore* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+#undef __
+
+}  // namespace mips
+}  // namespace art
diff --git a/compiler/optimizing/code_generator_vector_mips64.cc b/compiler/optimizing/code_generator_vector_mips64.cc
new file mode 100644
index 0000000000..2ee7ac91cf
--- /dev/null
+++ b/compiler/optimizing/code_generator_vector_mips64.cc
@@ -0,0 +1,235 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "code_generator_mips64.h"
+
+namespace art {
+namespace mips64 {
+
+// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
+#define __ down_cast<Mips64Assembler*>(GetAssembler())->  // NOLINT
+
+void LocationsBuilderMIPS64::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderMIPS64::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderMIPS64::VisitVecSumReduce(HVecSumReduce* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecSumReduce(HVecSumReduce* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+// Helper to set up locations for vector unary operations.
+static void CreateVecUnOpLocations(ArenaAllocator* arena, HVecUnaryOperation* instruction) {
+  LocationSummary* locations = new (arena) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      DCHECK(locations);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderMIPS64::VisitVecCnv(HVecCnv* instruction) {
+  CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecCnv(HVecCnv* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderMIPS64::VisitVecNeg(HVecNeg* instruction) {
+  CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecNeg(HVecNeg* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderMIPS64::VisitVecNot(HVecNot* instruction) {
+  CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecNot(HVecNot* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+// Helper to set up locations for vector binary operations.
+static void CreateVecBinOpLocations(ArenaAllocator* arena, HVecBinaryOperation* instruction) {
+  LocationSummary* locations = new (arena) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      DCHECK(locations);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderMIPS64::VisitVecAdd(HVecAdd* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecAdd(HVecAdd* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderMIPS64::VisitVecSub(HVecSub* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecSub(HVecSub* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderMIPS64::VisitVecMul(HVecMul* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecMul(HVecMul* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderMIPS64::VisitVecDiv(HVecDiv* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecDiv(HVecDiv* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderMIPS64::VisitVecAnd(HVecAnd* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecAnd(HVecAnd* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderMIPS64::VisitVecAndNot(HVecAndNot* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecAndNot(HVecAndNot* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderMIPS64::VisitVecOr(HVecOr* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecOr(HVecOr* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderMIPS64::VisitVecXor(HVecXor* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecXor(HVecXor* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+// Helper to set up locations for vector shift operations.
+static void CreateVecShiftLocations(ArenaAllocator* arena, HVecBinaryOperation* instruction) {
+  LocationSummary* locations = new (arena) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      DCHECK(locations);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderMIPS64::VisitVecShl(HVecShl* instruction) {
+  CreateVecShiftLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecShl(HVecShl* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderMIPS64::VisitVecShr(HVecShr* instruction) {
+  CreateVecShiftLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecShr(HVecShr* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderMIPS64::VisitVecUShr(HVecUShr* instruction) {
+  CreateVecShiftLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecUShr(HVecUShr* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderMIPS64::VisitVecLoad(HVecLoad* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecLoad(HVecLoad* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderMIPS64::VisitVecStore(HVecStore* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecStore(HVecStore* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+#undef __
+
+}  // namespace mips64
+}  // namespace art
diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc
new file mode 100644
index 0000000000..4f3988ee2e
--- /dev/null
+++ b/compiler/optimizing/code_generator_vector_x86.cc
@@ -0,0 +1,767 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "code_generator_x86.h"
+#include "mirror/array-inl.h"
+
+namespace art {
+namespace x86 {
+
+// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
+#define __ down_cast<X86Assembler*>(GetAssembler())->  // NOLINT
+
+void LocationsBuilderX86::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimLong:
+      // Long needs extra temporary to load the register pair.
+      locations->AddTemp(Location::RequiresFpuRegister());
+      FALLTHROUGH_INTENDED;
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+      locations->SetInAt(0, Location::RequiresRegister());
+      locations->SetOut(Location::RequiresFpuRegister());
+      break;
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetOut(Location::SameAsFirstInput());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void InstructionCodeGeneratorX86::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  XmmRegister reg = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ movd(reg, locations->InAt(0).AsRegister<Register>());
+      __ punpcklbw(reg, reg);
+      __ punpcklwd(reg, reg);
+      __ pshufd(reg, reg, Immediate(0));
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ movd(reg, locations->InAt(0).AsRegister<Register>());
+      __ punpcklwd(reg, reg);
+      __ pshufd(reg, reg, Immediate(0));
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ movd(reg, locations->InAt(0).AsRegister<Register>());
+      __ pshufd(reg, reg, Immediate(0));
+      break;
+    case Primitive::kPrimLong: {
+      XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ movd(reg, locations->InAt(0).AsRegisterPairLow<Register>());
+      __ movd(tmp, locations->InAt(0).AsRegisterPairHigh<Register>());
+      __ punpckldq(reg, tmp);
+      __ punpcklqdq(reg, reg);
+      break;
+    }
+    case Primitive::kPrimFloat:
+      DCHECK(locations->InAt(0).Equals(locations->Out()));
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ shufps(reg, reg, Immediate(0));
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK(locations->InAt(0).Equals(locations->Out()));
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ shufpd(reg, reg, Immediate(0));
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderX86::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorX86::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderX86::VisitVecSumReduce(HVecSumReduce* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorX86::VisitVecSumReduce(HVecSumReduce* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+// Helper to set up locations for vector unary operations.
+static void CreateVecUnOpLocations(ArenaAllocator* arena, HVecUnaryOperation* instruction) {
+  LocationSummary* locations = new (arena) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetOut(Location::RequiresFpuRegister());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderX86::VisitVecCnv(HVecCnv* instruction) {
+  CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86::VisitVecCnv(HVecCnv* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  XmmRegister src = locations->InAt(0).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  Primitive::Type from = instruction->GetInputType();
+  Primitive::Type to = instruction->GetResultType();
+  if (from == Primitive::kPrimInt && to == Primitive::kPrimFloat) {
+    DCHECK_EQ(4u, instruction->GetVectorLength());
+    __ cvtdq2ps(dst, src);
+  } else {
+    LOG(FATAL) << "Unsupported SIMD type";
+  }
+}
+
+void LocationsBuilderX86::VisitVecNeg(HVecNeg* instruction) {
+  CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86::VisitVecNeg(HVecNeg* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  XmmRegister src = locations->InAt(0).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ pxor(dst, dst);
+      __ psubb(dst, src);
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ pxor(dst, dst);
+      __ psubw(dst, src);
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ pxor(dst, dst);
+      __ psubd(dst, src);
+      break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ pxor(dst, dst);
+      __ psubq(dst, src);
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ xorps(dst, dst);
+      __ subps(dst, src);
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ xorpd(dst, dst);
+      __ subpd(dst, src);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderX86::VisitVecNot(HVecNot* instruction) {
+  CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+  // Boolean-not requires a temporary to construct the 16 x one.
+  if (instruction->GetPackedType() == Primitive::kPrimBoolean) {
+    instruction->GetLocations()->AddTemp(Location::RequiresFpuRegister());
+  }
+}
+
+void InstructionCodeGeneratorX86::VisitVecNot(HVecNot* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  XmmRegister src = locations->InAt(0).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean: {  // special case boolean-not
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+      __ pxor(dst, dst);
+      __ pcmpeqb(tmp, tmp);  // all ones
+      __ psubb(dst, tmp);  // 16 x one
+      __ pxor(dst, src);
+      break;
+    }
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      DCHECK_LE(2u, instruction->GetVectorLength());
+      DCHECK_LE(instruction->GetVectorLength(), 16u);
+      __ pcmpeqb(dst, dst);  // all ones
+      __ pxor(dst, src);
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ pcmpeqb(dst, dst);  // all ones
+      __ xorps(dst, src);
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ pcmpeqb(dst, dst);  // all ones
+      __ xorpd(dst, src);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+// Helper to set up locations for vector binary operations.
+static void CreateVecBinOpLocations(ArenaAllocator* arena, HVecBinaryOperation* instruction) {
+  LocationSummary* locations = new (arena) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetInAt(1, Location::RequiresFpuRegister());
+      locations->SetOut(Location::SameAsFirstInput());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderX86::VisitVecAdd(HVecAdd* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86::VisitVecAdd(HVecAdd* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ paddb(dst, src);
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ paddw(dst, src);
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ paddd(dst, src);
+      break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ paddq(dst, src);
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ addps(dst, src);
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ addpd(dst, src);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderX86::VisitVecSub(HVecSub* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86::VisitVecSub(HVecSub* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ psubb(dst, src);
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ psubw(dst, src);
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ psubd(dst, src);
+      break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ psubq(dst, src);
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ subps(dst, src);
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ subpd(dst, src);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderX86::VisitVecMul(HVecMul* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86::VisitVecMul(HVecMul* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ pmullw(dst, src);
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ pmulld(dst, src);
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ mulps(dst, src);
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ mulpd(dst, src);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderX86::VisitVecDiv(HVecDiv* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86::VisitVecDiv(HVecDiv* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ divps(dst, src);
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ divpd(dst, src);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderX86::VisitVecAnd(HVecAnd* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86::VisitVecAnd(HVecAnd* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      DCHECK_LE(2u, instruction->GetVectorLength());
+      DCHECK_LE(instruction->GetVectorLength(), 16u);
+      __ pand(dst, src);
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ andps(dst, src);
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ andpd(dst, src);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderX86::VisitVecAndNot(HVecAndNot* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86::VisitVecAndNot(HVecAndNot* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      DCHECK_LE(2u, instruction->GetVectorLength());
+      DCHECK_LE(instruction->GetVectorLength(), 16u);
+      __ pandn(dst, src);
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ andnps(dst, src);
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ andnpd(dst, src);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderX86::VisitVecOr(HVecOr* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86::VisitVecOr(HVecOr* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      DCHECK_LE(2u, instruction->GetVectorLength());
+      DCHECK_LE(instruction->GetVectorLength(), 16u);
+      __ por(dst, src);
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ orps(dst, src);
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ orpd(dst, src);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderX86::VisitVecXor(HVecXor* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86::VisitVecXor(HVecXor* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      DCHECK_LE(2u, instruction->GetVectorLength());
+      DCHECK_LE(instruction->GetVectorLength(), 16u);
+      __ pxor(dst, src);
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ xorps(dst, src);
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ xorpd(dst, src);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+// Helper to set up locations for vector shift operations.
+static void CreateVecShiftLocations(ArenaAllocator* arena, HVecBinaryOperation* instruction) {
+  LocationSummary* locations = new (arena) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetInAt(1, Location::ConstantLocation(instruction->InputAt(1)->AsConstant()));
+      locations->SetOut(Location::SameAsFirstInput());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderX86::VisitVecShl(HVecShl* instruction) {
+  CreateVecShiftLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86::VisitVecShl(HVecShl* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ psllw(dst, Immediate(static_cast<uint8_t>(value)));
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ pslld(dst, Immediate(static_cast<uint8_t>(value)));
+      break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ psllq(dst, Immediate(static_cast<uint8_t>(value)));
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderX86::VisitVecShr(HVecShr* instruction) {
+  CreateVecShiftLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86::VisitVecShr(HVecShr* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ psraw(dst, Immediate(static_cast<uint8_t>(value)));
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ psrad(dst, Immediate(static_cast<uint8_t>(value)));
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderX86::VisitVecUShr(HVecUShr* instruction) {
+  CreateVecShiftLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86::VisitVecUShr(HVecUShr* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ psrlw(dst, Immediate(static_cast<uint8_t>(value)));
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ psrld(dst, Immediate(static_cast<uint8_t>(value)));
+      break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ psrlq(dst, Immediate(static_cast<uint8_t>(value)));
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+// Helper to set up locations for vector memory operations.
+static void CreateVecMemLocations(ArenaAllocator* arena,
+                                  HVecMemoryOperation* instruction,
+                                  bool is_load) {
+  LocationSummary* locations = new (arena) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      locations->SetInAt(0, Location::RequiresRegister());
+      locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
+      if (is_load) {
+        locations->SetOut(Location::RequiresFpuRegister());
+      } else {
+        locations->SetInAt(2, Location::RequiresFpuRegister());
+      }
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+// Helper to set up registers and address for vector memory operations.
+static Address CreateVecMemRegisters(HVecMemoryOperation* instruction,
+                                     Location* reg_loc,
+                                     bool is_load) {
+  LocationSummary* locations = instruction->GetLocations();
+  Location base = locations->InAt(0);
+  Location index = locations->InAt(1);
+  *reg_loc = is_load ? locations->Out() : locations->InAt(2);
+  size_t size = Primitive::ComponentSize(instruction->GetPackedType());
+  uint32_t offset = mirror::Array::DataOffset(size).Uint32Value();
+  ScaleFactor scale = TIMES_1;
+  switch (size) {
+    case 2: scale = TIMES_2; break;
+    case 4: scale = TIMES_4; break;
+    case 8: scale = TIMES_8; break;
+    default: break;
+  }
+  return CodeGeneratorX86::ArrayAddress(base.AsRegister<Register>(), index, scale, offset);
+}
+
+void LocationsBuilderX86::VisitVecLoad(HVecLoad* instruction) {
+  CreateVecMemLocations(GetGraph()->GetArena(), instruction, /*is_load*/ true);
+}
+
+void InstructionCodeGeneratorX86::VisitVecLoad(HVecLoad* instruction) {
+  Location reg_loc = Location::NoLocation();
+  Address address = CreateVecMemRegisters(instruction, &reg_loc, /*is_load*/ true);
+  XmmRegister reg = reg_loc.AsFpuRegister<XmmRegister>();
+  bool is_aligned16 = instruction->GetAlignment().IsAlignedAt(16);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      DCHECK_LE(2u, instruction->GetVectorLength());
+      DCHECK_LE(instruction->GetVectorLength(), 16u);
+      is_aligned16 ? __ movdqa(reg, address) : __ movdqu(reg, address);
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      is_aligned16 ? __ movaps(reg, address) : __ movups(reg, address);
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      is_aligned16 ? __ movapd(reg, address) : __ movupd(reg, address);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderX86::VisitVecStore(HVecStore* instruction) {
+  CreateVecMemLocations(GetGraph()->GetArena(), instruction, /*is_load*/ false);
+}
+
+void InstructionCodeGeneratorX86::VisitVecStore(HVecStore* instruction) {
+  Location reg_loc = Location::NoLocation();
+  Address address = CreateVecMemRegisters(instruction, &reg_loc, /*is_load*/ false);
+  XmmRegister reg = reg_loc.AsFpuRegister<XmmRegister>();
+  bool is_aligned16 = instruction->GetAlignment().IsAlignedAt(16);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      DCHECK_LE(2u, instruction->GetVectorLength());
+      DCHECK_LE(instruction->GetVectorLength(), 16u);
+      is_aligned16 ? __ movdqa(address, reg) : __ movdqu(address, reg);
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      is_aligned16 ? __ movaps(address, reg) : __ movups(address, reg);
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      is_aligned16 ? __ movapd(address, reg) : __ movupd(address, reg);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+#undef __
+
+}  // namespace x86
+}  // namespace art
diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc
new file mode 100644
index 0000000000..b1c1494f6b
--- /dev/null
+++ b/compiler/optimizing/code_generator_vector_x86_64.cc
@@ -0,0 +1,760 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "code_generator_x86_64.h"
+#include "mirror/array-inl.h"
+
+namespace art {
+namespace x86_64 {
+
+// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
+#define __ down_cast<X86_64Assembler*>(GetAssembler())->  // NOLINT
+
+void LocationsBuilderX86_64::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      locations->SetInAt(0, Location::RequiresRegister());
+      locations->SetOut(Location::RequiresFpuRegister());
+      break;
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetOut(Location::SameAsFirstInput());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  XmmRegister reg = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ movd(reg, locations->InAt(0).AsRegister<CpuRegister>());
+      __ punpcklbw(reg, reg);
+      __ punpcklwd(reg, reg);
+      __ pshufd(reg, reg, Immediate(0));
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ movd(reg, locations->InAt(0).AsRegister<CpuRegister>());
+      __ punpcklwd(reg, reg);
+      __ pshufd(reg, reg, Immediate(0));
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ movd(reg, locations->InAt(0).AsRegister<CpuRegister>());
+      __ pshufd(reg, reg, Immediate(0));
+      break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ movd(reg, locations->InAt(0).AsRegister<CpuRegister>());  // is 64-bit
+      __ punpcklqdq(reg, reg);
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK(locations->InAt(0).Equals(locations->Out()));
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ shufps(reg, reg, Immediate(0));
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK(locations->InAt(0).Equals(locations->Out()));
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ shufpd(reg, reg, Immediate(0));
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderX86_64::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderX86_64::VisitVecSumReduce(HVecSumReduce* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecSumReduce(HVecSumReduce* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+// Helper to set up locations for vector unary operations.
+static void CreateVecUnOpLocations(ArenaAllocator* arena, HVecUnaryOperation* instruction) {
+  LocationSummary* locations = new (arena) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetOut(Location::RequiresFpuRegister());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderX86_64::VisitVecCnv(HVecCnv* instruction) {
+  CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecCnv(HVecCnv* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  XmmRegister src = locations->InAt(0).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  Primitive::Type from = instruction->GetInputType();
+  Primitive::Type to = instruction->GetResultType();
+  if (from == Primitive::kPrimInt && to == Primitive::kPrimFloat) {
+    DCHECK_EQ(4u, instruction->GetVectorLength());
+    __ cvtdq2ps(dst, src);
+  } else {
+    LOG(FATAL) << "Unsupported SIMD type";
+  }
+}
+
+void LocationsBuilderX86_64::VisitVecNeg(HVecNeg* instruction) {
+  CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecNeg(HVecNeg* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  XmmRegister src = locations->InAt(0).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ pxor(dst, dst);
+      __ psubb(dst, src);
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ pxor(dst, dst);
+      __ psubw(dst, src);
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ pxor(dst, dst);
+      __ psubd(dst, src);
+      break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ pxor(dst, dst);
+      __ psubq(dst, src);
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ xorps(dst, dst);
+      __ subps(dst, src);
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ xorpd(dst, dst);
+      __ subpd(dst, src);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderX86_64::VisitVecNot(HVecNot* instruction) {
+  CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+  // Boolean-not requires a temporary to construct the 16 x one.
+  if (instruction->GetPackedType() == Primitive::kPrimBoolean) {
+    instruction->GetLocations()->AddTemp(Location::RequiresFpuRegister());
+  }
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecNot(HVecNot* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  XmmRegister src = locations->InAt(0).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean: {  // special case boolean-not
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+      __ pxor(dst, dst);
+      __ pcmpeqb(tmp, tmp);  // all ones
+      __ psubb(dst, tmp);  // 16 x one
+      __ pxor(dst, src);
+      break;
+    }
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      DCHECK_LE(2u, instruction->GetVectorLength());
+      DCHECK_LE(instruction->GetVectorLength(), 16u);
+      __ pcmpeqb(dst, dst);  // all ones
+      __ pxor(dst, src);
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ pcmpeqb(dst, dst);  // all ones
+      __ xorps(dst, src);
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ pcmpeqb(dst, dst);  // all ones
+      __ xorpd(dst, src);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+// Helper to set up locations for vector binary operations.
+static void CreateVecBinOpLocations(ArenaAllocator* arena, HVecBinaryOperation* instruction) {
+  LocationSummary* locations = new (arena) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetInAt(1, Location::RequiresFpuRegister());
+      locations->SetOut(Location::SameAsFirstInput());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderX86_64::VisitVecAdd(HVecAdd* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecAdd(HVecAdd* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ paddb(dst, src);
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ paddw(dst, src);
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ paddd(dst, src);
+      break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ paddq(dst, src);
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ addps(dst, src);
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ addpd(dst, src);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderX86_64::VisitVecSub(HVecSub* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecSub(HVecSub* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ psubb(dst, src);
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ psubw(dst, src);
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ psubd(dst, src);
+      break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ psubq(dst, src);
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ subps(dst, src);
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ subpd(dst, src);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderX86_64::VisitVecMul(HVecMul* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecMul(HVecMul* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ pmullw(dst, src);
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ pmulld(dst, src);
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ mulps(dst, src);
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ mulpd(dst, src);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderX86_64::VisitVecDiv(HVecDiv* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecDiv(HVecDiv* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ divps(dst, src);
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ divpd(dst, src);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderX86_64::VisitVecAnd(HVecAnd* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecAnd(HVecAnd* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      DCHECK_LE(2u, instruction->GetVectorLength());
+      DCHECK_LE(instruction->GetVectorLength(), 16u);
+      __ pand(dst, src);
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ andps(dst, src);
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ andpd(dst, src);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderX86_64::VisitVecAndNot(HVecAndNot* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecAndNot(HVecAndNot* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      DCHECK_LE(2u, instruction->GetVectorLength());
+      DCHECK_LE(instruction->GetVectorLength(), 16u);
+      __ pandn(dst, src);
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ andnps(dst, src);
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ andnpd(dst, src);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderX86_64::VisitVecOr(HVecOr* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecOr(HVecOr* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      DCHECK_LE(2u, instruction->GetVectorLength());
+      DCHECK_LE(instruction->GetVectorLength(), 16u);
+      __ por(dst, src);
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ orps(dst, src);
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ orpd(dst, src);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderX86_64::VisitVecXor(HVecXor* instruction) {
+  CreateVecBinOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecXor(HVecXor* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      DCHECK_LE(2u, instruction->GetVectorLength());
+      DCHECK_LE(instruction->GetVectorLength(), 16u);
+      __ pxor(dst, src);
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ xorps(dst, src);
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ xorpd(dst, src);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+// Helper to set up locations for vector shift operations.
+static void CreateVecShiftLocations(ArenaAllocator* arena, HVecBinaryOperation* instruction) {
+  LocationSummary* locations = new (arena) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetInAt(1, Location::ConstantLocation(instruction->InputAt(1)->AsConstant()));
+      locations->SetOut(Location::SameAsFirstInput());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderX86_64::VisitVecShl(HVecShl* instruction) {
+  CreateVecShiftLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecShl(HVecShl* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ psllw(dst, Immediate(static_cast<int8_t>(value)));
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ pslld(dst, Immediate(static_cast<int8_t>(value)));
+      break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ psllq(dst, Immediate(static_cast<int8_t>(value)));
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderX86_64::VisitVecShr(HVecShr* instruction) {
+  CreateVecShiftLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecShr(HVecShr* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ psraw(dst, Immediate(static_cast<int8_t>(value)));
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ psrad(dst, Immediate(static_cast<int8_t>(value)));
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderX86_64::VisitVecUShr(HVecUShr* instruction) {
+  CreateVecShiftLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecUShr(HVecUShr* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ psrlw(dst, Immediate(static_cast<int8_t>(value)));
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ psrld(dst, Immediate(static_cast<int8_t>(value)));
+      break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ psrlq(dst, Immediate(static_cast<int8_t>(value)));
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+// Helper to set up locations for vector memory operations.
+static void CreateVecMemLocations(ArenaAllocator* arena,
+                                  HVecMemoryOperation* instruction,
+                                  bool is_load) {
+  LocationSummary* locations = new (arena) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      locations->SetInAt(0, Location::RequiresRegister());
+      locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
+      if (is_load) {
+        locations->SetOut(Location::RequiresFpuRegister());
+      } else {
+        locations->SetInAt(2, Location::RequiresFpuRegister());
+      }
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+// Helper to set up registers and address for vector memory operations.
+static Address CreateVecMemRegisters(HVecMemoryOperation* instruction,
+                                     Location* reg_loc,
+                                     bool is_load) {
+  LocationSummary* locations = instruction->GetLocations();
+  Location base = locations->InAt(0);
+  Location index = locations->InAt(1);
+  *reg_loc = is_load ? locations->Out() : locations->InAt(2);
+  size_t size = Primitive::ComponentSize(instruction->GetPackedType());
+  uint32_t offset = mirror::Array::DataOffset(size).Uint32Value();
+  ScaleFactor scale = TIMES_1;
+  switch (size) {
+    case 2: scale = TIMES_2; break;
+    case 4: scale = TIMES_4; break;
+    case 8: scale = TIMES_8; break;
+    default: break;
+  }
+  return CodeGeneratorX86_64::ArrayAddress(base.AsRegister<CpuRegister>(), index, scale, offset);
+}
+
+void LocationsBuilderX86_64::VisitVecLoad(HVecLoad* instruction) {
+  CreateVecMemLocations(GetGraph()->GetArena(), instruction, /*is_load*/ true);
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecLoad(HVecLoad* instruction) {
+  Location reg_loc = Location::NoLocation();
+  Address address = CreateVecMemRegisters(instruction, &reg_loc, /*is_load*/ true);
+  XmmRegister reg = reg_loc.AsFpuRegister<XmmRegister>();
+  bool is_aligned16 = instruction->GetAlignment().IsAlignedAt(16);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      DCHECK_LE(2u, instruction->GetVectorLength());
+      DCHECK_LE(instruction->GetVectorLength(), 16u);
+      is_aligned16 ? __ movdqa(reg, address) : __ movdqu(reg, address);
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      is_aligned16 ? __ movaps(reg, address) : __ movups(reg, address);
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      is_aligned16 ? __ movapd(reg, address) : __ movupd(reg, address);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderX86_64::VisitVecStore(HVecStore* instruction) {
+  CreateVecMemLocations(GetGraph()->GetArena(), instruction, /*is_load*/ false);
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecStore(HVecStore* instruction) {
+  Location reg_loc = Location::NoLocation();
+  Address address = CreateVecMemRegisters(instruction, &reg_loc, /*is_load*/ false);
+  XmmRegister reg = reg_loc.AsFpuRegister<XmmRegister>();
+  bool is_aligned16 = instruction->GetAlignment().IsAlignedAt(16);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      DCHECK_LE(2u, instruction->GetVectorLength());
+      DCHECK_LE(instruction->GetVectorLength(), 16u);
+      is_aligned16 ? __ movdqa(address, reg) : __ movdqu(address, reg);
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      is_aligned16 ? __ movaps(address, reg) : __ movups(address, reg);
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      is_aligned16 ? __ movapd(address, reg) : __ movupd(address, reg);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+#undef __
+
+}  // namespace x86_64
+}  // namespace art
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 4db4796985..80776e8b78 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -723,7 +723,7 @@ class ReadBarrierForHeapReferenceSlowPathX86 : public SlowPathCode {
            instruction_->IsArrayGet() ||
            instruction_->IsInstanceOf() ||
            instruction_->IsCheckCast() ||
-           (instruction_->IsInvokeVirtual()) && instruction_->GetLocations()->Intrinsified())
+           (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()))
         << "Unexpected instruction in read barrier for heap reference slow path: "
         << instruction_->DebugName();
 
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 2ffc398287..49f099f6a9 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -744,7 +744,7 @@ class ReadBarrierForHeapReferenceSlowPathX86_64 : public SlowPathCode {
            instruction_->IsArrayGet() ||
            instruction_->IsInstanceOf() ||
            instruction_->IsCheckCast() ||
-           (instruction_->IsInvokeVirtual()) && instruction_->GetLocations()->Intrinsified())
+           (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()))
         << "Unexpected instruction in read barrier for heap reference slow path: "
         << instruction_->DebugName();
 
@@ -3660,7 +3660,7 @@ void InstructionCodeGeneratorX86_64::GenerateDivRemWithAnyConstant(HBinaryOperat
 void InstructionCodeGeneratorX86_64::GenerateDivRemIntegral(HBinaryOperation* instruction) {
   DCHECK(instruction->IsDiv() || instruction->IsRem());
   Primitive::Type type = instruction->GetResultType();
-  DCHECK(type == Primitive::kPrimInt || Primitive::kPrimLong);
+  DCHECK(type == Primitive::kPrimInt || type == Primitive::kPrimLong);
 
   bool is_div = instruction->IsDiv();
   LocationSummary* locations = instruction->GetLocations();
diff --git a/compiler/optimizing/codegen_test_utils.h b/compiler/optimizing/codegen_test_utils.h
index cd954043f5..31cd204c9f 100644
--- a/compiler/optimizing/codegen_test_utils.h
+++ b/compiler/optimizing/codegen_test_utils.h
@@ -74,7 +74,6 @@ class CodegenTargetConfig {
   }
 
  private:
-  CodegenTargetConfig() {}
   InstructionSet isa_;
   CreateCodegenFn create_codegen_;
 };
diff --git a/compiler/optimizing/common_arm.h b/compiler/optimizing/common_arm.h
index e184745520..01304ac35b 100644
--- a/compiler/optimizing/common_arm.h
+++ b/compiler/optimizing/common_arm.h
@@ -66,6 +66,11 @@ inline vixl::aarch32::SRegister LowSRegisterFrom(Location location) {
   return vixl::aarch32::SRegister(location.AsFpuRegisterPairLow<vixl::aarch32::SRegister>());
 }
 
+inline vixl::aarch32::SRegister HighSRegisterFrom(Location location) {
+  DCHECK(location.IsFpuRegisterPair()) << location;
+  return vixl::aarch32::SRegister(location.AsFpuRegisterPairHigh<vixl::aarch32::SRegister>());
+}
+
 inline vixl::aarch32::Register RegisterFrom(Location location) {
   DCHECK(location.IsRegister()) << location;
   return vixl::aarch32::Register(location.reg());
diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index 0dfae11465..cc3c143b15 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc
@@ -505,6 +505,10 @@ class HGraphVisualizerPrinter : public HGraphDelegateVisitor {
     StartAttributeStream("kind") << (try_boundary->IsEntry() ? "entry" : "exit");
   }
 
+  void VisitDeoptimize(HDeoptimize* deoptimize) OVERRIDE {
+    StartAttributeStream("kind") << deoptimize->GetKind();
+  }
+
 #if defined(ART_ENABLE_CODEGEN_arm) || defined(ART_ENABLE_CODEGEN_arm64)
   void VisitMultiplyAccumulate(HMultiplyAccumulate* instruction) OVERRIDE {
     StartAttributeStream("kind") << instruction->GetOpKind();
diff --git a/compiler/optimizing/induction_var_analysis_test.cc b/compiler/optimizing/induction_var_analysis_test.cc
index 82ee93d5c2..9516ccb385 100644
--- a/compiler/optimizing/induction_var_analysis_test.cc
+++ b/compiler/optimizing/induction_var_analysis_test.cc
@@ -29,7 +29,21 @@ namespace art {
  */
 class InductionVarAnalysisTest : public CommonCompilerTest {
  public:
-  InductionVarAnalysisTest() : pool_(), allocator_(&pool_) {
+  InductionVarAnalysisTest()
+      : pool_(),
+        allocator_(&pool_),
+        iva_(nullptr),
+        entry_(nullptr),
+        return_(nullptr),
+        exit_(nullptr),
+        parameter_(nullptr),
+        constant0_(nullptr),
+        constant1_(nullptr),
+        constant2_(nullptr),
+        constant7_(nullptr),
+        constant100_(nullptr),
+        constantm1_(nullptr),
+        float_constant0_(nullptr) {
     graph_ = CreateGraph(&allocator_);
   }
 
diff --git a/compiler/optimizing/inliner.cc b/compiler/optimizing/inliner.cc
index f7331452c6..79cd7048a5 100644
--- a/compiler/optimizing/inliner.cc
+++ b/compiler/optimizing/inliner.cc
@@ -63,7 +63,7 @@ static constexpr size_t kMaximumNumberOfCumulatedDexRegisters = 64;
 static constexpr size_t kMaximumNumberOfRecursiveCalls = 4;
 
 // Controls the use of inline caches in AOT mode.
-static constexpr bool kUseAOTInlineCaches = false;
+static constexpr bool kUseAOTInlineCaches = true;
 
 // We check for line numbers to make sure the DepthString implementation
 // aligns the output nicely.
@@ -672,6 +672,32 @@ HInstanceFieldGet* HInliner::BuildGetReceiverClass(ClassLinker* class_linker,
   return result;
 }
 
+static ArtMethod* ResolveMethodFromInlineCache(Handle<mirror::Class> klass,
+                                               ArtMethod* resolved_method,
+                                               HInstruction* invoke_instruction,
+                                               PointerSize pointer_size)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  if (Runtime::Current()->IsAotCompiler()) {
+    // We can get unrelated types when working with profiles (corruption,
+    // systme updates, or anyone can write to it). So first check if the class
+    // actually implements the declaring class of the method that is being
+    // called in bytecode.
+    // Note: the lookup methods used below require to have assignable types.
+    if (!resolved_method->GetDeclaringClass()->IsAssignableFrom(klass.Get())) {
+      return nullptr;
+    }
+  }
+
+  if (invoke_instruction->IsInvokeInterface()) {
+    resolved_method = klass->FindVirtualMethodForInterface(resolved_method, pointer_size);
+  } else {
+    DCHECK(invoke_instruction->IsInvokeVirtual());
+    resolved_method = klass->FindVirtualMethodForVirtual(resolved_method, pointer_size);
+  }
+  DCHECK(resolved_method != nullptr);
+  return resolved_method;
+}
+
 bool HInliner::TryInlineMonomorphicCall(HInvoke* invoke_instruction,
                                         ArtMethod* resolved_method,
                                         Handle<mirror::ObjectArray<mirror::Class>> classes) {
@@ -690,20 +716,20 @@ bool HInliner::TryInlineMonomorphicCall(HInvoke* invoke_instruction,
 
   ClassLinker* class_linker = caller_compilation_unit_.GetClassLinker();
   PointerSize pointer_size = class_linker->GetImagePointerSize();
-  if (invoke_instruction->IsInvokeInterface()) {
-    resolved_method = GetMonomorphicType(classes)->FindVirtualMethodForInterface(
-        resolved_method, pointer_size);
-  } else {
-    DCHECK(invoke_instruction->IsInvokeVirtual());
-    resolved_method = GetMonomorphicType(classes)->FindVirtualMethodForVirtual(
-        resolved_method, pointer_size);
-  }
+  Handle<mirror::Class> monomorphic_type = handles_->NewHandle(GetMonomorphicType(classes));
+  resolved_method = ResolveMethodFromInlineCache(
+      monomorphic_type, resolved_method, invoke_instruction, pointer_size);
+
   LOG_NOTE() << "Try inline monomorphic call to " << resolved_method->PrettyMethod();
-  DCHECK(resolved_method != nullptr);
+  if (resolved_method == nullptr) {
+    // Bogus AOT profile, bail.
+    DCHECK(Runtime::Current()->IsAotCompiler());
+    return false;
+  }
+
   HInstruction* receiver = invoke_instruction->InputAt(0);
   HInstruction* cursor = invoke_instruction->GetPrevious();
   HBasicBlock* bb_cursor = invoke_instruction->GetBlock();
-  Handle<mirror::Class> monomorphic_type = handles_->NewHandle(GetMonomorphicType(classes));
   if (!TryInlineAndReplace(invoke_instruction,
                            resolved_method,
                            ReferenceTypeInfo::Create(monomorphic_type, /* is_exact */ true),
@@ -742,7 +768,8 @@ void HInliner::AddCHAGuard(HInstruction* invoke_instruction,
       HShouldDeoptimizeFlag(graph_->GetArena(), dex_pc);
   HInstruction* compare = new (graph_->GetArena()) HNotEqual(
       deopt_flag, graph_->GetIntConstant(0, dex_pc));
-  HInstruction* deopt = new (graph_->GetArena()) HDeoptimize(compare, dex_pc);
+  HInstruction* deopt = new (graph_->GetArena()) HDeoptimize(
+      graph_->GetArena(), compare, HDeoptimize::Kind::kInline, dex_pc);
 
   if (cursor != nullptr) {
     bb_cursor->InsertInstructionAfter(deopt_flag, cursor);
@@ -806,9 +833,16 @@ HInstruction* HInliner::AddTypeGuard(HInstruction* receiver,
   bb_cursor->InsertInstructionAfter(compare, load_class);
   if (with_deoptimization) {
     HDeoptimize* deoptimize = new (graph_->GetArena()) HDeoptimize(
-        compare, invoke_instruction->GetDexPc());
+        graph_->GetArena(),
+        compare,
+        receiver,
+        HDeoptimize::Kind::kInline,
+        invoke_instruction->GetDexPc());
     bb_cursor->InsertInstructionAfter(deoptimize, compare);
     deoptimize->CopyEnvironmentFrom(invoke_instruction->GetEnvironment());
+    DCHECK_EQ(invoke_instruction->InputAt(0), receiver);
+    receiver->ReplaceUsesDominatedBy(deoptimize, deoptimize);
+    deoptimize->SetReferenceTypeInfo(receiver->GetReferenceTypeInfo());
   }
   return compare;
 }
@@ -835,11 +869,14 @@ bool HInliner::TryInlinePolymorphicCall(HInvoke* invoke_instruction,
     ArtMethod* method = nullptr;
 
     Handle<mirror::Class> handle = handles_->NewHandle(classes->Get(i));
-    if (invoke_instruction->IsInvokeInterface()) {
-      method = handle->FindVirtualMethodForInterface(resolved_method, pointer_size);
-    } else {
-      DCHECK(invoke_instruction->IsInvokeVirtual());
-      method = handle->FindVirtualMethodForVirtual(resolved_method, pointer_size);
+    method = ResolveMethodFromInlineCache(
+        handle, resolved_method, invoke_instruction, pointer_size);
+    if (method == nullptr) {
+      DCHECK(Runtime::Current()->IsAotCompiler());
+      // AOT profile is bogus. This loop expects to iterate over all entries,
+      // so just just continue.
+      all_targets_inlined = false;
+      continue;
     }
 
     HInstruction* receiver = invoke_instruction->InputAt(0);
@@ -884,7 +921,7 @@ bool HInliner::TryInlinePolymorphicCall(HInvoke* invoke_instruction,
         }
         invoke_instruction->GetBlock()->RemoveInstruction(invoke_instruction);
         // Because the inline cache data can be populated concurrently, we force the end of the
-        // iteration. Otherhwise, we could see a new receiver type.
+        // iteration. Otherwise, we could see a new receiver type.
         break;
       } else {
         CreateDiamondPatternForPolymorphicInline(compare, return_replacement, invoke_instruction);
@@ -1083,13 +1120,19 @@ bool HInliner::TryInlinePolymorphicCallToSameTarget(
     CreateDiamondPatternForPolymorphicInline(compare, return_replacement, invoke_instruction);
   } else {
     HDeoptimize* deoptimize = new (graph_->GetArena()) HDeoptimize(
-        compare, invoke_instruction->GetDexPc());
+        graph_->GetArena(),
+        compare,
+        receiver,
+        HDeoptimize::Kind::kInline,
+        invoke_instruction->GetDexPc());
     bb_cursor->InsertInstructionAfter(deoptimize, compare);
     deoptimize->CopyEnvironmentFrom(invoke_instruction->GetEnvironment());
     if (return_replacement != nullptr) {
       invoke_instruction->ReplaceWith(return_replacement);
     }
+    receiver->ReplaceUsesDominatedBy(deoptimize, deoptimize);
     invoke_instruction->GetBlock()->RemoveInstruction(invoke_instruction);
+    deoptimize->SetReferenceTypeInfo(receiver->GetReferenceTypeInfo());
   }
 
   // Run type propagation to get the guard typed.
diff --git a/compiler/optimizing/instruction_simplifier.cc b/compiler/optimizing/instruction_simplifier.cc
index 17421fc364..60790e5b84 100644
--- a/compiler/optimizing/instruction_simplifier.cc
+++ b/compiler/optimizing/instruction_simplifier.cc
@@ -2132,6 +2132,9 @@ void InstructionSimplifierVisitor::VisitDeoptimize(HDeoptimize* deoptimize) {
   if (cond->IsConstant()) {
     if (cond->AsIntConstant()->IsFalse()) {
       // Never deopt: instruction can be removed.
+      if (deoptimize->GuardsAnInput()) {
+        deoptimize->ReplaceWith(deoptimize->GuardedInput());
+      }
       deoptimize->GetBlock()->RemoveInstruction(deoptimize);
     } else {
       // Always deopt.
diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc
index b25bad7170..0d933eaf82 100644
--- a/compiler/optimizing/intrinsics_arm_vixl.cc
+++ b/compiler/optimizing/intrinsics_arm_vixl.cc
@@ -39,6 +39,7 @@ using helpers::Int32ConstantFrom;
 using helpers::LocationFrom;
 using helpers::LowRegisterFrom;
 using helpers::LowSRegisterFrom;
+using helpers::HighSRegisterFrom;
 using helpers::OutputDRegister;
 using helpers::OutputSRegister;
 using helpers::OutputRegister;
@@ -794,6 +795,58 @@ void IntrinsicCodeGeneratorARMVIXL::VisitMathRint(HInvoke* invoke) {
   __ Vrintn(F64, F64, OutputDRegister(invoke), InputDRegisterAt(invoke, 0));
 }
 
+void IntrinsicLocationsBuilderARMVIXL::VisitMathRoundFloat(HInvoke* invoke) {
+  if (features_.HasARMv8AInstructions()) {
+    LocationSummary* locations = new (arena_) LocationSummary(invoke,
+                                                              LocationSummary::kNoCall,
+                                                              kIntrinsified);
+    locations->SetInAt(0, Location::RequiresFpuRegister());
+    locations->SetOut(Location::RequiresRegister());
+    locations->AddTemp(Location::RequiresFpuRegister());
+  }
+}
+
+void IntrinsicCodeGeneratorARMVIXL::VisitMathRoundFloat(HInvoke* invoke) {
+  DCHECK(codegen_->GetInstructionSetFeatures().HasARMv8AInstructions());
+
+  ArmVIXLAssembler* assembler = GetAssembler();
+  vixl32::SRegister in_reg = InputSRegisterAt(invoke, 0);
+  vixl32::Register out_reg = OutputRegister(invoke);
+  vixl32::SRegister temp1 = LowSRegisterFrom(invoke->GetLocations()->GetTemp(0));
+  vixl32::SRegister temp2 = HighSRegisterFrom(invoke->GetLocations()->GetTemp(0));
+  vixl32::Label done;
+  vixl32::Label* final_label = codegen_->GetFinalLabel(invoke, &done);
+
+  // Round to nearest integer, ties away from zero.
+  __ Vcvta(S32, F32, temp1, in_reg);
+  __ Vmov(out_reg, temp1);
+
+  // For positive, zero or NaN inputs, rounding is done.
+  __ Cmp(out_reg, 0);
+  __ B(ge, final_label, /* far_target */ false);
+
+  // Handle input < 0 cases.
+  // If input is negative but not a tie, previous result (round to nearest) is valid.
+  // If input is a negative tie, change rounding direction to positive infinity, out_reg += 1.
+  __ Vrinta(F32, F32, temp1, in_reg);
+  __ Vmov(temp2, 0.5);
+  __ Vsub(F32, temp1, in_reg, temp1);
+  __ Vcmp(F32, temp1, temp2);
+  __ Vmrs(RegisterOrAPSR_nzcv(kPcCode), FPSCR);
+  {
+    // Use ExactAsemblyScope here because we are using IT.
+    ExactAssemblyScope it_scope(assembler->GetVIXLAssembler(),
+                                2 * kMaxInstructionSizeInBytes,
+                                CodeBufferCheckScope::kMaximumSize);
+    __ it(eq);
+    __ add(eq, out_reg, out_reg, 1);
+  }
+
+  if (done.IsReferenced()) {
+    __ Bind(&done);
+  }
+}
+
 void IntrinsicLocationsBuilderARMVIXL::VisitMemoryPeekByte(HInvoke* invoke) {
   CreateIntToIntLocations(arena_, invoke);
 }
@@ -3100,7 +3153,6 @@ void IntrinsicCodeGeneratorARMVIXL::VisitIntegerValueOf(HInvoke* invoke) {
 }
 
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, MathRoundDouble)   // Could be done by changing rounding mode, maybe?
-UNIMPLEMENTED_INTRINSIC(ARMVIXL, MathRoundFloat)    // Could be done by changing rounding mode, maybe?
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, UnsafeCASLong)     // High register pressure.
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, SystemArrayCopyChar)
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, IntegerHighestOneBit)
diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc
index bf85b1989e..b67793c4ed 100644
--- a/compiler/optimizing/intrinsics_mips.cc
+++ b/compiler/optimizing/intrinsics_mips.cc
@@ -1514,21 +1514,31 @@ void IntrinsicCodeGeneratorMIPS::VisitThreadCurrentThread(HInvoke* invoke) {
                     Thread::PeerOffset<kMipsPointerSize>().Int32Value());
 }
 
-static void CreateIntIntIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke) {
-  bool can_call =
-       invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObject ||
-       invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile;
+static void CreateIntIntIntToIntLocations(ArenaAllocator* arena,
+                                          HInvoke* invoke,
+                                          Primitive::Type type) {
+  bool can_call = kEmitCompilerReadBarrier &&
+      (invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObject ||
+       invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile);
   LocationSummary* locations = new (arena) LocationSummary(invoke,
-                                                           can_call ?
-                                                               LocationSummary::kCallOnSlowPath :
-                                                               LocationSummary::kNoCall,
+                                                           (can_call
+                                                                ? LocationSummary::kCallOnSlowPath
+                                                                : LocationSummary::kNoCall),
                                                            kIntrinsified);
   locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
   locations->SetInAt(1, Location::RequiresRegister());
   locations->SetInAt(2, Location::RequiresRegister());
-  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+  locations->SetOut(Location::RequiresRegister(),
+                    (can_call ? Location::kOutputOverlap : Location::kNoOutputOverlap));
+  if (type == Primitive::kPrimNot && kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+    // We need a temporary register for the read barrier marking slow
+    // path in InstructionCodeGeneratorMIPS::GenerateReferenceLoadWithBakerReadBarrier.
+    locations->AddTemp(Location::RequiresRegister());
+  }
 }
 
+// Note that the caller must supply a properly aligned memory address.
+// If they do not, the behavior is undefined (atomicity not guaranteed, exception may occur).
 static void GenUnsafeGet(HInvoke* invoke,
                          Primitive::Type type,
                          bool is_volatile,
@@ -1539,49 +1549,109 @@ static void GenUnsafeGet(HInvoke* invoke,
          (type == Primitive::kPrimLong) ||
          (type == Primitive::kPrimNot)) << type;
   MipsAssembler* assembler = codegen->GetAssembler();
+  // Target register.
+  Location trg_loc = locations->Out();
   // Object pointer.
-  Register base = locations->InAt(1).AsRegister<Register>();
+  Location base_loc = locations->InAt(1);
+  Register base = base_loc.AsRegister<Register>();
   // The "offset" argument is passed as a "long". Since this code is for
   // a 32-bit processor, we can only use 32-bit addresses, so we only
   // need the low 32-bits of offset.
-  Register offset_lo = invoke->GetLocations()->InAt(2).AsRegisterPairLow<Register>();
+  Location offset_loc = locations->InAt(2);
+  Register offset_lo = offset_loc.AsRegisterPairLow<Register>();
 
-  __ Addu(TMP, base, offset_lo);
-  if (is_volatile) {
-    __ Sync(0);
+  if (!(kEmitCompilerReadBarrier && kUseBakerReadBarrier && (type == Primitive::kPrimNot))) {
+    __ Addu(TMP, base, offset_lo);
   }
-  if (type == Primitive::kPrimLong) {
-    Register trg_lo = locations->Out().AsRegisterPairLow<Register>();
-    Register trg_hi = locations->Out().AsRegisterPairHigh<Register>();
 
-    if (is_R6) {
-      __ Lw(trg_lo, TMP, 0);
-      __ Lw(trg_hi, TMP, 4);
-    } else {
-      __ Lwr(trg_lo, TMP, 0);
-      __ Lwl(trg_lo, TMP, 3);
-      __ Lwr(trg_hi, TMP, 4);
-      __ Lwl(trg_hi, TMP, 7);
+  switch (type) {
+    case Primitive::kPrimLong: {
+      Register trg_lo = trg_loc.AsRegisterPairLow<Register>();
+      Register trg_hi = trg_loc.AsRegisterPairHigh<Register>();
+      CHECK(!is_volatile);  // TODO: support atomic 8-byte volatile loads.
+      if (is_R6) {
+        __ Lw(trg_lo, TMP, 0);
+        __ Lw(trg_hi, TMP, 4);
+      } else {
+        __ Lwr(trg_lo, TMP, 0);
+        __ Lwl(trg_lo, TMP, 3);
+        __ Lwr(trg_hi, TMP, 4);
+        __ Lwl(trg_hi, TMP, 7);
+      }
+      break;
     }
-  } else {
-    Register trg = locations->Out().AsRegister<Register>();
 
-    if (is_R6) {
-      __ Lw(trg, TMP, 0);
-    } else {
-      __ Lwr(trg, TMP, 0);
-      __ Lwl(trg, TMP, 3);
+    case Primitive::kPrimInt: {
+      Register trg = trg_loc.AsRegister<Register>();
+      if (is_R6) {
+        __ Lw(trg, TMP, 0);
+      } else {
+        __ Lwr(trg, TMP, 0);
+        __ Lwl(trg, TMP, 3);
+      }
+      if (is_volatile) {
+        __ Sync(0);
+      }
+      break;
     }
 
-    if (type == Primitive::kPrimNot) {
-      __ MaybeUnpoisonHeapReference(trg);
+    case Primitive::kPrimNot: {
+      Register trg = trg_loc.AsRegister<Register>();
+      if (kEmitCompilerReadBarrier) {
+        if (kUseBakerReadBarrier) {
+          Location temp = locations->GetTemp(0);
+          codegen->GenerateReferenceLoadWithBakerReadBarrier(invoke,
+                                                             trg_loc,
+                                                             base,
+                                                             /* offset */ 0U,
+                                                             /* index */ offset_loc,
+                                                             TIMES_1,
+                                                             temp,
+                                                             /* needs_null_check */ false);
+          if (is_volatile) {
+            __ Sync(0);
+          }
+        } else {
+          if (is_R6) {
+            __ Lw(trg, TMP, 0);
+          } else {
+            __ Lwr(trg, TMP, 0);
+            __ Lwl(trg, TMP, 3);
+          }
+          if (is_volatile) {
+            __ Sync(0);
+          }
+          codegen->GenerateReadBarrierSlow(invoke,
+                                           trg_loc,
+                                           trg_loc,
+                                           base_loc,
+                                           /* offset */ 0U,
+                                           /* index */ offset_loc);
+        }
+      } else {
+        if (is_R6) {
+          __ Lw(trg, TMP, 0);
+        } else {
+          __ Lwr(trg, TMP, 0);
+          __ Lwl(trg, TMP, 3);
+        }
+        if (is_volatile) {
+          __ Sync(0);
+        }
+        __ MaybeUnpoisonHeapReference(trg);
+      }
+      break;
     }
+
+    default:
+      LOG(FATAL) << "Unexpected type " << type;
+      UNREACHABLE();
   }
 }
 
 // int sun.misc.Unsafe.getInt(Object o, long offset)
 void IntrinsicLocationsBuilderMIPS::VisitUnsafeGet(HInvoke* invoke) {
-  CreateIntIntIntToIntLocations(arena_, invoke);
+  CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimInt);
 }
 
 void IntrinsicCodeGeneratorMIPS::VisitUnsafeGet(HInvoke* invoke) {
@@ -1590,7 +1660,7 @@ void IntrinsicCodeGeneratorMIPS::VisitUnsafeGet(HInvoke* invoke) {
 
 // int sun.misc.Unsafe.getIntVolatile(Object o, long offset)
 void IntrinsicLocationsBuilderMIPS::VisitUnsafeGetVolatile(HInvoke* invoke) {
-  CreateIntIntIntToIntLocations(arena_, invoke);
+  CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimInt);
 }
 
 void IntrinsicCodeGeneratorMIPS::VisitUnsafeGetVolatile(HInvoke* invoke) {
@@ -1599,25 +1669,16 @@ void IntrinsicCodeGeneratorMIPS::VisitUnsafeGetVolatile(HInvoke* invoke) {
 
 // long sun.misc.Unsafe.getLong(Object o, long offset)
 void IntrinsicLocationsBuilderMIPS::VisitUnsafeGetLong(HInvoke* invoke) {
-  CreateIntIntIntToIntLocations(arena_, invoke);
+  CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimLong);
 }
 
 void IntrinsicCodeGeneratorMIPS::VisitUnsafeGetLong(HInvoke* invoke) {
   GenUnsafeGet(invoke, Primitive::kPrimLong, /* is_volatile */ false, IsR6(), codegen_);
 }
 
-// long sun.misc.Unsafe.getLongVolatile(Object o, long offset)
-void IntrinsicLocationsBuilderMIPS::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
-  CreateIntIntIntToIntLocations(arena_, invoke);
-}
-
-void IntrinsicCodeGeneratorMIPS::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
-  GenUnsafeGet(invoke, Primitive::kPrimLong, /* is_volatile */ true, IsR6(), codegen_);
-}
-
 // Object sun.misc.Unsafe.getObject(Object o, long offset)
 void IntrinsicLocationsBuilderMIPS::VisitUnsafeGetObject(HInvoke* invoke) {
-  CreateIntIntIntToIntLocations(arena_, invoke);
+  CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimNot);
 }
 
 void IntrinsicCodeGeneratorMIPS::VisitUnsafeGetObject(HInvoke* invoke) {
@@ -1626,7 +1687,7 @@ void IntrinsicCodeGeneratorMIPS::VisitUnsafeGetObject(HInvoke* invoke) {
 
 // Object sun.misc.Unsafe.getObjectVolatile(Object o, long offset)
 void IntrinsicLocationsBuilderMIPS::VisitUnsafeGetObjectVolatile(HInvoke* invoke) {
-  CreateIntIntIntToIntLocations(arena_, invoke);
+  CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimNot);
 }
 
 void IntrinsicCodeGeneratorMIPS::VisitUnsafeGetObjectVolatile(HInvoke* invoke) {
@@ -1643,6 +1704,8 @@ static void CreateIntIntIntIntToVoidLocations(ArenaAllocator* arena, HInvoke* in
   locations->SetInAt(3, Location::RequiresRegister());
 }
 
+// Note that the caller must supply a properly aligned memory address.
+// If they do not, the behavior is undefined (atomicity not guaranteed, exception may occur).
 static void GenUnsafePut(LocationSummary* locations,
                          Primitive::Type type,
                          bool is_volatile,
@@ -1681,7 +1744,7 @@ static void GenUnsafePut(LocationSummary* locations,
   } else {
     Register value_lo = locations->InAt(3).AsRegisterPairLow<Register>();
     Register value_hi = locations->InAt(3).AsRegisterPairHigh<Register>();
-
+    CHECK(!is_volatile);  // TODO: support atomic 8-byte volatile stores.
     if (is_R6) {
       __ Sw(value_lo, TMP, 0);
       __ Sw(value_hi, TMP, 4);
@@ -1815,50 +1878,71 @@ void IntrinsicCodeGeneratorMIPS::VisitUnsafePutLongOrdered(HInvoke* invoke) {
                codegen_);
 }
 
-// void sun.misc.Unsafe.putLongVolatile(Object o, long offset, long x)
-void IntrinsicLocationsBuilderMIPS::VisitUnsafePutLongVolatile(HInvoke* invoke) {
-  CreateIntIntIntIntToVoidLocations(arena_, invoke);
-}
-
-void IntrinsicCodeGeneratorMIPS::VisitUnsafePutLongVolatile(HInvoke* invoke) {
-  GenUnsafePut(invoke->GetLocations(),
-               Primitive::kPrimLong,
-               /* is_volatile */ true,
-               /* is_ordered */ false,
-               IsR6(),
-               codegen_);
-}
-
-static void CreateIntIntIntIntIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke) {
+static void CreateIntIntIntIntIntToIntPlusTemps(ArenaAllocator* arena, HInvoke* invoke) {
+  bool can_call = kEmitCompilerReadBarrier &&
+      kUseBakerReadBarrier &&
+      (invoke->GetIntrinsic() == Intrinsics::kUnsafeCASObject);
   LocationSummary* locations = new (arena) LocationSummary(invoke,
-                                                           LocationSummary::kNoCall,
+                                                           (can_call
+                                                                ? LocationSummary::kCallOnSlowPath
+                                                                : LocationSummary::kNoCall),
                                                            kIntrinsified);
   locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
   locations->SetInAt(1, Location::RequiresRegister());
   locations->SetInAt(2, Location::RequiresRegister());
   locations->SetInAt(3, Location::RequiresRegister());
   locations->SetInAt(4, Location::RequiresRegister());
-
   locations->SetOut(Location::RequiresRegister());
+
+  // Temporary register used in CAS by (Baker) read barrier.
+  if (can_call) {
+    locations->AddTemp(Location::RequiresRegister());
+  }
 }
 
-static void GenCas(LocationSummary* locations, Primitive::Type type, CodeGeneratorMIPS* codegen) {
+// Note that the caller must supply a properly aligned memory address.
+// If they do not, the behavior is undefined (atomicity not guaranteed, exception may occur).
+static void GenCas(HInvoke* invoke, Primitive::Type type, CodeGeneratorMIPS* codegen) {
   MipsAssembler* assembler = codegen->GetAssembler();
+  LocationSummary* locations = invoke->GetLocations();
   bool isR6 = codegen->GetInstructionSetFeatures().IsR6();
   Register base = locations->InAt(1).AsRegister<Register>();
-  Register offset_lo = locations->InAt(2).AsRegisterPairLow<Register>();
+  Location offset_loc = locations->InAt(2);
+  Register offset_lo = offset_loc.AsRegisterPairLow<Register>();
   Register expected = locations->InAt(3).AsRegister<Register>();
   Register value = locations->InAt(4).AsRegister<Register>();
-  Register out = locations->Out().AsRegister<Register>();
+  Location out_loc = locations->Out();
+  Register out = out_loc.AsRegister<Register>();
 
   DCHECK_NE(base, out);
   DCHECK_NE(offset_lo, out);
   DCHECK_NE(expected, out);
 
   if (type == Primitive::kPrimNot) {
-    // Mark card for object assuming new value is stored.
+    // The only read barrier implementation supporting the
+    // UnsafeCASObject intrinsic is the Baker-style read barriers.
+    DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier);
+
+    // Mark card for object assuming new value is stored. Worst case we will mark an unchanged
+    // object and scan the receiver at the next GC for nothing.
     bool value_can_be_null = true;  // TODO: Worth finding out this information?
     codegen->MarkGCCard(base, value, value_can_be_null);
+
+    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+      Location temp = locations->GetTemp(0);
+      // Need to make sure the reference stored in the field is a to-space
+      // one before attempting the CAS or the CAS could fail incorrectly.
+      codegen->GenerateReferenceLoadWithBakerReadBarrier(
+          invoke,
+          out_loc,  // Unused, used only as a "temporary" within the read barrier.
+          base,
+          /* offset */ 0u,
+          /* index */ offset_loc,
+          ScaleFactor::TIMES_1,
+          temp,
+          /* needs_null_check */ false,
+          /* always_update_field */ true);
+    }
   }
 
   MipsLabel loop_head, exit_loop;
@@ -1926,20 +2010,30 @@ static void GenCas(LocationSummary* locations, Primitive::Type type, CodeGenerat
 
 // boolean sun.misc.Unsafe.compareAndSwapInt(Object o, long offset, int expected, int x)
 void IntrinsicLocationsBuilderMIPS::VisitUnsafeCASInt(HInvoke* invoke) {
-  CreateIntIntIntIntIntToIntLocations(arena_, invoke);
+  CreateIntIntIntIntIntToIntPlusTemps(arena_, invoke);
 }
 
 void IntrinsicCodeGeneratorMIPS::VisitUnsafeCASInt(HInvoke* invoke) {
-  GenCas(invoke->GetLocations(), Primitive::kPrimInt, codegen_);
+  GenCas(invoke, Primitive::kPrimInt, codegen_);
 }
 
 // boolean sun.misc.Unsafe.compareAndSwapObject(Object o, long offset, Object expected, Object x)
 void IntrinsicLocationsBuilderMIPS::VisitUnsafeCASObject(HInvoke* invoke) {
-  CreateIntIntIntIntIntToIntLocations(arena_, invoke);
+  // The only read barrier implementation supporting the
+  // UnsafeCASObject intrinsic is the Baker-style read barriers.
+  if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
+    return;
+  }
+
+  CreateIntIntIntIntIntToIntPlusTemps(arena_, invoke);
 }
 
 void IntrinsicCodeGeneratorMIPS::VisitUnsafeCASObject(HInvoke* invoke) {
-  GenCas(invoke->GetLocations(), Primitive::kPrimNot, codegen_);
+  // The only read barrier implementation supporting the
+  // UnsafeCASObject intrinsic is the Baker-style read barriers.
+  DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier);
+
+  GenCas(invoke, Primitive::kPrimNot, codegen_);
 }
 
 // int java.lang.String.compareTo(String anotherString)
@@ -2664,6 +2758,8 @@ UNIMPLEMENTED_INTRINSIC(MIPS, MathCeil)
 UNIMPLEMENTED_INTRINSIC(MIPS, MathFloor)
 UNIMPLEMENTED_INTRINSIC(MIPS, MathRint)
 UNIMPLEMENTED_INTRINSIC(MIPS, MathRoundDouble)
+UNIMPLEMENTED_INTRINSIC(MIPS, UnsafeGetLongVolatile);
+UNIMPLEMENTED_INTRINSIC(MIPS, UnsafePutLongVolatile);
 UNIMPLEMENTED_INTRINSIC(MIPS, UnsafeCASLong)
 
 UNIMPLEMENTED_INTRINSIC(MIPS, ReferenceGetReferent)
diff --git a/compiler/optimizing/intrinsics_mips64.cc b/compiler/optimizing/intrinsics_mips64.cc
index 1ee89cf127..6098767aae 100644
--- a/compiler/optimizing/intrinsics_mips64.cc
+++ b/compiler/optimizing/intrinsics_mips64.cc
@@ -1151,16 +1151,31 @@ void IntrinsicCodeGeneratorMIPS64::VisitThreadCurrentThread(HInvoke* invoke) {
                     Thread::PeerOffset<kMips64PointerSize>().Int32Value());
 }
 
-static void CreateIntIntIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke) {
+static void CreateIntIntIntToIntLocations(ArenaAllocator* arena,
+                                          HInvoke* invoke,
+                                          Primitive::Type type) {
+  bool can_call = kEmitCompilerReadBarrier &&
+      (invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObject ||
+       invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile);
   LocationSummary* locations = new (arena) LocationSummary(invoke,
-                                                           LocationSummary::kNoCall,
+                                                           (can_call
+                                                                ? LocationSummary::kCallOnSlowPath
+                                                                : LocationSummary::kNoCall),
                                                            kIntrinsified);
   locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
   locations->SetInAt(1, Location::RequiresRegister());
   locations->SetInAt(2, Location::RequiresRegister());
-  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+  locations->SetOut(Location::RequiresRegister(),
+                    (can_call ? Location::kOutputOverlap : Location::kNoOutputOverlap));
+  if (type == Primitive::kPrimNot && kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+    // We need a temporary register for the read barrier marking slow
+    // path in InstructionCodeGeneratorMIPS64::GenerateReferenceLoadWithBakerReadBarrier.
+    locations->AddTemp(Location::RequiresRegister());
+  }
 }
 
+// Note that the caller must supply a properly aligned memory address.
+// If they do not, the behavior is undefined (atomicity not guaranteed, exception may occur).
 static void GenUnsafeGet(HInvoke* invoke,
                          Primitive::Type type,
                          bool is_volatile,
@@ -1168,30 +1183,71 @@ static void GenUnsafeGet(HInvoke* invoke,
   LocationSummary* locations = invoke->GetLocations();
   DCHECK((type == Primitive::kPrimInt) ||
          (type == Primitive::kPrimLong) ||
-         (type == Primitive::kPrimNot));
+         (type == Primitive::kPrimNot)) << type;
   Mips64Assembler* assembler = codegen->GetAssembler();
+  // Target register.
+  Location trg_loc = locations->Out();
+  GpuRegister trg = trg_loc.AsRegister<GpuRegister>();
   // Object pointer.
-  GpuRegister base = locations->InAt(1).AsRegister<GpuRegister>();
+  Location base_loc = locations->InAt(1);
+  GpuRegister base = base_loc.AsRegister<GpuRegister>();
   // Long offset.
-  GpuRegister offset = locations->InAt(2).AsRegister<GpuRegister>();
-  GpuRegister trg = locations->Out().AsRegister<GpuRegister>();
+  Location offset_loc = locations->InAt(2);
+  GpuRegister offset = offset_loc.AsRegister<GpuRegister>();
 
-  __ Daddu(TMP, base, offset);
-  if (is_volatile) {
-    __ Sync(0);
+  if (!(kEmitCompilerReadBarrier && kUseBakerReadBarrier && (type == Primitive::kPrimNot))) {
+    __ Daddu(TMP, base, offset);
   }
+
   switch (type) {
+    case Primitive::kPrimLong:
+      __ Ld(trg, TMP, 0);
+      if (is_volatile) {
+        __ Sync(0);
+      }
+      break;
+
     case Primitive::kPrimInt:
       __ Lw(trg, TMP, 0);
+      if (is_volatile) {
+        __ Sync(0);
+      }
       break;
 
     case Primitive::kPrimNot:
-      __ Lwu(trg, TMP, 0);
-      __ MaybeUnpoisonHeapReference(trg);
-      break;
-
-    case Primitive::kPrimLong:
-      __ Ld(trg, TMP, 0);
+      if (kEmitCompilerReadBarrier) {
+        if (kUseBakerReadBarrier) {
+          Location temp = locations->GetTemp(0);
+          codegen->GenerateReferenceLoadWithBakerReadBarrier(invoke,
+                                                             trg_loc,
+                                                             base,
+                                                             /* offset */ 0U,
+                                                             /* index */ offset_loc,
+                                                             TIMES_1,
+                                                             temp,
+                                                             /* needs_null_check */ false);
+          if (is_volatile) {
+            __ Sync(0);
+          }
+        } else {
+          __ Lwu(trg, TMP, 0);
+          if (is_volatile) {
+            __ Sync(0);
+          }
+          codegen->GenerateReadBarrierSlow(invoke,
+                                           trg_loc,
+                                           trg_loc,
+                                           base_loc,
+                                           /* offset */ 0U,
+                                           /* index */ offset_loc);
+        }
+      } else {
+        __ Lwu(trg, TMP, 0);
+        if (is_volatile) {
+          __ Sync(0);
+        }
+        __ MaybeUnpoisonHeapReference(trg);
+      }
       break;
 
     default:
@@ -1202,7 +1258,7 @@ static void GenUnsafeGet(HInvoke* invoke,
 
 // int sun.misc.Unsafe.getInt(Object o, long offset)
 void IntrinsicLocationsBuilderMIPS64::VisitUnsafeGet(HInvoke* invoke) {
-  CreateIntIntIntToIntLocations(arena_, invoke);
+  CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimInt);
 }
 
 void IntrinsicCodeGeneratorMIPS64::VisitUnsafeGet(HInvoke* invoke) {
@@ -1211,7 +1267,7 @@ void IntrinsicCodeGeneratorMIPS64::VisitUnsafeGet(HInvoke* invoke) {
 
 // int sun.misc.Unsafe.getIntVolatile(Object o, long offset)
 void IntrinsicLocationsBuilderMIPS64::VisitUnsafeGetVolatile(HInvoke* invoke) {
-  CreateIntIntIntToIntLocations(arena_, invoke);
+  CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimInt);
 }
 
 void IntrinsicCodeGeneratorMIPS64::VisitUnsafeGetVolatile(HInvoke* invoke) {
@@ -1220,7 +1276,7 @@ void IntrinsicCodeGeneratorMIPS64::VisitUnsafeGetVolatile(HInvoke* invoke) {
 
 // long sun.misc.Unsafe.getLong(Object o, long offset)
 void IntrinsicLocationsBuilderMIPS64::VisitUnsafeGetLong(HInvoke* invoke) {
-  CreateIntIntIntToIntLocations(arena_, invoke);
+  CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimLong);
 }
 
 void IntrinsicCodeGeneratorMIPS64::VisitUnsafeGetLong(HInvoke* invoke) {
@@ -1229,7 +1285,7 @@ void IntrinsicCodeGeneratorMIPS64::VisitUnsafeGetLong(HInvoke* invoke) {
 
 // long sun.misc.Unsafe.getLongVolatile(Object o, long offset)
 void IntrinsicLocationsBuilderMIPS64::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
-  CreateIntIntIntToIntLocations(arena_, invoke);
+  CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimLong);
 }
 
 void IntrinsicCodeGeneratorMIPS64::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
@@ -1238,7 +1294,7 @@ void IntrinsicCodeGeneratorMIPS64::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
 
 // Object sun.misc.Unsafe.getObject(Object o, long offset)
 void IntrinsicLocationsBuilderMIPS64::VisitUnsafeGetObject(HInvoke* invoke) {
-  CreateIntIntIntToIntLocations(arena_, invoke);
+  CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimNot);
 }
 
 void IntrinsicCodeGeneratorMIPS64::VisitUnsafeGetObject(HInvoke* invoke) {
@@ -1247,7 +1303,7 @@ void IntrinsicCodeGeneratorMIPS64::VisitUnsafeGetObject(HInvoke* invoke) {
 
 // Object sun.misc.Unsafe.getObjectVolatile(Object o, long offset)
 void IntrinsicLocationsBuilderMIPS64::VisitUnsafeGetObjectVolatile(HInvoke* invoke) {
-  CreateIntIntIntToIntLocations(arena_, invoke);
+  CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimNot);
 }
 
 void IntrinsicCodeGeneratorMIPS64::VisitUnsafeGetObjectVolatile(HInvoke* invoke) {
@@ -1264,6 +1320,8 @@ static void CreateIntIntIntIntToVoid(ArenaAllocator* arena, HInvoke* invoke) {
   locations->SetInAt(3, Location::RequiresRegister());
 }
 
+// Note that the caller must supply a properly aligned memory address.
+// If they do not, the behavior is undefined (atomicity not guaranteed, exception may occur).
 static void GenUnsafePut(LocationSummary* locations,
                          Primitive::Type type,
                          bool is_volatile,
@@ -1429,35 +1487,70 @@ void IntrinsicCodeGeneratorMIPS64::VisitUnsafePutLongVolatile(HInvoke* invoke) {
                codegen_);
 }
 
-static void CreateIntIntIntIntIntToInt(ArenaAllocator* arena, HInvoke* invoke) {
+static void CreateIntIntIntIntIntToIntPlusTemps(ArenaAllocator* arena, HInvoke* invoke) {
+  bool can_call = kEmitCompilerReadBarrier &&
+      kUseBakerReadBarrier &&
+      (invoke->GetIntrinsic() == Intrinsics::kUnsafeCASObject);
   LocationSummary* locations = new (arena) LocationSummary(invoke,
-                                                           LocationSummary::kNoCall,
+                                                           (can_call
+                                                                ? LocationSummary::kCallOnSlowPath
+                                                                : LocationSummary::kNoCall),
                                                            kIntrinsified);
   locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
   locations->SetInAt(1, Location::RequiresRegister());
   locations->SetInAt(2, Location::RequiresRegister());
   locations->SetInAt(3, Location::RequiresRegister());
   locations->SetInAt(4, Location::RequiresRegister());
-
   locations->SetOut(Location::RequiresRegister());
+
+  // Temporary register used in CAS by (Baker) read barrier.
+  if (can_call) {
+    locations->AddTemp(Location::RequiresRegister());
+  }
 }
 
-static void GenCas(LocationSummary* locations, Primitive::Type type, CodeGeneratorMIPS64* codegen) {
+// Note that the caller must supply a properly aligned memory address.
+// If they do not, the behavior is undefined (atomicity not guaranteed, exception may occur).
+static void GenCas(HInvoke* invoke, Primitive::Type type, CodeGeneratorMIPS64* codegen) {
   Mips64Assembler* assembler = codegen->GetAssembler();
+  LocationSummary* locations = invoke->GetLocations();
   GpuRegister base = locations->InAt(1).AsRegister<GpuRegister>();
-  GpuRegister offset = locations->InAt(2).AsRegister<GpuRegister>();
+  Location offset_loc = locations->InAt(2);
+  GpuRegister offset = offset_loc.AsRegister<GpuRegister>();
   GpuRegister expected = locations->InAt(3).AsRegister<GpuRegister>();
   GpuRegister value = locations->InAt(4).AsRegister<GpuRegister>();
-  GpuRegister out = locations->Out().AsRegister<GpuRegister>();
+  Location out_loc = locations->Out();
+  GpuRegister out = out_loc.AsRegister<GpuRegister>();
 
   DCHECK_NE(base, out);
   DCHECK_NE(offset, out);
   DCHECK_NE(expected, out);
 
   if (type == Primitive::kPrimNot) {
-    // Mark card for object assuming new value is stored.
+    // The only read barrier implementation supporting the
+    // UnsafeCASObject intrinsic is the Baker-style read barriers.
+    DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier);
+
+    // Mark card for object assuming new value is stored. Worst case we will mark an unchanged
+    // object and scan the receiver at the next GC for nothing.
     bool value_can_be_null = true;  // TODO: Worth finding out this information?
     codegen->MarkGCCard(base, value, value_can_be_null);
+
+    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+      Location temp = locations->GetTemp(0);
+      // Need to make sure the reference stored in the field is a to-space
+      // one before attempting the CAS or the CAS could fail incorrectly.
+      codegen->GenerateReferenceLoadWithBakerReadBarrier(
+          invoke,
+          out_loc,  // Unused, used only as a "temporary" within the read barrier.
+          base,
+          /* offset */ 0u,
+          /* index */ offset_loc,
+          ScaleFactor::TIMES_1,
+          temp,
+          /* needs_null_check */ false,
+          /* always_update_field */ true);
+    }
   }
 
   Mips64Label loop_head, exit_loop;
@@ -1521,29 +1614,39 @@ static void GenCas(LocationSummary* locations, Primitive::Type type, CodeGenerat
 
 // boolean sun.misc.Unsafe.compareAndSwapInt(Object o, long offset, int expected, int x)
 void IntrinsicLocationsBuilderMIPS64::VisitUnsafeCASInt(HInvoke* invoke) {
-  CreateIntIntIntIntIntToInt(arena_, invoke);
+  CreateIntIntIntIntIntToIntPlusTemps(arena_, invoke);
 }
 
 void IntrinsicCodeGeneratorMIPS64::VisitUnsafeCASInt(HInvoke* invoke) {
-  GenCas(invoke->GetLocations(), Primitive::kPrimInt, codegen_);
+  GenCas(invoke, Primitive::kPrimInt, codegen_);
 }
 
 // boolean sun.misc.Unsafe.compareAndSwapLong(Object o, long offset, long expected, long x)
 void IntrinsicLocationsBuilderMIPS64::VisitUnsafeCASLong(HInvoke* invoke) {
-  CreateIntIntIntIntIntToInt(arena_, invoke);
+  CreateIntIntIntIntIntToIntPlusTemps(arena_, invoke);
 }
 
 void IntrinsicCodeGeneratorMIPS64::VisitUnsafeCASLong(HInvoke* invoke) {
-  GenCas(invoke->GetLocations(), Primitive::kPrimLong, codegen_);
+  GenCas(invoke, Primitive::kPrimLong, codegen_);
 }
 
 // boolean sun.misc.Unsafe.compareAndSwapObject(Object o, long offset, Object expected, Object x)
 void IntrinsicLocationsBuilderMIPS64::VisitUnsafeCASObject(HInvoke* invoke) {
-  CreateIntIntIntIntIntToInt(arena_, invoke);
+  // The only read barrier implementation supporting the
+  // UnsafeCASObject intrinsic is the Baker-style read barriers.
+  if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
+    return;
+  }
+
+  CreateIntIntIntIntIntToIntPlusTemps(arena_, invoke);
 }
 
 void IntrinsicCodeGeneratorMIPS64::VisitUnsafeCASObject(HInvoke* invoke) {
-  GenCas(invoke->GetLocations(), Primitive::kPrimNot, codegen_);
+  // The only read barrier implementation supporting the
+  // UnsafeCASObject intrinsic is the Baker-style read barriers.
+  DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier);
+
+  GenCas(invoke, Primitive::kPrimNot, codegen_);
 }
 
 // int java.lang.String.compareTo(String anotherString)
diff --git a/compiler/optimizing/licm_test.cc b/compiler/optimizing/licm_test.cc
index 5bcfa4c98b..8d15f78cce 100644
--- a/compiler/optimizing/licm_test.cc
+++ b/compiler/optimizing/licm_test.cc
@@ -28,7 +28,18 @@ namespace art {
  */
 class LICMTest : public CommonCompilerTest {
  public:
-  LICMTest() : pool_(), allocator_(&pool_) {
+  LICMTest()
+      : pool_(),
+        allocator_(&pool_),
+        entry_(nullptr),
+        loop_preheader_(nullptr),
+        loop_header_(nullptr),
+        loop_body_(nullptr),
+        return_(nullptr),
+        exit_(nullptr),
+        parameter_(nullptr),
+        int_constant_(nullptr),
+        float_constant_(nullptr) {
     graph_ = CreateGraph(&allocator_);
   }
 
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 8df513f410..42ed04dfa3 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -16,11 +16,21 @@
 
 #include "loop_optimization.h"
 
+#include "arch/instruction_set.h"
+#include "arch/arm/instruction_set_features_arm.h"
+#include "arch/arm64/instruction_set_features_arm64.h"
+#include "arch/mips/instruction_set_features_mips.h"
+#include "arch/mips64/instruction_set_features_mips64.h"
+#include "arch/x86/instruction_set_features_x86.h"
+#include "arch/x86_64/instruction_set_features_x86_64.h"
 #include "driver/compiler_driver.h"
 #include "linear_order.h"
 
 namespace art {
 
+// Enables vectorization (SIMDization) in the loop optimizer.
+static constexpr bool kEnableVectorization = true;
+
 // Remove the instruction from the graph. A bit more elaborate than the usual
 // instruction removal, since there may be a cycle in the use structure.
 static void RemoveFromCycle(HInstruction* instruction) {
@@ -53,6 +63,19 @@ static bool IsEarlyExit(HLoopInformation* loop_info) {
   return false;
 }
 
+// Test vector restrictions.
+static bool HasVectorRestrictions(uint64_t restrictions, uint64_t tested) {
+  return (restrictions & tested) != 0;
+}
+
+// Inserts an instruction.
+static HInstruction* Insert(HBasicBlock* block, HInstruction* instruction) {
+  DCHECK(block != nullptr);
+  DCHECK(instruction != nullptr);
+  block->InsertInstructionBefore(instruction, block->GetLastInstruction());
+  return instruction;
+}
+
 //
 // Class methods.
 //
@@ -64,11 +87,15 @@ HLoopOptimization::HLoopOptimization(HGraph* graph,
       compiler_driver_(compiler_driver),
       induction_range_(induction_analysis),
       loop_allocator_(nullptr),
+      global_allocator_(graph_->GetArena()),
       top_loop_(nullptr),
       last_loop_(nullptr),
       iset_(nullptr),
       induction_simplication_count_(0),
-      simplified_(false) {
+      simplified_(false),
+      vector_length_(0),
+      vector_refs_(nullptr),
+      vector_map_(nullptr) {
 }
 
 void HLoopOptimization::Run() {
@@ -81,15 +108,13 @@ void HLoopOptimization::Run() {
   // Phase-local allocator that draws from the global pool. Since the allocator
   // itself resides on the stack, it is destructed on exiting Run(), which
   // implies its underlying memory is released immediately.
-  ArenaAllocator allocator(graph_->GetArena()->GetArenaPool());
+  ArenaAllocator allocator(global_allocator_->GetArenaPool());
   loop_allocator_ = &allocator;
 
   // Perform loop optimizations.
   LocalRun();
-
   if (top_loop_ == nullptr) {
-    // All loops have been eliminated.
-    graph_->SetHasLoops(false);
+    graph_->SetHasLoops(false);  // no more loops
   }
 
   // Detach.
@@ -111,18 +136,29 @@ void HLoopOptimization::LocalRun() {
   }
 
   // Traverse the loop hierarchy inner-to-outer and optimize. Traversal can use
-  // a temporary set that stores instructions using the phase-local allocator.
+  // temporary data structures using the phase-local allocator. All new HIR
+  // should use the global allocator.
   if (top_loop_ != nullptr) {
     ArenaSet<HInstruction*> iset(loop_allocator_->Adapter(kArenaAllocLoopOptimization));
+    ArenaSet<ArrayReference> refs(loop_allocator_->Adapter(kArenaAllocLoopOptimization));
+    ArenaSafeMap<HInstruction*, HInstruction*> map(
+        std::less<HInstruction*>(), loop_allocator_->Adapter(kArenaAllocLoopOptimization));
+    // Attach.
     iset_ = &iset;
+    vector_refs_ = &refs;
+    vector_map_ = &map;
+    // Traverse.
     TraverseLoopsInnerToOuter(top_loop_);
-    iset_ = nullptr;  // detach
+    // Detach.
+    iset_ = nullptr;
+    vector_refs_ = nullptr;
+    vector_map_ = nullptr;
   }
 }
 
 void HLoopOptimization::AddLoop(HLoopInformation* loop_info) {
   DCHECK(loop_info != nullptr);
-  LoopNode* node = new (loop_allocator_) LoopNode(loop_info);  // phase-local allocator
+  LoopNode* node = new (loop_allocator_) LoopNode(loop_info);
   if (last_loop_ == nullptr) {
     // First loop.
     DCHECK(top_loop_ == nullptr);
@@ -170,7 +206,7 @@ void HLoopOptimization::RemoveLoop(LoopNode* node) {
 void HLoopOptimization::TraverseLoopsInnerToOuter(LoopNode* node) {
   for ( ; node != nullptr; node = node->next) {
     // Visit inner loops first.
-    int current_induction_simplification_count = induction_simplication_count_;
+    uint32_t current_induction_simplification_count = induction_simplication_count_;
     if (node->inner != nullptr) {
       TraverseLoopsInnerToOuter(node->inner);
     }
@@ -179,7 +215,7 @@ void HLoopOptimization::TraverseLoopsInnerToOuter(LoopNode* node) {
     if (current_induction_simplification_count != induction_simplication_count_) {
       induction_range_.ReVisit(node->loop_info);
     }
-    // Repeat simplifications in the body of this loop until no more changes occur.
+    // Repeat simplifications in the loop-body until no more changes occur.
     // Note that since each simplification consists of eliminating code (without
     // introducing new code), this process is always finite.
     do {
@@ -187,13 +223,17 @@ void HLoopOptimization::TraverseLoopsInnerToOuter(LoopNode* node) {
       SimplifyInduction(node);
       SimplifyBlocks(node);
     } while (simplified_);
-    // Simplify inner loop.
+    // Optimize inner loop.
     if (node->inner == nullptr) {
-      SimplifyInnerLoop(node);
+      OptimizeInnerLoop(node);
     }
   }
 }
 
+//
+// Optimization.
+//
+
 void HLoopOptimization::SimplifyInduction(LoopNode* node) {
   HBasicBlock* header = node->loop_info->GetHeader();
   HBasicBlock* preheader = node->loop_info->GetPreHeader();
@@ -204,13 +244,9 @@ void HLoopOptimization::SimplifyInduction(LoopNode* node) {
   //           for (int i = 0; i < 10; i++, k++) { .... no k .... } return k;
   for (HInstructionIterator it(header->GetPhis()); !it.Done(); it.Advance()) {
     HPhi* phi = it.Current()->AsPhi();
-    iset_->clear();
-    int32_t use_count = 0;
-    if (IsPhiInduction(phi) &&
-        IsOnlyUsedAfterLoop(node->loop_info, phi, /*collect_loop_uses*/ false, &use_count) &&
-        // No uses, or no early-exit with proper replacement.
-        (use_count == 0 ||
-         (!IsEarlyExit(node->loop_info) && TryReplaceWithLastValue(phi, preheader)))) {
+    iset_->clear();  // prepare phi induction
+    if (TrySetPhiInduction(phi, /*restrict_uses*/ true) &&
+        TryAssignLastValue(node->loop_info, phi, preheader, /*collect_loop_uses*/ false)) {
       for (HInstruction* i : *iset_) {
         RemoveFromCycle(i);
       }
@@ -256,49 +292,47 @@ void HLoopOptimization::SimplifyBlocks(LoopNode* node) {
   }
 }
 
-bool HLoopOptimization::SimplifyInnerLoop(LoopNode* node) {
+void HLoopOptimization::OptimizeInnerLoop(LoopNode* node) {
   HBasicBlock* header = node->loop_info->GetHeader();
   HBasicBlock* preheader = node->loop_info->GetPreHeader();
   // Ensure loop header logic is finite.
-  int64_t tc = 0;
-  if (!induction_range_.IsFinite(node->loop_info, &tc)) {
-    return false;
+  int64_t trip_count = 0;
+  if (!induction_range_.IsFinite(node->loop_info, &trip_count)) {
+    return;
   }
+
   // Ensure there is only a single loop-body (besides the header).
   HBasicBlock* body = nullptr;
   for (HBlocksInLoopIterator it(*node->loop_info); !it.Done(); it.Advance()) {
     if (it.Current() != header) {
       if (body != nullptr) {
-        return false;
+        return;
       }
       body = it.Current();
     }
   }
   // Ensure there is only a single exit point.
   if (header->GetSuccessors().size() != 2) {
-    return false;
+    return;
   }
   HBasicBlock* exit = (header->GetSuccessors()[0] == body)
       ? header->GetSuccessors()[1]
       : header->GetSuccessors()[0];
   // Ensure exit can only be reached by exiting loop.
   if (exit->GetPredecessors().size() != 1) {
-    return false;
+    return;
   }
   // Detect either an empty loop (no side effects other than plain iteration) or
   // a trivial loop (just iterating once). Replace subsequent index uses, if any,
   // with the last value and remove the loop, possibly after unrolling its body.
   HInstruction* phi = header->GetFirstPhi();
-  iset_->clear();
-  int32_t use_count = 0;
-  if (IsEmptyHeader(header)) {
+  iset_->clear();  // prepare phi induction
+  if (TrySetSimpleLoopHeader(header)) {
     bool is_empty = IsEmptyBody(body);
-    if ((is_empty || tc == 1) &&
-        IsOnlyUsedAfterLoop(node->loop_info, phi, /*collect_loop_uses*/ true, &use_count) &&
-        // No uses, or proper replacement.
-        (use_count == 0 || TryReplaceWithLastValue(phi, preheader))) {
+    if ((is_empty || trip_count == 1) &&
+        TryAssignLastValue(node->loop_info, phi, preheader, /*collect_loop_uses*/ true)) {
       if (!is_empty) {
-        // Unroll the loop body, which sees initial value of the index.
+        // Unroll the loop-body, which sees initial value of the index.
         phi->ReplaceWith(phi->InputAt(0));
         preheader->MergeInstructionsWith(body);
       }
@@ -308,28 +342,649 @@ bool HLoopOptimization::SimplifyInnerLoop(LoopNode* node) {
       header->RemoveDominatedBlock(exit);
       header->DisconnectAndDelete();
       preheader->AddSuccessor(exit);
-      preheader->AddInstruction(new (graph_->GetArena()) HGoto());  // global allocator
+      preheader->AddInstruction(new (global_allocator_) HGoto());
       preheader->AddDominatedBlock(exit);
       exit->SetDominator(preheader);
       RemoveLoop(node);  // update hierarchy
+      return;
+    }
+  }
+
+  // Vectorize loop, if possible and valid.
+  if (kEnableVectorization) {
+    iset_->clear();  // prepare phi induction
+    if (TrySetSimpleLoopHeader(header) &&
+        CanVectorize(node, body, trip_count) &&
+        TryAssignLastValue(node->loop_info, phi, preheader, /*collect_loop_uses*/ true)) {
+      Vectorize(node, body, exit, trip_count);
+      graph_->SetHasSIMD(true);  // flag SIMD usage
+      return;
+    }
+  }
+}
+
+//
+// Loop vectorization. The implementation is based on the book by Aart J.C. Bik:
+// "The Software Vectorization Handbook. Applying Multimedia Extensions for Maximum Performance."
+// Intel Press, June, 2004 (http://www.aartbik.com/).
+//
+
+bool HLoopOptimization::CanVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count) {
+  // Reset vector bookkeeping.
+  vector_length_ = 0;
+  vector_refs_->clear();
+  vector_runtime_test_a_ =
+  vector_runtime_test_b_= nullptr;
+
+  // Phis in the loop-body prevent vectorization.
+  if (!block->GetPhis().IsEmpty()) {
+    return false;
+  }
+
+  // Scan the loop-body, starting a right-hand-side tree traversal at each left-hand-side
+  // occurrence, which allows passing down attributes down the use tree.
+  for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
+    if (!VectorizeDef(node, it.Current(), /*generate_code*/ false)) {
+      return false;  // failure to vectorize a left-hand-side
+    }
+  }
+
+  // Heuristics. Does vectorization seem profitable?
+  // TODO: refine
+  if (vector_length_ == 0) {
+    return false;  // nothing found
+  } else if (0 < trip_count && trip_count < vector_length_) {
+    return false;  // insufficient iterations
+  }
+
+  // Data dependence analysis. Find each pair of references with same type, where
+  // at least one is a write. Each such pair denotes a possible data dependence.
+  // This analysis exploits the property that differently typed arrays cannot be
+  // aliased, as well as the property that references either point to the same
+  // array or to two completely disjoint arrays, i.e., no partial aliasing.
+  // Other than a few simply heuristics, no detailed subscript analysis is done.
+  for (auto i = vector_refs_->begin(); i != vector_refs_->end(); ++i) {
+    for (auto j = i; ++j != vector_refs_->end(); ) {
+      if (i->type == j->type && (i->lhs || j->lhs)) {
+        // Found same-typed a[i+x] vs. b[i+y], where at least one is a write.
+        HInstruction* a = i->base;
+        HInstruction* b = j->base;
+        HInstruction* x = i->offset;
+        HInstruction* y = j->offset;
+        if (a == b) {
+          // Found a[i+x] vs. a[i+y]. Accept if x == y (loop-independent data dependence).
+          // Conservatively assume a loop-carried data dependence otherwise, and reject.
+          if (x != y) {
+            return false;
+          }
+        } else {
+          // Found a[i+x] vs. b[i+y]. Accept if x == y (at worst loop-independent data dependence).
+          // Conservatively assume a potential loop-carried data dependence otherwise, avoided by
+          // generating an explicit a != b disambiguation runtime test on the two references.
+          if (x != y) {
+            // For now, we reject after one test to avoid excessive overhead.
+            if (vector_runtime_test_a_ != nullptr) {
+              return false;
+            }
+            vector_runtime_test_a_ = a;
+            vector_runtime_test_b_ = b;
+          }
+        }
+      }
+    }
+  }
+
+  // Success!
+  return true;
+}
+
+void HLoopOptimization::Vectorize(LoopNode* node,
+                                  HBasicBlock* block,
+                                  HBasicBlock* exit,
+                                  int64_t trip_count) {
+  Primitive::Type induc_type = Primitive::kPrimInt;
+  HBasicBlock* header = node->loop_info->GetHeader();
+  HBasicBlock* preheader = node->loop_info->GetPreHeader();
+
+  // A cleanup is needed for any unknown trip count or for a known trip count
+  // with remainder iterations after vectorization.
+  bool needs_cleanup = trip_count == 0 || (trip_count % vector_length_) != 0;
+
+  // Adjust vector bookkeeping.
+  iset_->clear();  // prepare phi induction
+  bool is_simple_loop_header = TrySetSimpleLoopHeader(header);  // fills iset_
+  DCHECK(is_simple_loop_header);
+
+  // Generate preheader:
+  // stc = <trip-count>;
+  // vtc = stc - stc % VL;
+  HInstruction* stc = induction_range_.GenerateTripCount(node->loop_info, graph_, preheader);
+  HInstruction* vtc = stc;
+  if (needs_cleanup) {
+    DCHECK(IsPowerOfTwo(vector_length_));
+    HInstruction* rem = Insert(
+        preheader, new (global_allocator_) HAnd(induc_type,
+                                                stc,
+                                                graph_->GetIntConstant(vector_length_ - 1)));
+    vtc = Insert(preheader, new (global_allocator_) HSub(induc_type, stc, rem));
+  }
+
+  // Generate runtime disambiguation test:
+  // vtc = a != b ? vtc : 0;
+  if (vector_runtime_test_a_ != nullptr) {
+    HInstruction* rt = Insert(
+        preheader,
+        new (global_allocator_) HNotEqual(vector_runtime_test_a_, vector_runtime_test_b_));
+    vtc = Insert(preheader,
+                 new (global_allocator_) HSelect(rt, vtc, graph_->GetIntConstant(0), kNoDexPc));
+    needs_cleanup = true;
+  }
+
+  // Generate vector loop:
+  // for (i = 0; i < vtc; i += VL)
+  //    <vectorized-loop-body>
+  vector_mode_ = kVector;
+  GenerateNewLoop(node,
+                  block,
+                  graph_->TransformLoopForVectorization(header, block, exit),
+                  graph_->GetIntConstant(0),
+                  vtc,
+                  graph_->GetIntConstant(vector_length_));
+  HLoopInformation* vloop = vector_header_->GetLoopInformation();
+
+  // Generate cleanup loop, if needed:
+  // for ( ; i < stc; i += 1)
+  //    <loop-body>
+  if (needs_cleanup) {
+    vector_mode_ = kSequential;
+    GenerateNewLoop(node,
+                    block,
+                    graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit),
+                    vector_phi_,
+                    stc,
+                    graph_->GetIntConstant(1));
+  }
+
+  // Remove the original loop by disconnecting the body block
+  // and removing all instructions from the header.
+  block->DisconnectAndDelete();
+  while (!header->GetFirstInstruction()->IsGoto()) {
+    header->RemoveInstruction(header->GetFirstInstruction());
+  }
+  // Update loop hierarchy: the old header now resides in the
+  // same outer loop as the old preheader.
+  header->SetLoopInformation(preheader->GetLoopInformation());  // outward
+  node->loop_info = vloop;
+}
+
+void HLoopOptimization::GenerateNewLoop(LoopNode* node,
+                                        HBasicBlock* block,
+                                        HBasicBlock* new_preheader,
+                                        HInstruction* lo,
+                                        HInstruction* hi,
+                                        HInstruction* step) {
+  Primitive::Type induc_type = Primitive::kPrimInt;
+  // Prepare new loop.
+  vector_map_->clear();
+  vector_preheader_ = new_preheader,
+  vector_header_ = vector_preheader_->GetSingleSuccessor();
+  vector_body_ = vector_header_->GetSuccessors()[1];
+  vector_phi_ = new (global_allocator_) HPhi(global_allocator_,
+                                             kNoRegNumber,
+                                             0,
+                                             HPhi::ToPhiType(induc_type));
+  // Generate header.
+  // for (i = lo; i < hi; i += step)
+  //    <loop-body>
+  HInstruction* cond = new (global_allocator_) HAboveOrEqual(vector_phi_, hi);
+  vector_header_->AddPhi(vector_phi_);
+  vector_header_->AddInstruction(cond);
+  vector_header_->AddInstruction(new (global_allocator_) HIf(cond));
+  // Suspend check and environment.
+  HInstruction* suspend = vector_header_->GetFirstInstruction();
+  suspend->CopyEnvironmentFromWithLoopPhiAdjustment(
+      node->loop_info->GetSuspendCheck()->GetEnvironment(), vector_header_);
+  // Generate body.
+  for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
+    bool vectorized_def = VectorizeDef(node, it.Current(), /*generate_code*/ true);
+    DCHECK(vectorized_def);
+  }
+  for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
+    auto i = vector_map_->find(it.Current());
+    if (i != vector_map_->end() && !i->second->IsInBlock()) {
+      Insert(vector_body_, i->second);  // lays out in original order
+      if (i->second->NeedsEnvironment()) {
+        i->second->CopyEnvironmentFromWithLoopPhiAdjustment(
+            suspend->GetEnvironment(), vector_header_);
+      }
+    }
+  }
+  // Finalize increment and phi.
+  HInstruction* inc = new (global_allocator_) HAdd(induc_type, vector_phi_, step);
+  vector_phi_->AddInput(lo);
+  vector_phi_->AddInput(Insert(vector_body_, inc));
+}
+
+// TODO: accept reductions at left-hand-side, mixed-type store idioms, etc.
+bool HLoopOptimization::VectorizeDef(LoopNode* node,
+                                     HInstruction* instruction,
+                                     bool generate_code) {
+  // Accept a left-hand-side array base[index] for
+  // (1) supported vector type,
+  // (2) loop-invariant base,
+  // (3) unit stride index,
+  // (4) vectorizable right-hand-side value.
+  uint64_t restrictions = kNone;
+  if (instruction->IsArraySet()) {
+    Primitive::Type type = instruction->AsArraySet()->GetComponentType();
+    HInstruction* base = instruction->InputAt(0);
+    HInstruction* index = instruction->InputAt(1);
+    HInstruction* value = instruction->InputAt(2);
+    HInstruction* offset = nullptr;
+    if (TrySetVectorType(type, &restrictions) &&
+        node->loop_info->IsDefinedOutOfTheLoop(base) &&
+        induction_range_.IsUnitStride(index, &offset) &&
+        VectorizeUse(node, value, generate_code, type, restrictions)) {
+      if (generate_code) {
+        GenerateVecSub(index, offset);
+        GenerateVecMem(instruction, vector_map_->Get(index), vector_map_->Get(value), type);
+      } else {
+        vector_refs_->insert(ArrayReference(base, offset, type, /*lhs*/ true));
+      }
       return true;
     }
+    return false;
+  }
+  // Branch back okay.
+  if (instruction->IsGoto()) {
+    return true;
+  }
+  // Otherwise accept only expressions with no effects outside the immediate loop-body.
+  // Note that actual uses are inspected during right-hand-side tree traversal.
+  return !IsUsedOutsideLoop(node->loop_info, instruction) && !instruction->DoesAnyWrite();
+}
+
+// TODO: more operations and intrinsics, detect saturation arithmetic, etc.
+bool HLoopOptimization::VectorizeUse(LoopNode* node,
+                                     HInstruction* instruction,
+                                     bool generate_code,
+                                     Primitive::Type type,
+                                     uint64_t restrictions) {
+  // Accept anything for which code has already been generated.
+  if (generate_code) {
+    if (vector_map_->find(instruction) != vector_map_->end()) {
+      return true;
+    }
+  }
+  // Continue the right-hand-side tree traversal, passing in proper
+  // types and vector restrictions along the way. During code generation,
+  // all new nodes are drawn from the global allocator.
+  if (node->loop_info->IsDefinedOutOfTheLoop(instruction)) {
+    // Accept invariant use, using scalar expansion.
+    if (generate_code) {
+      GenerateVecInv(instruction, type);
+    }
+    return true;
+  } else if (instruction->IsArrayGet()) {
+    // Accept a right-hand-side array base[index] for
+    // (1) exact matching vector type,
+    // (2) loop-invariant base,
+    // (3) unit stride index,
+    // (4) vectorizable right-hand-side value.
+    HInstruction* base = instruction->InputAt(0);
+    HInstruction* index = instruction->InputAt(1);
+    HInstruction* offset = nullptr;
+    if (type == instruction->GetType() &&
+        node->loop_info->IsDefinedOutOfTheLoop(base) &&
+        induction_range_.IsUnitStride(index, &offset)) {
+      if (generate_code) {
+        GenerateVecSub(index, offset);
+        GenerateVecMem(instruction, vector_map_->Get(index), nullptr, type);
+      } else {
+        vector_refs_->insert(ArrayReference(base, offset, type, /*lhs*/ false));
+      }
+      return true;
+    }
+  } else if (instruction->IsTypeConversion()) {
+    // Accept particular type conversions.
+    HTypeConversion* conversion = instruction->AsTypeConversion();
+    HInstruction* opa = conversion->InputAt(0);
+    Primitive::Type from = conversion->GetInputType();
+    Primitive::Type to = conversion->GetResultType();
+    if ((to == Primitive::kPrimByte ||
+         to == Primitive::kPrimChar ||
+         to == Primitive::kPrimShort) && from == Primitive::kPrimInt) {
+      // Accept a "narrowing" type conversion from a "wider" computation for
+      // (1) conversion into final required type,
+      // (2) vectorizable operand,
+      // (3) "wider" operations cannot bring in higher order bits.
+      if (to == type && VectorizeUse(node, opa, generate_code, type, restrictions | kNoHiBits)) {
+        if (generate_code) {
+          if (vector_mode_ == kVector) {
+            vector_map_->Put(instruction, vector_map_->Get(opa));  // operand pass-through
+          } else {
+            GenerateVecOp(instruction, vector_map_->Get(opa), nullptr, type);
+          }
+        }
+        return true;
+      }
+    } else if (to == Primitive::kPrimFloat && from == Primitive::kPrimInt) {
+      DCHECK_EQ(to, type);
+      // Accept int to float conversion for
+      // (1) supported int,
+      // (2) vectorizable operand.
+      if (TrySetVectorType(from, &restrictions) &&
+          VectorizeUse(node, opa, generate_code, from, restrictions)) {
+        if (generate_code) {
+          GenerateVecOp(instruction, vector_map_->Get(opa), nullptr, type);
+        }
+        return true;
+      }
+    }
+    return false;
+  } else if (instruction->IsNeg() || instruction->IsNot() || instruction->IsBooleanNot()) {
+    // Accept unary operator for vectorizable operand.
+    HInstruction* opa = instruction->InputAt(0);
+    if (VectorizeUse(node, opa, generate_code, type, restrictions)) {
+      if (generate_code) {
+        GenerateVecOp(instruction, vector_map_->Get(opa), nullptr, type);
+      }
+      return true;
+    }
+  } else if (instruction->IsAdd() || instruction->IsSub() ||
+             instruction->IsMul() || instruction->IsDiv() ||
+             instruction->IsAnd() || instruction->IsOr()  || instruction->IsXor()) {
+    // Deal with vector restrictions.
+    if ((instruction->IsMul() && HasVectorRestrictions(restrictions, kNoMul)) ||
+        (instruction->IsDiv() && HasVectorRestrictions(restrictions, kNoDiv))) {
+      return false;
+    }
+    // Accept binary operator for vectorizable operands.
+    HInstruction* opa = instruction->InputAt(0);
+    HInstruction* opb = instruction->InputAt(1);
+    if (VectorizeUse(node, opa, generate_code, type, restrictions) &&
+        VectorizeUse(node, opb, generate_code, type, restrictions)) {
+      if (generate_code) {
+        GenerateVecOp(instruction, vector_map_->Get(opa), vector_map_->Get(opb), type);
+      }
+      return true;
+    }
+  } else if (instruction->IsShl() || instruction->IsShr() || instruction->IsUShr()) {
+    // Deal with vector restrictions.
+    if ((HasVectorRestrictions(restrictions, kNoShift)) ||
+        (instruction->IsShr() && HasVectorRestrictions(restrictions, kNoShr))) {
+      return false;  // unsupported instruction
+    } else if ((instruction->IsShr() || instruction->IsUShr()) &&
+               HasVectorRestrictions(restrictions, kNoHiBits)) {
+      return false;  // hibits may impact lobits; TODO: we can do better!
+    }
+    // Accept shift operator for vectorizable/invariant operands.
+    // TODO: accept symbolic, albeit loop invariant shift factors.
+    HInstruction* opa = instruction->InputAt(0);
+    HInstruction* opb = instruction->InputAt(1);
+    if (VectorizeUse(node, opa, generate_code, type, restrictions) && opb->IsIntConstant()) {
+      if (generate_code) {
+        // Make sure shift factor only looks at lower bits, as defined for sequential shifts.
+        // Note that even the narrower SIMD shifts do the right thing after that.
+        int32_t mask = (instruction->GetType() == Primitive::kPrimLong)
+            ? kMaxLongShiftDistance
+            : kMaxIntShiftDistance;
+        HInstruction* s = graph_->GetIntConstant(opb->AsIntConstant()->GetValue() & mask);
+        GenerateVecOp(instruction, vector_map_->Get(opa), s, type);
+      }
+      return true;
+    }
+  } else if (instruction->IsInvokeStaticOrDirect()) {
+    // TODO: coming soon.
+    return false;
   }
   return false;
 }
 
-bool HLoopOptimization::IsPhiInduction(HPhi* phi) {
+bool HLoopOptimization::TrySetVectorType(Primitive::Type type, uint64_t* restrictions) {
+  const InstructionSetFeatures* features = compiler_driver_->GetInstructionSetFeatures();
+  switch (compiler_driver_->GetInstructionSet()) {
+    case kArm:
+    case kThumb2:
+      return false;
+    case kArm64:
+      // Allow vectorization for all ARM devices, because Android assumes that
+      // ARMv8 AArch64 always supports advanced SIMD. For now, only D registers
+      // (64-bit vectors) not Q registers (128-bit vectors).
+      switch (type) {
+        case Primitive::kPrimBoolean:
+        case Primitive::kPrimByte:
+          *restrictions |= kNoDiv;
+          return TrySetVectorLength(8);
+        case Primitive::kPrimChar:
+        case Primitive::kPrimShort:
+          *restrictions |= kNoDiv;
+          return TrySetVectorLength(4);
+        case Primitive::kPrimInt:
+          *restrictions |= kNoDiv;
+          return TrySetVectorLength(2);
+        case Primitive::kPrimFloat:
+          return TrySetVectorLength(2);
+        default:
+          return false;
+      }
+    case kX86:
+    case kX86_64:
+      // Allow vectorization for SSE4-enabled X86 devices only (128-bit vectors).
+      if (features->AsX86InstructionSetFeatures()->HasSSE4_1()) {
+        switch (type) {
+          case Primitive::kPrimBoolean:
+          case Primitive::kPrimByte:
+            *restrictions |= kNoMul | kNoDiv | kNoShift;
+            return TrySetVectorLength(16);
+          case Primitive::kPrimChar:
+          case Primitive::kPrimShort:
+            *restrictions |= kNoDiv;
+            return TrySetVectorLength(8);
+          case Primitive::kPrimInt:
+            *restrictions |= kNoDiv;
+            return TrySetVectorLength(4);
+          case Primitive::kPrimLong:
+            *restrictions |= kNoMul | kNoDiv | kNoShr;
+            return TrySetVectorLength(2);
+          case Primitive::kPrimFloat:
+            return TrySetVectorLength(4);
+          case Primitive::kPrimDouble:
+            return TrySetVectorLength(2);
+          default:
+            break;
+        }  // switch type
+      }
+      return false;
+    case kMips:
+    case kMips64:
+      // TODO: implement MIPS SIMD.
+      return false;
+    default:
+      return false;
+  }  // switch instruction set
+}
+
+bool HLoopOptimization::TrySetVectorLength(uint32_t length) {
+  DCHECK(IsPowerOfTwo(length) && length >= 2u);
+  // First time set?
+  if (vector_length_ == 0) {
+    vector_length_ = length;
+  }
+  // Different types are acceptable within a loop-body, as long as all the corresponding vector
+  // lengths match exactly to obtain a uniform traversal through the vector iteration space
+  // (idiomatic exceptions to this rule can be handled by further unrolling sub-expressions).
+  return vector_length_ == length;
+}
+
+void HLoopOptimization::GenerateVecInv(HInstruction* org, Primitive::Type type) {
+  if (vector_map_->find(org) == vector_map_->end()) {
+    // In scalar code, just use a self pass-through for scalar invariants
+    // (viz. expression remains itself).
+    if (vector_mode_ == kSequential) {
+      vector_map_->Put(org, org);
+      return;
+    }
+    // In vector code, explicit scalar expansion is needed.
+    HInstruction* vector = new (global_allocator_) HVecReplicateScalar(
+        global_allocator_, org, type, vector_length_);
+    vector_map_->Put(org, Insert(vector_preheader_, vector));
+  }
+}
+
+void HLoopOptimization::GenerateVecSub(HInstruction* org, HInstruction* offset) {
+  if (vector_map_->find(org) == vector_map_->end()) {
+    HInstruction* subscript = vector_phi_;
+    if (offset != nullptr) {
+      subscript = new (global_allocator_) HAdd(Primitive::kPrimInt, subscript, offset);
+      if (org->IsPhi()) {
+        Insert(vector_body_, subscript);  // lacks layout placeholder
+      }
+    }
+    vector_map_->Put(org, subscript);
+  }
+}
+
+void HLoopOptimization::GenerateVecMem(HInstruction* org,
+                                       HInstruction* opa,
+                                       HInstruction* opb,
+                                       Primitive::Type type) {
+  HInstruction* vector = nullptr;
+  if (vector_mode_ == kVector) {
+    // Vector store or load.
+    if (opb != nullptr) {
+      vector = new (global_allocator_) HVecStore(
+          global_allocator_, org->InputAt(0), opa, opb, type, vector_length_);
+    } else  {
+      vector = new (global_allocator_) HVecLoad(
+          global_allocator_, org->InputAt(0), opa, type, vector_length_);
+    }
+  } else {
+    // Scalar store or load.
+    DCHECK(vector_mode_ == kSequential);
+    if (opb != nullptr) {
+      vector = new (global_allocator_) HArraySet(org->InputAt(0), opa, opb, type, kNoDexPc);
+    } else  {
+      vector = new (global_allocator_) HArrayGet(org->InputAt(0), opa, type, kNoDexPc);
+    }
+  }
+  vector_map_->Put(org, vector);
+}
+
+#define GENERATE_VEC(x, y) \
+  if (vector_mode_ == kVector) { \
+    vector = (x); \
+  } else { \
+    DCHECK(vector_mode_ == kSequential); \
+    vector = (y); \
+  } \
+  break;
+
+void HLoopOptimization::GenerateVecOp(HInstruction* org,
+                                      HInstruction* opa,
+                                      HInstruction* opb,
+                                      Primitive::Type type) {
+  if (vector_mode_ == kSequential) {
+    // Scalar code follows implicit integral promotion.
+    if (type == Primitive::kPrimBoolean ||
+        type == Primitive::kPrimByte ||
+        type == Primitive::kPrimChar ||
+        type == Primitive::kPrimShort) {
+      type = Primitive::kPrimInt;
+    }
+  }
+  HInstruction* vector = nullptr;
+  switch (org->GetKind()) {
+    case HInstruction::kNeg:
+      DCHECK(opb == nullptr);
+      GENERATE_VEC(
+          new (global_allocator_) HVecNeg(global_allocator_, opa, type, vector_length_),
+          new (global_allocator_) HNeg(type, opa));
+    case HInstruction::kNot:
+      DCHECK(opb == nullptr);
+      GENERATE_VEC(
+          new (global_allocator_) HVecNot(global_allocator_, opa, type, vector_length_),
+          new (global_allocator_) HNot(type, opa));
+    case HInstruction::kBooleanNot:
+      DCHECK(opb == nullptr);
+      GENERATE_VEC(
+          new (global_allocator_) HVecNot(global_allocator_, opa, type, vector_length_),
+          new (global_allocator_) HBooleanNot(opa));
+    case HInstruction::kTypeConversion:
+      DCHECK(opb == nullptr);
+      GENERATE_VEC(
+          new (global_allocator_) HVecCnv(global_allocator_, opa, type, vector_length_),
+          new (global_allocator_) HTypeConversion(type, opa, kNoDexPc));
+    case HInstruction::kAdd:
+      GENERATE_VEC(
+          new (global_allocator_) HVecAdd(global_allocator_, opa, opb, type, vector_length_),
+          new (global_allocator_) HAdd(type, opa, opb));
+    case HInstruction::kSub:
+      GENERATE_VEC(
+          new (global_allocator_) HVecSub(global_allocator_, opa, opb, type, vector_length_),
+          new (global_allocator_) HSub(type, opa, opb));
+    case HInstruction::kMul:
+      GENERATE_VEC(
+          new (global_allocator_) HVecMul(global_allocator_, opa, opb, type, vector_length_),
+          new (global_allocator_) HMul(type, opa, opb));
+    case HInstruction::kDiv:
+      GENERATE_VEC(
+          new (global_allocator_) HVecDiv(global_allocator_, opa, opb, type, vector_length_),
+          new (global_allocator_) HDiv(type, opa, opb, kNoDexPc));
+    case HInstruction::kAnd:
+      GENERATE_VEC(
+          new (global_allocator_) HVecAnd(global_allocator_, opa, opb, type, vector_length_),
+          new (global_allocator_) HAnd(type, opa, opb));
+    case HInstruction::kOr:
+      GENERATE_VEC(
+          new (global_allocator_) HVecOr(global_allocator_, opa, opb, type, vector_length_),
+          new (global_allocator_) HOr(type, opa, opb));
+    case HInstruction::kXor:
+      GENERATE_VEC(
+          new (global_allocator_) HVecXor(global_allocator_, opa, opb, type, vector_length_),
+          new (global_allocator_) HXor(type, opa, opb));
+    case HInstruction::kShl:
+      GENERATE_VEC(
+          new (global_allocator_) HVecShl(global_allocator_, opa, opb, type, vector_length_),
+          new (global_allocator_) HShl(type, opa, opb));
+    case HInstruction::kShr:
+      GENERATE_VEC(
+          new (global_allocator_) HVecShr(global_allocator_, opa, opb, type, vector_length_),
+          new (global_allocator_) HShr(type, opa, opb));
+    case HInstruction::kUShr:
+      GENERATE_VEC(
+          new (global_allocator_) HVecUShr(global_allocator_, opa, opb, type, vector_length_),
+          new (global_allocator_) HUShr(type, opa, opb));
+    case HInstruction::kInvokeStaticOrDirect: {
+      // TODO: coming soon.
+      break;
+    }
+    default:
+      break;
+  }  // switch
+  CHECK(vector != nullptr) << "Unsupported SIMD operator";
+  vector_map_->Put(org, vector);
+}
+
+#undef GENERATE_VEC
+
+//
+// Helpers.
+//
+
+bool HLoopOptimization::TrySetPhiInduction(HPhi* phi, bool restrict_uses) {
+  DCHECK(iset_->empty());
   ArenaSet<HInstruction*>* set = induction_range_.LookupCycle(phi);
   if (set != nullptr) {
-    DCHECK(iset_->empty());
     for (HInstruction* i : *set) {
       // Check that, other than instructions that are no longer in the graph (removed earlier)
-      // each instruction is removable and, other than the phi, uses are contained in the cycle.
+      // each instruction is removable and, when restrict uses are requested, other than for phi,
+      // all uses are contained within the cycle.
       if (!i->IsInBlock()) {
         continue;
       } else if (!i->IsRemovable()) {
         return false;
-      } else if (i != phi) {
+      } else if (i != phi && restrict_uses) {
         for (const HUseListNode<HInstruction*>& use : i->GetUses()) {
           if (set->find(use.GetUser()) == set->end()) {
             return false;
@@ -348,10 +1003,12 @@ bool HLoopOptimization::IsPhiInduction(HPhi* phi) {
 //       c:   Condition(phi, bound)
 //       i:   If(c)
 // TODO: Find a less pattern matching approach?
-bool HLoopOptimization::IsEmptyHeader(HBasicBlock* block) {
+bool HLoopOptimization::TrySetSimpleLoopHeader(HBasicBlock* block) {
   DCHECK(iset_->empty());
   HInstruction* phi = block->GetFirstPhi();
-  if (phi != nullptr && phi->GetNext() == nullptr && IsPhiInduction(phi->AsPhi())) {
+  if (phi != nullptr &&
+      phi->GetNext() == nullptr &&
+      TrySetPhiInduction(phi->AsPhi(), /*restrict_uses*/ false)) {
     HInstruction* s = block->GetFirstInstruction();
     if (s != nullptr && s->IsSuspendCheck()) {
       HInstruction* c = s->GetNext();
@@ -369,14 +1026,24 @@ bool HLoopOptimization::IsEmptyHeader(HBasicBlock* block) {
 }
 
 bool HLoopOptimization::IsEmptyBody(HBasicBlock* block) {
-  if (block->GetFirstPhi() == nullptr) {
-    for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
-      HInstruction* instruction = it.Current();
-      if (!instruction->IsGoto() && iset_->find(instruction) == iset_->end()) {
-        return false;
-      }
+  if (!block->GetPhis().IsEmpty()) {
+    return false;
+  }
+  for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
+    HInstruction* instruction = it.Current();
+    if (!instruction->IsGoto() && iset_->find(instruction) == iset_->end()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool HLoopOptimization::IsUsedOutsideLoop(HLoopInformation* loop_info,
+                                          HInstruction* instruction) {
+  for (const HUseListNode<HInstruction*>& use : instruction->GetUses()) {
+    if (use.GetUser()->GetBlock()->GetLoopInformation() != loop_info) {
+      return true;
     }
-    return true;
   }
   return false;
 }
@@ -438,6 +1105,19 @@ bool HLoopOptimization::TryReplaceWithLastValue(HInstruction* instruction, HBasi
   return false;
 }
 
+bool HLoopOptimization::TryAssignLastValue(HLoopInformation* loop_info,
+                                           HInstruction* instruction,
+                                           HBasicBlock* block,
+                                           bool collect_loop_uses) {
+  // Assigning the last value is always successful if there are no uses.
+  // Otherwise, it succeeds in a no early-exit loop by generating the
+  // proper last value assignment.
+  int32_t use_count = 0;
+  return IsOnlyUsedAfterLoop(loop_info, instruction, collect_loop_uses, &use_count) &&
+      (use_count == 0 ||
+       (!IsEarlyExit(loop_info) && TryReplaceWithLastValue(instruction, block)));
+}
+
 void HLoopOptimization::RemoveDeadInstructions(const HInstructionList& list) {
   for (HBackwardInstructionIterator i(list); !i.Done(); i.Advance()) {
     HInstruction* instruction = i.Current();
diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h
index 0b798fc7a9..16f7691af2 100644
--- a/compiler/optimizing/loop_optimization.h
+++ b/compiler/optimizing/loop_optimization.h
@@ -27,7 +27,8 @@ class CompilerDriver;
 
 /**
  * Loop optimizations. Builds a loop hierarchy and applies optimizations to
- * the detected nested loops, such as removal of dead induction and empty loops.
+ * the detected nested loops, such as removal of dead induction and empty loops
+ * and inner loop vectorization.
  */
 class HLoopOptimization : public HOptimization {
  public:
@@ -50,34 +51,105 @@ class HLoopOptimization : public HOptimization {
           inner(nullptr),
           previous(nullptr),
           next(nullptr) {}
-    HLoopInformation* const loop_info;
+    HLoopInformation* loop_info;
     LoopNode* outer;
     LoopNode* inner;
     LoopNode* previous;
     LoopNode* next;
   };
 
-  void LocalRun();
+  /*
+   * Vectorization restrictions (bit mask).
+   */
+  enum VectorRestrictions {
+    kNone     = 0,   // no restrictions
+    kNoMul    = 1,   // no multiplication
+    kNoDiv    = 2,   // no division
+    kNoShift  = 4,   // no shift
+    kNoShr    = 8,   // no arithmetic shift right
+    kNoHiBits = 16,  // "wider" operations cannot bring in higher order bits
+  };
+
+  /*
+   * Vectorization mode during synthesis
+   * (sequential peeling/cleanup loop or vector loop).
+   */
+  enum VectorMode {
+    kSequential,
+    kVector
+  };
+
+  /*
+   * Representation of a unit-stride array reference.
+   */
+  struct ArrayReference {
+    ArrayReference(HInstruction* b, HInstruction* o, Primitive::Type t, bool l)
+        : base(b), offset(o), type(t), lhs(l) { }
+    bool operator<(const ArrayReference& other) const {
+      return
+          (base < other.base) ||
+          (base == other.base &&
+           (offset < other.offset || (offset == other.offset &&
+                                      (type < other.type ||
+                                       (type == other.type && lhs < other.lhs)))));
+    }
+    HInstruction* base;    // base address
+    HInstruction* offset;  // offset + i
+    Primitive::Type type;  // component type
+    bool lhs;              // def/use
+  };
 
+  // Loop setup and traversal.
+  void LocalRun();
   void AddLoop(HLoopInformation* loop_info);
   void RemoveLoop(LoopNode* node);
-
   void TraverseLoopsInnerToOuter(LoopNode* node);
 
-  // Simplification.
+  // Optimization.
   void SimplifyInduction(LoopNode* node);
   void SimplifyBlocks(LoopNode* node);
-  bool SimplifyInnerLoop(LoopNode* node);
+  void OptimizeInnerLoop(LoopNode* node);
+
+  // Vectorization analysis and synthesis.
+  bool CanVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count);
+  void Vectorize(LoopNode* node, HBasicBlock* block, HBasicBlock* exit, int64_t trip_count);
+  void GenerateNewLoop(LoopNode* node,
+                       HBasicBlock* block,
+                       HBasicBlock* new_preheader,
+                       HInstruction* lo,
+                       HInstruction* hi,
+                       HInstruction* step);
+  bool VectorizeDef(LoopNode* node, HInstruction* instruction, bool generate_code);
+  bool VectorizeUse(LoopNode* node,
+                    HInstruction* instruction,
+                    bool generate_code,
+                    Primitive::Type type,
+                    uint64_t restrictions);
+  bool TrySetVectorType(Primitive::Type type, /*out*/ uint64_t* restrictions);
+  bool TrySetVectorLength(uint32_t length);
+  void GenerateVecInv(HInstruction* org, Primitive::Type type);
+  void GenerateVecSub(HInstruction* org, HInstruction* off);
+  void GenerateVecMem(HInstruction* org,
+                      HInstruction* opa,
+                      HInstruction* opb,
+                      Primitive::Type type);
+  void GenerateVecOp(HInstruction* org, HInstruction* opa, HInstruction* opb, Primitive::Type type);
 
   // Helpers.
-  bool IsPhiInduction(HPhi* phi);
-  bool IsEmptyHeader(HBasicBlock* block);
+  bool TrySetPhiInduction(HPhi* phi, bool restrict_uses);
+  bool TrySetSimpleLoopHeader(HBasicBlock* block);
   bool IsEmptyBody(HBasicBlock* block);
   bool IsOnlyUsedAfterLoop(HLoopInformation* loop_info,
                            HInstruction* instruction,
                            bool collect_loop_uses,
                            /*out*/ int32_t* use_count);
+  bool IsUsedOutsideLoop(HLoopInformation* loop_info,
+                         HInstruction* instruction);
   bool TryReplaceWithLastValue(HInstruction* instruction, HBasicBlock* block);
+  bool TryAssignLastValue(HLoopInformation* loop_info,
+                          HInstruction* instruction,
+                          HBasicBlock* block,
+                          bool collect_loop_uses);
   void RemoveDeadInstructions(const HInstructionList& list);
 
   // Compiler driver (to query ISA features).
@@ -90,6 +162,9 @@ class HLoopOptimization : public HOptimization {
   // through this allocator is immediately released when the loop optimizer is done.
   ArenaAllocator* loop_allocator_;
 
+  // Global heap memory allocator. Used to build HIR.
+  ArenaAllocator* global_allocator_;
+
   // Entries into the loop hierarchy representation. The hierarchy resides
   // in phase-local heap memory.
   LoopNode* top_loop_;
@@ -102,11 +177,33 @@ class HLoopOptimization : public HOptimization {
   // Counter that tracks how many induction cycles have been simplified. Useful
   // to trigger incremental updates of induction variable analysis of outer loops
   // when the induction of inner loops has changed.
-  int32_t induction_simplication_count_;
+  uint32_t induction_simplication_count_;
 
   // Flag that tracks if any simplifications have occurred.
   bool simplified_;
 
+  // Number of "lanes" for selected packed type.
+  uint32_t vector_length_;
+
+  // Set of array references in the vector loop.
+  // Contents reside in phase-local heap memory.
+  ArenaSet<ArrayReference>* vector_refs_;
+
+  // Mapping used during vectorization synthesis for both the scalar peeling/cleanup
+  // loop (simd_ is false) and the actual vector loop (simd_ is true). The data
+  // structure maps original instructions into the new instructions.
+  // Contents reside in phase-local heap memory.
+  ArenaSafeMap<HInstruction*, HInstruction*>* vector_map_;
+
+  // Temporary vectorization bookkeeping.
+  HBasicBlock* vector_preheader_;  // preheader of the new loop
+  HBasicBlock* vector_header_;  // header of the new loop
+  HBasicBlock* vector_body_;  // body of the new loop
+  HInstruction* vector_runtime_test_a_;
+  HInstruction* vector_runtime_test_b_;  // defines a != b runtime test
+  HPhi* vector_phi_;  // the Phi representing the normalized loop index
+  VectorMode vector_mode_;  // selects synthesis mode
+
   friend class LoopOptimizationTest;
 
   DISALLOW_COPY_AND_ASSIGN(HLoopOptimization);
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index ec706e6694..5617e4bfcb 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -1088,6 +1088,19 @@ void HInstruction::ReplaceWith(HInstruction* other) {
   DCHECK(env_uses_.empty());
 }
 
+void HInstruction::ReplaceUsesDominatedBy(HInstruction* dominator, HInstruction* replacement) {
+  const HUseList<HInstruction*>& uses = GetUses();
+  for (auto it = uses.begin(), end = uses.end(); it != end; /* ++it below */) {
+    HInstruction* user = it->GetUser();
+    size_t index = it->GetIndex();
+    // Increment `it` now because `*it` may disappear thanks to user->ReplaceInput().
+    ++it;
+    if (dominator->StrictlyDominates(user)) {
+      user->ReplaceInput(replacement, index);
+    }
+  }
+}
+
 void HInstruction::ReplaceInput(HInstruction* replacement, size_t index) {
   HUserRecord<HInstruction*> input_use = InputRecordAt(index);
   if (input_use.GetInstruction() == replacement) {
@@ -1323,6 +1336,18 @@ std::ostream& operator<<(std::ostream& os, const ComparisonBias& rhs) {
   }
 }
 
+std::ostream& operator<<(std::ostream& os, const HDeoptimize::Kind& rhs) {
+  switch (rhs) {
+    case HDeoptimize::Kind::kBCE:
+      return os << "bce";
+    case HDeoptimize::Kind::kInline:
+      return os << "inline";
+    default:
+      LOG(FATAL) << "Unknown Deoptimization kind: " << static_cast<int>(rhs);
+      UNREACHABLE();
+  }
+}
+
 bool HCondition::IsBeforeWhenDisregardMoves(HInstruction* instruction) const {
   return this == instruction->GetPreviousDisregardingMoves();
 }
@@ -2315,6 +2340,66 @@ void HGraph::TransformLoopHeaderForBCE(HBasicBlock* header) {
       new_pre_header, old_pre_header, /* replace_if_back_edge */ false);
 }
 
+HBasicBlock* HGraph::TransformLoopForVectorization(HBasicBlock* header,
+                                                   HBasicBlock* body,
+                                                   HBasicBlock* exit) {
+  DCHECK(header->IsLoopHeader());
+  HLoopInformation* loop = header->GetLoopInformation();
+
+  // Add new loop blocks.
+  HBasicBlock* new_pre_header = new (arena_) HBasicBlock(this, header->GetDexPc());
+  HBasicBlock* new_header = new (arena_) HBasicBlock(this, header->GetDexPc());
+  HBasicBlock* new_body = new (arena_) HBasicBlock(this, header->GetDexPc());
+  AddBlock(new_pre_header);
+  AddBlock(new_header);
+  AddBlock(new_body);
+
+  // Set up control flow.
+  header->ReplaceSuccessor(exit, new_pre_header);
+  new_pre_header->AddSuccessor(new_header);
+  new_header->AddSuccessor(exit);
+  new_header->AddSuccessor(new_body);
+  new_body->AddSuccessor(new_header);
+
+  // Set up dominators.
+  header->ReplaceDominatedBlock(exit, new_pre_header);
+  new_pre_header->SetDominator(header);
+  new_pre_header->dominated_blocks_.push_back(new_header);
+  new_header->SetDominator(new_pre_header);
+  new_header->dominated_blocks_.push_back(new_body);
+  new_body->SetDominator(new_header);
+  new_header->dominated_blocks_.push_back(exit);
+  exit->SetDominator(new_header);
+
+  // Fix reverse post order.
+  size_t index_of_header = IndexOfElement(reverse_post_order_, header);
+  MakeRoomFor(&reverse_post_order_, 2, index_of_header);
+  reverse_post_order_[++index_of_header] = new_pre_header;
+  reverse_post_order_[++index_of_header] = new_header;
+  size_t index_of_body = IndexOfElement(reverse_post_order_, body);
+  MakeRoomFor(&reverse_post_order_, 1, index_of_body - 1);
+  reverse_post_order_[index_of_body] = new_body;
+
+  // Add gotos and suspend check (client must add conditional in header and copy environment).
+  new_pre_header->AddInstruction(new (arena_) HGoto());
+  HSuspendCheck* suspend_check = new (arena_) HSuspendCheck(header->GetDexPc());
+  new_header->AddInstruction(suspend_check);
+  new_body->AddInstruction(new (arena_) HGoto());
+
+  // Update loop information.
+  new_header->AddBackEdge(new_body);
+  new_header->GetLoopInformation()->SetSuspendCheck(suspend_check);
+  new_header->GetLoopInformation()->Populate();
+  new_pre_header->SetLoopInformation(loop->GetPreHeader()->GetLoopInformation());  // outward
+  HLoopInformationOutwardIterator it(*new_header);
+  for (it.Advance(); !it.Done(); it.Advance()) {
+    it.Current()->Add(new_pre_header);
+    it.Current()->Add(new_header);
+    it.Current()->Add(new_body);
+  }
+  return new_pre_header;
+}
+
 static void CheckAgainstUpperBound(ReferenceTypeInfo rti, ReferenceTypeInfo upper_bound_rti)
     REQUIRES_SHARED(Locks::mutator_lock_) {
   if (rti.IsValid()) {
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index fb0c889792..52a02c2285 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -400,6 +400,12 @@ class HGraph : public ArenaObject<kArenaAllocGraph> {
   // put deoptimization instructions, etc.
   void TransformLoopHeaderForBCE(HBasicBlock* header);
 
+  // Adds a new loop directly after the loop with the given header and exit.
+  // Returns the new preheader.
+  HBasicBlock* TransformLoopForVectorization(HBasicBlock* header,
+                                             HBasicBlock* body,
+                                             HBasicBlock* exit);
+
   // Removes `block` from the graph. Assumes `block` has been disconnected from
   // other blocks and has no instructions or phis.
   void DeleteDeadEmptyBlock(HBasicBlock* block);
@@ -1363,6 +1369,25 @@ class HLoopInformationOutwardIterator : public ValueObject {
   M(TypeConversion, Instruction)                                        \
   M(UShr, BinaryOperation)                                              \
   M(Xor, BinaryOperation)                                               \
+  M(VecReplicateScalar, VecUnaryOperation)                              \
+  M(VecSetScalars, VecUnaryOperation)                                   \
+  M(VecSumReduce, VecUnaryOperation)                                    \
+  M(VecCnv, VecUnaryOperation)                                          \
+  M(VecNeg, VecUnaryOperation)                                          \
+  M(VecNot, VecUnaryOperation)                                          \
+  M(VecAdd, VecBinaryOperation)                                         \
+  M(VecSub, VecBinaryOperation)                                         \
+  M(VecMul, VecBinaryOperation)                                         \
+  M(VecDiv, VecBinaryOperation)                                         \
+  M(VecAnd, VecBinaryOperation)                                         \
+  M(VecAndNot, VecBinaryOperation)                                      \
+  M(VecOr, VecBinaryOperation)                                          \
+  M(VecXor, VecBinaryOperation)                                         \
+  M(VecShl, VecBinaryOperation)                                         \
+  M(VecShr, VecBinaryOperation)                                         \
+  M(VecUShr, VecBinaryOperation)                                        \
+  M(VecLoad, VecMemoryOperation)                                        \
+  M(VecStore, VecMemoryOperation)                                       \
 
 /*
  * Instructions, shared across several (not all) architectures.
@@ -1424,7 +1449,11 @@ class HLoopInformationOutwardIterator : public ValueObject {
   M(Constant, Instruction)                                              \
   M(UnaryOperation, Instruction)                                        \
   M(BinaryOperation, Instruction)                                       \
-  M(Invoke, Instruction)
+  M(Invoke, Instruction)                                                \
+  M(VecOperation, Instruction)                                          \
+  M(VecUnaryOperation, VecOperation)                                    \
+  M(VecBinaryOperation, VecOperation)                                   \
+  M(VecMemoryOperation, VecOperation)
 
 #define FOR_EACH_INSTRUCTION(M)                                         \
   FOR_EACH_CONCRETE_INSTRUCTION(M)                                      \
@@ -2081,6 +2110,7 @@ class HInstruction : public ArenaObject<kArenaAllocInstruction> {
   void SetLocations(LocationSummary* locations) { locations_ = locations; }
 
   void ReplaceWith(HInstruction* instruction);
+  void ReplaceUsesDominatedBy(HInstruction* dominator, HInstruction* replacement);
   void ReplaceInput(HInstruction* replacement, size_t index);
 
   // This is almost the same as doing `ReplaceWith()`. But in this helper, the
@@ -2944,28 +2974,97 @@ class HTryBoundary FINAL : public HTemplateInstruction<0> {
 };
 
 // Deoptimize to interpreter, upon checking a condition.
-class HDeoptimize FINAL : public HTemplateInstruction<1> {
+class HDeoptimize FINAL : public HVariableInputSizeInstruction {
  public:
+  enum class Kind {
+    kBCE,
+    kInline,
+    kLast = kInline
+  };
+
+  // Use this constructor when the `HDeoptimize` acts as a barrier, where no code can move
+  // across.
+  HDeoptimize(ArenaAllocator* arena, HInstruction* cond, Kind kind, uint32_t dex_pc)
+      : HVariableInputSizeInstruction(
+            SideEffects::All(),
+            dex_pc,
+            arena,
+            /* number_of_inputs */ 1,
+            kArenaAllocMisc) {
+    SetPackedFlag<kFieldCanBeMoved>(false);
+    SetPackedField<DeoptimizeKindField>(kind);
+    SetRawInputAt(0, cond);
+  }
+
+  // Use this constructor when the `HDeoptimize` guards an instruction, and any user
+  // that relies on the deoptimization to pass should have its input be the `HDeoptimize`
+  // instead of `guard`.
   // We set CanTriggerGC to prevent any intermediate address to be live
   // at the point of the `HDeoptimize`.
-  HDeoptimize(HInstruction* cond, uint32_t dex_pc)
-      : HTemplateInstruction(SideEffects::CanTriggerGC(), dex_pc) {
+  HDeoptimize(ArenaAllocator* arena,
+              HInstruction* cond,
+              HInstruction* guard,
+              Kind kind,
+              uint32_t dex_pc)
+      : HVariableInputSizeInstruction(
+            SideEffects::CanTriggerGC(),
+            dex_pc,
+            arena,
+            /* number_of_inputs */ 2,
+            kArenaAllocMisc) {
+    SetPackedFlag<kFieldCanBeMoved>(true);
+    SetPackedField<DeoptimizeKindField>(kind);
     SetRawInputAt(0, cond);
+    SetRawInputAt(1, guard);
   }
 
-  bool CanBeMoved() const OVERRIDE { return true; }
-  bool InstructionDataEquals(const HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
-    return true;
+  bool CanBeMoved() const OVERRIDE { return GetPackedFlag<kFieldCanBeMoved>(); }
+
+  bool InstructionDataEquals(const HInstruction* other) const OVERRIDE {
+    return (other->CanBeMoved() == CanBeMoved()) && (other->AsDeoptimize()->GetKind() == GetKind());
   }
+
   bool NeedsEnvironment() const OVERRIDE { return true; }
+
   bool CanThrow() const OVERRIDE { return true; }
 
+  Kind GetKind() const { return GetPackedField<DeoptimizeKindField>(); }
+
+  Primitive::Type GetType() const OVERRIDE {
+    return GuardsAnInput() ? GuardedInput()->GetType() : Primitive::kPrimVoid;
+  }
+
+  bool GuardsAnInput() const {
+    return InputCount() == 2;
+  }
+
+  HInstruction* GuardedInput() const {
+    DCHECK(GuardsAnInput());
+    return InputAt(1);
+  }
+
+  void RemoveGuard() {
+    RemoveInputAt(1);
+  }
+
   DECLARE_INSTRUCTION(Deoptimize);
 
  private:
+  static constexpr size_t kFieldCanBeMoved = kNumberOfGenericPackedBits;
+  static constexpr size_t kFieldDeoptimizeKind = kNumberOfGenericPackedBits + 1;
+  static constexpr size_t kFieldDeoptimizeKindSize =
+      MinimumBitsToStore(static_cast<size_t>(Kind::kLast));
+  static constexpr size_t kNumberOfDeoptimizePackedBits =
+      kFieldDeoptimizeKind + kFieldDeoptimizeKindSize;
+  static_assert(kNumberOfDeoptimizePackedBits <= kMaxNumberOfPackedBits,
+                "Too many packed fields.");
+  using DeoptimizeKindField = BitField<Kind, kFieldDeoptimizeKind, kFieldDeoptimizeKindSize>;
+
   DISALLOW_COPY_AND_ASSIGN(HDeoptimize);
 };
 
+std::ostream& operator<<(std::ostream& os, const HDeoptimize::Kind& rhs);
+
 // Represents a should_deoptimize flag. Currently used for CHA-based devirtualization.
 // The compiled code checks this flag value in a guard before devirtualized call and
 // if it's true, starts to do deoptimization.
@@ -6619,6 +6718,8 @@ class HParallelMove FINAL : public HTemplateInstruction<0> {
 
 }  // namespace art
 
+#include "nodes_vector.h"
+
 #if defined(ART_ENABLE_CODEGEN_arm) || defined(ART_ENABLE_CODEGEN_arm64)
 #include "nodes_shared.h"
 #endif
diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h
new file mode 100644
index 0000000000..9f9b918f17
--- /dev/null
+++ b/compiler/optimizing/nodes_vector.h
@@ -0,0 +1,585 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_OPTIMIZING_NODES_VECTOR_H_
+#define ART_COMPILER_OPTIMIZING_NODES_VECTOR_H_
+
+// This #include should never be used by compilation, because this header file (nodes_vector.h)
+// is included in the header file nodes.h itself. However it gives editing tools better context.
+#include "nodes.h"
+
+namespace art {
+
+// Memory alignment, represented as an offset relative to a base, where 0 <= offset < base,
+// and base is a power of two. For example, the value Alignment(16, 0) means memory is
+// perfectly aligned at a 16-byte boundary, whereas the value Alignment(16, 4) means
+// memory is always exactly 4 bytes above such a boundary.
+class Alignment {
+ public:
+  Alignment(size_t base, size_t offset) : base_(base), offset_(offset) {
+    DCHECK_LT(offset, base);
+    DCHECK(IsPowerOfTwo(base));
+  }
+
+  // Returns true if memory is "at least" aligned at the given boundary.
+  // Assumes requested base is power of two.
+  bool IsAlignedAt(size_t base) const {
+    DCHECK_NE(0u, base);
+    DCHECK(IsPowerOfTwo(base));
+    return ((offset_ | base_) & (base - 1u)) == 0;
+  }
+
+  std::string ToString() const {
+    return "ALIGN(" + std::to_string(base_) + "," + std::to_string(offset_) + ")";
+  }
+
+ private:
+  size_t base_;
+  size_t offset_;
+};
+
+//
+// Definitions of abstract vector operations in HIR.
+//
+
+// Abstraction of a vector operation, i.e., an operation that performs
+// GetVectorLength() x GetPackedType() operations simultaneously.
+class HVecOperation : public HVariableInputSizeInstruction {
+ public:
+  HVecOperation(ArenaAllocator* arena,
+                Primitive::Type packed_type,
+                SideEffects side_effects,
+                size_t number_of_inputs,
+                size_t vector_length,
+                uint32_t dex_pc)
+      : HVariableInputSizeInstruction(side_effects,
+                                      dex_pc,
+                                      arena,
+                                      number_of_inputs,
+                                      kArenaAllocVectorNode),
+        vector_length_(vector_length) {
+    SetPackedField<TypeField>(packed_type);
+    DCHECK_LT(1u, vector_length);
+  }
+
+  // Returns the number of elements packed in a vector.
+  size_t GetVectorLength() const {
+    return vector_length_;
+  }
+
+  // Returns the number of bytes in a full vector.
+  size_t GetVectorNumberOfBytes() const {
+    return vector_length_ * Primitive::ComponentSize(GetPackedType());
+  }
+
+  // Returns the type of the vector operation: a SIMD operation looks like a FPU location.
+  // TODO: we could introduce SIMD types in HIR.
+  Primitive::Type GetType() const OVERRIDE {
+    return Primitive::kPrimDouble;
+  }
+
+  // Returns the true component type packed in a vector.
+  Primitive::Type GetPackedType() const {
+    return GetPackedField<TypeField>();
+  }
+
+  DECLARE_ABSTRACT_INSTRUCTION(VecOperation);
+
+ private:
+  // Additional packed bits.
+  static constexpr size_t kFieldType = HInstruction::kNumberOfGenericPackedBits;
+  static constexpr size_t kFieldTypeSize =
+      MinimumBitsToStore(static_cast<size_t>(Primitive::kPrimLast));
+  static constexpr size_t kNumberOfVectorOpPackedBits = kFieldType + kFieldTypeSize;
+  static_assert(kNumberOfVectorOpPackedBits <= kMaxNumberOfPackedBits, "Too many packed fields.");
+  using TypeField = BitField<Primitive::Type, kFieldType, kFieldTypeSize>;
+
+  const size_t vector_length_;
+
+  DISALLOW_COPY_AND_ASSIGN(HVecOperation);
+};
+
+// Abstraction of a unary vector operation.
+class HVecUnaryOperation : public HVecOperation {
+ public:
+  HVecUnaryOperation(ArenaAllocator* arena,
+                     Primitive::Type packed_type,
+                     size_t vector_length,
+                     uint32_t dex_pc)
+      : HVecOperation(arena,
+                      packed_type,
+                      SideEffects::None(),
+                      /*number_of_inputs*/ 1,
+                      vector_length,
+                      dex_pc) { }
+  DECLARE_ABSTRACT_INSTRUCTION(VecUnaryOperation);
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HVecUnaryOperation);
+};
+
+// Abstraction of a binary vector operation.
+class HVecBinaryOperation : public HVecOperation {
+ public:
+  HVecBinaryOperation(ArenaAllocator* arena,
+                      Primitive::Type packed_type,
+                      size_t vector_length,
+                      uint32_t dex_pc)
+      : HVecOperation(arena,
+                      packed_type,
+                      SideEffects::None(),
+                      /*number_of_inputs*/ 2,
+                      vector_length,
+                      dex_pc) { }
+  DECLARE_ABSTRACT_INSTRUCTION(VecBinaryOperation);
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HVecBinaryOperation);
+};
+
+// Abstraction of a vector operation that references memory, with an alignment.
+// The Android runtime guarantees at least "component size" alignment for array
+// elements and, thus, vectors.
+class HVecMemoryOperation : public HVecOperation {
+ public:
+  HVecMemoryOperation(ArenaAllocator* arena,
+                      Primitive::Type packed_type,
+                      SideEffects side_effects,
+                      size_t number_of_inputs,
+                      size_t vector_length,
+                      uint32_t dex_pc)
+      : HVecOperation(arena, packed_type, side_effects, number_of_inputs, vector_length, dex_pc),
+        alignment_(Primitive::ComponentSize(packed_type), 0) { }
+
+  void SetAlignment(Alignment alignment) { alignment_ = alignment; }
+
+  Alignment GetAlignment() const { return alignment_; }
+
+  DECLARE_ABSTRACT_INSTRUCTION(VecMemoryOperation);
+
+ private:
+  Alignment alignment_;
+
+  DISALLOW_COPY_AND_ASSIGN(HVecMemoryOperation);
+};
+
+//
+// Definitions of concrete vector operations in HIR.
+//
+
+// Replicates the given scalar into a vector,
+// viz. replicate(x) = [ x, .. , x ].
+class HVecReplicateScalar FINAL : public HVecUnaryOperation {
+ public:
+  HVecReplicateScalar(ArenaAllocator* arena,
+                      HInstruction* scalar,
+                      Primitive::Type packed_type,
+                      size_t vector_length,
+                      uint32_t dex_pc = kNoDexPc)
+      : HVecUnaryOperation(arena, packed_type, vector_length, dex_pc) {
+    SetRawInputAt(0, scalar);
+  }
+  DECLARE_INSTRUCTION(VecReplicateScalar);
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HVecReplicateScalar);
+};
+
+// Assigns the given scalar elements to a vector,
+// viz. set( array(x1, .., xn) ) = [ x1, .. , xn ].
+class HVecSetScalars FINAL : public HVecUnaryOperation {
+  HVecSetScalars(ArenaAllocator* arena,
+                 HInstruction** scalars,  // array
+                 Primitive::Type packed_type,
+                 size_t vector_length,
+                 uint32_t dex_pc = kNoDexPc)
+      : HVecUnaryOperation(arena, packed_type, vector_length, dex_pc) {
+    for (size_t i = 0; i < vector_length; i++) {
+      SetRawInputAt(0, scalars[i]);
+    }
+  }
+  DECLARE_INSTRUCTION(VecSetScalars);
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HVecSetScalars);
+};
+
+// Sum-reduces the given vector into a shorter vector (m < n) or scalar (m = 1),
+// viz. sum-reduce[ x1, .. , xn ] = [ y1, .., ym ], where yi = sum_j x_j.
+class HVecSumReduce FINAL : public HVecUnaryOperation {
+  HVecSumReduce(ArenaAllocator* arena,
+                HInstruction* input,
+                Primitive::Type packed_type,
+                size_t vector_length,
+                uint32_t dex_pc = kNoDexPc)
+      : HVecUnaryOperation(arena, packed_type, vector_length, dex_pc) {
+    DCHECK(input->IsVecOperation());
+    DCHECK_EQ(input->AsVecOperation()->GetPackedType(), packed_type);
+    SetRawInputAt(0, input);
+  }
+
+  // TODO: probably integral promotion
+  Primitive::Type GetType() const OVERRIDE { return GetPackedType(); }
+
+  DECLARE_INSTRUCTION(VecSumReduce);
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HVecSumReduce);
+};
+
+// Converts every component in the vector,
+// viz. cnv[ x1, .. , xn ]  = [ cnv(x1), .. , cnv(xn) ].
+class HVecCnv FINAL : public HVecUnaryOperation {
+ public:
+  HVecCnv(ArenaAllocator* arena,
+          HInstruction* input,
+          Primitive::Type packed_type,
+          size_t vector_length,
+          uint32_t dex_pc = kNoDexPc)
+      : HVecUnaryOperation(arena, packed_type, vector_length, dex_pc) {
+    DCHECK(input->IsVecOperation());
+    DCHECK_NE(input->AsVecOperation()->GetPackedType(), packed_type);  // actual convert
+    SetRawInputAt(0, input);
+  }
+
+  Primitive::Type GetInputType() const { return InputAt(0)->AsVecOperation()->GetPackedType(); }
+  Primitive::Type GetResultType() const { return GetPackedType(); }
+
+  DECLARE_INSTRUCTION(VecCnv);
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HVecCnv);
+};
+
+// Negates every component in the vector,
+// viz. neg[ x1, .. , xn ]  = [ -x1, .. , -xn ].
+class HVecNeg FINAL : public HVecUnaryOperation {
+ public:
+  HVecNeg(ArenaAllocator* arena,
+          HInstruction* input,
+          Primitive::Type packed_type,
+          size_t vector_length,
+          uint32_t dex_pc = kNoDexPc)
+      : HVecUnaryOperation(arena, packed_type, vector_length, dex_pc) {
+    DCHECK(input->IsVecOperation());
+    DCHECK_EQ(input->AsVecOperation()->GetPackedType(), packed_type);
+    SetRawInputAt(0, input);
+  }
+  DECLARE_INSTRUCTION(VecNeg);
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HVecNeg);
+};
+
+// Bitwise- or boolean-nots every component in the vector,
+// viz. not[ x1, .. , xn ]  = [ ~x1, .. , ~xn ], or
+//      not[ x1, .. , xn ]  = [ !x1, .. , !xn ] for boolean.
+class HVecNot FINAL : public HVecUnaryOperation {
+ public:
+  HVecNot(ArenaAllocator* arena,
+          HInstruction* input,
+          Primitive::Type packed_type,
+          size_t vector_length,
+          uint32_t dex_pc = kNoDexPc)
+      : HVecUnaryOperation(arena, packed_type, vector_length, dex_pc) {
+    DCHECK(input->IsVecOperation());
+    SetRawInputAt(0, input);
+  }
+  DECLARE_INSTRUCTION(VecNot);
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HVecNot);
+};
+
+// Adds every component in the two vectors,
+// viz. [ x1, .. , xn ] + [ y1, .. , yn ] = [ x1 + y1, .. , xn + yn ].
+class HVecAdd FINAL : public HVecBinaryOperation {
+ public:
+  HVecAdd(ArenaAllocator* arena,
+          HInstruction* left,
+          HInstruction* right,
+          Primitive::Type packed_type,
+          size_t vector_length,
+          uint32_t dex_pc = kNoDexPc)
+      : HVecBinaryOperation(arena, packed_type, vector_length, dex_pc) {
+    DCHECK(left->IsVecOperation() && right->IsVecOperation());
+    DCHECK_EQ(left->AsVecOperation()->GetPackedType(), packed_type);
+    DCHECK_EQ(right->AsVecOperation()->GetPackedType(), packed_type);
+    SetRawInputAt(0, left);
+    SetRawInputAt(1, right);
+  }
+  DECLARE_INSTRUCTION(VecAdd);
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HVecAdd);
+};
+
+// Subtracts every component in the two vectors,
+// viz. [ x1, .. , xn ] - [ y1, .. , yn ] = [ x1 - y1, .. , xn - yn ].
+class HVecSub FINAL : public HVecBinaryOperation {
+ public:
+  HVecSub(ArenaAllocator* arena,
+          HInstruction* left,
+          HInstruction* right,
+          Primitive::Type packed_type,
+          size_t vector_length,
+          uint32_t dex_pc = kNoDexPc)
+      : HVecBinaryOperation(arena, packed_type, vector_length, dex_pc) {
+    DCHECK(left->IsVecOperation() && right->IsVecOperation());
+    DCHECK_EQ(left->AsVecOperation()->GetPackedType(), packed_type);
+    DCHECK_EQ(right->AsVecOperation()->GetPackedType(), packed_type);
+    SetRawInputAt(0, left);
+    SetRawInputAt(1, right);
+  }
+  DECLARE_INSTRUCTION(VecSub);
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HVecSub);
+};
+
+// Multiplies every component in the two vectors,
+// viz. [ x1, .. , xn ] * [ y1, .. , yn ] = [ x1 * y1, .. , xn * yn ].
+class HVecMul FINAL : public HVecBinaryOperation {
+ public:
+  HVecMul(ArenaAllocator* arena,
+          HInstruction* left,
+          HInstruction* right,
+          Primitive::Type packed_type,
+          size_t vector_length,
+          uint32_t dex_pc = kNoDexPc)
+      : HVecBinaryOperation(arena, packed_type, vector_length, dex_pc) {
+    DCHECK(left->IsVecOperation() && right->IsVecOperation());
+    DCHECK_EQ(left->AsVecOperation()->GetPackedType(), packed_type);
+    DCHECK_EQ(right->AsVecOperation()->GetPackedType(), packed_type);
+    SetRawInputAt(0, left);
+    SetRawInputAt(1, right);
+  }
+  DECLARE_INSTRUCTION(VecMul);
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HVecMul);
+};
+
+// Divides every component in the two vectors,
+// viz. [ x1, .. , xn ] / [ y1, .. , yn ] = [ x1 / y1, .. , xn / yn ].
+class HVecDiv FINAL : public HVecBinaryOperation {
+ public:
+  HVecDiv(ArenaAllocator* arena,
+          HInstruction* left,
+          HInstruction* right,
+          Primitive::Type packed_type,
+          size_t vector_length,
+          uint32_t dex_pc = kNoDexPc)
+      : HVecBinaryOperation(arena, packed_type, vector_length, dex_pc) {
+    DCHECK(left->IsVecOperation() && right->IsVecOperation());
+    DCHECK_EQ(left->AsVecOperation()->GetPackedType(), packed_type);
+    DCHECK_EQ(right->AsVecOperation()->GetPackedType(), packed_type);
+    SetRawInputAt(0, left);
+    SetRawInputAt(1, right);
+  }
+  DECLARE_INSTRUCTION(VecDiv);
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HVecDiv);
+};
+
+// Bitwise-ands every component in the two vectors,
+// viz. [ x1, .. , xn ] & [ y1, .. , yn ] = [ x1 & y1, .. , xn & yn ].
+class HVecAnd FINAL : public HVecBinaryOperation {
+ public:
+  HVecAnd(ArenaAllocator* arena,
+          HInstruction* left,
+          HInstruction* right,
+          Primitive::Type packed_type,
+          size_t vector_length,
+          uint32_t dex_pc = kNoDexPc)
+      : HVecBinaryOperation(arena, packed_type, vector_length, dex_pc) {
+    DCHECK(left->IsVecOperation() && right->IsVecOperation());
+    SetRawInputAt(0, left);
+    SetRawInputAt(1, right);
+  }
+  DECLARE_INSTRUCTION(VecAnd);
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HVecAnd);
+};
+
+// Bitwise-and-nots every component in the two vectors,
+// viz. [ x1, .. , xn ] and-not [ y1, .. , yn ] = [ ~x1 & y1, .. , ~xn & yn ].
+class HVecAndNot FINAL : public HVecBinaryOperation {
+ public:
+  HVecAndNot(ArenaAllocator* arena,
+             HInstruction* left,
+             HInstruction* right,
+             Primitive::Type packed_type,
+             size_t vector_length,
+             uint32_t dex_pc = kNoDexPc)
+         : HVecBinaryOperation(arena, packed_type, vector_length, dex_pc) {
+    DCHECK(left->IsVecOperation() && right->IsVecOperation());
+    SetRawInputAt(0, left);
+    SetRawInputAt(1, right);
+  }
+  DECLARE_INSTRUCTION(VecAndNot);
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HVecAndNot);
+};
+
+// Bitwise-ors every component in the two vectors,
+// viz. [ x1, .. , xn ] | [ y1, .. , yn ] = [ x1 | y1, .. , xn | yn ].
+class HVecOr FINAL : public HVecBinaryOperation {
+ public:
+  HVecOr(ArenaAllocator* arena,
+         HInstruction* left,
+         HInstruction* right,
+         Primitive::Type packed_type,
+         size_t vector_length,
+         uint32_t dex_pc = kNoDexPc)
+      : HVecBinaryOperation(arena, packed_type, vector_length, dex_pc) {
+    DCHECK(left->IsVecOperation() && right->IsVecOperation());
+    SetRawInputAt(0, left);
+    SetRawInputAt(1, right);
+  }
+  DECLARE_INSTRUCTION(VecOr);
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HVecOr);
+};
+
+// Bitwise-xors every component in the two vectors,
+// viz. [ x1, .. , xn ] ^ [ y1, .. , yn ] = [ x1 ^ y1, .. , xn ^ yn ].
+class HVecXor FINAL : public HVecBinaryOperation {
+ public:
+  HVecXor(ArenaAllocator* arena,
+          HInstruction* left,
+          HInstruction* right,
+          Primitive::Type packed_type,
+          size_t vector_length,
+          uint32_t dex_pc = kNoDexPc)
+      : HVecBinaryOperation(arena, packed_type, vector_length, dex_pc) {
+    DCHECK(left->IsVecOperation() && right->IsVecOperation());
+    SetRawInputAt(0, left);
+    SetRawInputAt(1, right);
+  }
+  DECLARE_INSTRUCTION(VecXor);
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HVecXor);
+};
+
+// Logically shifts every component in the vector left by the given distance,
+// viz. [ x1, .. , xn ] << d = [ x1 << d, .. , xn << d ].
+class HVecShl FINAL : public HVecBinaryOperation {
+ public:
+  HVecShl(ArenaAllocator* arena,
+          HInstruction* left,
+          HInstruction* right,
+          Primitive::Type packed_type,
+          size_t vector_length,
+          uint32_t dex_pc = kNoDexPc)
+      : HVecBinaryOperation(arena, packed_type, vector_length, dex_pc) {
+    DCHECK(left->IsVecOperation());
+    DCHECK_EQ(left->AsVecOperation()->GetPackedType(), packed_type);
+    SetRawInputAt(0, left);
+    SetRawInputAt(1, right);
+  }
+  DECLARE_INSTRUCTION(VecShl);
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HVecShl);
+};
+
+// Arithmetically shifts every component in the vector right by the given distance,
+// viz. [ x1, .. , xn ] >> d = [ x1 >> d, .. , xn >> d ].
+class HVecShr FINAL : public HVecBinaryOperation {
+ public:
+  HVecShr(ArenaAllocator* arena,
+          HInstruction* left,
+          HInstruction* right,
+          Primitive::Type packed_type,
+          size_t vector_length,
+          uint32_t dex_pc = kNoDexPc)
+      : HVecBinaryOperation(arena, packed_type, vector_length, dex_pc) {
+    DCHECK(left->IsVecOperation());
+    DCHECK_EQ(left->AsVecOperation()->GetPackedType(), packed_type);
+    SetRawInputAt(0, left);
+    SetRawInputAt(1, right);
+  }
+  DECLARE_INSTRUCTION(VecShr);
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HVecShr);
+};
+
+// Logically shifts every component in the vector right by the given distance,
+// viz. [ x1, .. , xn ] >>> d = [ x1 >>> d, .. , xn >>> d ].
+class HVecUShr FINAL : public HVecBinaryOperation {
+ public:
+  HVecUShr(ArenaAllocator* arena,
+           HInstruction* left,
+           HInstruction* right,
+           Primitive::Type packed_type,
+           size_t vector_length,
+           uint32_t dex_pc = kNoDexPc)
+      : HVecBinaryOperation(arena, packed_type, vector_length, dex_pc) {
+    DCHECK(left->IsVecOperation());
+    DCHECK_EQ(left->AsVecOperation()->GetPackedType(), packed_type);
+    SetRawInputAt(0, left);
+    SetRawInputAt(1, right);
+  }
+  DECLARE_INSTRUCTION(VecUShr);
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HVecUShr);
+};
+
+// Loads a vector from memory, viz. load(mem, 1)
+// yield the vector [ mem(1), .. , mem(n) ].
+class HVecLoad FINAL : public HVecMemoryOperation {
+ public:
+  HVecLoad(ArenaAllocator* arena,
+           HInstruction* base,
+           HInstruction* index,
+           Primitive::Type packed_type,
+           size_t vector_length,
+           uint32_t dex_pc = kNoDexPc)
+      : HVecMemoryOperation(arena,
+                            packed_type,
+                            SideEffects::ArrayReadOfType(packed_type),
+                            /*number_of_inputs*/ 2,
+                            vector_length,
+                            dex_pc) {
+    SetRawInputAt(0, base);
+    SetRawInputAt(1, index);
+  }
+  DECLARE_INSTRUCTION(VecLoad);
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HVecLoad);
+};
+
+// Stores a vector to memory, viz. store(m, 1, [x1, .. , xn] )
+// sets mem(1) = x1, .. , mem(n) = xn.
+class HVecStore FINAL : public HVecMemoryOperation {
+ public:
+  HVecStore(ArenaAllocator* arena,
+            HInstruction* base,
+            HInstruction* index,
+            HInstruction* value,
+            Primitive::Type packed_type,
+            size_t vector_length,
+            uint32_t dex_pc = kNoDexPc)
+      : HVecMemoryOperation(arena,
+                            packed_type,
+                            SideEffects::ArrayWriteOfType(packed_type),
+                            /*number_of_inputs*/ 3,
+                            vector_length,
+                            dex_pc) {
+    DCHECK(value->IsVecOperation());
+    DCHECK_EQ(value->AsVecOperation()->GetPackedType(), packed_type);
+    SetRawInputAt(0, base);
+    SetRawInputAt(1, index);
+    SetRawInputAt(2, value);
+  }
+  DECLARE_INSTRUCTION(VecStore);
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HVecStore);
+};
+
+}  // namespace art
+
+#endif  // ART_COMPILER_OPTIMIZING_NODES_VECTOR_H_
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index 3c6d2d64a9..eb88fdee84 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -454,6 +454,8 @@ static bool IsInstructionSetSupported(InstructionSet instruction_set) {
 static bool InstructionSetSupportsReadBarrier(InstructionSet instruction_set) {
   return instruction_set == kArm64
       || instruction_set == kThumb2
+      || instruction_set == kMips
+      || instruction_set == kMips64
       || instruction_set == kX86
       || instruction_set == kX86_64;
 }
diff --git a/compiler/optimizing/prepare_for_register_allocation.cc b/compiler/optimizing/prepare_for_register_allocation.cc
index efbaf6c221..66bfea9860 100644
--- a/compiler/optimizing/prepare_for_register_allocation.cc
+++ b/compiler/optimizing/prepare_for_register_allocation.cc
@@ -40,6 +40,14 @@ void PrepareForRegisterAllocation::VisitDivZeroCheck(HDivZeroCheck* check) {
   check->ReplaceWith(check->InputAt(0));
 }
 
+void PrepareForRegisterAllocation::VisitDeoptimize(HDeoptimize* deoptimize) {
+  if (deoptimize->GuardsAnInput()) {
+    // Replace the uses with the actual guarded instruction.
+    deoptimize->ReplaceWith(deoptimize->GuardedInput());
+    deoptimize->RemoveGuard();
+  }
+}
+
 void PrepareForRegisterAllocation::VisitBoundsCheck(HBoundsCheck* check) {
   check->ReplaceWith(check->InputAt(0));
   if (check->IsStringCharAt()) {
diff --git a/compiler/optimizing/prepare_for_register_allocation.h b/compiler/optimizing/prepare_for_register_allocation.h
index c128227654..7ffbe44ef6 100644
--- a/compiler/optimizing/prepare_for_register_allocation.h
+++ b/compiler/optimizing/prepare_for_register_allocation.h
@@ -44,6 +44,7 @@ class PrepareForRegisterAllocation : public HGraphDelegateVisitor {
   void VisitClinitCheck(HClinitCheck* check) OVERRIDE;
   void VisitCondition(HCondition* condition) OVERRIDE;
   void VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) OVERRIDE;
+  void VisitDeoptimize(HDeoptimize* deoptimize) OVERRIDE;
 
   bool CanMoveClinitCheck(HInstruction* input, HInstruction* user) const;
   bool CanEmitConditionAt(HCondition* condition, HInstruction* user) const;
diff --git a/compiler/optimizing/reference_type_propagation.cc b/compiler/optimizing/reference_type_propagation.cc
index 6e332ca59b..d5637b9b75 100644
--- a/compiler/optimizing/reference_type_propagation.cc
+++ b/compiler/optimizing/reference_type_propagation.cc
@@ -310,8 +310,8 @@ static void BoundTypeForClassCheck(HInstruction* check) {
     BoundTypeIn(receiver, trueBlock, /* start_instruction */ nullptr, class_rti);
   } else {
     DCHECK(check->IsDeoptimize());
-    if (compare->IsEqual()) {
-      BoundTypeIn(receiver, check->GetBlock(), check, class_rti);
+    if (compare->IsEqual() && check->AsDeoptimize()->GuardsAnInput()) {
+      check->SetReferenceTypeInfo(class_rti);
     }
   }
 }
diff --git a/compiler/optimizing/reference_type_propagation_test.cc b/compiler/optimizing/reference_type_propagation_test.cc
index 84a4bab1a9..0b49ce1a4c 100644
--- a/compiler/optimizing/reference_type_propagation_test.cc
+++ b/compiler/optimizing/reference_type_propagation_test.cc
@@ -29,7 +29,7 @@ namespace art {
  */
 class ReferenceTypePropagationTest : public CommonCompilerTest {
  public:
-  ReferenceTypePropagationTest() : pool_(), allocator_(&pool_) {
+  ReferenceTypePropagationTest() : pool_(), allocator_(&pool_), propagation_(nullptr) {
     graph_ = CreateGraph(&allocator_);
   }
 
diff --git a/compiler/optimizing/scheduler.h b/compiler/optimizing/scheduler.h
index ab0dad4300..9236a0e4fa 100644
--- a/compiler/optimizing/scheduler.h
+++ b/compiler/optimizing/scheduler.h
@@ -315,7 +315,10 @@ class SchedulingLatencyVisitor : public HGraphDelegateVisitor {
   // This class and its sub-classes will never be used to drive a visit of an
   // `HGraph` but only to visit `HInstructions` one at a time, so we do not need
   // to pass a valid graph to `HGraphDelegateVisitor()`.
-  SchedulingLatencyVisitor() : HGraphDelegateVisitor(nullptr) {}
+  SchedulingLatencyVisitor()
+      : HGraphDelegateVisitor(nullptr),
+        last_visited_latency_(0),
+        last_visited_internal_latency_(0) {}
 
   void VisitInstruction(HInstruction* instruction) OVERRIDE {
     LOG(FATAL) << "Error visiting " << instruction->DebugName() << ". "
@@ -413,6 +416,7 @@ class HScheduler {
         selector_(selector),
         only_optimize_loop_blocks_(true),
         scheduling_graph_(this, arena),
+        cursor_(nullptr),
         candidates_(arena_->Adapter(kArenaAllocScheduler)) {}
   virtual ~HScheduler() {}
 
diff --git a/compiler/optimizing/ssa_liveness_analysis.cc b/compiler/optimizing/ssa_liveness_analysis.cc
index 36ee5a903a..b538a89a06 100644
--- a/compiler/optimizing/ssa_liveness_analysis.cc
+++ b/compiler/optimizing/ssa_liveness_analysis.cc
@@ -470,7 +470,12 @@ bool LiveInterval::SameRegisterKind(Location other) const {
 }
 
 size_t LiveInterval::NumberOfSpillSlotsNeeded() const {
-  // TODO: detect vector operation.
+  // For a SIMD operation, compute the number of needed spill slots.
+  // TODO: do through vector type?
+  HInstruction* definition = GetParent()->GetDefinedBy();
+  if (definition != nullptr && definition->IsVecOperation()) {
+    return definition->AsVecOperation()->GetVectorNumberOfBytes() / kVRegSize;
+  }
   // Return number of needed spill slots based on type.
   return (type_ == Primitive::kPrimLong || type_ == Primitive::kPrimDouble) ? 2 : 1;
 }
diff --git a/compiler/optimizing/ssa_liveness_analysis_test.cc b/compiler/optimizing/ssa_liveness_analysis_test.cc
index 1916c73ca4..a1016d1d47 100644
--- a/compiler/optimizing/ssa_liveness_analysis_test.cc
+++ b/compiler/optimizing/ssa_liveness_analysis_test.cc
@@ -189,13 +189,14 @@ TEST_F(SsaLivenessAnalysisTest, TestDeoptimize) {
   // Use HAboveOrEqual+HDeoptimize as the bounds check.
   HInstruction* ae = new (&allocator_) HAboveOrEqual(index, length);
   block->AddInstruction(ae);
-  HInstruction* deoptimize = new(&allocator_) HDeoptimize(ae, /* dex_pc */ 0u);
+  HInstruction* deoptimize =
+      new(&allocator_) HDeoptimize(&allocator_, ae, HDeoptimize::Kind::kBCE, /* dex_pc */ 0u);
   block->AddInstruction(deoptimize);
   HEnvironment* deoptimize_env = new (&allocator_) HEnvironment(&allocator_,
-                                                                  /* number_of_vregs */ 5,
-                                                                  /* method */ nullptr,
-                                                                  /* dex_pc */ 0u,
-                                                                  deoptimize);
+                                                                /* number_of_vregs */ 5,
+                                                                /* method */ nullptr,
+                                                                /* dex_pc */ 0u,
+                                                                deoptimize);
   deoptimize_env->CopyFrom(args);
   deoptimize->SetRawEnvironment(deoptimize_env);
   HInstruction* array_set =
diff --git a/compiler/utils/assembler_test.h b/compiler/utils/assembler_test.h
index d265a44092..f655994bd3 100644
--- a/compiler/utils/assembler_test.h
+++ b/compiler/utils/assembler_test.h
@@ -309,7 +309,7 @@ class AssemblerTest : public testing::Test {
   template <typename RegType, typename ImmType>
   std::string RepeatTemplatedRegisterImmBits(void (Ass::*f)(RegType, ImmType),
                                              int imm_bits,
-                                             const std::vector<Reg*> registers,
+                                             const std::vector<RegType*> registers,
                                              std::string (AssemblerTest::*GetName)(const RegType&),
                                              const std::string& fmt,
                                              int bias) {
@@ -573,6 +573,19 @@ class AssemblerTest : public testing::Test {
   }
 
   template <typename ImmType>
+  std::string RepeatVIb(void (Ass::*f)(VecReg, ImmType),
+                        int imm_bits,
+                        std::string fmt,
+                        int bias = 0) {
+    return RepeatTemplatedRegisterImmBits<VecReg, ImmType>(f,
+                                                           imm_bits,
+                                                           GetVectorRegisters(),
+                                                           &AssemblerTest::GetVecRegName,
+                                                           fmt,
+                                                           bias);
+  }
+
+  template <typename ImmType>
   std::string RepeatVRIb(void (Ass::*f)(VecReg, Reg, ImmType),
                          int imm_bits,
                          const std::string& fmt,
diff --git a/compiler/utils/atomic_method_ref_map-inl.h b/compiler/utils/atomic_method_ref_map-inl.h
index d71c2fe997..ad3a099eb6 100644
--- a/compiler/utils/atomic_method_ref_map-inl.h
+++ b/compiler/utils/atomic_method_ref_map-inl.h
@@ -42,7 +42,7 @@ template <typename T>
 inline bool AtomicMethodRefMap<T>::Get(MethodReference ref, T* out) const {
   const ElementArray* const array = GetArray(ref.dex_file);
   if (array == nullptr) {
-    return kInsertResultInvalidDexFile;
+    return false;
   }
   *out = (*array)[ref.dex_method_index].LoadRelaxed();
   return true;
diff --git a/compiler/utils/mips64/assembler_mips64.cc b/compiler/utils/mips64/assembler_mips64.cc
index 8a5ae754df..0cff44d830 100644
--- a/compiler/utils/mips64/assembler_mips64.cc
+++ b/compiler/utils/mips64/assembler_mips64.cc
@@ -252,6 +252,22 @@ void Mips64Assembler::EmitMsaMI10(int s10,
   Emit(encoding);
 }
 
+void Mips64Assembler::EmitMsaI10(int operation,
+                                 int df,
+                                 int i10,
+                                 VectorRegister wd,
+                                 int minor_opcode) {
+  CHECK_NE(wd, kNoVectorRegister);
+  CHECK(IsUint<10>(i10)) << i10;
+  uint32_t encoding = static_cast<uint32_t>(kMsaMajorOpcode) << kOpcodeShift |
+                      operation << kMsaOperationShift |
+                      df << kDfShift |
+                      i10 << kI10Shift |
+                      static_cast<uint32_t>(wd) << kWdShift |
+                      minor_opcode;
+  Emit(encoding);
+}
+
 void Mips64Assembler::EmitMsa2R(int operation,
                                 int df,
                                 VectorRegister ws,
@@ -1581,6 +1597,30 @@ void Mips64Assembler::FillD(VectorRegister wd, GpuRegister rs) {
   EmitMsa2R(0xc0, 0x3, static_cast<VectorRegister>(rs), wd, 0x1e);
 }
 
+void Mips64Assembler::LdiB(VectorRegister wd, int imm8) {
+  CHECK(HasMsa());
+  CHECK(IsInt<8>(imm8)) << imm8;
+  EmitMsaI10(0x6, 0x0, imm8 & kMsaS10Mask, wd, 0x7);
+}
+
+void Mips64Assembler::LdiH(VectorRegister wd, int imm10) {
+  CHECK(HasMsa());
+  CHECK(IsInt<10>(imm10)) << imm10;
+  EmitMsaI10(0x6, 0x1, imm10 & kMsaS10Mask, wd, 0x7);
+}
+
+void Mips64Assembler::LdiW(VectorRegister wd, int imm10) {
+  CHECK(HasMsa());
+  CHECK(IsInt<10>(imm10)) << imm10;
+  EmitMsaI10(0x6, 0x2, imm10 & kMsaS10Mask, wd, 0x7);
+}
+
+void Mips64Assembler::LdiD(VectorRegister wd, int imm10) {
+  CHECK(HasMsa());
+  CHECK(IsInt<10>(imm10)) << imm10;
+  EmitMsaI10(0x6, 0x3, imm10 & kMsaS10Mask, wd, 0x7);
+}
+
 void Mips64Assembler::LdB(VectorRegister wd, GpuRegister rs, int offset) {
   CHECK(HasMsa());
   CHECK(IsInt<10>(offset)) << offset;
@@ -1661,6 +1701,7 @@ void Mips64Assembler::Addiu32(GpuRegister rt, GpuRegister rs, int32_t value) {
   }
 }
 
+// TODO: don't use rtmp, use daui, dahi, dati.
 void Mips64Assembler::Daddiu64(GpuRegister rt, GpuRegister rs, int64_t value, GpuRegister rtmp) {
   if (IsInt<16>(value)) {
     Daddiu(rt, rs, value);
diff --git a/compiler/utils/mips64/assembler_mips64.h b/compiler/utils/mips64/assembler_mips64.h
index a8035b6da4..666c6935a1 100644
--- a/compiler/utils/mips64/assembler_mips64.h
+++ b/compiler/utils/mips64/assembler_mips64.h
@@ -734,6 +734,10 @@ class Mips64Assembler FINAL : public Assembler, public JNIMacroAssembler<Pointer
   void FillW(VectorRegister wd, GpuRegister rs);
   void FillD(VectorRegister wd, GpuRegister rs);
 
+  void LdiB(VectorRegister wd, int imm8);
+  void LdiH(VectorRegister wd, int imm10);
+  void LdiW(VectorRegister wd, int imm10);
+  void LdiD(VectorRegister wd, int imm10);
   void LdB(VectorRegister wd, GpuRegister rs, int offset);
   void LdH(VectorRegister wd, GpuRegister rs, int offset);
   void LdW(VectorRegister wd, GpuRegister rs, int offset);
@@ -1457,6 +1461,7 @@ class Mips64Assembler FINAL : public Assembler, public JNIMacroAssembler<Pointer
   void EmitMsaBIT(int operation, int df_m, VectorRegister ws, VectorRegister wd, int minor_opcode);
   void EmitMsaELM(int operation, int df_n, VectorRegister ws, VectorRegister wd, int minor_opcode);
   void EmitMsaMI10(int s10, GpuRegister rs, VectorRegister wd, int minor_opcode, int df);
+  void EmitMsaI10(int operation, int df, int i10, VectorRegister wd, int minor_opcode);
   void EmitMsa2R(int operation, int df, VectorRegister ws, VectorRegister wd, int minor_opcode);
   void EmitMsa2RF(int operation, int df, VectorRegister ws, VectorRegister wd, int minor_opcode);
 
diff --git a/compiler/utils/mips64/assembler_mips64_test.cc b/compiler/utils/mips64/assembler_mips64_test.cc
index cadbe27819..f2e3b1610c 100644
--- a/compiler/utils/mips64/assembler_mips64_test.cc
+++ b/compiler/utils/mips64/assembler_mips64_test.cc
@@ -2836,6 +2836,22 @@ TEST_F(AssemblerMIPS64Test, FillD) {
   DriverStr(RepeatVR(&mips64::Mips64Assembler::FillD, "fill.d ${reg1}, ${reg2}"), "fill.d");
 }
 
+TEST_F(AssemblerMIPS64Test, LdiB) {
+  DriverStr(RepeatVIb(&mips64::Mips64Assembler::LdiB, -8, "ldi.b ${reg}, {imm}"), "ldi.b");
+}
+
+TEST_F(AssemblerMIPS64Test, LdiH) {
+  DriverStr(RepeatVIb(&mips64::Mips64Assembler::LdiH, -10, "ldi.h ${reg}, {imm}"), "ldi.h");
+}
+
+TEST_F(AssemblerMIPS64Test, LdiW) {
+  DriverStr(RepeatVIb(&mips64::Mips64Assembler::LdiW, -10, "ldi.w ${reg}, {imm}"), "ldi.w");
+}
+
+TEST_F(AssemblerMIPS64Test, LdiD) {
+  DriverStr(RepeatVIb(&mips64::Mips64Assembler::LdiD, -10, "ldi.d ${reg}, {imm}"), "ldi.d");
+}
+
 TEST_F(AssemblerMIPS64Test, LdB) {
   DriverStr(RepeatVRIb(&mips64::Mips64Assembler::LdB, -10, "ld.b ${reg1}, {imm}(${reg2})"), "ld.b");
 }
diff --git a/compiler/utils/mips64/constants_mips64.h b/compiler/utils/mips64/constants_mips64.h
index 5ae9c73589..bc8e40b437 100644
--- a/compiler/utils/mips64/constants_mips64.h
+++ b/compiler/utils/mips64/constants_mips64.h
@@ -66,6 +66,7 @@ enum InstructionFields {
   kWdShift = 6,
   kWdBits = 5,
   kS10Shift = 16,
+  kI10Shift = 11,
   kS10MinorShift = 2,
 
   kBranchOffsetMask = 0x0000ffff,
diff --git a/compiler/utils/mips64/managed_register_mips64.cc b/compiler/utils/mips64/managed_register_mips64.cc
index dea396e4a7..42d061ec15 100644
--- a/compiler/utils/mips64/managed_register_mips64.cc
+++ b/compiler/utils/mips64/managed_register_mips64.cc
@@ -26,6 +26,11 @@ bool Mips64ManagedRegister::Overlaps(const Mips64ManagedRegister& other) const {
   CHECK(IsValidManagedRegister());
   CHECK(other.IsValidManagedRegister());
   if (Equals(other)) return true;
+  if (IsFpuRegister() && other.IsVectorRegister()) {
+    return (AsFpuRegister() == other.AsOverlappingFpuRegister());
+  } else if (IsVectorRegister() && other.IsFpuRegister()) {
+    return (AsVectorRegister() == other.AsOverlappingVectorRegister());
+  }
   return false;
 }
 
@@ -36,6 +41,8 @@ void Mips64ManagedRegister::Print(std::ostream& os) const {
     os << "GPU: " << static_cast<int>(AsGpuRegister());
   } else if (IsFpuRegister()) {
      os << "FpuRegister: " << static_cast<int>(AsFpuRegister());
+  } else if (IsVectorRegister()) {
+     os << "VectorRegister: " << static_cast<int>(AsVectorRegister());
   } else {
     os << "??: " << RegId();
   }
diff --git a/compiler/utils/mips64/managed_register_mips64.h b/compiler/utils/mips64/managed_register_mips64.h
index c9f95569cf..3980199b1e 100644
--- a/compiler/utils/mips64/managed_register_mips64.h
+++ b/compiler/utils/mips64/managed_register_mips64.h
@@ -30,11 +30,27 @@ const int kNumberOfGpuAllocIds = kNumberOfGpuRegisters;
 const int kNumberOfFpuRegIds = kNumberOfFpuRegisters;
 const int kNumberOfFpuAllocIds = kNumberOfFpuRegisters;
 
-const int kNumberOfRegIds = kNumberOfGpuRegIds + kNumberOfFpuRegIds;
-const int kNumberOfAllocIds = kNumberOfGpuAllocIds + kNumberOfFpuAllocIds;
-
-// An instance of class 'ManagedRegister' represents a single GPU register (enum
-// Register) or a double precision FP register (enum FpuRegister)
+const int kNumberOfVecRegIds = kNumberOfVectorRegisters;
+const int kNumberOfVecAllocIds = kNumberOfVectorRegisters;
+
+const int kNumberOfRegIds = kNumberOfGpuRegIds + kNumberOfFpuRegIds + kNumberOfVecRegIds;
+const int kNumberOfAllocIds = kNumberOfGpuAllocIds + kNumberOfFpuAllocIds + kNumberOfVecAllocIds;
+
+// Register ids map:
+//   [0..R[  core registers (enum GpuRegister)
+//   [R..F[  floating-point registers (enum FpuRegister)
+//   [F..W[  MSA vector registers (enum VectorRegister)
+// where
+//   R = kNumberOfGpuRegIds
+//   F = R + kNumberOfFpuRegIds
+//   W = F + kNumberOfVecRegIds
+
+// An instance of class 'ManagedRegister' represents a single Mips64 register.
+// A register can be one of the following:
+//  * core register (enum GpuRegister)
+//  * floating-point register (enum FpuRegister)
+//  * MSA vector register (enum VectorRegister)
+//
 // 'ManagedRegister::NoRegister()' provides an invalid register.
 // There is a one-to-one mapping between ManagedRegister and register id.
 class Mips64ManagedRegister : public ManagedRegister {
@@ -49,6 +65,21 @@ class Mips64ManagedRegister : public ManagedRegister {
     return static_cast<FpuRegister>(id_ - kNumberOfGpuRegIds);
   }
 
+  constexpr VectorRegister AsVectorRegister() const {
+    CHECK(IsVectorRegister());
+    return static_cast<VectorRegister>(id_ - (kNumberOfGpuRegIds + kNumberOfFpuRegisters));
+  }
+
+  constexpr FpuRegister AsOverlappingFpuRegister() const {
+    CHECK(IsValidManagedRegister());
+    return static_cast<FpuRegister>(AsVectorRegister());
+  }
+
+  constexpr VectorRegister AsOverlappingVectorRegister() const {
+    CHECK(IsValidManagedRegister());
+    return static_cast<VectorRegister>(AsFpuRegister());
+  }
+
   constexpr bool IsGpuRegister() const {
     CHECK(IsValidManagedRegister());
     return (0 <= id_) && (id_ < kNumberOfGpuRegIds);
@@ -60,6 +91,12 @@ class Mips64ManagedRegister : public ManagedRegister {
     return (0 <= test) && (test < kNumberOfFpuRegIds);
   }
 
+  constexpr bool IsVectorRegister() const {
+    CHECK(IsValidManagedRegister());
+    const int test = id_ - (kNumberOfGpuRegIds + kNumberOfFpuRegIds);
+    return (0 <= test) && (test < kNumberOfVecRegIds);
+  }
+
   void Print(std::ostream& os) const;
 
   // Returns true if the two managed-registers ('this' and 'other') overlap.
@@ -77,6 +114,11 @@ class Mips64ManagedRegister : public ManagedRegister {
     return FromRegId(r + kNumberOfGpuRegIds);
   }
 
+  static constexpr Mips64ManagedRegister FromVectorRegister(VectorRegister r) {
+    CHECK_NE(r, kNoVectorRegister);
+    return FromRegId(r + kNumberOfGpuRegIds + kNumberOfFpuRegIds);
+  }
+
  private:
   constexpr bool IsValidManagedRegister() const {
     return (0 <= id_) && (id_ < kNumberOfRegIds);
diff --git a/compiler/utils/mips64/managed_register_mips64_test.cc b/compiler/utils/mips64/managed_register_mips64_test.cc
new file mode 100644
index 0000000000..8b72d7e61d
--- /dev/null
+++ b/compiler/utils/mips64/managed_register_mips64_test.cc
@@ -0,0 +1,480 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "managed_register_mips64.h"
+#include "globals.h"
+#include "gtest/gtest.h"
+
+namespace art {
+namespace mips64 {
+
+TEST(Mips64ManagedRegister, NoRegister) {
+  Mips64ManagedRegister reg = ManagedRegister::NoRegister().AsMips64();
+  EXPECT_TRUE(reg.IsNoRegister());
+  EXPECT_FALSE(reg.Overlaps(reg));
+}
+
+TEST(Mips64ManagedRegister, GpuRegister) {
+  Mips64ManagedRegister reg = Mips64ManagedRegister::FromGpuRegister(ZERO);
+  EXPECT_FALSE(reg.IsNoRegister());
+  EXPECT_TRUE(reg.IsGpuRegister());
+  EXPECT_FALSE(reg.IsFpuRegister());
+  EXPECT_FALSE(reg.IsVectorRegister());
+  EXPECT_EQ(ZERO, reg.AsGpuRegister());
+
+  reg = Mips64ManagedRegister::FromGpuRegister(AT);
+  EXPECT_FALSE(reg.IsNoRegister());
+  EXPECT_TRUE(reg.IsGpuRegister());
+  EXPECT_FALSE(reg.IsFpuRegister());
+  EXPECT_FALSE(reg.IsVectorRegister());
+  EXPECT_EQ(AT, reg.AsGpuRegister());
+
+  reg = Mips64ManagedRegister::FromGpuRegister(V0);
+  EXPECT_FALSE(reg.IsNoRegister());
+  EXPECT_TRUE(reg.IsGpuRegister());
+  EXPECT_FALSE(reg.IsFpuRegister());
+  EXPECT_FALSE(reg.IsVectorRegister());
+  EXPECT_EQ(V0, reg.AsGpuRegister());
+
+  reg = Mips64ManagedRegister::FromGpuRegister(A0);
+  EXPECT_FALSE(reg.IsNoRegister());
+  EXPECT_TRUE(reg.IsGpuRegister());
+  EXPECT_FALSE(reg.IsFpuRegister());
+  EXPECT_FALSE(reg.IsVectorRegister());
+  EXPECT_EQ(A0, reg.AsGpuRegister());
+
+  reg = Mips64ManagedRegister::FromGpuRegister(A7);
+  EXPECT_FALSE(reg.IsNoRegister());
+  EXPECT_TRUE(reg.IsGpuRegister());
+  EXPECT_FALSE(reg.IsFpuRegister());
+  EXPECT_FALSE(reg.IsVectorRegister());
+  EXPECT_EQ(A7, reg.AsGpuRegister());
+
+  reg = Mips64ManagedRegister::FromGpuRegister(T0);
+  EXPECT_FALSE(reg.IsNoRegister());
+  EXPECT_TRUE(reg.IsGpuRegister());
+  EXPECT_FALSE(reg.IsFpuRegister());
+  EXPECT_FALSE(reg.IsVectorRegister());
+  EXPECT_EQ(T0, reg.AsGpuRegister());
+
+  reg = Mips64ManagedRegister::FromGpuRegister(T3);
+  EXPECT_FALSE(reg.IsNoRegister());
+  EXPECT_TRUE(reg.IsGpuRegister());
+  EXPECT_FALSE(reg.IsFpuRegister());
+  EXPECT_FALSE(reg.IsVectorRegister());
+  EXPECT_EQ(T3, reg.AsGpuRegister());
+
+  reg = Mips64ManagedRegister::FromGpuRegister(S0);
+  EXPECT_FALSE(reg.IsNoRegister());
+  EXPECT_TRUE(reg.IsGpuRegister());
+  EXPECT_FALSE(reg.IsFpuRegister());
+  EXPECT_FALSE(reg.IsVectorRegister());
+  EXPECT_EQ(S0, reg.AsGpuRegister());
+
+  reg = Mips64ManagedRegister::FromGpuRegister(GP);
+  EXPECT_FALSE(reg.IsNoRegister());
+  EXPECT_TRUE(reg.IsGpuRegister());
+  EXPECT_FALSE(reg.IsFpuRegister());
+  EXPECT_FALSE(reg.IsVectorRegister());
+  EXPECT_EQ(GP, reg.AsGpuRegister());
+
+  reg = Mips64ManagedRegister::FromGpuRegister(SP);
+  EXPECT_FALSE(reg.IsNoRegister());
+  EXPECT_TRUE(reg.IsGpuRegister());
+  EXPECT_FALSE(reg.IsFpuRegister());
+  EXPECT_FALSE(reg.IsVectorRegister());
+  EXPECT_EQ(SP, reg.AsGpuRegister());
+
+  reg = Mips64ManagedRegister::FromGpuRegister(RA);
+  EXPECT_FALSE(reg.IsNoRegister());
+  EXPECT_TRUE(reg.IsGpuRegister());
+  EXPECT_FALSE(reg.IsFpuRegister());
+  EXPECT_FALSE(reg.IsVectorRegister());
+  EXPECT_EQ(RA, reg.AsGpuRegister());
+}
+
+TEST(Mips64ManagedRegister, FpuRegister) {
+  Mips64ManagedRegister reg = Mips64ManagedRegister::FromFpuRegister(F0);
+  Mips64ManagedRegister vreg = Mips64ManagedRegister::FromVectorRegister(W0);
+  EXPECT_FALSE(reg.IsNoRegister());
+  EXPECT_FALSE(reg.IsGpuRegister());
+  EXPECT_TRUE(reg.IsFpuRegister());
+  EXPECT_FALSE(reg.IsVectorRegister());
+  EXPECT_TRUE(reg.Overlaps(vreg));
+  EXPECT_EQ(F0, reg.AsFpuRegister());
+  EXPECT_EQ(W0, reg.AsOverlappingVectorRegister());
+  EXPECT_TRUE(reg.Equals(Mips64ManagedRegister::FromFpuRegister(F0)));
+
+  reg = Mips64ManagedRegister::FromFpuRegister(F1);
+  vreg = Mips64ManagedRegister::FromVectorRegister(W1);
+  EXPECT_FALSE(reg.IsNoRegister());
+  EXPECT_FALSE(reg.IsGpuRegister());
+  EXPECT_TRUE(reg.IsFpuRegister());
+  EXPECT_FALSE(reg.IsVectorRegister());
+  EXPECT_TRUE(reg.Overlaps(vreg));
+  EXPECT_EQ(F1, reg.AsFpuRegister());
+  EXPECT_EQ(W1, reg.AsOverlappingVectorRegister());
+  EXPECT_TRUE(reg.Equals(Mips64ManagedRegister::FromFpuRegister(F1)));
+
+  reg = Mips64ManagedRegister::FromFpuRegister(F20);
+  vreg = Mips64ManagedRegister::FromVectorRegister(W20);
+  EXPECT_FALSE(reg.IsNoRegister());
+  EXPECT_FALSE(reg.IsGpuRegister());
+  EXPECT_TRUE(reg.IsFpuRegister());
+  EXPECT_FALSE(reg.IsVectorRegister());
+  EXPECT_TRUE(reg.Overlaps(vreg));
+  EXPECT_EQ(F20, reg.AsFpuRegister());
+  EXPECT_EQ(W20, reg.AsOverlappingVectorRegister());
+  EXPECT_TRUE(reg.Equals(Mips64ManagedRegister::FromFpuRegister(F20)));
+
+  reg = Mips64ManagedRegister::FromFpuRegister(F31);
+  vreg = Mips64ManagedRegister::FromVectorRegister(W31);
+  EXPECT_FALSE(reg.IsNoRegister());
+  EXPECT_FALSE(reg.IsGpuRegister());
+  EXPECT_TRUE(reg.IsFpuRegister());
+  EXPECT_FALSE(reg.IsVectorRegister());
+  EXPECT_TRUE(reg.Overlaps(vreg));
+  EXPECT_EQ(F31, reg.AsFpuRegister());
+  EXPECT_EQ(W31, reg.AsOverlappingVectorRegister());
+  EXPECT_TRUE(reg.Equals(Mips64ManagedRegister::FromFpuRegister(F31)));
+}
+
+TEST(Mips64ManagedRegister, VectorRegister) {
+  Mips64ManagedRegister reg = Mips64ManagedRegister::FromVectorRegister(W0);
+  Mips64ManagedRegister freg = Mips64ManagedRegister::FromFpuRegister(F0);
+  EXPECT_FALSE(reg.IsNoRegister());
+  EXPECT_FALSE(reg.IsGpuRegister());
+  EXPECT_FALSE(reg.IsFpuRegister());
+  EXPECT_TRUE(reg.IsVectorRegister());
+  EXPECT_TRUE(reg.Overlaps(freg));
+  EXPECT_EQ(W0, reg.AsVectorRegister());
+  EXPECT_EQ(F0, reg.AsOverlappingFpuRegister());
+  EXPECT_TRUE(reg.Equals(Mips64ManagedRegister::FromVectorRegister(W0)));
+
+  reg = Mips64ManagedRegister::FromVectorRegister(W2);
+  freg = Mips64ManagedRegister::FromFpuRegister(F2);
+  EXPECT_FALSE(reg.IsNoRegister());
+  EXPECT_FALSE(reg.IsGpuRegister());
+  EXPECT_FALSE(reg.IsFpuRegister());
+  EXPECT_TRUE(reg.IsVectorRegister());
+  EXPECT_TRUE(reg.Overlaps(freg));
+  EXPECT_EQ(W2, reg.AsVectorRegister());
+  EXPECT_EQ(F2, reg.AsOverlappingFpuRegister());
+  EXPECT_TRUE(reg.Equals(Mips64ManagedRegister::FromVectorRegister(W2)));
+
+  reg = Mips64ManagedRegister::FromVectorRegister(W13);
+  freg = Mips64ManagedRegister::FromFpuRegister(F13);
+  EXPECT_FALSE(reg.IsNoRegister());
+  EXPECT_FALSE(reg.IsGpuRegister());
+  EXPECT_FALSE(reg.IsFpuRegister());
+  EXPECT_TRUE(reg.IsVectorRegister());
+  EXPECT_TRUE(reg.Overlaps(freg));
+  EXPECT_EQ(W13, reg.AsVectorRegister());
+  EXPECT_EQ(F13, reg.AsOverlappingFpuRegister());
+  EXPECT_TRUE(reg.Equals(Mips64ManagedRegister::FromVectorRegister(W13)));
+
+  reg = Mips64ManagedRegister::FromVectorRegister(W29);
+  freg = Mips64ManagedRegister::FromFpuRegister(F29);
+  EXPECT_FALSE(reg.IsNoRegister());
+  EXPECT_FALSE(reg.IsGpuRegister());
+  EXPECT_FALSE(reg.IsFpuRegister());
+  EXPECT_TRUE(reg.IsVectorRegister());
+  EXPECT_TRUE(reg.Overlaps(freg));
+  EXPECT_EQ(W29, reg.AsVectorRegister());
+  EXPECT_EQ(F29, reg.AsOverlappingFpuRegister());
+  EXPECT_TRUE(reg.Equals(Mips64ManagedRegister::FromVectorRegister(W29)));
+}
+
+TEST(Mips64ManagedRegister, Equals) {
+  ManagedRegister no_reg = ManagedRegister::NoRegister();
+  EXPECT_TRUE(no_reg.Equals(Mips64ManagedRegister::NoRegister()));
+  EXPECT_FALSE(no_reg.Equals(Mips64ManagedRegister::FromGpuRegister(ZERO)));
+  EXPECT_FALSE(no_reg.Equals(Mips64ManagedRegister::FromGpuRegister(A1)));
+  EXPECT_FALSE(no_reg.Equals(Mips64ManagedRegister::FromGpuRegister(S2)));
+  EXPECT_FALSE(no_reg.Equals(Mips64ManagedRegister::FromFpuRegister(F0)));
+  EXPECT_FALSE(no_reg.Equals(Mips64ManagedRegister::FromVectorRegister(W0)));
+
+  Mips64ManagedRegister reg_ZERO = Mips64ManagedRegister::FromGpuRegister(ZERO);
+  EXPECT_FALSE(reg_ZERO.Equals(Mips64ManagedRegister::NoRegister()));
+  EXPECT_TRUE(reg_ZERO.Equals(Mips64ManagedRegister::FromGpuRegister(ZERO)));
+  EXPECT_FALSE(reg_ZERO.Equals(Mips64ManagedRegister::FromGpuRegister(A1)));
+  EXPECT_FALSE(reg_ZERO.Equals(Mips64ManagedRegister::FromGpuRegister(S2)));
+  EXPECT_FALSE(reg_ZERO.Equals(Mips64ManagedRegister::FromFpuRegister(F0)));
+  EXPECT_FALSE(reg_ZERO.Equals(Mips64ManagedRegister::FromVectorRegister(W0)));
+
+  Mips64ManagedRegister reg_A1 = Mips64ManagedRegister::FromGpuRegister(A1);
+  EXPECT_FALSE(reg_A1.Equals(Mips64ManagedRegister::NoRegister()));
+  EXPECT_FALSE(reg_A1.Equals(Mips64ManagedRegister::FromGpuRegister(ZERO)));
+  EXPECT_FALSE(reg_A1.Equals(Mips64ManagedRegister::FromGpuRegister(A0)));
+  EXPECT_TRUE(reg_A1.Equals(Mips64ManagedRegister::FromGpuRegister(A1)));
+  EXPECT_FALSE(reg_A1.Equals(Mips64ManagedRegister::FromGpuRegister(S2)));
+  EXPECT_FALSE(reg_A1.Equals(Mips64ManagedRegister::FromFpuRegister(F0)));
+  EXPECT_FALSE(reg_A1.Equals(Mips64ManagedRegister::FromVectorRegister(W0)));
+
+  Mips64ManagedRegister reg_S2 = Mips64ManagedRegister::FromGpuRegister(S2);
+  EXPECT_FALSE(reg_S2.Equals(Mips64ManagedRegister::NoRegister()));
+  EXPECT_FALSE(reg_S2.Equals(Mips64ManagedRegister::FromGpuRegister(ZERO)));
+  EXPECT_FALSE(reg_S2.Equals(Mips64ManagedRegister::FromGpuRegister(A1)));
+  EXPECT_FALSE(reg_S2.Equals(Mips64ManagedRegister::FromGpuRegister(S1)));
+  EXPECT_TRUE(reg_S2.Equals(Mips64ManagedRegister::FromGpuRegister(S2)));
+  EXPECT_FALSE(reg_S2.Equals(Mips64ManagedRegister::FromFpuRegister(F0)));
+  EXPECT_FALSE(reg_S2.Equals(Mips64ManagedRegister::FromVectorRegister(W0)));
+
+  Mips64ManagedRegister reg_F0 = Mips64ManagedRegister::FromFpuRegister(F0);
+  EXPECT_FALSE(reg_F0.Equals(Mips64ManagedRegister::NoRegister()));
+  EXPECT_FALSE(reg_F0.Equals(Mips64ManagedRegister::FromGpuRegister(ZERO)));
+  EXPECT_FALSE(reg_F0.Equals(Mips64ManagedRegister::FromGpuRegister(A1)));
+  EXPECT_FALSE(reg_F0.Equals(Mips64ManagedRegister::FromGpuRegister(S2)));
+  EXPECT_TRUE(reg_F0.Equals(Mips64ManagedRegister::FromFpuRegister(F0)));
+  EXPECT_FALSE(reg_F0.Equals(Mips64ManagedRegister::FromFpuRegister(F1)));
+  EXPECT_FALSE(reg_F0.Equals(Mips64ManagedRegister::FromFpuRegister(F31)));
+  EXPECT_FALSE(reg_F0.Equals(Mips64ManagedRegister::FromVectorRegister(W0)));
+
+  Mips64ManagedRegister reg_F31 = Mips64ManagedRegister::FromFpuRegister(F31);
+  EXPECT_FALSE(reg_F31.Equals(Mips64ManagedRegister::NoRegister()));
+  EXPECT_FALSE(reg_F31.Equals(Mips64ManagedRegister::FromGpuRegister(ZERO)));
+  EXPECT_FALSE(reg_F31.Equals(Mips64ManagedRegister::FromGpuRegister(A1)));
+  EXPECT_FALSE(reg_F31.Equals(Mips64ManagedRegister::FromGpuRegister(S2)));
+  EXPECT_FALSE(reg_F31.Equals(Mips64ManagedRegister::FromFpuRegister(F0)));
+  EXPECT_FALSE(reg_F31.Equals(Mips64ManagedRegister::FromFpuRegister(F1)));
+  EXPECT_TRUE(reg_F31.Equals(Mips64ManagedRegister::FromFpuRegister(F31)));
+  EXPECT_FALSE(reg_F31.Equals(Mips64ManagedRegister::FromVectorRegister(W0)));
+
+  Mips64ManagedRegister reg_W0 = Mips64ManagedRegister::FromVectorRegister(W0);
+  EXPECT_FALSE(reg_W0.Equals(Mips64ManagedRegister::NoRegister()));
+  EXPECT_FALSE(reg_W0.Equals(Mips64ManagedRegister::FromGpuRegister(ZERO)));
+  EXPECT_FALSE(reg_W0.Equals(Mips64ManagedRegister::FromGpuRegister(A1)));
+  EXPECT_FALSE(reg_W0.Equals(Mips64ManagedRegister::FromGpuRegister(S1)));
+  EXPECT_FALSE(reg_W0.Equals(Mips64ManagedRegister::FromFpuRegister(F0)));
+  EXPECT_TRUE(reg_W0.Equals(Mips64ManagedRegister::FromVectorRegister(W0)));
+  EXPECT_FALSE(reg_W0.Equals(Mips64ManagedRegister::FromVectorRegister(W1)));
+  EXPECT_FALSE(reg_W0.Equals(Mips64ManagedRegister::FromVectorRegister(W31)));
+
+  Mips64ManagedRegister reg_W31 = Mips64ManagedRegister::FromVectorRegister(W31);
+  EXPECT_FALSE(reg_W31.Equals(Mips64ManagedRegister::NoRegister()));
+  EXPECT_FALSE(reg_W31.Equals(Mips64ManagedRegister::FromGpuRegister(ZERO)));
+  EXPECT_FALSE(reg_W31.Equals(Mips64ManagedRegister::FromGpuRegister(A1)));
+  EXPECT_FALSE(reg_W31.Equals(Mips64ManagedRegister::FromGpuRegister(S1)));
+  EXPECT_FALSE(reg_W31.Equals(Mips64ManagedRegister::FromFpuRegister(F0)));
+  EXPECT_FALSE(reg_W31.Equals(Mips64ManagedRegister::FromVectorRegister(W0)));
+  EXPECT_FALSE(reg_W31.Equals(Mips64ManagedRegister::FromVectorRegister(W1)));
+  EXPECT_TRUE(reg_W31.Equals(Mips64ManagedRegister::FromVectorRegister(W31)));
+}
+
+TEST(Mips64ManagedRegister, Overlaps) {
+  Mips64ManagedRegister reg = Mips64ManagedRegister::FromFpuRegister(F0);
+  Mips64ManagedRegister reg_o = Mips64ManagedRegister::FromVectorRegister(W0);
+  EXPECT_TRUE(reg.Overlaps(reg_o));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(ZERO)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(A0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(S0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(RA)));
+  EXPECT_EQ(F0, reg_o.AsOverlappingFpuRegister());
+  EXPECT_EQ(W0, reg.AsOverlappingVectorRegister());
+  EXPECT_TRUE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F4)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F16)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F31)));
+  EXPECT_TRUE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W4)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W16)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W31)));
+
+  reg = Mips64ManagedRegister::FromFpuRegister(F4);
+  reg_o = Mips64ManagedRegister::FromVectorRegister(W4);
+  EXPECT_TRUE(reg.Overlaps(reg_o));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(ZERO)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(A0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(S0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(RA)));
+  EXPECT_EQ(F4, reg_o.AsOverlappingFpuRegister());
+  EXPECT_EQ(W4, reg.AsOverlappingVectorRegister());
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F0)));
+  EXPECT_TRUE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F4)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F16)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F31)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W0)));
+  EXPECT_TRUE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W4)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W16)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W31)));
+
+  reg = Mips64ManagedRegister::FromFpuRegister(F16);
+  reg_o = Mips64ManagedRegister::FromVectorRegister(W16);
+  EXPECT_TRUE(reg.Overlaps(reg_o));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(ZERO)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(A0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(S0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(RA)));
+  EXPECT_EQ(F16, reg_o.AsOverlappingFpuRegister());
+  EXPECT_EQ(W16, reg.AsOverlappingVectorRegister());
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F4)));
+  EXPECT_TRUE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F16)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F31)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W4)));
+  EXPECT_TRUE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W16)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W31)));
+
+  reg = Mips64ManagedRegister::FromFpuRegister(F31);
+  reg_o = Mips64ManagedRegister::FromVectorRegister(W31);
+  EXPECT_TRUE(reg.Overlaps(reg_o));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(ZERO)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(A0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(S0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(RA)));
+  EXPECT_EQ(F31, reg_o.AsOverlappingFpuRegister());
+  EXPECT_EQ(W31, reg.AsOverlappingVectorRegister());
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F4)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F16)));
+  EXPECT_TRUE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F31)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W4)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W16)));
+  EXPECT_TRUE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W31)));
+
+  reg = Mips64ManagedRegister::FromVectorRegister(W0);
+  reg_o = Mips64ManagedRegister::FromFpuRegister(F0);
+  EXPECT_TRUE(reg.Overlaps(reg_o));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(ZERO)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(A0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(S0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(RA)));
+  EXPECT_EQ(W0, reg_o.AsOverlappingVectorRegister());
+  EXPECT_EQ(F0, reg.AsOverlappingFpuRegister());
+  EXPECT_TRUE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F4)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F16)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F31)));
+  EXPECT_TRUE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W4)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W16)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W31)));
+
+  reg = Mips64ManagedRegister::FromVectorRegister(W4);
+  reg_o = Mips64ManagedRegister::FromFpuRegister(F4);
+  EXPECT_TRUE(reg.Overlaps(reg_o));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(ZERO)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(A0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(S0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(RA)));
+  EXPECT_EQ(W4, reg_o.AsOverlappingVectorRegister());
+  EXPECT_EQ(F4, reg.AsOverlappingFpuRegister());
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F0)));
+  EXPECT_TRUE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F4)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F16)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F31)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W0)));
+  EXPECT_TRUE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W4)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W16)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W31)));
+
+  reg = Mips64ManagedRegister::FromVectorRegister(W16);
+  reg_o = Mips64ManagedRegister::FromFpuRegister(F16);
+  EXPECT_TRUE(reg.Overlaps(reg_o));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(ZERO)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(A0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(S0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(RA)));
+  EXPECT_EQ(W16, reg_o.AsOverlappingVectorRegister());
+  EXPECT_EQ(F16, reg.AsOverlappingFpuRegister());
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F4)));
+  EXPECT_TRUE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F16)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F31)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W4)));
+  EXPECT_TRUE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W16)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W31)));
+
+  reg = Mips64ManagedRegister::FromVectorRegister(W31);
+  reg_o = Mips64ManagedRegister::FromFpuRegister(F31);
+  EXPECT_TRUE(reg.Overlaps(reg_o));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(ZERO)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(A0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(S0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(RA)));
+  EXPECT_EQ(W31, reg_o.AsOverlappingVectorRegister());
+  EXPECT_EQ(F31, reg.AsOverlappingFpuRegister());
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F4)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F16)));
+  EXPECT_TRUE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F31)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W4)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W16)));
+  EXPECT_TRUE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W31)));
+
+  reg = Mips64ManagedRegister::FromGpuRegister(ZERO);
+  EXPECT_TRUE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(ZERO)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(A0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(S0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(RA)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F4)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F16)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F31)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W4)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W16)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W31)));
+
+  reg = Mips64ManagedRegister::FromGpuRegister(A0);
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(ZERO)));
+  EXPECT_TRUE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(A0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(S0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(RA)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F4)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F16)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F31)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W4)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W16)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W31)));
+
+  reg = Mips64ManagedRegister::FromGpuRegister(S0);
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(ZERO)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(A0)));
+  EXPECT_TRUE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(S0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(RA)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F4)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F16)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F31)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W4)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W16)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W31)));
+
+  reg = Mips64ManagedRegister::FromGpuRegister(RA);
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(ZERO)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(A0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(S0)));
+  EXPECT_TRUE(reg.Overlaps(Mips64ManagedRegister::FromGpuRegister(RA)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F4)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F16)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromFpuRegister(F31)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W0)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W4)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W16)));
+  EXPECT_FALSE(reg.Overlaps(Mips64ManagedRegister::FromVectorRegister(W31)));
+}
+
+}  // namespace mips64
+}  // namespace art
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index 5307dc09d9..9c934b7f39 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -1221,6 +1221,24 @@ void X86Assembler::por(XmmRegister dst, XmmRegister src) {
 }
 
 
+void X86Assembler::pavgb(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0xE0);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+
+void X86Assembler::pavgw(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0xE3);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+
 void X86Assembler::pcmpeqb(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index f52cf16c8b..b87522a017 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -495,6 +495,9 @@ class X86Assembler FINAL : public Assembler {
   void orps(XmmRegister dst, XmmRegister src);
   void por(XmmRegister dst, XmmRegister src);
 
+  void pavgb(XmmRegister dst, XmmRegister src);  // no addr variant (for now)
+  void pavgw(XmmRegister dst, XmmRegister src);
+
   void pcmpeqb(XmmRegister dst, XmmRegister src);
   void pcmpeqw(XmmRegister dst, XmmRegister src);
   void pcmpeqd(XmmRegister dst, XmmRegister src);
diff --git a/compiler/utils/x86/assembler_x86_test.cc b/compiler/utils/x86/assembler_x86_test.cc
index 23049079e0..a01eb6dc23 100644
--- a/compiler/utils/x86/assembler_x86_test.cc
+++ b/compiler/utils/x86/assembler_x86_test.cc
@@ -605,6 +605,14 @@ TEST_F(AssemblerX86Test, POr) {
   DriverStr(RepeatFF(&x86::X86Assembler::por, "por %{reg2}, %{reg1}"), "por");
 }
 
+TEST_F(AssemblerX86Test, PAvgB) {
+  DriverStr(RepeatFF(&x86::X86Assembler::pavgb, "pavgb %{reg2}, %{reg1}"), "pavgb");
+}
+
+TEST_F(AssemblerX86Test, PAvgW) {
+  DriverStr(RepeatFF(&x86::X86Assembler::pavgw, "pavgw %{reg2}, %{reg1}"), "pavgw");
+}
+
 TEST_F(AssemblerX86Test, PCmpeqB) {
   DriverStr(RepeatFF(&x86::X86Assembler::pcmpeqb, "pcmpeqb %{reg2}, %{reg1}"), "cmpeqb");
 }
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index d20a6965c3..488c75de41 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -1427,6 +1427,24 @@ void X86_64Assembler::por(XmmRegister dst, XmmRegister src) {
   EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
+void X86_64Assembler::pavgb(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0xE0);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
+}
+
+void X86_64Assembler::pavgw(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0xE3);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
+}
+
 void X86_64Assembler::pcmpeqb(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index 08e17e81e5..fc2b117f71 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -523,6 +523,9 @@ class X86_64Assembler FINAL : public Assembler {
   void orps(XmmRegister dst, XmmRegister src);
   void por(XmmRegister dst, XmmRegister src);
 
+  void pavgb(XmmRegister dst, XmmRegister src);  // no addr variant (for now)
+  void pavgw(XmmRegister dst, XmmRegister src);
+
   void pcmpeqb(XmmRegister dst, XmmRegister src);
   void pcmpeqw(XmmRegister dst, XmmRegister src);
   void pcmpeqd(XmmRegister dst, XmmRegister src);
diff --git a/compiler/utils/x86_64/assembler_x86_64_test.cc b/compiler/utils/x86_64/assembler_x86_64_test.cc
index 20062fdb07..4adf210e47 100644
--- a/compiler/utils/x86_64/assembler_x86_64_test.cc
+++ b/compiler/utils/x86_64/assembler_x86_64_test.cc
@@ -1293,6 +1293,14 @@ TEST_F(AssemblerX86_64Test, Por) {
   DriverStr(RepeatFF(&x86_64::X86_64Assembler::por, "por %{reg2}, %{reg1}"), "por");
 }
 
+TEST_F(AssemblerX86_64Test, Pavgb) {
+  DriverStr(RepeatFF(&x86_64::X86_64Assembler::pavgb, "pavgb %{reg2}, %{reg1}"), "pavgb");
+}
+
+TEST_F(AssemblerX86_64Test, Pavgw) {
+  DriverStr(RepeatFF(&x86_64::X86_64Assembler::pavgw, "pavgw %{reg2}, %{reg1}"), "pavgw");
+}
+
 TEST_F(AssemblerX86_64Test, PCmpeqb) {
   DriverStr(RepeatFF(&x86_64::X86_64Assembler::pcmpeqb, "pcmpeqb %{reg2}, %{reg1}"), "pcmpeqb");
 }
diff --git a/compiler/verifier_deps_test.cc b/compiler/verifier_deps_test.cc
index 4bfc84990d..fa7e98586c 100644
--- a/compiler/verifier_deps_test.cc
+++ b/compiler/verifier_deps_test.cc
@@ -18,21 +18,21 @@
 #include "verifier/verifier_deps.h"
 
 #include "class_linker.h"
-#include "compiler/common_compiler_test.h"
-#include "compiler/dex/verification_results.h"
-#include "compiler/dex/verified_method.h"
-#include "compiler/driver/compiler_options.h"
-#include "compiler/driver/compiler_driver.h"
-#include "compiler/utils/atomic_method_ref_map-inl.h"
+#include "common_compiler_test.h"
 #include "compiler_callbacks.h"
+#include "dex/verification_results.h"
+#include "dex/verified_method.h"
 #include "dex_file.h"
 #include "dex_file_types.h"
+#include "driver/compiler_options.h"
+#include "driver/compiler_driver.h"
 #include "handle_scope-inl.h"
 #include "verifier/method_verifier-inl.h"
 #include "mirror/class_loader.h"
 #include "runtime.h"
 #include "thread.h"
 #include "scoped_thread_state_change-inl.h"
+#include "utils/atomic_method_ref_map-inl.h"
 
 namespace art {
 namespace verifier {