63 files changed, 3384 insertions, 937 deletions
diff --git a/compiler/Android.bp b/compiler/Android.bp
index 46f3358af1..f6a4db49fb 100644
--- a/compiler/Android.bp
+++ b/compiler/Android.bp
@@ -80,6 +80,7 @@ art_cc_defaults {
         "optimizing/register_allocator_graph_color.cc",
         "optimizing/register_allocator_linear_scan.cc",
         "optimizing/select_generator.cc",
+        "optimizing/scheduler.cc",
         "optimizing/sharpening.cc",
         "optimizing/side_effects_analysis.cc",
         "optimizing/ssa_builder.cc",
@@ -123,6 +124,7 @@ art_cc_defaults {
                 "jni/quick/arm64/calling_convention_arm64.cc",
                 "linker/arm64/relative_patcher_arm64.cc",
                 "optimizing/code_generator_arm64.cc",
+                "optimizing/scheduler_arm64.cc",
                 "optimizing/instruction_simplifier_arm64.cc",
                 "optimizing/intrinsics_arm64.cc",
                 "optimizing/nodes_arm64.cc",
@@ -362,6 +364,7 @@ art_cc_test {
         "jni/jni_cfi_test.cc",
         "optimizing/codegen_test.cc",
         "optimizing/optimizing_cfi_test.cc",
+        "optimizing/scheduler_test.cc",
     ],
 
     codegen: {
diff --git a/compiler/driver/compiler_driver-inl.h b/compiler/driver/compiler_driver-inl.h
index f056dd3c00..f296851ebf 100644
--- a/compiler/driver/compiler_driver-inl.h
+++ b/compiler/driver/compiler_driver-inl.h
@@ -135,65 +135,6 @@ inline bool CompilerDriver::CanAccessResolvedMember<ArtMethod>(
   return referrer_class->CanAccessResolvedMethod(access_to, method, dex_cache, field_idx);
 }
 
-template <typename ArtMember>
-inline std::pair<bool, bool> CompilerDriver::IsClassOfStaticMemberAvailableToReferrer(
-    mirror::DexCache* dex_cache,
-    mirror::Class* referrer_class,
-    ArtMember* resolved_member,
-    uint16_t member_idx,
-    dex::TypeIndex* storage_index) {
-  DCHECK(resolved_member->IsStatic());
-  if (LIKELY(referrer_class != nullptr)) {
-    ObjPtr<mirror::Class> members_class = resolved_member->GetDeclaringClass();
-    if (members_class == referrer_class) {
-      *storage_index = members_class->GetDexTypeIndex();
-      return std::make_pair(true, true);
-    }
-    if (CanAccessResolvedMember<ArtMember>(
-        referrer_class, members_class.Ptr(), resolved_member, dex_cache, member_idx)) {
-      // We have the resolved member, we must make it into a index for the referrer
-      // in its static storage (which may fail if it doesn't have a slot for it)
-      // TODO: for images we can elide the static storage base null check
-      // if we know there's a non-null entry in the image
-      const DexFile* dex_file = dex_cache->GetDexFile();
-      dex::TypeIndex storage_idx(DexFile::kDexNoIndex16);
-      if (LIKELY(members_class->GetDexCache() == dex_cache)) {
-        // common case where the dex cache of both the referrer and the member are the same,
-        // no need to search the dex file
-        storage_idx = members_class->GetDexTypeIndex();
-      } else {
-        // Search dex file for localized ssb index, may fail if member's class is a parent
-        // of the class mentioned in the dex file and there is no dex cache entry.
-        storage_idx = resolved_member->GetDeclaringClass()->FindTypeIndexInOtherDexFile(*dex_file);
-      }
-      if (storage_idx.IsValid()) {
-        *storage_index = storage_idx;
-        return std::make_pair(true, !resolved_member->IsFinal());
-      }
-    }
-  }
-  // Conservative defaults.
-  *storage_index = dex::TypeIndex(DexFile::kDexNoIndex16);
-  return std::make_pair(false, false);
-}
-
-inline std::pair<bool, bool> CompilerDriver::IsFastStaticField(
-    mirror::DexCache* dex_cache, mirror::Class* referrer_class,
-    ArtField* resolved_field, uint16_t field_idx, dex::TypeIndex* storage_index) {
-  return IsClassOfStaticMemberAvailableToReferrer(
-      dex_cache, referrer_class, resolved_field, field_idx, storage_index);
-}
-
-inline bool CompilerDriver::IsClassOfStaticMethodAvailableToReferrer(
-    mirror::DexCache* dex_cache, mirror::Class* referrer_class,
-    ArtMethod* resolved_method, uint16_t method_idx, dex::TypeIndex* storage_index) {
-  std::pair<bool, bool> result = IsClassOfStaticMemberAvailableToReferrer(
-      dex_cache, referrer_class, resolved_method, method_idx, storage_index);
-  // Only the first member of `result` is meaningful, as there is no
-  // "write access" to a method.
-  return result.first;
-}
-
 inline ArtMethod* CompilerDriver::ResolveMethod(
     ScopedObjectAccess& soa, Handle<mirror::DexCache> dex_cache,
     Handle<mirror::ClassLoader> class_loader, const DexCompilationUnit* mUnit,
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index c03ffcaa31..1d4eaf8c5a 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -529,9 +529,15 @@ static optimizer::DexToDexCompilationLevel GetDexToDexCompilationLevel(
   // We store the verification information in the class status in the oat file, which the linker
   // can validate (checksums) and use to skip load-time verification. It is thus safe to
   // optimize when a class has been fully verified before.
+  optimizer::DexToDexCompilationLevel max_level = optimizer::DexToDexCompilationLevel::kOptimize;
+  if (driver.GetCompilerOptions().GetDebuggable()) {
+    // We are debuggable so definitions of classes might be changed. We don't want to do any
+    // optimizations that could break that.
+    max_level = optimizer::DexToDexCompilationLevel::kRequired;
+  }
   if (klass->IsVerified()) {
     // Class is verified so we can enable DEX-to-DEX compilation for performance.
-    return optimizer::DexToDexCompilationLevel::kOptimize;
+    return max_level;
   } else if (klass->IsCompileTimeVerified()) {
     // Class verification has soft-failed. Anyway, ensure at least correctness.
     DCHECK_EQ(klass->GetStatus(), mirror::Class::kStatusRetryVerificationAtRuntime);
@@ -940,6 +946,31 @@ inline void CompilerDriver::CheckThreadPools() {
   DCHECK(single_thread_pool_ != nullptr);
 }
 
+static void EnsureVerifiedOrVerifyAtRuntime(jobject jclass_loader,
+                                            const std::vector<const DexFile*>& dex_files) {
+  ScopedObjectAccess soa(Thread::Current());
+  StackHandleScope<2> hs(soa.Self());
+  Handle<mirror::ClassLoader> class_loader(
+      hs.NewHandle(soa.Decode<mirror::ClassLoader>(jclass_loader)));
+  MutableHandle<mirror::Class> cls(hs.NewHandle<mirror::Class>(nullptr));
+  ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
+
+  for (const DexFile* dex_file : dex_files) {
+    for (uint32_t i = 0; i < dex_file->NumClassDefs(); ++i) {
+      const DexFile::ClassDef& class_def = dex_file->GetClassDef(i);
+      const char* descriptor = dex_file->GetClassDescriptor(class_def);
+      cls.Assign(class_linker->FindClass(soa.Self(), descriptor, class_loader));
+      if (cls.Get() == nullptr) {
+        soa.Self()->ClearException();
+      } else if (&cls->GetDexFile() == dex_file) {
+        DCHECK(cls->IsErroneous() || cls->IsVerified() || cls->IsCompileTimeVerified())
+            << cls->PrettyClass()
+            << " " << cls->GetStatus();
+      }
+    }
+  }
+}
+
 void CompilerDriver::PreCompile(jobject class_loader,
                                 const std::vector<const DexFile*>& dex_files,
                                 TimingLogger* timings) {
@@ -984,6 +1015,9 @@ void CompilerDriver::PreCompile(jobject class_loader,
   }
 
   if (compiler_options_->IsAnyMethodCompilationEnabled()) {
+    if (kIsDebugBuild) {
+      EnsureVerifiedOrVerifyAtRuntime(class_loader, dex_files);
+    }
     InitializeClasses(class_loader, dex_files, timings);
     VLOG(compiler) << "InitializeClasses: " << GetMemoryUsageString(false);
   }
@@ -1034,23 +1068,6 @@ bool CompilerDriver::ShouldCompileBasedOnProfile(const MethodReference& method_r
   return result;
 }
 
-bool CompilerDriver::ShouldVerifyClassBasedOnProfile(const DexFile& dex_file,
-                                                     uint16_t class_idx) const {
-  if (!compiler_options_->VerifyOnlyProfile()) {
-    // No profile, verify everything.
-    return true;
-  }
-  DCHECK(profile_compilation_info_ != nullptr);
-  const DexFile::ClassDef& class_def = dex_file.GetClassDef(class_idx);
-  dex::TypeIndex type_idx = class_def.class_idx_;
-  bool result = profile_compilation_info_->ContainsClass(dex_file, type_idx);
-  if (kDebugProfileGuidedCompilation) {
-    LOG(INFO) << "[ProfileGuidedCompilation] " << (result ? "Verified" : "Skipped") << " method:"
-        << dex_file.GetClassDescriptor(class_def);
-  }
-  return result;
-}
-
 class ResolveCatchBlockExceptionsClassVisitor : public ClassVisitor {
  public:
   explicit ResolveCatchBlockExceptionsClassVisitor(
@@ -1949,6 +1966,31 @@ static void PopulateVerifiedMethods(const DexFile& dex_file,
   DCHECK(!it.HasNext());
 }
 
+static void LoadAndUpdateStatus(const DexFile& dex_file,
+                                const DexFile::ClassDef& class_def,
+                                mirror::Class::Status status,
+                                Handle<mirror::ClassLoader> class_loader,
+                                Thread* self)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  StackHandleScope<1> hs(self);
+  const char* descriptor = dex_file.GetClassDescriptor(class_def);
+  ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
+  Handle<mirror::Class> cls(hs.NewHandle<mirror::Class>(
+      class_linker->FindClass(self, descriptor, class_loader)));
+  if (cls.Get() != nullptr) {
+    // Check that the class is resolved with the current dex file. We might get
+    // a boot image class, or a class in a different dex file for multidex, and
+    // we should not update the status in that case.
+    if (&cls->GetDexFile() == &dex_file) {
+      ObjectLock<mirror::Class> lock(self, cls);
+      mirror::Class::SetStatus(cls, status, self);
+    }
+  } else {
+    DCHECK(self->IsExceptionPending());
+    self->ClearException();
+  }
+}
+
 bool CompilerDriver::FastVerify(jobject jclass_loader,
                                 const std::vector<const DexFile*>& dex_files,
                                 TimingLogger* timings) {
@@ -1963,12 +2005,12 @@ bool CompilerDriver::FastVerify(jobject jclass_loader,
   StackHandleScope<2> hs(soa.Self());
   Handle<mirror::ClassLoader> class_loader(
       hs.NewHandle(soa.Decode<mirror::ClassLoader>(jclass_loader)));
-  MutableHandle<mirror::Class> cls(hs.NewHandle<mirror::Class>(nullptr));
-  ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
   if (!verifier_deps->ValidateDependencies(class_loader, soa.Self())) {
     return false;
   }
 
+  bool compiler_only_verifies = !GetCompilerOptions().IsAnyMethodCompilationEnabled();
+
   // We successfully validated the dependencies, now update class status
   // of verified classes. Note that the dependencies also record which classes
   // could not be fully verified; we could try again, but that would hurt verification
@@ -1983,28 +2025,16 @@ bool CompilerDriver::FastVerify(jobject jclass_loader,
     for (uint32_t i = 0; i < dex_file->NumClassDefs(); ++i) {
       const DexFile::ClassDef& class_def = dex_file->GetClassDef(i);
       if (set.find(class_def.class_idx_) == set.end()) {
-        if (!GetCompilerOptions().IsAnyMethodCompilationEnabled()) {
+        if (compiler_only_verifies) {
           // Just update the compiled_classes_ map. The compiler doesn't need to resolve
           // the type.
           compiled_classes_.Overwrite(
               ClassReference(dex_file, i), new CompiledClass(mirror::Class::kStatusVerified));
         } else {
-          // Resolve the type, so later compilation stages know they don't need to verify
+          // Update the class status, so later compilation stages know they don't need to verify
           // the class.
-          const char* descriptor = dex_file->GetClassDescriptor(class_def);
-          cls.Assign(class_linker->FindClass(soa.Self(), descriptor, class_loader));
-          if (cls.Get() != nullptr) {
-            // Check that the class is resolved with the current dex file. We might get
-            // a boot image class, or a class in a different dex file for multidex, and
-            // we should not update the status in that case.
-            if (&cls->GetDexFile() == dex_file) {
-              ObjectLock<mirror::Class> lock(soa.Self(), cls);
-              mirror::Class::SetStatus(cls, mirror::Class::kStatusVerified, soa.Self());
-            }
-          } else {
-            DCHECK(soa.Self()->IsExceptionPending());
-            soa.Self()->ClearException();
-          }
+          LoadAndUpdateStatus(
+              *dex_file, class_def, mirror::Class::kStatusVerified, class_loader, soa.Self());
           // Create `VerifiedMethod`s for each methods, the compiler expects one for
           // quickening or compiling.
           // Note that this means:
@@ -2013,6 +2043,14 @@ bool CompilerDriver::FastVerify(jobject jclass_loader,
           // TODO(ngeoffray): Reconsider this once we refactor compiler filters.
           PopulateVerifiedMethods(*dex_file, i, verification_results_);
         }
+      } else if (!compiler_only_verifies) {
+        // Make sure later compilation stages know they should not try to verify
+        // this class again.
+        LoadAndUpdateStatus(*dex_file,
+                            class_def,
+                            mirror::Class::kStatusRetryVerificationAtRuntime,
+                            class_loader,
+                            soa.Self());
       }
     }
   }
@@ -2077,13 +2115,6 @@ class VerifyClassVisitor : public CompilationVisitor {
     ATRACE_CALL();
     ScopedObjectAccess soa(Thread::Current());
     const DexFile& dex_file = *manager_->GetDexFile();
-    if (!manager_->GetCompiler()->ShouldVerifyClassBasedOnProfile(dex_file, class_def_index)) {
-      // Skip verification since the class is not in the profile, and let the VerifierDeps know
-      // that the class will need to be verified at runtime.
-      verifier::VerifierDeps::MaybeRecordVerificationStatus(
-          dex_file, dex::TypeIndex(class_def_index), verifier::MethodVerifier::kSoftFailure);
-      return;
-    }
     const DexFile::ClassDef& class_def = dex_file.GetClassDef(class_def_index);
     const char* descriptor = dex_file.GetClassDescriptor(class_def);
     ClassLinker* class_linker = manager_->GetClassLinker();
@@ -2199,7 +2230,7 @@ class SetVerifiedClassVisitor : public CompilationVisitor {
     if (klass.Get() != nullptr) {
       // Only do this if the class is resolved. If even resolution fails, quickening will go very,
       // very wrong.
-      if (klass->IsResolved()) {
+      if (klass->IsResolved() && !klass->IsErroneousResolved()) {
         if (klass->GetStatus() < mirror::Class::kStatusVerified) {
           ObjectLock<mirror::Class> lock(soa.Self(), klass);
           // Set class status to verified.
@@ -2626,7 +2657,8 @@ CompiledClass* CompilerDriver::GetCompiledClass(ClassReference ref) const {
 void CompilerDriver::RecordClassStatus(ClassReference ref, mirror::Class::Status status) {
   switch (status) {
     case mirror::Class::kStatusNotReady:
-    case mirror::Class::kStatusError:
+    case mirror::Class::kStatusErrorResolved:
+    case mirror::Class::kStatusErrorUnresolved:
     case mirror::Class::kStatusRetryVerificationAtRuntime:
     case mirror::Class::kStatusVerified:
     case mirror::Class::kStatusInitialized:
diff --git a/compiler/driver/compiler_driver.h b/compiler/driver/compiler_driver.h
index 503fe3adfc..5b4c751c4a 100644
--- a/compiler/driver/compiler_driver.h
+++ b/compiler/driver/compiler_driver.h
@@ -233,27 +233,6 @@ class CompilerDriver {
       ArtField* resolved_field, uint16_t field_idx)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
-  // Can we fast-path an SGET/SPUT access to a static field? If yes, compute the type index
-  // of the declaring class in the referrer's dex file.
-  std::pair<bool, bool> IsFastStaticField(mirror::DexCache* dex_cache,
-                                          mirror::Class* referrer_class,
-                                          ArtField* resolved_field,
-                                          uint16_t field_idx,
-                                          dex::TypeIndex* storage_index)
-      REQUIRES_SHARED(Locks::mutator_lock_);
-
-  // Return whether the declaring class of `resolved_method` is
-  // available to `referrer_class`. If this is true, compute the type
-  // index of the declaring class in the referrer's dex file and
-  // return it through the out argument `storage_index`; otherwise
-  // return DexFile::kDexNoIndex through `storage_index`.
-  bool IsClassOfStaticMethodAvailableToReferrer(mirror::DexCache* dex_cache,
-                                                mirror::Class* referrer_class,
-                                                ArtMethod* resolved_method,
-                                                uint16_t method_idx,
-                                                dex::TypeIndex* storage_index)
-      REQUIRES_SHARED(Locks::mutator_lock_);
-
   // Resolve a method. Returns null on failure, including incompatible class change.
   ArtMethod* ResolveMethod(
       ScopedObjectAccess& soa, Handle<mirror::DexCache> dex_cache,
@@ -379,21 +358,6 @@ class CompilerDriver {
   }
 
  private:
-  // Return whether the declaring class of `resolved_member` is
-  // available to `referrer_class` for read or write access using two
-  // Boolean values returned as a pair. If is true at least for read
-  // access, compute the type index of the declaring class in the
-  // referrer's dex file and return it through the out argument
-  // `storage_index`; otherwise return DexFile::kDexNoIndex through
-  // `storage_index`.
-  template <typename ArtMember>
-  std::pair<bool, bool> IsClassOfStaticMemberAvailableToReferrer(mirror::DexCache* dex_cache,
-                                                                 mirror::Class* referrer_class,
-                                                                 ArtMember* resolved_member,
-                                                                 uint16_t member_idx,
-                                                                 dex::TypeIndex* storage_index)
-      REQUIRES_SHARED(Locks::mutator_lock_);
-
   // Can `referrer_class` access the resolved `member`?
   // Dispatch call to mirror::Class::CanAccessResolvedField or
   // mirror::Class::CanAccessResolvedMember depending on the value of
diff --git a/compiler/image_writer.cc b/compiler/image_writer.cc
index 459aca3b38..c72edb18a3 100644
--- a/compiler/image_writer.cc
+++ b/compiler/image_writer.cc
@@ -756,7 +756,7 @@ bool ImageWriter::PruneAppImageClassInternal(
   bool my_early_exit = false;  // Only for ourselves, ignore caller.
   // Remove classes that failed to verify since we don't want to have java.lang.VerifyError in the
   // app image.
-  if (klass->GetStatus() == mirror::Class::kStatusError) {
+  if (klass->IsErroneous()) {
     result = true;
   } else {
     ObjPtr<mirror::ClassExt> ext(klass->GetExtData());
@@ -777,8 +777,8 @@ bool ImageWriter::PruneAppImageClassInternal(
                                                   visited);
   }
   // Check static fields and their classes.
-  size_t num_static_fields = klass->NumReferenceStaticFields();
-  if (num_static_fields != 0 && klass->IsResolved()) {
+  if (klass->IsResolved() && klass->NumReferenceStaticFields() != 0) {
+    size_t num_static_fields = klass->NumReferenceStaticFields();
     // Presumably GC can happen when we are cross compiling, it should not cause performance
     // problems to do pointer size logic.
     MemberOffset field_offset = klass->GetFirstReferenceStaticFieldOffset(
@@ -1154,7 +1154,7 @@ mirror::Object* ImageWriter::TryAssignBinSlot(WorkStack& work_stack,
       // Visit and assign offsets for fields and field arrays.
       mirror::Class* as_klass = obj->AsClass();
       mirror::DexCache* dex_cache = as_klass->GetDexCache();
-      DCHECK_NE(as_klass->GetStatus(), mirror::Class::kStatusError);
+      DCHECK(!as_klass->IsErroneous()) << as_klass->GetStatus();
       if (compile_app_image_) {
         // Extra sanity, no boot loader classes should be left!
         CHECK(!IsBootClassLoaderClass(as_klass)) << as_klass->PrettyClass();
@@ -2348,6 +2348,16 @@ const uint8_t* ImageWriter::GetQuickCode(ArtMethod* method,
 void ImageWriter::CopyAndFixupMethod(ArtMethod* orig,
                                      ArtMethod* copy,
                                      const ImageInfo& image_info) {
+  if (orig->IsAbstract()) {
+    // Ignore the single-implementation info for abstract method.
+    // Do this on orig instead of copy, otherwise there is a crash due to methods
+    // are copied before classes.
+    // TODO: handle fixup of single-implementation method for abstract method.
+    orig->SetHasSingleImplementation(false);
+    orig->SetSingleImplementation(
+        nullptr, Runtime::Current()->GetClassLinker()->GetImagePointerSize());
+  }
+
   memcpy(copy, orig, ArtMethod::Size(target_ptr_size_));
 
   copy->SetDeclaringClass(GetImageAddress(orig->GetDeclaringClassUnchecked()));
diff --git a/compiler/jit/jit_compiler.cc b/compiler/jit/jit_compiler.cc
index 148ce4f9ee..cbd831a60f 100644
--- a/compiler/jit/jit_compiler.cc
+++ b/compiler/jit/jit_compiler.cc
@@ -102,7 +102,7 @@ JitCompiler::JitCompiler() {
       /* no_inline_from */ nullptr,
       /* include_patch_information */ false,
       CompilerOptions::kDefaultTopKProfileThreshold,
-      Runtime::Current()->IsDebuggable(),
+      Runtime::Current()->IsJavaDebuggable(),
       CompilerOptions::kDefaultGenerateDebugInfo,
       /* implicit_null_checks */ true,
       /* implicit_so_checks */ true,
@@ -211,7 +211,7 @@ bool JitCompiler::CompileMethod(Thread* self, ArtMethod* method, bool osr) {
     JitCodeCache* const code_cache = runtime->GetJit()->GetCodeCache();
     success = compiler_driver_->GetCompiler()->JitCompile(self, code_cache, method, osr);
     if (success && (jit_logger_ != nullptr)) {
-      jit_logger_->WriteLog(code_cache, method);
+      jit_logger_->WriteLog(code_cache, method, osr);
     }
   }
 
diff --git a/compiler/jit/jit_logger.cc b/compiler/jit/jit_logger.cc
index 9ce3b0cfe8..aa4f66773a 100644
--- a/compiler/jit/jit_logger.cc
+++ b/compiler/jit/jit_logger.cc
@@ -23,6 +23,7 @@
 #include "driver/compiler_driver.h"
 #include "jit/jit.h"
 #include "jit/jit_code_cache.h"
+#include "oat_file-inl.h"
 
 namespace art {
 namespace jit {
@@ -49,9 +50,10 @@ void JitLogger::OpenPerfMapLog() {
   }
 }
 
-void JitLogger::WritePerfMapLog(JitCodeCache* code_cache, ArtMethod* method) {
+void JitLogger::WritePerfMapLog(JitCodeCache* code_cache, ArtMethod* method, bool osr) {
   if (perf_file_ != nullptr) {
-    const void* ptr = method->GetEntryPointFromQuickCompiledCode();
+    const void* ptr = osr ? code_cache->LookupOsrMethodHeader(method)->GetCode()
+                          : method->GetEntryPointFromQuickCompiledCode();
     size_t code_size = code_cache->GetMemorySizeOfCodePointer(ptr);
     std::string method_name = method->PrettyMethod();
 
@@ -268,9 +270,10 @@ void JitLogger::OpenJitDumpLog() {
   WriteJitDumpHeader();
 }
 
-void JitLogger::WriteJitDumpLog(JitCodeCache* code_cache, ArtMethod* method) {
+void JitLogger::WriteJitDumpLog(JitCodeCache* code_cache, ArtMethod* method, bool osr) {
   if (jit_dump_file_ != nullptr) {
-    const void* code = method->GetEntryPointFromQuickCompiledCode();
+    const void* code = osr ? code_cache->LookupOsrMethodHeader(method)->GetCode()
+                           : method->GetEntryPointFromQuickCompiledCode();
     size_t code_size = code_cache->GetMemorySizeOfCodePointer(code);
     std::string method_name = method->PrettyMethod();
 
diff --git a/compiler/jit/jit_logger.h b/compiler/jit/jit_logger.h
index 0f8cfe4e2f..460864e8a9 100644
--- a/compiler/jit/jit_logger.h
+++ b/compiler/jit/jit_logger.h
@@ -94,10 +94,10 @@ class JitLogger {
       OpenJitDumpLog();
     }
 
-    void WriteLog(JitCodeCache* code_cache, ArtMethod* method)
+    void WriteLog(JitCodeCache* code_cache, ArtMethod* method, bool osr)
         REQUIRES_SHARED(Locks::mutator_lock_) {
-      WritePerfMapLog(code_cache, method);
-      WriteJitDumpLog(code_cache, method);
+      WritePerfMapLog(code_cache, method, osr);
+      WriteJitDumpLog(code_cache, method, osr);
     }
 
     void CloseLog() {
@@ -108,13 +108,13 @@ class JitLogger {
   private:
     // For perf-map profiling
     void OpenPerfMapLog();
-    void WritePerfMapLog(JitCodeCache* code_cache, ArtMethod* method)
+    void WritePerfMapLog(JitCodeCache* code_cache, ArtMethod* method, bool osr)
         REQUIRES_SHARED(Locks::mutator_lock_);
     void ClosePerfMapLog();
 
     // For perf-inject profiling
     void OpenJitDumpLog();
-    void WriteJitDumpLog(JitCodeCache* code_cache, ArtMethod* method)
+    void WriteJitDumpLog(JitCodeCache* code_cache, ArtMethod* method, bool osr)
         REQUIRES_SHARED(Locks::mutator_lock_);
     void CloseJitDumpLog();
 
diff --git a/compiler/linker/mips/relative_patcher_mips.cc b/compiler/linker/mips/relative_patcher_mips.cc
index c09950cd5d..fe5f9a948a 100644
--- a/compiler/linker/mips/relative_patcher_mips.cc
+++ b/compiler/linker/mips/relative_patcher_mips.cc
@@ -49,9 +49,12 @@ void MipsRelativePatcher::PatchPcRelativeReference(std::vector<uint8_t>* code,
                                                    uint32_t target_offset) {
   uint32_t anchor_literal_offset = patch.PcInsnOffset();
   uint32_t literal_offset = patch.LiteralOffset();
+  uint32_t literal_low_offset;
   bool dex_cache_array = (patch.GetType() == LinkerPatch::Type::kDexCacheArray);
 
-  // Basic sanity checks.
+  // Perform basic sanity checks and initialize `literal_low_offset` to point
+  // to the instruction containing the 16 least significant bits of the
+  // relative address.
   if (is_r6) {
     DCHECK_GE(code->size(), 8u);
     DCHECK_LE(literal_offset, code->size() - 8u);
@@ -61,10 +64,10 @@ void MipsRelativePatcher::PatchPcRelativeReference(std::vector<uint8_t>* code,
     DCHECK_EQ((*code)[literal_offset + 1], 0x12);
     DCHECK_EQ(((*code)[literal_offset + 2] & 0x1F), 0x1E);
     DCHECK_EQ(((*code)[literal_offset + 3] & 0xFC), 0xEC);
-    // ADDIU reg, reg, offset_low
+    // instr reg(s), offset_low
     DCHECK_EQ((*code)[literal_offset + 4], 0x78);
     DCHECK_EQ((*code)[literal_offset + 5], 0x56);
-    DCHECK_EQ(((*code)[literal_offset + 7] & 0xFC), 0x24);
+    literal_low_offset = literal_offset + 4;
   } else {
     DCHECK_GE(code->size(), 16u);
     DCHECK_LE(literal_offset, code->size() - 12u);
@@ -84,36 +87,34 @@ void MipsRelativePatcher::PatchPcRelativeReference(std::vector<uint8_t>* code,
     DCHECK_EQ((*code)[literal_offset + 1], 0x12);
     DCHECK_EQ(((*code)[literal_offset + 2] & 0xE0), 0x00);
     DCHECK_EQ((*code)[literal_offset + 3], 0x3C);
-    // ORI reg, reg, offset_low
-    DCHECK_EQ((*code)[literal_offset + 4], 0x78);
-    DCHECK_EQ((*code)[literal_offset + 5], 0x56);
-    DCHECK_EQ(((*code)[literal_offset + 7] & 0xFC), 0x34);
     // ADDU reg, reg, reg2
-    DCHECK_EQ((*code)[literal_offset + 8], 0x21);
-    DCHECK_EQ(((*code)[literal_offset + 9] & 0x07), 0x00);
+    DCHECK_EQ((*code)[literal_offset + 4], 0x21);
+    DCHECK_EQ(((*code)[literal_offset + 5] & 0x07), 0x00);
     if (dex_cache_array) {
       // reg2 is either RA or from HMipsComputeBaseMethodAddress.
-      DCHECK_EQ(((*code)[literal_offset + 10] & 0x1F), 0x1F);
+      DCHECK_EQ(((*code)[literal_offset + 6] & 0x1F), 0x1F);
     }
-    DCHECK_EQ(((*code)[literal_offset + 11] & 0xFC), 0x00);
+    DCHECK_EQ(((*code)[literal_offset + 7] & 0xFC), 0x00);
+    // instr reg(s), offset_low
+    DCHECK_EQ((*code)[literal_offset + 8], 0x78);
+    DCHECK_EQ((*code)[literal_offset + 9], 0x56);
+    literal_low_offset = literal_offset + 8;
   }
 
   // Apply patch.
   uint32_t anchor_offset = patch_offset - literal_offset + anchor_literal_offset;
   uint32_t diff = target_offset - anchor_offset;
-  if (dex_cache_array) {
+  if (dex_cache_array && !is_r6) {
     diff += kDexCacheArrayLwOffset;
   }
-  if (is_r6) {
-    diff += (diff & 0x8000) << 1;  // Account for sign extension in ADDIU.
-  }
+  diff += (diff & 0x8000) << 1;  // Account for sign extension in "instr reg(s), offset_low".
 
   // LUI reg, offset_high / AUIPC reg, offset_high
   (*code)[literal_offset + 0] = static_cast<uint8_t>(diff >> 16);
   (*code)[literal_offset + 1] = static_cast<uint8_t>(diff >> 24);
-  // ORI reg, reg, offset_low / ADDIU reg, reg, offset_low
-  (*code)[literal_offset + 4] = static_cast<uint8_t>(diff >> 0);
-  (*code)[literal_offset + 5] = static_cast<uint8_t>(diff >> 8);
+  // instr reg(s), offset_low
+  (*code)[literal_low_offset + 0] = static_cast<uint8_t>(diff >> 0);
+  (*code)[literal_low_offset + 1] = static_cast<uint8_t>(diff >> 8);
 }
 
 }  // namespace linker
diff --git a/compiler/linker/mips/relative_patcher_mips32r6_test.cc b/compiler/linker/mips/relative_patcher_mips32r6_test.cc
index 4f9a3a050a..474eb73e08 100644
--- a/compiler/linker/mips/relative_patcher_mips32r6_test.cc
+++ b/compiler/linker/mips/relative_patcher_mips32r6_test.cc
@@ -20,10 +20,6 @@
 namespace art {
 namespace linker {
 
-// We'll maximize the range of a single load instruction for dex cache array accesses
-// by aligning offset -32768 with the offset of the first used element.
-static constexpr uint32_t kDexCacheArrayLwOffset = 0x8000;
-
 class Mips32r6RelativePatcherTest : public RelativePatcherTest {
  public:
   Mips32r6RelativePatcherTest() : RelativePatcherTest(kMips, "mips32r6") {}
@@ -64,9 +60,6 @@ void Mips32r6RelativePatcherTest::CheckPcRelativePatch(const ArrayRef<const Link
   ASSERT_TRUE(result.first);
 
   uint32_t diff = target_offset - (result.second + kAnchorOffset);
-  if (patches[0].GetType() == LinkerPatch::Type::kDexCacheArray) {
-    diff += kDexCacheArrayLwOffset;
-  }
   diff += (diff & 0x8000) << 1;  // Account for sign extension in addiu.
 
   const uint8_t expected_code[] = {
diff --git a/compiler/linker/mips/relative_patcher_mips_test.cc b/compiler/linker/mips/relative_patcher_mips_test.cc
index faeb92a315..b0d1294cf4 100644
--- a/compiler/linker/mips/relative_patcher_mips_test.cc
+++ b/compiler/linker/mips/relative_patcher_mips_test.cc
@@ -47,12 +47,12 @@ class MipsRelativePatcherTest : public RelativePatcherTest {
 
 const uint8_t MipsRelativePatcherTest::kUnpatchedPcRelativeRawCode[] = {
     0x00, 0x00, 0x10, 0x04,  // nal
-    0x34, 0x12, 0x12, 0x3C,  // lui  s2, high(diff); placeholder = 0x1234
-    0x78, 0x56, 0x52, 0x36,  // ori  s2, s2, low(diff); placeholder = 0x5678
-    0x21, 0x90, 0x5F, 0x02,  // addu s2, s2, ra
+    0x34, 0x12, 0x12, 0x3C,  // lui   s2, high(diff); placeholder = 0x1234
+    0x21, 0x90, 0x5F, 0x02,  // addu  s2, s2, ra
+    0x78, 0x56, 0x52, 0x26,  // addiu s2, s2, low(diff); placeholder = 0x5678
 };
 const uint32_t MipsRelativePatcherTest::kLiteralOffset = 4;  // At lui (where patching starts).
-const uint32_t MipsRelativePatcherTest::kAnchorOffset = 8;  // At ori (where PC+0 points).
+const uint32_t MipsRelativePatcherTest::kAnchorOffset = 8;  // At addu (where PC+0 points).
 const ArrayRef<const uint8_t> MipsRelativePatcherTest::kUnpatchedPcRelativeCode(
     kUnpatchedPcRelativeRawCode);
 
@@ -68,12 +68,13 @@ void MipsRelativePatcherTest::CheckPcRelativePatch(const ArrayRef<const LinkerPa
   if (patches[0].GetType() == LinkerPatch::Type::kDexCacheArray) {
     diff += kDexCacheArrayLwOffset;
   }
+  diff += (diff & 0x8000) << 1;  // Account for sign extension in addiu.
 
   const uint8_t expected_code[] = {
       0x00, 0x00, 0x10, 0x04,
       static_cast<uint8_t>(diff >> 16), static_cast<uint8_t>(diff >> 24), 0x12, 0x3C,
-      static_cast<uint8_t>(diff), static_cast<uint8_t>(diff >> 8), 0x52, 0x36,
       0x21, 0x90, 0x5F, 0x02,
+      static_cast<uint8_t>(diff), static_cast<uint8_t>(diff >> 8), 0x52, 0x26,
   };
   EXPECT_TRUE(CheckLinkedMethod(MethodRef(1u), ArrayRef<const uint8_t>(expected_code)));
 }
diff --git a/compiler/oat_test.cc b/compiler/oat_test.cc
index 34b33a13a3..d5842a8c9d 100644
--- a/compiler/oat_test.cc
+++ b/compiler/oat_test.cc
@@ -487,7 +487,7 @@ TEST_F(OatTest, OatHeaderSizeCheck) {
   EXPECT_EQ(72U, sizeof(OatHeader));
   EXPECT_EQ(4U, sizeof(OatMethodOffsets));
   EXPECT_EQ(20U, sizeof(OatQuickMethodHeader));
-  EXPECT_EQ(157 * static_cast<size_t>(GetInstructionSetPointerSize(kRuntimeISA)),
+  EXPECT_EQ(161 * static_cast<size_t>(GetInstructionSetPointerSize(kRuntimeISA)),
             sizeof(QuickEntryPoints));
 }
 
diff --git a/compiler/oat_writer.cc b/compiler/oat_writer.cc
index de5af97b9a..a16a34b299 100644
--- a/compiler/oat_writer.cc
+++ b/compiler/oat_writer.cc
@@ -716,7 +716,10 @@ class OatWriter::InitOatClassesMethodVisitor : public DexMethodVisitor {
     if (compiled_class != nullptr) {
       status = compiled_class->GetStatus();
     } else if (writer_->compiler_driver_->GetVerificationResults()->IsClassRejected(class_ref)) {
-      status = mirror::Class::kStatusError;
+      // The oat class status is used only for verification of resolved classes,
+      // so use kStatusErrorResolved whether the class was resolved or unresolved
+      // during compile-time verification.
+      status = mirror::Class::kStatusErrorResolved;
     } else {
       status = mirror::Class::kStatusNotReady;
     }
@@ -2263,6 +2266,10 @@ bool OatWriter::LayoutAndWriteDexFile(OutputStream* out, OatDexFile* oat_dex_fil
     File* raw_file = oat_dex_file->source_.GetRawFile();
     dex_file = DexFile::OpenDex(raw_file->Fd(), location, /* verify_checksum */ true, &error_msg);
   }
+  if (dex_file == nullptr) {
+    LOG(ERROR) << "Failed to open dex file for layout:" << error_msg;
+    return false;
+  }
   Options options;
   options.output_to_memmap_ = true;
   DexLayout dex_layout(options, profile_compilation_info_, nullptr);
diff --git a/compiler/optimizing/builder.h b/compiler/optimizing/builder.h
index 8cf4089eba..e4ad4222fb 100644
--- a/compiler/optimizing/builder.h
+++ b/compiler/optimizing/builder.h
@@ -32,6 +32,8 @@
 
 namespace art {
 
+class CodeGenerator;
+
 class HGraphBuilder : public ValueObject {
  public:
   HGraphBuilder(HGraph* graph,
@@ -40,6 +42,7 @@ class HGraphBuilder : public ValueObject {
                 const DexFile* dex_file,
                 const DexFile::CodeItem& code_item,
                 CompilerDriver* driver,
+                CodeGenerator* code_generator,
                 OptimizingCompilerStats* compiler_stats,
                 const uint8_t* interpreter_metadata,
                 Handle<mirror::DexCache> dex_cache,
@@ -61,6 +64,7 @@ class HGraphBuilder : public ValueObject {
                              dex_compilation_unit,
                              outer_compilation_unit,
                              driver,
+                             code_generator,
                              interpreter_metadata,
                              compiler_stats,
                              dex_cache,
@@ -89,6 +93,7 @@ class HGraphBuilder : public ValueObject {
                              /* dex_compilation_unit */ nullptr,
                              /* outer_compilation_unit */ nullptr,
                              /* compiler_driver */ nullptr,
+                             /* code_generator */ nullptr,
                              /* interpreter_metadata */ nullptr,
                              /* compiler_stats */ nullptr,
                              null_dex_cache_,
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index 99427f05da..d68aa51b1b 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -1417,4 +1417,22 @@ void CodeGenerator::EmitJitRoots(uint8_t* code,
   EmitJitRootPatches(code, roots_data);
 }
 
+QuickEntrypointEnum CodeGenerator::GetArrayAllocationEntrypoint(Handle<mirror::Class> array_klass) {
+  ScopedObjectAccess soa(Thread::Current());
+  if (array_klass.Get() == nullptr) {
+    // This can only happen for non-primitive arrays, as primitive arrays can always
+    // be resolved.
+    return kQuickAllocArrayResolved32;
+  }
+
+  switch (array_klass->GetComponentSize()) {
+    case 1: return kQuickAllocArrayResolved8;
+    case 2: return kQuickAllocArrayResolved16;
+    case 4: return kQuickAllocArrayResolved32;
+    case 8: return kQuickAllocArrayResolved64;
+  }
+  LOG(FATAL) << "Unreachable";
+  return kQuickAllocArrayResolved;
+}
+
 }  // namespace art
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index 2d129aff22..b912672792 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -573,6 +573,8 @@ class CodeGenerator : public DeletableArenaObject<kArenaAllocCodeGenerator> {
   uint32_t GetReferenceSlowFlagOffset() const;
   uint32_t GetReferenceDisableFlagOffset() const;
 
+  static QuickEntrypointEnum GetArrayAllocationEntrypoint(Handle<mirror::Class> array_klass);
+
  protected:
   // Patch info used for recording locations of required linker patches and their targets,
   // i.e. target method, string, type or code identified by their dex file and index.
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index f5b6ebef9c..20cdae3619 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -3993,8 +3993,11 @@ void LocationsBuilderARM::VisitNewArray(HNewArray* instruction) {
 void InstructionCodeGeneratorARM::VisitNewArray(HNewArray* instruction) {
   // Note: if heap poisoning is enabled, the entry point takes cares
   // of poisoning the reference.
-  codegen_->InvokeRuntime(kQuickAllocArrayResolved, instruction, instruction->GetDexPc());
+  QuickEntrypointEnum entrypoint =
+      CodeGenerator::GetArrayAllocationEntrypoint(instruction->GetLoadClass()->GetClass());
+  codegen_->InvokeRuntime(entrypoint, instruction, instruction->GetDexPc());
   CheckEntrypointTypes<kQuickAllocArrayResolved, void*, mirror::Class*, int32_t>();
+  DCHECK(!codegen_->IsLeafMethod());
 }
 
 void LocationsBuilderARM::VisitParameterValue(HParameterValue* instruction) {
@@ -5719,6 +5722,9 @@ void ParallelMoveResolverARM::RestoreScratch(int reg) {
 HLoadClass::LoadKind CodeGeneratorARM::GetSupportedLoadClassKind(
     HLoadClass::LoadKind desired_class_load_kind) {
   switch (desired_class_load_kind) {
+    case HLoadClass::LoadKind::kInvalid:
+      LOG(FATAL) << "UNREACHABLE";
+      UNREACHABLE();
     case HLoadClass::LoadKind::kReferrersClass:
       break;
     case HLoadClass::LoadKind::kBootImageLinkTimeAddress:
@@ -5849,6 +5855,7 @@ void InstructionCodeGeneratorARM::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAFE
       break;
     }
     case HLoadClass::LoadKind::kDexCacheViaMethod:
+    case HLoadClass::LoadKind::kInvalid:
       LOG(FATAL) << "UNREACHABLE";
       UNREACHABLE();
   }
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 9762ee81b1..598be4715b 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -1452,6 +1452,19 @@ static bool CoherentConstantAndType(Location constant, Primitive::Type type) {
          (cst->IsDoubleConstant() && type == Primitive::kPrimDouble);
 }
 
+// Allocate a scratch register from the VIXL pool, querying first into
+// the floating-point register pool, and then the the core register
+// pool.  This is essentially a reimplementation of
+// vixl::aarch64::UseScratchRegisterScope::AcquireCPURegisterOfSize
+// using a different allocation strategy.
+static CPURegister AcquireFPOrCoreCPURegisterOfSize(vixl::aarch64::MacroAssembler* masm,
+                                                    vixl::aarch64::UseScratchRegisterScope* temps,
+                                                    int size_in_bits) {
+  return masm->GetScratchFPRegisterList()->IsEmpty()
+      ? CPURegister(temps->AcquireRegisterOfSize(size_in_bits))
+      : CPURegister(temps->AcquireVRegisterOfSize(size_in_bits));
+}
+
 void CodeGeneratorARM64::MoveLocation(Location destination,
                                       Location source,
                                       Primitive::Type dst_type) {
@@ -1533,7 +1546,9 @@ void CodeGeneratorARM64::MoveLocation(Location destination,
       HConstant* src_cst = source.GetConstant();
       CPURegister temp;
       if (src_cst->IsZeroBitPattern()) {
-        temp = (src_cst->IsLongConstant() || src_cst->IsDoubleConstant()) ? xzr : wzr;
+        temp = (src_cst->IsLongConstant() || src_cst->IsDoubleConstant())
+            ? Register(xzr)
+            : Register(wzr);
       } else {
         if (src_cst->IsIntConstant()) {
           temp = temps.AcquireW();
@@ -1561,8 +1576,16 @@ void CodeGeneratorARM64::MoveLocation(Location destination,
       // a move is blocked by a another move requiring a scratch FP
       // register, which would reserve D31). To prevent this issue, we
       // ask for a scratch register of any type (core or FP).
-      CPURegister temp =
-          temps.AcquireCPURegisterOfSize(destination.IsDoubleStackSlot() ? kXRegSize : kWRegSize);
+      //
+      // Also, we start by asking for a FP scratch register first, as the
+      // demand of scratch core registers is higher.  This is why we
+      // use AcquireFPOrCoreCPURegisterOfSize instead of
+      // UseScratchRegisterScope::AcquireCPURegisterOfSize, which
+      // allocates core scratch registers first.
+      CPURegister temp = AcquireFPOrCoreCPURegisterOfSize(
+          GetVIXLAssembler(),
+          &temps,
+          (destination.IsDoubleStackSlot() ? kXRegSize : kWRegSize));
       __ Ldr(temp, StackOperandFrom(source));
       __ Str(temp, StackOperandFrom(destination));
     }
@@ -1903,6 +1926,9 @@ void LocationsBuilderARM64::HandleFieldGet(HInstruction* instruction) {
                                                        LocationSummary::kNoCall);
   if (object_field_get_with_read_barrier && kUseBakerReadBarrier) {
     locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
+    // We need a temporary register for the read barrier marking slow
+    // path in CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier.
+    locations->AddTemp(Location::RequiresRegister());
   }
   locations->SetInAt(0, Location::RequiresRegister());
   if (Primitive::IsFloatingPointType(instruction->GetType())) {
@@ -1930,11 +1956,9 @@ void InstructionCodeGeneratorARM64::HandleFieldGet(HInstruction* instruction,
 
   if (field_type == Primitive::kPrimNot && kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
     // Object FieldGet with Baker's read barrier case.
-    MacroAssembler* masm = GetVIXLAssembler();
-    UseScratchRegisterScope temps(masm);
     // /* HeapReference<Object> */ out = *(base + offset)
     Register base = RegisterFrom(base_loc, Primitive::kPrimNot);
-    Register temp = temps.AcquireW();
+    Register temp = WRegisterFrom(locations->GetTemp(0));
     // Note that potential implicit null checks are handled in this
     // CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier call.
     codegen_->GenerateFieldLoadWithBakerReadBarrier(
@@ -4336,6 +4360,9 @@ void InstructionCodeGeneratorARM64::VisitInvokeVirtual(HInvokeVirtual* invoke) {
 HLoadClass::LoadKind CodeGeneratorARM64::GetSupportedLoadClassKind(
     HLoadClass::LoadKind desired_class_load_kind) {
   switch (desired_class_load_kind) {
+    case HLoadClass::LoadKind::kInvalid:
+      LOG(FATAL) << "UNREACHABLE";
+      UNREACHABLE();
     case HLoadClass::LoadKind::kReferrersClass:
       break;
     case HLoadClass::LoadKind::kBootImageLinkTimeAddress:
@@ -4474,6 +4501,7 @@ void InstructionCodeGeneratorARM64::VisitLoadClass(HLoadClass* cls) NO_THREAD_SA
       break;
     }
     case HLoadClass::LoadKind::kDexCacheViaMethod:
+    case HLoadClass::LoadKind::kInvalid:
       LOG(FATAL) << "UNREACHABLE";
       UNREACHABLE();
   }
@@ -4762,7 +4790,9 @@ void LocationsBuilderARM64::VisitNewArray(HNewArray* instruction) {
 void InstructionCodeGeneratorARM64::VisitNewArray(HNewArray* instruction) {
   // Note: if heap poisoning is enabled, the entry point takes cares
   // of poisoning the reference.
-  codegen_->InvokeRuntime(kQuickAllocArrayResolved, instruction, instruction->GetDexPc());
+  QuickEntrypointEnum entrypoint =
+      CodeGenerator::GetArrayAllocationEntrypoint(instruction->GetLoadClass()->GetClass());
+  codegen_->InvokeRuntime(entrypoint, instruction, instruction->GetDexPc());
   CheckEntrypointTypes<kQuickAllocArrayResolved, void*, mirror::Class*, int32_t>();
 }
 
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 7d3c655b27..f6cb90a63a 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -210,12 +210,11 @@ class FieldAccessCallingConventionARM64 : public FieldAccessCallingConvention {
   Location GetReturnLocation(Primitive::Type type ATTRIBUTE_UNUSED) const OVERRIDE {
     return helpers::LocationFrom(vixl::aarch64::x0);
   }
-  Location GetSetValueLocation(Primitive::Type type, bool is_instance) const OVERRIDE {
-    return Primitive::Is64BitType(type)
+  Location GetSetValueLocation(Primitive::Type type ATTRIBUTE_UNUSED,
+                               bool is_instance) const OVERRIDE {
+    return is_instance
         ? helpers::LocationFrom(vixl::aarch64::x2)
-        : (is_instance
-            ? helpers::LocationFrom(vixl::aarch64::x2)
-            : helpers::LocationFrom(vixl::aarch64::x1));
+        : helpers::LocationFrom(vixl::aarch64::x1);
   }
   Location GetFpuLocation(Primitive::Type type ATTRIBUTE_UNUSED) const OVERRIDE {
     return helpers::LocationFrom(vixl::aarch64::d0);
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index ffaf18fb4d..e18960872e 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -3998,15 +3998,18 @@ void LocationsBuilderARMVIXL::VisitNewArray(HNewArray* instruction) {
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly);
   InvokeRuntimeCallingConventionARMVIXL calling_convention;
   locations->SetOut(LocationFrom(r0));
-  locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(1)));
-  locations->SetInAt(1, LocationFrom(calling_convention.GetRegisterAt(2)));
+  locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
+  locations->SetInAt(1, LocationFrom(calling_convention.GetRegisterAt(1)));
 }
 
 void InstructionCodeGeneratorARMVIXL::VisitNewArray(HNewArray* instruction) {
   // Note: if heap poisoning is enabled, the entry point takes cares
   // of poisoning the reference.
-  codegen_->InvokeRuntime(kQuickAllocArrayResolved, instruction, instruction->GetDexPc());
+  QuickEntrypointEnum entrypoint =
+      CodeGenerator::GetArrayAllocationEntrypoint(instruction->GetLoadClass()->GetClass());
+  codegen_->InvokeRuntime(entrypoint, instruction, instruction->GetDexPc());
   CheckEntrypointTypes<kQuickAllocArrayResolved, void*, mirror::Class*, int32_t>();
+  DCHECK(!codegen_->IsLeafMethod());
 }
 
 void LocationsBuilderARMVIXL::VisitParameterValue(HParameterValue* instruction) {
@@ -5796,6 +5799,9 @@ void ParallelMoveResolverARMVIXL::RestoreScratch(int reg ATTRIBUTE_UNUSED) {
 HLoadClass::LoadKind CodeGeneratorARMVIXL::GetSupportedLoadClassKind(
     HLoadClass::LoadKind desired_class_load_kind) {
   switch (desired_class_load_kind) {
+    case HLoadClass::LoadKind::kInvalid:
+      LOG(FATAL) << "UNREACHABLE";
+      UNREACHABLE();
     case HLoadClass::LoadKind::kReferrersClass:
       break;
     case HLoadClass::LoadKind::kBootImageLinkTimeAddress:
@@ -5916,6 +5922,7 @@ void InstructionCodeGeneratorARMVIXL::VisitLoadClass(HLoadClass* cls) NO_THREAD_
       break;
     }
     case HLoadClass::LoadKind::kDexCacheViaMethod:
+    case HLoadClass::LoadKind::kInvalid:
       LOG(FATAL) << "UNREACHABLE";
       UNREACHABLE();
   }
@@ -7253,8 +7260,7 @@ vixl32::Register CodeGeneratorARMVIXL::GetInvokeStaticOrDirectExtraParameter(
   // save one load. However, since this is just an intrinsic slow path we prefer this
   // simple and more robust approach rather that trying to determine if that's the case.
   SlowPathCode* slow_path = GetCurrentSlowPath();
-  DCHECK(slow_path != nullptr);  // For intrinsified invokes the call is emitted on the slow path.
-  if (slow_path->IsCoreRegisterSaved(RegisterFrom(location).GetCode())) {
+  if (slow_path != nullptr && slow_path->IsCoreRegisterSaved(RegisterFrom(location).GetCode())) {
     int stack_offset = slow_path->GetStackOffsetOfCoreRegister(RegisterFrom(location).GetCode());
     GetAssembler()->LoadFromOffset(kLoadWord, temp, sp, stack_offset);
     return temp;
@@ -7679,15 +7685,21 @@ void InstructionCodeGeneratorARMVIXL::VisitPackedSwitch(HPackedSwitch* switch_in
     vixl32::Register jump_offset = temps.Acquire();
 
     // Load jump offset from the table.
-    __ Adr(table_base, jump_table->GetTableStartLabel());
-    __ Ldr(jump_offset, MemOperand(table_base, key_reg, vixl32::LSL, 2));
+    {
+      const size_t jump_size = switch_instr->GetNumEntries() * sizeof(int32_t);
+      ExactAssemblyScope aas(GetVIXLAssembler(),
+                             (vixl32::kMaxInstructionSizeInBytes * 4) + jump_size,
+                             CodeBufferCheckScope::kMaximumSize);
+      __ adr(table_base, jump_table->GetTableStartLabel());
+      __ ldr(jump_offset, MemOperand(table_base, key_reg, vixl32::LSL, 2));
 
-    // Jump to target block by branching to table_base(pc related) + offset.
-    vixl32::Register target_address = table_base;
-    __ Add(target_address, table_base, jump_offset);
-    __ Bx(target_address);
+      // Jump to target block by branching to table_base(pc related) + offset.
+      vixl32::Register target_address = table_base;
+      __ add(target_address, table_base, jump_offset);
+      __ bx(target_address);
 
-    jump_table->EmitTable(codegen_);
+      jump_table->EmitTable(codegen_);
+    }
   }
 }
 void LocationsBuilderARMVIXL::VisitArmDexCacheArraysBase(HArmDexCacheArraysBase* base) {
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index 76be74e921..0677dad078 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -258,8 +258,10 @@ class LoadClassSlowPathMIPS : public SlowPathCodeMIPS {
       DCHECK_NE(out.AsRegister<Register>(), AT);
       CodeGeneratorMIPS::PcRelativePatchInfo* info =
           mips_codegen->NewTypeBssEntryPatch(cls_->GetDexFile(), type_index);
-      mips_codegen->EmitPcRelativeAddressPlaceholder(info, TMP, base);
-      __ StoreToOffset(kStoreWord, out.AsRegister<Register>(), TMP, 0);
+      bool reordering = __ SetReorder(false);
+      mips_codegen->EmitPcRelativeAddressPlaceholderHigh(info, TMP, base);
+      __ StoreToOffset(kStoreWord, out.AsRegister<Register>(), TMP, /* placeholder */ 0x5678);
+      __ SetReorder(reordering);
     }
     __ B(GetExitLabel());
   }
@@ -313,8 +315,10 @@ class LoadStringSlowPathMIPS : public SlowPathCodeMIPS {
     DCHECK_NE(out, AT);
     CodeGeneratorMIPS::PcRelativePatchInfo* info =
         mips_codegen->NewPcRelativeStringPatch(load->GetDexFile(), string_index);
-    mips_codegen->EmitPcRelativeAddressPlaceholder(info, TMP, base);
-    __ StoreToOffset(kStoreWord, out, TMP, 0);
+    bool reordering = __ SetReorder(false);
+    mips_codegen->EmitPcRelativeAddressPlaceholderHigh(info, TMP, base);
+    __ StoreToOffset(kStoreWord, out, TMP, /* placeholder */ 0x5678);
+    __ SetReorder(reordering);
 
     __ B(GetExitLabel());
   }
@@ -480,6 +484,8 @@ CodeGeneratorMIPS::CodeGeneratorMIPS(HGraph* graph,
       type_bss_entry_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       boot_image_address_patches_(std::less<uint32_t>(),
                                   graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      jit_string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      jit_class_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       clobbered_ra_(false) {
   // Save RA (containing the return address) to mimic Quick.
   AddAllocatedRegister(Location::RegisterLocation(RA));
@@ -700,9 +706,6 @@ bool CodeGeneratorMIPS::HasAllocatedCalleeSaveRegisters() const {
   // (this can happen in leaf methods), force CodeGenerator::InitializeCodeGeneration()
   // into the path that creates a stack frame so that RA can be explicitly saved and restored.
   // RA can't otherwise be saved/restored when it's the only spilled register.
-  // TODO: Can this be improved? It causes creation of a stack frame (while RA might be
-  // saved in an unused temporary register) and saving of RA and the current method pointer
-  // in the frame.
   return CodeGenerator::HasAllocatedCalleeSaveRegisters() || clobbered_ra_;
 }
 
@@ -1127,16 +1130,15 @@ Literal* CodeGeneratorMIPS::DeduplicateBootImageAddressLiteral(uint32_t address)
   return DeduplicateUint32Literal(dchecked_integral_cast<uint32_t>(address), map);
 }
 
-void CodeGeneratorMIPS::EmitPcRelativeAddressPlaceholder(
-    PcRelativePatchInfo* info, Register out, Register base) {
-  bool reordering = __ SetReorder(false);
+void CodeGeneratorMIPS::EmitPcRelativeAddressPlaceholderHigh(PcRelativePatchInfo* info,
+                                                             Register out,
+                                                             Register base) {
   if (GetInstructionSetFeatures().IsR6()) {
     DCHECK_EQ(base, ZERO);
     __ Bind(&info->high_label);
     __ Bind(&info->pc_rel_label);
-    // Add a 32-bit offset to PC.
+    // Add the high half of a 32-bit offset to PC.
     __ Auipc(out, /* placeholder */ 0x1234);
-    __ Addiu(out, out, /* placeholder */ 0x5678);
   } else {
     // If base is ZERO, emit NAL to obtain the actual base.
     if (base == ZERO) {
@@ -1150,11 +1152,72 @@ void CodeGeneratorMIPS::EmitPcRelativeAddressPlaceholder(
     if (base == ZERO) {
       __ Bind(&info->pc_rel_label);
     }
-    __ Ori(out, out, /* placeholder */ 0x5678);
-    // Add a 32-bit offset to PC.
+    // Add the high half of a 32-bit offset to PC.
     __ Addu(out, out, (base == ZERO) ? RA : base);
   }
-  __ SetReorder(reordering);
+  // The immediately following instruction will add the sign-extended low half of the 32-bit
+  // offset to `out` (e.g. lw, jialc, addiu).
+}
+
+CodeGeneratorMIPS::JitPatchInfo* CodeGeneratorMIPS::NewJitRootStringPatch(
+    const DexFile& dex_file,
+    dex::StringIndex dex_index,
+    Handle<mirror::String> handle) {
+  jit_string_roots_.Overwrite(StringReference(&dex_file, dex_index),
+                              reinterpret_cast64<uint64_t>(handle.GetReference()));
+  jit_string_patches_.emplace_back(dex_file, dex_index.index_);
+  return &jit_string_patches_.back();
+}
+
+CodeGeneratorMIPS::JitPatchInfo* CodeGeneratorMIPS::NewJitRootClassPatch(
+    const DexFile& dex_file,
+    dex::TypeIndex dex_index,
+    Handle<mirror::Class> handle) {
+  jit_class_roots_.Overwrite(TypeReference(&dex_file, dex_index),
+                             reinterpret_cast64<uint64_t>(handle.GetReference()));
+  jit_class_patches_.emplace_back(dex_file, dex_index.index_);
+  return &jit_class_patches_.back();
+}
+
+void CodeGeneratorMIPS::PatchJitRootUse(uint8_t* code,
+                                        const uint8_t* roots_data,
+                                        const CodeGeneratorMIPS::JitPatchInfo& info,
+                                        uint64_t index_in_table) const {
+  uint32_t literal_offset = GetAssembler().GetLabelLocation(&info.high_label);
+  uintptr_t address =
+      reinterpret_cast<uintptr_t>(roots_data) + index_in_table * sizeof(GcRoot<mirror::Object>);
+  uint32_t addr32 = dchecked_integral_cast<uint32_t>(address);
+  // lui reg, addr32_high
+  DCHECK_EQ(code[literal_offset + 0], 0x34);
+  DCHECK_EQ(code[literal_offset + 1], 0x12);
+  DCHECK_EQ((code[literal_offset + 2] & 0xE0), 0x00);
+  DCHECK_EQ(code[literal_offset + 3], 0x3C);
+  // lw reg, reg, addr32_low
+  DCHECK_EQ(code[literal_offset + 4], 0x78);
+  DCHECK_EQ(code[literal_offset + 5], 0x56);
+  DCHECK_EQ((code[literal_offset + 7] & 0xFC), 0x8C);
+  addr32 += (addr32 & 0x8000) << 1;  // Account for sign extension in "lw reg, reg, addr32_low".
+  // lui reg, addr32_high
+  code[literal_offset + 0] = static_cast<uint8_t>(addr32 >> 16);
+  code[literal_offset + 1] = static_cast<uint8_t>(addr32 >> 24);
+  // lw reg, reg, addr32_low
+  code[literal_offset + 4] = static_cast<uint8_t>(addr32 >> 0);
+  code[literal_offset + 5] = static_cast<uint8_t>(addr32 >> 8);
+}
+
+void CodeGeneratorMIPS::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) {
+  for (const JitPatchInfo& info : jit_string_patches_) {
+    const auto& it = jit_string_roots_.find(StringReference(&info.target_dex_file,
+                                                            dex::StringIndex(info.index)));
+    DCHECK(it != jit_string_roots_.end());
+    PatchJitRootUse(code, roots_data, info, it->second);
+  }
+  for (const JitPatchInfo& info : jit_class_patches_) {
+    const auto& it = jit_class_roots_.find(TypeReference(&info.target_dex_file,
+                                                         dex::TypeIndex(info.index)));
+    DCHECK(it != jit_class_roots_.end());
+    PatchJitRootUse(code, roots_data, info, it->second);
+  }
 }
 
 void CodeGeneratorMIPS::MarkGCCard(Register object,
@@ -5159,7 +5222,8 @@ void LocationsBuilderMIPS::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invo
   // art::PrepareForRegisterAllocation.
   DCHECK(!invoke->IsStaticWithExplicitClinitCheck());
 
-  bool has_extra_input = invoke->HasPcRelativeDexCache();
+  bool is_r6 = codegen_->GetInstructionSetFeatures().IsR6();
+  bool has_extra_input = invoke->HasPcRelativeDexCache() && !is_r6;
 
   IntrinsicLocationsBuilderMIPS intrinsic(codegen_);
   if (intrinsic.TryDispatch(invoke)) {
@@ -5200,12 +5264,13 @@ HLoadString::LoadKind CodeGeneratorMIPS::GetSupportedLoadStringKind(
   if (kEmitCompilerReadBarrier) {
     UNIMPLEMENTED(FATAL) << "for read barrier";
   }
-  // We disable PC-relative load when there is an irreducible loop, as the optimization
+  // We disable PC-relative load on pre-R6 when there is an irreducible loop, as the optimization
   // is incompatible with it.
   // TODO: Create as many MipsDexCacheArraysBase instructions as needed for methods
   // with irreducible loops.
   bool has_irreducible_loops = GetGraph()->HasIrreducibleLoops();
-  bool fallback_load = has_irreducible_loops;
+  bool is_r6 = GetInstructionSetFeatures().IsR6();
+  bool fallback_load = has_irreducible_loops && !is_r6;
   switch (desired_string_load_kind) {
     case HLoadString::LoadKind::kBootImageLinkTimeAddress:
       DCHECK(!GetCompilerOptions().GetCompilePic());
@@ -5220,8 +5285,7 @@ HLoadString::LoadKind CodeGeneratorMIPS::GetSupportedLoadStringKind(
       break;
     case HLoadString::LoadKind::kJitTableAddress:
       DCHECK(Runtime::Current()->UseJitCompilation());
-      // TODO: implement.
-      fallback_load = true;
+      fallback_load = false;
       break;
     case HLoadString::LoadKind::kDexCacheViaMethod:
       fallback_load = false;
@@ -5238,11 +5302,15 @@ HLoadClass::LoadKind CodeGeneratorMIPS::GetSupportedLoadClassKind(
   if (kEmitCompilerReadBarrier) {
     UNIMPLEMENTED(FATAL) << "for read barrier";
   }
-  // We disable pc-relative load when there is an irreducible loop, as the optimization
+  // We disable PC-relative load on pre-R6 when there is an irreducible loop, as the optimization
   // is incompatible with it.
   bool has_irreducible_loops = GetGraph()->HasIrreducibleLoops();
-  bool fallback_load = has_irreducible_loops;
+  bool is_r6 = GetInstructionSetFeatures().IsR6();
+  bool fallback_load = has_irreducible_loops && !is_r6;
   switch (desired_class_load_kind) {
+    case HLoadClass::LoadKind::kInvalid:
+      LOG(FATAL) << "UNREACHABLE";
+      UNREACHABLE();
     case HLoadClass::LoadKind::kReferrersClass:
       fallback_load = false;
       break;
@@ -5259,7 +5327,7 @@ HLoadClass::LoadKind CodeGeneratorMIPS::GetSupportedLoadClassKind(
       break;
     case HLoadClass::LoadKind::kJitTableAddress:
       DCHECK(Runtime::Current()->UseJitCompilation());
-      fallback_load = true;
+      fallback_load = false;
       break;
     case HLoadClass::LoadKind::kDexCacheViaMethod:
       fallback_load = false;
@@ -5273,6 +5341,7 @@ HLoadClass::LoadKind CodeGeneratorMIPS::GetSupportedLoadClassKind(
 
 Register CodeGeneratorMIPS::GetInvokeStaticOrDirectExtraParameter(HInvokeStaticOrDirect* invoke,
                                                                   Register temp) {
+  CHECK(!GetInstructionSetFeatures().IsR6());
   CHECK_EQ(invoke->InputCount(), invoke->GetNumberOfArguments() + 1u);
   Location location = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
   if (!invoke->GetLocations()->Intrinsified()) {
@@ -5301,13 +5370,13 @@ HInvokeStaticOrDirect::DispatchInfo CodeGeneratorMIPS::GetSupportedInvokeStaticO
       const HInvokeStaticOrDirect::DispatchInfo& desired_dispatch_info,
       HInvokeStaticOrDirect* invoke ATTRIBUTE_UNUSED) {
   HInvokeStaticOrDirect::DispatchInfo dispatch_info = desired_dispatch_info;
-  // We disable PC-relative load when there is an irreducible loop, as the optimization
+  // We disable PC-relative load on pre-R6 when there is an irreducible loop, as the optimization
   // is incompatible with it.
   bool has_irreducible_loops = GetGraph()->HasIrreducibleLoops();
-  bool fallback_load = true;
+  bool is_r6 = GetInstructionSetFeatures().IsR6();
+  bool fallback_load = has_irreducible_loops && !is_r6;
   switch (dispatch_info.method_load_kind) {
     case HInvokeStaticOrDirect::MethodLoadKind::kDexCachePcRelative:
-      fallback_load = has_irreducible_loops;
       break;
     default:
       fallback_load = false;
@@ -5325,7 +5394,8 @@ void CodeGeneratorMIPS::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke
   Location callee_method = temp;  // For all kinds except kRecursive, callee will be in temp.
   HInvokeStaticOrDirect::MethodLoadKind method_load_kind = invoke->GetMethodLoadKind();
   HInvokeStaticOrDirect::CodePtrLocation code_ptr_location = invoke->GetCodePtrLocation();
-  Register base_reg = invoke->HasPcRelativeDexCache()
+  bool is_r6 = GetInstructionSetFeatures().IsR6();
+  Register base_reg = (invoke->HasPcRelativeDexCache() && !is_r6)
       ? GetInvokeStaticOrDirectExtraParameter(invoke, temp.AsRegister<Register>())
       : ZERO;
 
@@ -5346,14 +5416,23 @@ void CodeGeneratorMIPS::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke
     case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress:
       __ LoadConst32(temp.AsRegister<Register>(), invoke->GetMethodAddress());
       break;
-    case HInvokeStaticOrDirect::MethodLoadKind::kDexCachePcRelative: {
-      HMipsDexCacheArraysBase* base =
-          invoke->InputAt(invoke->GetSpecialInputIndex())->AsMipsDexCacheArraysBase();
-      int32_t offset =
-          invoke->GetDexCacheArrayOffset() - base->GetElementOffset() - kDexCacheArrayLwOffset;
-      __ LoadFromOffset(kLoadWord, temp.AsRegister<Register>(), base_reg, offset);
+    case HInvokeStaticOrDirect::MethodLoadKind::kDexCachePcRelative:
+      if (is_r6) {
+        uint32_t offset = invoke->GetDexCacheArrayOffset();
+        CodeGeneratorMIPS::PcRelativePatchInfo* info =
+            NewPcRelativeDexCacheArrayPatch(invoke->GetDexFileForPcRelativeDexCache(), offset);
+        bool reordering = __ SetReorder(false);
+        EmitPcRelativeAddressPlaceholderHigh(info, TMP, ZERO);
+        __ Lw(temp.AsRegister<Register>(), TMP, /* placeholder */ 0x5678);
+        __ SetReorder(reordering);
+      } else {
+        HMipsDexCacheArraysBase* base =
+            invoke->InputAt(invoke->GetSpecialInputIndex())->AsMipsDexCacheArraysBase();
+        int32_t offset =
+            invoke->GetDexCacheArrayOffset() - base->GetElementOffset() - kDexCacheArrayLwOffset;
+        __ LoadFromOffset(kLoadWord, temp.AsRegister<Register>(), base_reg, offset);
+      }
       break;
-    }
     case HInvokeStaticOrDirect::MethodLoadKind::kDexCacheViaMethod: {
       Location current_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
       Register reg = temp.AsRegister<Register>();
@@ -5546,7 +5625,10 @@ void InstructionCodeGeneratorMIPS::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAF
       DCHECK(codegen_->GetCompilerOptions().IsBootImage());
       CodeGeneratorMIPS::PcRelativePatchInfo* info =
           codegen_->NewPcRelativeTypePatch(cls->GetDexFile(), cls->GetTypeIndex());
-      codegen_->EmitPcRelativeAddressPlaceholder(info, out, base_or_current_method_reg);
+      bool reordering = __ SetReorder(false);
+      codegen_->EmitPcRelativeAddressPlaceholderHigh(info, out, base_or_current_method_reg);
+      __ Addiu(out, out, /* placeholder */ 0x5678);
+      __ SetReorder(reordering);
       break;
     }
     case HLoadClass::LoadKind::kBootImageAddress: {
@@ -5562,16 +5644,26 @@ void InstructionCodeGeneratorMIPS::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAF
     case HLoadClass::LoadKind::kBssEntry: {
       CodeGeneratorMIPS::PcRelativePatchInfo* info =
           codegen_->NewTypeBssEntryPatch(cls->GetDexFile(), cls->GetTypeIndex());
-      codegen_->EmitPcRelativeAddressPlaceholder(info, out, base_or_current_method_reg);
-      __ LoadFromOffset(kLoadWord, out, out, 0);
+      bool reordering = __ SetReorder(false);
+      codegen_->EmitPcRelativeAddressPlaceholderHigh(info, out, base_or_current_method_reg);
+      __ LoadFromOffset(kLoadWord, out, out, /* placeholder */ 0x5678);
+      __ SetReorder(reordering);
       generate_null_check = true;
       break;
     }
     case HLoadClass::LoadKind::kJitTableAddress: {
-      LOG(FATAL) << "Unimplemented";
+      CodeGeneratorMIPS::JitPatchInfo* info = codegen_->NewJitRootClassPatch(cls->GetDexFile(),
+                                                                             cls->GetTypeIndex(),
+                                                                             cls->GetClass());
+      bool reordering = __ SetReorder(false);
+      __ Bind(&info->high_label);
+      __ Lui(out, /* placeholder */ 0x1234);
+      GenerateGcRootFieldLoad(cls, out_loc, out, /* placeholder */ 0x5678);
+      __ SetReorder(reordering);
       break;
     }
     case HLoadClass::LoadKind::kDexCacheViaMethod:
+    case HLoadClass::LoadKind::kInvalid:
       LOG(FATAL) << "UNREACHABLE";
       UNREACHABLE();
   }
@@ -5678,7 +5770,10 @@ void InstructionCodeGeneratorMIPS::VisitLoadString(HLoadString* load) NO_THREAD_
       DCHECK(codegen_->GetCompilerOptions().IsBootImage());
       CodeGeneratorMIPS::PcRelativePatchInfo* info =
           codegen_->NewPcRelativeStringPatch(load->GetDexFile(), load->GetStringIndex());
-      codegen_->EmitPcRelativeAddressPlaceholder(info, out, base_or_current_method_reg);
+      bool reordering = __ SetReorder(false);
+      codegen_->EmitPcRelativeAddressPlaceholderHigh(info, out, base_or_current_method_reg);
+      __ Addiu(out, out, /* placeholder */ 0x5678);
+      __ SetReorder(reordering);
       return;  // No dex cache slow path.
     }
     case HLoadString::LoadKind::kBootImageAddress: {
@@ -5694,14 +5789,28 @@ void InstructionCodeGeneratorMIPS::VisitLoadString(HLoadString* load) NO_THREAD_
       DCHECK(!codegen_->GetCompilerOptions().IsBootImage());
       CodeGeneratorMIPS::PcRelativePatchInfo* info =
           codegen_->NewPcRelativeStringPatch(load->GetDexFile(), load->GetStringIndex());
-      codegen_->EmitPcRelativeAddressPlaceholder(info, out, base_or_current_method_reg);
-      __ LoadFromOffset(kLoadWord, out, out, 0);
+      bool reordering = __ SetReorder(false);
+      codegen_->EmitPcRelativeAddressPlaceholderHigh(info, out, base_or_current_method_reg);
+      __ LoadFromOffset(kLoadWord, out, out, /* placeholder */ 0x5678);
+      __ SetReorder(reordering);
       SlowPathCodeMIPS* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathMIPS(load);
       codegen_->AddSlowPath(slow_path);
       __ Beqz(out, slow_path->GetEntryLabel());
       __ Bind(slow_path->GetExitLabel());
       return;
     }
+    case HLoadString::LoadKind::kJitTableAddress: {
+      CodeGeneratorMIPS::JitPatchInfo* info =
+          codegen_->NewJitRootStringPatch(load->GetDexFile(),
+                                          load->GetStringIndex(),
+                                          load->GetString());
+      bool reordering = __ SetReorder(false);
+      __ Bind(&info->high_label);
+      __ Lui(out, /* placeholder */ 0x1234);
+      GenerateGcRootFieldLoad(load, out_loc, out, /* placeholder */ 0x5678);
+      __ SetReorder(reordering);
+      return;
+    }
     default:
       break;
   }
@@ -6894,8 +7003,12 @@ void InstructionCodeGeneratorMIPS::VisitMipsDexCacheArraysBase(HMipsDexCacheArra
   Register reg = base->GetLocations()->Out().AsRegister<Register>();
   CodeGeneratorMIPS::PcRelativePatchInfo* info =
       codegen_->NewPcRelativeDexCacheArrayPatch(base->GetDexFile(), base->GetElementOffset());
+  CHECK(!codegen_->GetInstructionSetFeatures().IsR6());
+  bool reordering = __ SetReorder(false);
   // TODO: Reuse MipsComputeBaseMethodAddress on R2 instead of passing ZERO to force emitting NAL.
-  codegen_->EmitPcRelativeAddressPlaceholder(info, reg, ZERO);
+  codegen_->EmitPcRelativeAddressPlaceholderHigh(info, reg, ZERO);
+  __ Addiu(reg, reg, /* placeholder */ 0x5678);
+  __ SetReorder(reordering);
 }
 
 void LocationsBuilderMIPS::VisitInvokeUnresolved(HInvokeUnresolved* invoke) {
diff --git a/compiler/optimizing/code_generator_mips.h b/compiler/optimizing/code_generator_mips.h
index c8fd325999..47eba50248 100644
--- a/compiler/optimizing/code_generator_mips.h
+++ b/compiler/optimizing/code_generator_mips.h
@@ -352,6 +352,7 @@ class CodeGeneratorMIPS : public CodeGenerator {
 
   // Emit linker patches.
   void EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches) OVERRIDE;
+  void EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) OVERRIDE;
 
   void MarkGCCard(Register object, Register value, bool value_can_be_null);
 
@@ -463,7 +464,32 @@ class CodeGeneratorMIPS : public CodeGenerator {
   Literal* DeduplicateBootImageTypeLiteral(const DexFile& dex_file, dex::TypeIndex type_index);
   Literal* DeduplicateBootImageAddressLiteral(uint32_t address);
 
-  void EmitPcRelativeAddressPlaceholder(PcRelativePatchInfo* info, Register out, Register base);
+  void EmitPcRelativeAddressPlaceholderHigh(PcRelativePatchInfo* info, Register out, Register base);
+
+  // The JitPatchInfo is used for JIT string and class loads.
+  struct JitPatchInfo {
+    JitPatchInfo(const DexFile& dex_file, uint64_t idx)
+        : target_dex_file(dex_file), index(idx) { }
+    JitPatchInfo(JitPatchInfo&& other) = default;
+
+    const DexFile& target_dex_file;
+    // String/type index.
+    uint64_t index;
+    // Label for the instruction loading the most significant half of the address.
+    // The least significant half is loaded with the instruction that follows immediately.
+    MipsLabel high_label;
+  };
+
+  void PatchJitRootUse(uint8_t* code,
+                       const uint8_t* roots_data,
+                       const JitPatchInfo& info,
+                       uint64_t index_in_table) const;
+  JitPatchInfo* NewJitRootStringPatch(const DexFile& dex_file,
+                                      dex::StringIndex dex_index,
+                                      Handle<mirror::String> handle);
+  JitPatchInfo* NewJitRootClassPatch(const DexFile& dex_file,
+                                     dex::TypeIndex dex_index,
+                                     Handle<mirror::Class> handle);
 
  private:
   Register GetInvokeStaticOrDirectExtraParameter(HInvokeStaticOrDirect* invoke, Register temp);
@@ -512,6 +538,10 @@ class CodeGeneratorMIPS : public CodeGenerator {
   ArenaDeque<PcRelativePatchInfo> type_bss_entry_patches_;
   // Deduplication map for patchable boot image addresses.
   Uint32ToLiteralMap boot_image_address_patches_;
+  // Patches for string root accesses in JIT compiled code.
+  ArenaDeque<JitPatchInfo> jit_string_patches_;
+  // Patches for class root accesses in JIT compiled code.
+  ArenaDeque<JitPatchInfo> jit_class_patches_;
 
   // PC-relative loads on R2 clobber RA, which may need to be preserved explicitly in leaf methods.
   // This is a flag set by pc_relative_fixups_mips and dex_cache_array_fixups_mips optimizations.
diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc
index 192b4a5050..4c8dabfede 100644
--- a/compiler/optimizing/code_generator_mips64.cc
+++ b/compiler/optimizing/code_generator_mips64.cc
@@ -91,9 +91,6 @@ Location InvokeDexCallingConventionVisitorMIPS64::GetNextLocation(Primitive::Typ
   // Space on the stack is reserved for all arguments.
   stack_index_ += Primitive::Is64BitType(type) ? 2 : 1;
 
-  // TODO: shouldn't we use a whole machine word per argument on the stack?
-  // Implicit 4-byte method pointer (and such) will cause misalignment.
-
   return next_location;
 }
 
@@ -434,7 +431,11 @@ CodeGeneratorMIPS64::CodeGeneratorMIPS64(HGraph* graph,
       pc_relative_type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       type_bss_entry_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       boot_image_address_patches_(std::less<uint32_t>(),
-                                  graph->GetArena()->Adapter(kArenaAllocCodeGenerator)) {
+                                  graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      jit_string_patches_(StringReferenceValueComparator(),
+                          graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      jit_class_patches_(TypeReferenceValueComparator(),
+                         graph->GetArena()->Adapter(kArenaAllocCodeGenerator)) {
   // Save RA (containing the return address) to mimic Quick.
   AddAllocatedRegister(Location::RegisterLocation(RA));
 }
@@ -1055,6 +1056,49 @@ void CodeGeneratorMIPS64::EmitPcRelativeAddressPlaceholderHigh(PcRelativePatchIn
   // offset to `out` (e.g. ld, jialc, daddiu).
 }
 
+Literal* CodeGeneratorMIPS64::DeduplicateJitStringLiteral(const DexFile& dex_file,
+                                                          dex::StringIndex string_index,
+                                                          Handle<mirror::String> handle) {
+  jit_string_roots_.Overwrite(StringReference(&dex_file, string_index),
+                              reinterpret_cast64<uint64_t>(handle.GetReference()));
+  return jit_string_patches_.GetOrCreate(
+      StringReference(&dex_file, string_index),
+      [this]() { return __ NewLiteral<uint32_t>(/* placeholder */ 0u); });
+}
+
+Literal* CodeGeneratorMIPS64::DeduplicateJitClassLiteral(const DexFile& dex_file,
+                                                         dex::TypeIndex type_index,
+                                                         Handle<mirror::Class> handle) {
+  jit_class_roots_.Overwrite(TypeReference(&dex_file, type_index),
+                             reinterpret_cast64<uint64_t>(handle.GetReference()));
+  return jit_class_patches_.GetOrCreate(
+      TypeReference(&dex_file, type_index),
+      [this]() { return __ NewLiteral<uint32_t>(/* placeholder */ 0u); });
+}
+
+void CodeGeneratorMIPS64::PatchJitRootUse(uint8_t* code,
+                                          const uint8_t* roots_data,
+                                          const Literal* literal,
+                                          uint64_t index_in_table) const {
+  uint32_t literal_offset = GetAssembler().GetLabelLocation(literal->GetLabel());
+  uintptr_t address =
+      reinterpret_cast<uintptr_t>(roots_data) + index_in_table * sizeof(GcRoot<mirror::Object>);
+  reinterpret_cast<uint32_t*>(code + literal_offset)[0] = dchecked_integral_cast<uint32_t>(address);
+}
+
+void CodeGeneratorMIPS64::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) {
+  for (const auto& entry : jit_string_patches_) {
+    const auto& it = jit_string_roots_.find(entry.first);
+    DCHECK(it != jit_string_roots_.end());
+    PatchJitRootUse(code, roots_data, entry.second, it->second);
+  }
+  for (const auto& entry : jit_class_patches_) {
+    const auto& it = jit_class_roots_.find(entry.first);
+    DCHECK(it != jit_class_roots_.end());
+    PatchJitRootUse(code, roots_data, entry.second, it->second);
+  }
+}
+
 void CodeGeneratorMIPS64::SetupBlockedRegisters() const {
   // ZERO, K0, K1, GP, SP, RA are always reserved and can't be allocated.
   blocked_core_registers_[ZERO] = true;
@@ -3117,14 +3161,6 @@ void InstructionCodeGeneratorMIPS64::GenerateGcRootFieldLoad(
     Location root,
     GpuRegister obj,
     uint32_t offset) {
-  // When handling PC-relative loads, the caller calls
-  // EmitPcRelativeAddressPlaceholderHigh() and then GenerateGcRootFieldLoad().
-  // The relative patcher expects the two methods to emit the following patchable
-  // sequence of instructions in this case:
-  //   auipc reg1, 0x1234  // 0x1234 is a placeholder for offset_high.
-  //   lwu   reg2, 0x5678(reg1)  // 0x5678 is a placeholder for offset_low.
-  // TODO: Adjust GenerateGcRootFieldLoad() and its caller when this method is
-  // extended (e.g. for read barriers) so as not to break the relative patcher.
   GpuRegister root_reg = root.AsRegister<GpuRegister>();
   if (kEmitCompilerReadBarrier) {
     UNIMPLEMENTED(FATAL) << "for read barrier";
@@ -3317,8 +3353,6 @@ HLoadString::LoadKind CodeGeneratorMIPS64::GetSupportedLoadStringKind(
       break;
     case HLoadString::LoadKind::kJitTableAddress:
       DCHECK(Runtime::Current()->UseJitCompilation());
-      // TODO: implement.
-      fallback_load = true;
       break;
   }
   if (fallback_load) {
@@ -3334,6 +3368,9 @@ HLoadClass::LoadKind CodeGeneratorMIPS64::GetSupportedLoadClassKind(
   }
   bool fallback_load = false;
   switch (desired_class_load_kind) {
+    case HLoadClass::LoadKind::kInvalid:
+      LOG(FATAL) << "UNREACHABLE";
+      UNREACHABLE();
     case HLoadClass::LoadKind::kReferrersClass:
       break;
     case HLoadClass::LoadKind::kBootImageLinkTimeAddress:
@@ -3349,8 +3386,6 @@ HLoadClass::LoadKind CodeGeneratorMIPS64::GetSupportedLoadClassKind(
       break;
     case HLoadClass::LoadKind::kJitTableAddress:
       DCHECK(Runtime::Current()->UseJitCompilation());
-      // TODO: implement.
-      fallback_load = true;
       break;
     case HLoadClass::LoadKind::kDexCacheViaMethod:
       break;
@@ -3588,11 +3623,16 @@ void InstructionCodeGeneratorMIPS64::VisitLoadClass(HLoadClass* cls) NO_THREAD_S
       generate_null_check = true;
       break;
     }
-    case HLoadClass::LoadKind::kJitTableAddress: {
-      LOG(FATAL) << "Unimplemented";
+    case HLoadClass::LoadKind::kJitTableAddress:
+      __ LoadLiteral(out,
+                     kLoadUnsignedWord,
+                     codegen_->DeduplicateJitClassLiteral(cls->GetDexFile(),
+                                                          cls->GetTypeIndex(),
+                                                          cls->GetClass()));
+      GenerateGcRootFieldLoad(cls, out_loc, out, 0);
       break;
-    }
     case HLoadClass::LoadKind::kDexCacheViaMethod:
+    case HLoadClass::LoadKind::kInvalid:
       LOG(FATAL) << "UNREACHABLE";
       UNREACHABLE();
   }
@@ -3693,6 +3733,14 @@ void InstructionCodeGeneratorMIPS64::VisitLoadString(HLoadString* load) NO_THREA
       __ Bind(slow_path->GetExitLabel());
       return;
     }
+    case HLoadString::LoadKind::kJitTableAddress:
+      __ LoadLiteral(out,
+                     kLoadUnsignedWord,
+                     codegen_->DeduplicateJitStringLiteral(load->GetDexFile(),
+                                                           load->GetStringIndex(),
+                                                           load->GetString()));
+      GenerateGcRootFieldLoad(load, out_loc, out, 0);
+      return;
     default:
       break;
   }
diff --git a/compiler/optimizing/code_generator_mips64.h b/compiler/optimizing/code_generator_mips64.h
index 52b780c106..26cc7dc788 100644
--- a/compiler/optimizing/code_generator_mips64.h
+++ b/compiler/optimizing/code_generator_mips64.h
@@ -52,7 +52,7 @@ static constexpr size_t kRuntimeParameterFpuRegistersLength =
 
 
 static constexpr GpuRegister kCoreCalleeSaves[] =
-    { S0, S1, S2, S3, S4, S5, S6, S7, GP, S8, RA };  // TODO: review
+    { S0, S1, S2, S3, S4, S5, S6, S7, GP, S8, RA };
 static constexpr FpuRegister kFpuCalleeSaves[] =
     { F24, F25, F26, F27, F28, F29, F30, F31 };
 
@@ -115,12 +115,11 @@ class FieldAccessCallingConventionMIPS64 : public FieldAccessCallingConvention {
   Location GetReturnLocation(Primitive::Type type ATTRIBUTE_UNUSED) const OVERRIDE {
     return Location::RegisterLocation(V0);
   }
-  Location GetSetValueLocation(Primitive::Type type, bool is_instance) const OVERRIDE {
-    return Primitive::Is64BitType(type)
+  Location GetSetValueLocation(Primitive::Type type ATTRIBUTE_UNUSED,
+                               bool is_instance) const OVERRIDE {
+    return is_instance
         ? Location::RegisterLocation(A2)
-        : (is_instance
-            ? Location::RegisterLocation(A2)
-            : Location::RegisterLocation(A1));
+        : Location::RegisterLocation(A1);
   }
   Location GetFpuLocation(Primitive::Type type ATTRIBUTE_UNUSED) const OVERRIDE {
     return Location::FpuRegisterLocation(F0);
@@ -313,6 +312,7 @@ class CodeGeneratorMIPS64 : public CodeGenerator {
 
   // Emit linker patches.
   void EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches) OVERRIDE;
+  void EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) OVERRIDE;
 
   void MarkGCCard(GpuRegister object, GpuRegister value, bool value_can_be_null);
 
@@ -426,10 +426,27 @@ class CodeGeneratorMIPS64 : public CodeGenerator {
 
   void EmitPcRelativeAddressPlaceholderHigh(PcRelativePatchInfo* info, GpuRegister out);
 
+  void PatchJitRootUse(uint8_t* code,
+                       const uint8_t* roots_data,
+                       const Literal* literal,
+                       uint64_t index_in_table) const;
+  Literal* DeduplicateJitStringLiteral(const DexFile& dex_file,
+                                       dex::StringIndex string_index,
+                                       Handle<mirror::String> handle);
+  Literal* DeduplicateJitClassLiteral(const DexFile& dex_file,
+                                      dex::TypeIndex type_index,
+                                      Handle<mirror::Class> handle);
+
  private:
   using Uint32ToLiteralMap = ArenaSafeMap<uint32_t, Literal*>;
   using Uint64ToLiteralMap = ArenaSafeMap<uint64_t, Literal*>;
   using MethodToLiteralMap = ArenaSafeMap<MethodReference, Literal*, MethodReferenceComparator>;
+  using StringToLiteralMap = ArenaSafeMap<StringReference,
+                                          Literal*,
+                                          StringReferenceValueComparator>;
+  using TypeToLiteralMap = ArenaSafeMap<TypeReference,
+                                        Literal*,
+                                        TypeReferenceValueComparator>;
   using BootStringToLiteralMap = ArenaSafeMap<StringReference,
                                               Literal*,
                                               StringReferenceValueComparator>;
@@ -477,6 +494,10 @@ class CodeGeneratorMIPS64 : public CodeGenerator {
   ArenaDeque<PcRelativePatchInfo> type_bss_entry_patches_;
   // Deduplication map for patchable boot image addresses.
   Uint32ToLiteralMap boot_image_address_patches_;
+  // Patches for string root accesses in JIT compiled code.
+  StringToLiteralMap jit_string_patches_;
+  // Patches for class root accesses in JIT compiled code.
+  TypeToLiteralMap jit_class_patches_;
 
   DISALLOW_COPY_AND_ASSIGN(CodeGeneratorMIPS64);
 };
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 1b7431612d..137b55423b 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -4214,7 +4214,9 @@ void LocationsBuilderX86::VisitNewArray(HNewArray* instruction) {
 void InstructionCodeGeneratorX86::VisitNewArray(HNewArray* instruction) {
   // Note: if heap poisoning is enabled, the entry point takes cares
   // of poisoning the reference.
-  codegen_->InvokeRuntime(kQuickAllocArrayResolved, instruction, instruction->GetDexPc());
+  QuickEntrypointEnum entrypoint =
+      CodeGenerator::GetArrayAllocationEntrypoint(instruction->GetLoadClass()->GetClass());
+  codegen_->InvokeRuntime(entrypoint, instruction, instruction->GetDexPc());
   CheckEntrypointTypes<kQuickAllocArrayResolved, void*, mirror::Class*, int32_t>();
   DCHECK(!codegen_->IsLeafMethod());
 }
@@ -6022,6 +6024,9 @@ void ParallelMoveResolverX86::RestoreScratch(int reg) {
 HLoadClass::LoadKind CodeGeneratorX86::GetSupportedLoadClassKind(
     HLoadClass::LoadKind desired_class_load_kind) {
   switch (desired_class_load_kind) {
+    case HLoadClass::LoadKind::kInvalid:
+      LOG(FATAL) << "UNREACHABLE";
+      UNREACHABLE();
     case HLoadClass::LoadKind::kReferrersClass:
       break;
     case HLoadClass::LoadKind::kBootImageLinkTimeAddress:
@@ -6157,6 +6162,7 @@ void InstructionCodeGeneratorX86::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAFE
       break;
     }
     case HLoadClass::LoadKind::kDexCacheViaMethod:
+    case HLoadClass::LoadKind::kInvalid:
       LOG(FATAL) << "UNREACHABLE";
       UNREACHABLE();
   }
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index 7350fcc48a..5360dc9209 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -110,7 +110,9 @@ class FieldAccessCallingConventionX86 : public FieldAccessCallingConvention {
   }
   Location GetSetValueLocation(Primitive::Type type, bool is_instance) const OVERRIDE {
     return Primitive::Is64BitType(type)
-        ? Location::RegisterPairLocation(EDX, EBX)
+        ? (is_instance
+            ? Location::RegisterPairLocation(EDX, EBX)
+            : Location::RegisterPairLocation(ECX, EDX))
         : (is_instance
             ? Location::RegisterLocation(EDX)
             : Location::RegisterLocation(ECX));
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index c4caf4bf9d..c5367ce86e 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -4096,7 +4096,9 @@ void LocationsBuilderX86_64::VisitNewArray(HNewArray* instruction) {
 void InstructionCodeGeneratorX86_64::VisitNewArray(HNewArray* instruction) {
   // Note: if heap poisoning is enabled, the entry point takes cares
   // of poisoning the reference.
-  codegen_->InvokeRuntime(kQuickAllocArrayResolved, instruction, instruction->GetDexPc());
+  QuickEntrypointEnum entrypoint =
+      CodeGenerator::GetArrayAllocationEntrypoint(instruction->GetLoadClass()->GetClass());
+  codegen_->InvokeRuntime(entrypoint, instruction, instruction->GetDexPc());
   CheckEntrypointTypes<kQuickAllocArrayResolved, void*, mirror::Class*, int32_t>();
   DCHECK(!codegen_->IsLeafMethod());
 }
@@ -5425,6 +5427,9 @@ void InstructionCodeGeneratorX86_64::GenerateClassInitializationCheck(
 HLoadClass::LoadKind CodeGeneratorX86_64::GetSupportedLoadClassKind(
     HLoadClass::LoadKind desired_class_load_kind) {
   switch (desired_class_load_kind) {
+    case HLoadClass::LoadKind::kInvalid:
+      LOG(FATAL) << "UNREACHABLE";
+      UNREACHABLE();
     case HLoadClass::LoadKind::kReferrersClass:
       break;
     case HLoadClass::LoadKind::kBootImageLinkTimeAddress:
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index 3438b8159f..3a83731b3f 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -92,12 +92,11 @@ class FieldAccessCallingConventionX86_64 : public FieldAccessCallingConvention {
   Location GetReturnLocation(Primitive::Type type ATTRIBUTE_UNUSED) const OVERRIDE {
     return Location::RegisterLocation(RAX);
   }
-  Location GetSetValueLocation(Primitive::Type type, bool is_instance) const OVERRIDE {
-    return Primitive::Is64BitType(type)
+  Location GetSetValueLocation(Primitive::Type type ATTRIBUTE_UNUSED, bool is_instance)
+      const OVERRIDE {
+    return is_instance
         ? Location::RegisterLocation(RDX)
-        : (is_instance
-            ? Location::RegisterLocation(RDX)
-            : Location::RegisterLocation(RSI));
+        : Location::RegisterLocation(RSI);
   }
   Location GetFpuLocation(Primitive::Type type ATTRIBUTE_UNUSED) const OVERRIDE {
     return Location::FpuRegisterLocation(XMM0);
diff --git a/compiler/optimizing/codegen_test.cc b/compiler/optimizing/codegen_test.cc
index e3f3df0ff5..f8bbf68c1c 100644
--- a/compiler/optimizing/codegen_test.cc
+++ b/compiler/optimizing/codegen_test.cc
@@ -17,30 +17,15 @@
 #include <functional>
 #include <memory>
 
-#include "arch/instruction_set.h"
-#include "arch/arm/instruction_set_features_arm.h"
-#include "arch/arm/registers_arm.h"
-#include "arch/arm64/instruction_set_features_arm64.h"
-#include "arch/mips/instruction_set_features_mips.h"
-#include "arch/mips/registers_mips.h"
-#include "arch/mips64/instruction_set_features_mips64.h"
-#include "arch/mips64/registers_mips64.h"
-#include "arch/x86/instruction_set_features_x86.h"
-#include "arch/x86/registers_x86.h"
-#include "arch/x86_64/instruction_set_features_x86_64.h"
 #include "base/macros.h"
 #include "builder.h"
-#include "code_simulator_container.h"
-#include "common_compiler_test.h"
+#include "codegen_test_utils.h"
 #include "dex_file.h"
 #include "dex_instruction.h"
 #include "driver/compiler_options.h"
-#include "graph_checker.h"
 #include "nodes.h"
 #include "optimizing_unit_test.h"
-#include "prepare_for_register_allocation.h"
 #include "register_allocator_linear_scan.h"
-#include "ssa_liveness_analysis.h"
 #include "utils.h"
 #include "utils/arm/assembler_arm_vixl.h"
 #include "utils/arm/managed_register_arm.h"
@@ -48,324 +33,10 @@
 #include "utils/mips64/managed_register_mips64.h"
 #include "utils/x86/managed_register_x86.h"
 
-#ifdef ART_ENABLE_CODEGEN_arm
-#include "code_generator_arm.h"
-#include "code_generator_arm_vixl.h"
-#endif
-
-#ifdef ART_ENABLE_CODEGEN_arm64
-#include "code_generator_arm64.h"
-#endif
-
-#ifdef ART_ENABLE_CODEGEN_x86
-#include "code_generator_x86.h"
-#endif
-
-#ifdef ART_ENABLE_CODEGEN_x86_64
-#include "code_generator_x86_64.h"
-#endif
-
-#ifdef ART_ENABLE_CODEGEN_mips
-#include "code_generator_mips.h"
-#endif
-
-#ifdef ART_ENABLE_CODEGEN_mips64
-#include "code_generator_mips64.h"
-#endif
-
 #include "gtest/gtest.h"
 
 namespace art {
 
-typedef CodeGenerator* (*CreateCodegenFn)(HGraph*, const CompilerOptions&);
-
-class CodegenTargetConfig {
- public:
-  CodegenTargetConfig(InstructionSet isa, CreateCodegenFn create_codegen)
-      : isa_(isa), create_codegen_(create_codegen) {
-  }
-  InstructionSet GetInstructionSet() const { return isa_; }
-  CodeGenerator* CreateCodeGenerator(HGraph* graph, const CompilerOptions& compiler_options) {
-    return create_codegen_(graph, compiler_options);
-  }
-
- private:
-  CodegenTargetConfig() {}
-  InstructionSet isa_;
-  CreateCodegenFn create_codegen_;
-};
-
-#ifdef ART_ENABLE_CODEGEN_arm
-// Provide our own codegen, that ensures the C calling conventions
-// are preserved. Currently, ART and C do not match as R4 is caller-save
-// in ART, and callee-save in C. Alternatively, we could use or write
-// the stub that saves and restores all registers, but it is easier
-// to just overwrite the code generator.
-class TestCodeGeneratorARM : public arm::CodeGeneratorARM {
- public:
-  TestCodeGeneratorARM(HGraph* graph,
-                       const ArmInstructionSetFeatures& isa_features,
-                       const CompilerOptions& compiler_options)
-      : arm::CodeGeneratorARM(graph, isa_features, compiler_options) {
-    AddAllocatedRegister(Location::RegisterLocation(arm::R6));
-    AddAllocatedRegister(Location::RegisterLocation(arm::R7));
-  }
-
-  void SetupBlockedRegisters() const OVERRIDE {
-    arm::CodeGeneratorARM::SetupBlockedRegisters();
-    blocked_core_registers_[arm::R4] = true;
-    blocked_core_registers_[arm::R6] = false;
-    blocked_core_registers_[arm::R7] = false;
-  }
-};
-
-// A way to test the VIXL32-based code generator on ARM. This will replace
-// TestCodeGeneratorARM when the VIXL32-based backend replaces the existing one.
-class TestCodeGeneratorARMVIXL : public arm::CodeGeneratorARMVIXL {
- public:
-  TestCodeGeneratorARMVIXL(HGraph* graph,
-                           const ArmInstructionSetFeatures& isa_features,
-                           const CompilerOptions& compiler_options)
-      : arm::CodeGeneratorARMVIXL(graph, isa_features, compiler_options) {
-    AddAllocatedRegister(Location::RegisterLocation(arm::R6));
-    AddAllocatedRegister(Location::RegisterLocation(arm::R7));
-  }
-
-  void SetupBlockedRegisters() const OVERRIDE {
-    arm::CodeGeneratorARMVIXL::SetupBlockedRegisters();
-    blocked_core_registers_[arm::R4] = true;
-    blocked_core_registers_[arm::R6] = false;
-    blocked_core_registers_[arm::R7] = false;
-  }
-};
-#endif
-
-#ifdef ART_ENABLE_CODEGEN_x86
-class TestCodeGeneratorX86 : public x86::CodeGeneratorX86 {
- public:
-  TestCodeGeneratorX86(HGraph* graph,
-                       const X86InstructionSetFeatures& isa_features,
-                       const CompilerOptions& compiler_options)
-      : x86::CodeGeneratorX86(graph, isa_features, compiler_options) {
-    // Save edi, we need it for getting enough registers for long multiplication.
-    AddAllocatedRegister(Location::RegisterLocation(x86::EDI));
-  }
-
-  void SetupBlockedRegisters() const OVERRIDE {
-    x86::CodeGeneratorX86::SetupBlockedRegisters();
-    // ebx is a callee-save register in C, but caller-save for ART.
-    blocked_core_registers_[x86::EBX] = true;
-
-    // Make edi available.
-    blocked_core_registers_[x86::EDI] = false;
-  }
-};
-#endif
-
-class InternalCodeAllocator : public CodeAllocator {
- public:
-  InternalCodeAllocator() : size_(0) { }
-
-  virtual uint8_t* Allocate(size_t size) {
-    size_ = size;
-    memory_.reset(new uint8_t[size]);
-    return memory_.get();
-  }
-
-  size_t GetSize() const { return size_; }
-  uint8_t* GetMemory() const { return memory_.get(); }
-
- private:
-  size_t size_;
-  std::unique_ptr<uint8_t[]> memory_;
-
-  DISALLOW_COPY_AND_ASSIGN(InternalCodeAllocator);
-};
-
-static bool CanExecuteOnHardware(InstructionSet target_isa) {
-  return (target_isa == kRuntimeISA)
-      // Handle the special case of ARM, with two instructions sets (ARM32 and Thumb-2).
-      || (kRuntimeISA == kArm && target_isa == kThumb2);
-}
-
-static bool CanExecute(InstructionSet target_isa) {
-  CodeSimulatorContainer simulator(target_isa);
-  return CanExecuteOnHardware(target_isa) || simulator.CanSimulate();
-}
-
-template <typename Expected>
-static Expected SimulatorExecute(CodeSimulator* simulator, Expected (*f)());
-
-template <>
-bool SimulatorExecute<bool>(CodeSimulator* simulator, bool (*f)()) {
-  simulator->RunFrom(reinterpret_cast<intptr_t>(f));
-  return simulator->GetCReturnBool();
-}
-
-template <>
-int32_t SimulatorExecute<int32_t>(CodeSimulator* simulator, int32_t (*f)()) {
-  simulator->RunFrom(reinterpret_cast<intptr_t>(f));
-  return simulator->GetCReturnInt32();
-}
-
-template <>
-int64_t SimulatorExecute<int64_t>(CodeSimulator* simulator, int64_t (*f)()) {
-  simulator->RunFrom(reinterpret_cast<intptr_t>(f));
-  return simulator->GetCReturnInt64();
-}
-
-template <typename Expected>
-static void VerifyGeneratedCode(InstructionSet target_isa,
-                                Expected (*f)(),
-                                bool has_result,
-                                Expected expected) {
-  ASSERT_TRUE(CanExecute(target_isa)) << "Target isa is not executable.";
-
-  // Verify on simulator.
-  CodeSimulatorContainer simulator(target_isa);
-  if (simulator.CanSimulate()) {
-    Expected result = SimulatorExecute<Expected>(simulator.Get(), f);
-    if (has_result) {
-      ASSERT_EQ(expected, result);
-    }
-  }
-
-  // Verify on hardware.
-  if (CanExecuteOnHardware(target_isa)) {
-    Expected result = f();
-    if (has_result) {
-      ASSERT_EQ(expected, result);
-    }
-  }
-}
-
-template <typename Expected>
-static void Run(const InternalCodeAllocator& allocator,
-                const CodeGenerator& codegen,
-                bool has_result,
-                Expected expected) {
-  InstructionSet target_isa = codegen.GetInstructionSet();
-
-  typedef Expected (*fptr)();
-  CommonCompilerTest::MakeExecutable(allocator.GetMemory(), allocator.GetSize());
-  fptr f = reinterpret_cast<fptr>(allocator.GetMemory());
-  if (target_isa == kThumb2) {
-    // For thumb we need the bottom bit set.
-    f = reinterpret_cast<fptr>(reinterpret_cast<uintptr_t>(f) + 1);
-  }
-  VerifyGeneratedCode(target_isa, f, has_result, expected);
-}
-
-static void ValidateGraph(HGraph* graph) {
-  GraphChecker graph_checker(graph);
-  graph_checker.Run();
-  if (!graph_checker.IsValid()) {
-    for (const auto& error : graph_checker.GetErrors()) {
-      std::cout << error << std::endl;
-    }
-  }
-  ASSERT_TRUE(graph_checker.IsValid());
-}
-
-template <typename Expected>
-static void RunCodeNoCheck(CodeGenerator* codegen,
-                           HGraph* graph,
-                           const std::function<void(HGraph*)>& hook_before_codegen,
-                           bool has_result,
-                           Expected expected) {
-  SsaLivenessAnalysis liveness(graph, codegen);
-  PrepareForRegisterAllocation(graph).Run();
-  liveness.Analyze();
-  RegisterAllocator::Create(graph->GetArena(), codegen, liveness)->AllocateRegisters();
-  hook_before_codegen(graph);
-  InternalCodeAllocator allocator;
-  codegen->Compile(&allocator);
-  Run(allocator, *codegen, has_result, expected);
-}
-
-template <typename Expected>
-static void RunCode(CodeGenerator* codegen,
-                    HGraph* graph,
-                    std::function<void(HGraph*)> hook_before_codegen,
-                    bool has_result,
-                    Expected expected) {
-  ValidateGraph(graph);
-  RunCodeNoCheck(codegen, graph, hook_before_codegen, has_result, expected);
-}
-
-template <typename Expected>
-static void RunCode(CodegenTargetConfig target_config,
-                    HGraph* graph,
-                    std::function<void(HGraph*)> hook_before_codegen,
-                    bool has_result,
-                    Expected expected) {
-  CompilerOptions compiler_options;
-  std::unique_ptr<CodeGenerator> codegen(target_config.CreateCodeGenerator(graph, compiler_options));
-  RunCode(codegen.get(), graph, hook_before_codegen, has_result, expected);
-}
-
-#ifdef ART_ENABLE_CODEGEN_arm
-CodeGenerator* create_codegen_arm(HGraph* graph, const CompilerOptions& compiler_options) {
-  std::unique_ptr<const ArmInstructionSetFeatures> features_arm(
-      ArmInstructionSetFeatures::FromCppDefines());
-  return new (graph->GetArena()) TestCodeGeneratorARM(graph,
-                                                      *features_arm.get(),
-                                                      compiler_options);
-}
-
-CodeGenerator* create_codegen_arm_vixl32(HGraph* graph, const CompilerOptions& compiler_options) {
-  std::unique_ptr<const ArmInstructionSetFeatures> features_arm(
-      ArmInstructionSetFeatures::FromCppDefines());
-  return new (graph->GetArena())
-      TestCodeGeneratorARMVIXL(graph, *features_arm.get(), compiler_options);
-}
-#endif
-
-#ifdef ART_ENABLE_CODEGEN_arm64
-CodeGenerator* create_codegen_arm64(HGraph* graph, const CompilerOptions& compiler_options) {
-  std::unique_ptr<const Arm64InstructionSetFeatures> features_arm64(
-      Arm64InstructionSetFeatures::FromCppDefines());
-  return new (graph->GetArena()) arm64::CodeGeneratorARM64(graph,
-                                                           *features_arm64.get(),
-                                                           compiler_options);
-}
-#endif
-
-#ifdef ART_ENABLE_CODEGEN_x86
-CodeGenerator* create_codegen_x86(HGraph* graph, const CompilerOptions& compiler_options) {
-  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
-      X86InstructionSetFeatures::FromCppDefines());
-  return new (graph->GetArena()) TestCodeGeneratorX86(graph, *features_x86.get(), compiler_options);
-}
-#endif
-
-#ifdef ART_ENABLE_CODEGEN_x86_64
-CodeGenerator* create_codegen_x86_64(HGraph* graph, const CompilerOptions& compiler_options) {
-  std::unique_ptr<const X86_64InstructionSetFeatures> features_x86_64(
-     X86_64InstructionSetFeatures::FromCppDefines());
-  return new (graph->GetArena())
-      x86_64::CodeGeneratorX86_64(graph, *features_x86_64.get(), compiler_options);
-}
-#endif
-
-#ifdef ART_ENABLE_CODEGEN_mips
-CodeGenerator* create_codegen_mips(HGraph* graph, const CompilerOptions& compiler_options) {
-  std::unique_ptr<const MipsInstructionSetFeatures> features_mips(
-      MipsInstructionSetFeatures::FromCppDefines());
-  return new (graph->GetArena())
-      mips::CodeGeneratorMIPS(graph, *features_mips.get(), compiler_options);
-}
-#endif
-
-#ifdef ART_ENABLE_CODEGEN_mips64
-CodeGenerator* create_codegen_mips64(HGraph* graph, const CompilerOptions& compiler_options) {
-  std::unique_ptr<const Mips64InstructionSetFeatures> features_mips64(
-      Mips64InstructionSetFeatures::FromCppDefines());
-  return new (graph->GetArena())
-      mips64::CodeGeneratorMIPS64(graph, *features_mips64.get(), compiler_options);
-}
-#endif
-
 // Return all combinations of ISA and code generator that are executable on
 // hardware, or on simulator, and that we'd like to test.
 static ::std::vector<CodegenTargetConfig> GetTargetConfigs() {
@@ -1067,6 +738,39 @@ TEST_F(CodegenTest, ARMVIXLParallelMoveResolver) {
 }
 #endif
 
+#ifdef ART_ENABLE_CODEGEN_arm64
+// Regression test for b/34760542.
+TEST_F(CodegenTest, ARM64ParallelMoveResolverB34760542) {
+  std::unique_ptr<const Arm64InstructionSetFeatures> features(
+      Arm64InstructionSetFeatures::FromCppDefines());
+  ArenaPool pool;
+  ArenaAllocator allocator(&pool);
+  HGraph* graph = CreateGraph(&allocator);
+  arm64::CodeGeneratorARM64 codegen(graph, *features.get(), CompilerOptions());
+
+  codegen.Initialize();
+
+  // The following ParallelMove used to fail this assertion:
+  //
+  //   Assertion failed (!available->IsEmpty())
+  //
+  // in vixl::aarch64::UseScratchRegisterScope::AcquireNextAvailable.
+  HParallelMove* move = new (graph->GetArena()) HParallelMove(graph->GetArena());
+  move->AddMove(Location::DoubleStackSlot(0),
+                Location::DoubleStackSlot(257),
+                Primitive::kPrimDouble,
+                nullptr);
+  move->AddMove(Location::DoubleStackSlot(257),
+                Location::DoubleStackSlot(0),
+                Primitive::kPrimDouble,
+                nullptr);
+  codegen.GetMoveResolver()->EmitNativeCode(move);
+
+  InternalCodeAllocator code_allocator;
+  codegen.Finalize(&code_allocator);
+}
+#endif
+
 #ifdef ART_ENABLE_CODEGEN_mips
 TEST_F(CodegenTest, MipsClobberRA) {
   std::unique_ptr<const MipsInstructionSetFeatures> features_mips(
diff --git a/compiler/optimizing/codegen_test_utils.h b/compiler/optimizing/codegen_test_utils.h
new file mode 100644
index 0000000000..cd954043f5
--- /dev/null
+++ b/compiler/optimizing/codegen_test_utils.h
@@ -0,0 +1,355 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_OPTIMIZING_CODEGEN_TEST_UTILS_H_
+#define ART_COMPILER_OPTIMIZING_CODEGEN_TEST_UTILS_H_
+
+#include "arch/arm/instruction_set_features_arm.h"
+#include "arch/arm/registers_arm.h"
+#include "arch/arm64/instruction_set_features_arm64.h"
+#include "arch/instruction_set.h"
+#include "arch/mips/instruction_set_features_mips.h"
+#include "arch/mips/registers_mips.h"
+#include "arch/mips64/instruction_set_features_mips64.h"
+#include "arch/mips64/registers_mips64.h"
+#include "arch/x86/instruction_set_features_x86.h"
+#include "arch/x86/registers_x86.h"
+#include "arch/x86_64/instruction_set_features_x86_64.h"
+#include "code_simulator_container.h"
+#include "common_compiler_test.h"
+#include "graph_checker.h"
+#include "prepare_for_register_allocation.h"
+#include "ssa_liveness_analysis.h"
+
+#ifdef ART_ENABLE_CODEGEN_arm
+#include "code_generator_arm.h"
+#include "code_generator_arm_vixl.h"
+#endif
+
+#ifdef ART_ENABLE_CODEGEN_arm64
+#include "code_generator_arm64.h"
+#endif
+
+#ifdef ART_ENABLE_CODEGEN_x86
+#include "code_generator_x86.h"
+#endif
+
+#ifdef ART_ENABLE_CODEGEN_x86_64
+#include "code_generator_x86_64.h"
+#endif
+
+#ifdef ART_ENABLE_CODEGEN_mips
+#include "code_generator_mips.h"
+#endif
+
+#ifdef ART_ENABLE_CODEGEN_mips64
+#include "code_generator_mips64.h"
+#endif
+
+namespace art {
+
+typedef CodeGenerator* (*CreateCodegenFn)(HGraph*, const CompilerOptions&);
+
+class CodegenTargetConfig {
+ public:
+  CodegenTargetConfig(InstructionSet isa, CreateCodegenFn create_codegen)
+      : isa_(isa), create_codegen_(create_codegen) {
+  }
+  InstructionSet GetInstructionSet() const { return isa_; }
+  CodeGenerator* CreateCodeGenerator(HGraph* graph, const CompilerOptions& compiler_options) {
+    return create_codegen_(graph, compiler_options);
+  }
+
+ private:
+  CodegenTargetConfig() {}
+  InstructionSet isa_;
+  CreateCodegenFn create_codegen_;
+};
+
+#ifdef ART_ENABLE_CODEGEN_arm
+// Provide our own codegen, that ensures the C calling conventions
+// are preserved. Currently, ART and C do not match as R4 is caller-save
+// in ART, and callee-save in C. Alternatively, we could use or write
+// the stub that saves and restores all registers, but it is easier
+// to just overwrite the code generator.
+class TestCodeGeneratorARM : public arm::CodeGeneratorARM {
+ public:
+  TestCodeGeneratorARM(HGraph* graph,
+                       const ArmInstructionSetFeatures& isa_features,
+                       const CompilerOptions& compiler_options)
+      : arm::CodeGeneratorARM(graph, isa_features, compiler_options) {
+    AddAllocatedRegister(Location::RegisterLocation(arm::R6));
+    AddAllocatedRegister(Location::RegisterLocation(arm::R7));
+  }
+
+  void SetupBlockedRegisters() const OVERRIDE {
+    arm::CodeGeneratorARM::SetupBlockedRegisters();
+    blocked_core_registers_[arm::R4] = true;
+    blocked_core_registers_[arm::R6] = false;
+    blocked_core_registers_[arm::R7] = false;
+  }
+};
+
+// A way to test the VIXL32-based code generator on ARM. This will replace
+// TestCodeGeneratorARM when the VIXL32-based backend replaces the existing one.
+class TestCodeGeneratorARMVIXL : public arm::CodeGeneratorARMVIXL {
+ public:
+  TestCodeGeneratorARMVIXL(HGraph* graph,
+                           const ArmInstructionSetFeatures& isa_features,
+                           const CompilerOptions& compiler_options)
+      : arm::CodeGeneratorARMVIXL(graph, isa_features, compiler_options) {
+    AddAllocatedRegister(Location::RegisterLocation(arm::R6));
+    AddAllocatedRegister(Location::RegisterLocation(arm::R7));
+  }
+
+  void SetupBlockedRegisters() const OVERRIDE {
+    arm::CodeGeneratorARMVIXL::SetupBlockedRegisters();
+    blocked_core_registers_[arm::R4] = true;
+    blocked_core_registers_[arm::R6] = false;
+    blocked_core_registers_[arm::R7] = false;
+  }
+};
+#endif
+
+#ifdef ART_ENABLE_CODEGEN_x86
+class TestCodeGeneratorX86 : public x86::CodeGeneratorX86 {
+ public:
+  TestCodeGeneratorX86(HGraph* graph,
+                       const X86InstructionSetFeatures& isa_features,
+                       const CompilerOptions& compiler_options)
+      : x86::CodeGeneratorX86(graph, isa_features, compiler_options) {
+    // Save edi, we need it for getting enough registers for long multiplication.
+    AddAllocatedRegister(Location::RegisterLocation(x86::EDI));
+  }
+
+  void SetupBlockedRegisters() const OVERRIDE {
+    x86::CodeGeneratorX86::SetupBlockedRegisters();
+    // ebx is a callee-save register in C, but caller-save for ART.
+    blocked_core_registers_[x86::EBX] = true;
+
+    // Make edi available.
+    blocked_core_registers_[x86::EDI] = false;
+  }
+};
+#endif
+
+class InternalCodeAllocator : public CodeAllocator {
+ public:
+  InternalCodeAllocator() : size_(0) { }
+
+  virtual uint8_t* Allocate(size_t size) {
+    size_ = size;
+    memory_.reset(new uint8_t[size]);
+    return memory_.get();
+  }
+
+  size_t GetSize() const { return size_; }
+  uint8_t* GetMemory() const { return memory_.get(); }
+
+ private:
+  size_t size_;
+  std::unique_ptr<uint8_t[]> memory_;
+
+  DISALLOW_COPY_AND_ASSIGN(InternalCodeAllocator);
+};
+
+static bool CanExecuteOnHardware(InstructionSet target_isa) {
+  return (target_isa == kRuntimeISA)
+      // Handle the special case of ARM, with two instructions sets (ARM32 and Thumb-2).
+      || (kRuntimeISA == kArm && target_isa == kThumb2);
+}
+
+static bool CanExecute(InstructionSet target_isa) {
+  CodeSimulatorContainer simulator(target_isa);
+  return CanExecuteOnHardware(target_isa) || simulator.CanSimulate();
+}
+
+template <typename Expected>
+inline static Expected SimulatorExecute(CodeSimulator* simulator, Expected (*f)());
+
+template <>
+inline bool SimulatorExecute<bool>(CodeSimulator* simulator, bool (*f)()) {
+  simulator->RunFrom(reinterpret_cast<intptr_t>(f));
+  return simulator->GetCReturnBool();
+}
+
+template <>
+inline int32_t SimulatorExecute<int32_t>(CodeSimulator* simulator, int32_t (*f)()) {
+  simulator->RunFrom(reinterpret_cast<intptr_t>(f));
+  return simulator->GetCReturnInt32();
+}
+
+template <>
+inline int64_t SimulatorExecute<int64_t>(CodeSimulator* simulator, int64_t (*f)()) {
+  simulator->RunFrom(reinterpret_cast<intptr_t>(f));
+  return simulator->GetCReturnInt64();
+}
+
+template <typename Expected>
+static void VerifyGeneratedCode(InstructionSet target_isa,
+                                Expected (*f)(),
+                                bool has_result,
+                                Expected expected) {
+  ASSERT_TRUE(CanExecute(target_isa)) << "Target isa is not executable.";
+
+  // Verify on simulator.
+  CodeSimulatorContainer simulator(target_isa);
+  if (simulator.CanSimulate()) {
+    Expected result = SimulatorExecute<Expected>(simulator.Get(), f);
+    if (has_result) {
+      ASSERT_EQ(expected, result);
+    }
+  }
+
+  // Verify on hardware.
+  if (CanExecuteOnHardware(target_isa)) {
+    Expected result = f();
+    if (has_result) {
+      ASSERT_EQ(expected, result);
+    }
+  }
+}
+
+template <typename Expected>
+static void Run(const InternalCodeAllocator& allocator,
+                const CodeGenerator& codegen,
+                bool has_result,
+                Expected expected) {
+  InstructionSet target_isa = codegen.GetInstructionSet();
+
+  typedef Expected (*fptr)();
+  CommonCompilerTest::MakeExecutable(allocator.GetMemory(), allocator.GetSize());
+  fptr f = reinterpret_cast<fptr>(allocator.GetMemory());
+  if (target_isa == kThumb2) {
+    // For thumb we need the bottom bit set.
+    f = reinterpret_cast<fptr>(reinterpret_cast<uintptr_t>(f) + 1);
+  }
+  VerifyGeneratedCode(target_isa, f, has_result, expected);
+}
+
+static void ValidateGraph(HGraph* graph) {
+  GraphChecker graph_checker(graph);
+  graph_checker.Run();
+  if (!graph_checker.IsValid()) {
+    for (const auto& error : graph_checker.GetErrors()) {
+      std::cout << error << std::endl;
+    }
+  }
+  ASSERT_TRUE(graph_checker.IsValid());
+}
+
+template <typename Expected>
+static void RunCodeNoCheck(CodeGenerator* codegen,
+                           HGraph* graph,
+                           const std::function<void(HGraph*)>& hook_before_codegen,
+                           bool has_result,
+                           Expected expected) {
+  SsaLivenessAnalysis liveness(graph, codegen);
+  PrepareForRegisterAllocation(graph).Run();
+  liveness.Analyze();
+  RegisterAllocator::Create(graph->GetArena(), codegen, liveness)->AllocateRegisters();
+  hook_before_codegen(graph);
+  InternalCodeAllocator allocator;
+  codegen->Compile(&allocator);
+  Run(allocator, *codegen, has_result, expected);
+}
+
+template <typename Expected>
+static void RunCode(CodeGenerator* codegen,
+                    HGraph* graph,
+                    std::function<void(HGraph*)> hook_before_codegen,
+                    bool has_result,
+                    Expected expected) {
+  ValidateGraph(graph);
+  RunCodeNoCheck(codegen, graph, hook_before_codegen, has_result, expected);
+}
+
+template <typename Expected>
+static void RunCode(CodegenTargetConfig target_config,
+                    HGraph* graph,
+                    std::function<void(HGraph*)> hook_before_codegen,
+                    bool has_result,
+                    Expected expected) {
+  CompilerOptions compiler_options;
+  std::unique_ptr<CodeGenerator> codegen(target_config.CreateCodeGenerator(graph, compiler_options));
+  RunCode(codegen.get(), graph, hook_before_codegen, has_result, expected);
+}
+
+#ifdef ART_ENABLE_CODEGEN_arm
+CodeGenerator* create_codegen_arm(HGraph* graph, const CompilerOptions& compiler_options) {
+  std::unique_ptr<const ArmInstructionSetFeatures> features_arm(
+      ArmInstructionSetFeatures::FromCppDefines());
+  return new (graph->GetArena()) TestCodeGeneratorARM(graph,
+                                                      *features_arm.get(),
+                                                      compiler_options);
+}
+
+CodeGenerator* create_codegen_arm_vixl32(HGraph* graph, const CompilerOptions& compiler_options) {
+  std::unique_ptr<const ArmInstructionSetFeatures> features_arm(
+      ArmInstructionSetFeatures::FromCppDefines());
+  return new (graph->GetArena())
+      TestCodeGeneratorARMVIXL(graph, *features_arm.get(), compiler_options);
+}
+#endif
+
+#ifdef ART_ENABLE_CODEGEN_arm64
+CodeGenerator* create_codegen_arm64(HGraph* graph, const CompilerOptions& compiler_options) {
+  std::unique_ptr<const Arm64InstructionSetFeatures> features_arm64(
+      Arm64InstructionSetFeatures::FromCppDefines());
+  return new (graph->GetArena()) arm64::CodeGeneratorARM64(graph,
+                                                           *features_arm64.get(),
+                                                           compiler_options);
+}
+#endif
+
+#ifdef ART_ENABLE_CODEGEN_x86
+CodeGenerator* create_codegen_x86(HGraph* graph, const CompilerOptions& compiler_options) {
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  return new (graph->GetArena()) TestCodeGeneratorX86(graph, *features_x86.get(), compiler_options);
+}
+#endif
+
+#ifdef ART_ENABLE_CODEGEN_x86_64
+CodeGenerator* create_codegen_x86_64(HGraph* graph, const CompilerOptions& compiler_options) {
+  std::unique_ptr<const X86_64InstructionSetFeatures> features_x86_64(
+     X86_64InstructionSetFeatures::FromCppDefines());
+  return new (graph->GetArena())
+      x86_64::CodeGeneratorX86_64(graph, *features_x86_64.get(), compiler_options);
+}
+#endif
+
+#ifdef ART_ENABLE_CODEGEN_mips
+CodeGenerator* create_codegen_mips(HGraph* graph, const CompilerOptions& compiler_options) {
+  std::unique_ptr<const MipsInstructionSetFeatures> features_mips(
+      MipsInstructionSetFeatures::FromCppDefines());
+  return new (graph->GetArena())
+      mips::CodeGeneratorMIPS(graph, *features_mips.get(), compiler_options);
+}
+#endif
+
+#ifdef ART_ENABLE_CODEGEN_mips64
+CodeGenerator* create_codegen_mips64(HGraph* graph, const CompilerOptions& compiler_options) {
+  std::unique_ptr<const Mips64InstructionSetFeatures> features_mips64(
+      Mips64InstructionSetFeatures::FromCppDefines());
+  return new (graph->GetArena())
+      mips64::CodeGeneratorMIPS64(graph, *features_mips64.get(), compiler_options);
+}
+#endif
+
+}  // namespace art
+
+#endif  // ART_COMPILER_OPTIMIZING_CODEGEN_TEST_UTILS_H_
diff --git a/compiler/optimizing/common_arm.h b/compiler/optimizing/common_arm.h
index 21c3ae628a..ecb86875d6 100644
--- a/compiler/optimizing/common_arm.h
+++ b/compiler/optimizing/common_arm.h
@@ -146,6 +146,12 @@ inline vixl::aarch32::Register InputRegister(HInstruction* instr) {
   return InputRegisterAt(instr, 0);
 }
 
+inline vixl::aarch32::DRegister DRegisterFromS(vixl::aarch32::SRegister s) {
+  vixl::aarch32::DRegister d = vixl::aarch32::DRegister(s.GetCode() / 2);
+  DCHECK(s.Is(d.GetLane(0)) || s.Is(d.GetLane(1)));
+  return d;
+}
+
 inline int32_t Int32ConstantFrom(HInstruction* instr) {
   if (instr->IsIntConstant()) {
     return instr->AsIntConstant()->GetValue();
diff --git a/compiler/optimizing/common_arm64.h b/compiler/optimizing/common_arm64.h
index 776a483d43..93ea090583 100644
--- a/compiler/optimizing/common_arm64.h
+++ b/compiler/optimizing/common_arm64.h
@@ -130,8 +130,8 @@ inline vixl::aarch64::CPURegister InputCPURegisterOrZeroRegAt(HInstruction* inst
   Primitive::Type input_type = input->GetType();
   if (input->IsConstant() && input->AsConstant()->IsZeroBitPattern()) {
     return (Primitive::ComponentSize(input_type) >= vixl::aarch64::kXRegSizeInBytes)
-        ? vixl::aarch64::xzr
-        : vixl::aarch64::wzr;
+        ? vixl::aarch64::Register(vixl::aarch64::xzr)
+        : vixl::aarch64::Register(vixl::aarch64::wzr);
   }
   return InputCPURegisterAt(instr, index);
 }
diff --git a/compiler/optimizing/dex_cache_array_fixups_mips.cc b/compiler/optimizing/dex_cache_array_fixups_mips.cc
index 04a4294c48..7734f9197d 100644
--- a/compiler/optimizing/dex_cache_array_fixups_mips.cc
+++ b/compiler/optimizing/dex_cache_array_fixups_mips.cc
@@ -47,7 +47,7 @@ class DexCacheArrayFixupsVisitor : public HGraphVisitor {
     // Computing the dex cache base for PC-relative accesses will clobber RA with
     // the NAL instruction on R2. Take a note of this before generating the method
     // entry.
-    if (!dex_cache_array_bases_.empty() && !codegen_->GetInstructionSetFeatures().IsR6()) {
+    if (!dex_cache_array_bases_.empty()) {
       codegen_->ClobberRA();
     }
   }
@@ -92,6 +92,11 @@ class DexCacheArrayFixupsVisitor : public HGraphVisitor {
 };
 
 void DexCacheArrayFixups::Run() {
+  CodeGeneratorMIPS* mips_codegen = down_cast<CodeGeneratorMIPS*>(codegen_);
+  if (mips_codegen->GetInstructionSetFeatures().IsR6()) {
+    // Do nothing for R6 because it has PC-relative addressing.
+    return;
+  }
   if (graph_->HasIrreducibleLoops()) {
     // Do not run this optimization, as irreducible loops do not work with an instruction
     // that can be live-in at the irreducible loop header.
diff --git a/compiler/optimizing/induction_var_range.cc b/compiler/optimizing/induction_var_range.cc
index 3973985338..5539413aad 100644
--- a/compiler/optimizing/induction_var_range.cc
+++ b/compiler/optimizing/induction_var_range.cc
@@ -57,14 +57,18 @@ static bool IsIntAndGet(HInstruction* instruction, int64_t* value) {
   return false;
 }
 
-/** Returns b^e for b,e >= 1. */
-static int64_t IntPow(int64_t b, int64_t e) {
+/** Returns b^e for b,e >= 1. Sets overflow if arithmetic wrap-around occurred. */
+static int64_t IntPow(int64_t b, int64_t e, /*out*/ bool* overflow) {
   DCHECK_GE(b, 1);
   DCHECK_GE(e, 1);
   int64_t pow = 1;
   while (e) {
     if (e & 1) {
+      int64_t oldpow = pow;
       pow *= b;
+      if (pow < oldpow) {
+        *overflow = true;
+      }
     }
     e >>= 1;
     b *= b;
@@ -1020,20 +1024,27 @@ bool InductionVarRange::GenerateLastValueGeometric(HInductionVarAnalysis::Induct
     HInstruction* opb = nullptr;
     if (GenerateCode(info->op_a, nullptr, graph, block, &opa, false, false) &&
         GenerateCode(info->op_b, nullptr, graph, block, &opb, false, false)) {
-      // Compute f ^ m for known maximum index value m.
-      int64_t fpow = IntPow(f, m);
       if (graph != nullptr) {
-        DCHECK(info->operation == HInductionVarAnalysis::kMul ||
-               info->operation == HInductionVarAnalysis::kDiv);
         Primitive::Type type = info->type;
+        // Compute f ^ m for known maximum index value m.
+        bool overflow = false;
+        int64_t fpow = IntPow(f, m, &overflow);
+        if (info->operation == HInductionVarAnalysis::kDiv) {
+          // For division, any overflow truncates to zero.
+          if (overflow || (type != Primitive::kPrimLong && !CanLongValueFitIntoInt(fpow))) {
+            fpow = 0;
+          }
+        } else if (type != Primitive::kPrimLong) {
+          // For multiplication, okay to truncate to required precision.
+          DCHECK(info->operation == HInductionVarAnalysis::kMul);
+          fpow = static_cast<int32_t>(fpow);
+        }
+        // Generate code.
         if (fpow == 0) {
           // Special case: repeated mul/div always yields zero.
           *result = graph->GetConstant(type, 0);
         } else {
           // Last value: a * f ^ m + b or a * f ^ -m + b.
-          if (type != Primitive::kPrimLong) {
-            fpow = static_cast<int32_t>(fpow);  // okay to truncate
-          }
           HInstruction* e = nullptr;
           if (info->operation == HInductionVarAnalysis::kMul) {
             e = new (graph->GetArena()) HMul(type, opa, graph->GetConstant(type, fpow));
diff --git a/compiler/optimizing/inliner.cc b/compiler/optimizing/inliner.cc
index 5d40f75618..f0afccb782 100644
--- a/compiler/optimizing/inliner.cc
+++ b/compiler/optimizing/inliner.cc
@@ -304,7 +304,8 @@ ArtMethod* HInliner::TryCHADevirtualization(ArtMethod* resolved_method) {
     // We do not support HDeoptimize in OSR methods.
     return nullptr;
   }
-  return resolved_method->GetSingleImplementation();
+  PointerSize pointer_size = caller_compilation_unit_.GetClassLinker()->GetImagePointerSize();
+  return resolved_method->GetSingleImplementation(pointer_size);
 }
 
 bool HInliner::TryInline(HInvoke* invoke_instruction) {
@@ -557,9 +558,13 @@ HInstruction* HInliner::AddTypeGuard(HInstruction* receiver,
                                                                is_referrer,
                                                                invoke_instruction->GetDexPc(),
                                                                /* needs_access_check */ false);
+  HLoadClass::LoadKind kind = HSharpening::SharpenClass(
+      load_class, codegen_, compiler_driver_, caller_compilation_unit_);
+  DCHECK(kind != HLoadClass::LoadKind::kInvalid)
+      << "We should always be able to reference a class for inline caches";
+  // Insert before setting the kind, as setting the kind affects the inputs.
   bb_cursor->InsertInstructionAfter(load_class, receiver_class);
-  // Sharpen after adding the instruction, as the sharpening may remove inputs.
-  HSharpening::SharpenClass(load_class, codegen_, compiler_driver_);
+  load_class->SetLoadKind(kind);
 
   // TODO: Extend reference type propagation to understand the guard.
   HNotEqual* compare = new (graph_->GetArena()) HNotEqual(load_class, receiver_class);
@@ -1285,6 +1290,7 @@ bool HInliner::TryBuildAndInlineHelper(HInvoke* invoke_instruction,
                         resolved_method->GetDexFile(),
                         *code_item,
                         compiler_driver_,
+                        codegen_,
                         inline_stats.get(),
                         resolved_method->GetQuickenedInfo(class_linker->GetImagePointerSize()),
                         dex_cache,
@@ -1415,10 +1421,13 @@ bool HInliner::TryBuildAndInlineHelper(HInvoke* invoke_instruction,
         return false;
       }
 
-      if (!same_dex_file && current->NeedsEnvironment()) {
+      if (current->NeedsEnvironment() &&
+          !CanEncodeInlinedMethodInStackMap(*caller_compilation_unit_.GetDexFile(),
+                                            resolved_method)) {
         VLOG(compiler) << "Method " << callee_dex_file.PrettyMethod(method_index)
                        << " could not be inlined because " << current->DebugName()
-                       << " needs an environment and is in a different dex file";
+                       << " needs an environment, is in a different dex file"
+                       << ", and cannot be encoded in the stack maps.";
         return false;
       }
 
diff --git a/compiler/optimizing/instruction_builder.cc b/compiler/optimizing/instruction_builder.cc
index cac385ce3c..a1c391f455 100644
--- a/compiler/optimizing/instruction_builder.cc
+++ b/compiler/optimizing/instruction_builder.cc
@@ -22,6 +22,7 @@
 #include "dex_instruction-inl.h"
 #include "driver/compiler_options.h"
 #include "imtable-inl.h"
+#include "sharpening.h"
 #include "scoped_thread_state_change-inl.h"
 
 namespace art {
@@ -847,7 +848,7 @@ bool HInstructionBuilder::BuildInvoke(const Instruction& instruction,
     ScopedObjectAccess soa(Thread::Current());
     if (invoke_type == kStatic) {
       clinit_check = ProcessClinitCheckForInvoke(
-          dex_pc, resolved_method, method_idx, &clinit_check_requirement);
+          dex_pc, resolved_method, &clinit_check_requirement);
     } else if (invoke_type == kSuper) {
       if (IsSameDexFile(*resolved_method->GetDexFile(), *dex_compilation_unit_->GetDexFile())) {
         // Update the method index to the one resolved. Note that this may be a no-op if
@@ -933,15 +934,8 @@ bool HInstructionBuilder::BuildInvokePolymorphic(const Instruction& instruction
 
 bool HInstructionBuilder::BuildNewInstance(dex::TypeIndex type_index, uint32_t dex_pc) {
   ScopedObjectAccess soa(Thread::Current());
-  Handle<mirror::DexCache> dex_cache = dex_compilation_unit_->GetDexCache();
-  Handle<mirror::DexCache> outer_dex_cache = outer_compilation_unit_->GetDexCache();
-
-  if (outer_dex_cache.Get() != dex_cache.Get()) {
-    // We currently do not support inlining allocations across dex files.
-    return false;
-  }
 
-  HLoadClass* load_class = BuildLoadClass(type_index, dex_pc, /* check_access */ true);
+  HLoadClass* load_class = BuildLoadClass(type_index, dex_pc);
 
   HInstruction* cls = load_class;
   Handle<mirror::Class> klass = load_class->GetClass();
@@ -1005,39 +999,23 @@ bool HInstructionBuilder::IsInitialized(Handle<mirror::Class> cls) const {
 HClinitCheck* HInstructionBuilder::ProcessClinitCheckForInvoke(
       uint32_t dex_pc,
       ArtMethod* resolved_method,
-      uint32_t method_idx,
       HInvokeStaticOrDirect::ClinitCheckRequirement* clinit_check_requirement) {
-  Thread* self = Thread::Current();
-  StackHandleScope<2> hs(self);
-  Handle<mirror::DexCache> dex_cache = dex_compilation_unit_->GetDexCache();
-  Handle<mirror::DexCache> outer_dex_cache = outer_compilation_unit_->GetDexCache();
-  Handle<mirror::Class> outer_class(hs.NewHandle(GetOutermostCompilingClass()));
-  Handle<mirror::Class> resolved_method_class(hs.NewHandle(resolved_method->GetDeclaringClass()));
-
-  // The index at which the method's class is stored in the DexCache's type array.
-  dex::TypeIndex storage_index;
-  bool is_outer_class = (resolved_method->GetDeclaringClass() == outer_class.Get());
-  if (is_outer_class) {
-    storage_index = outer_class->GetDexTypeIndex();
-  } else if (outer_dex_cache.Get() == dex_cache.Get()) {
-    // Get `storage_index` from IsClassOfStaticMethodAvailableToReferrer.
-    compiler_driver_->IsClassOfStaticMethodAvailableToReferrer(outer_dex_cache.Get(),
-                                                               GetCompilingClass(),
-                                                               resolved_method,
-                                                               method_idx,
-                                                               &storage_index);
-  }
+  Handle<mirror::Class> klass = handles_->NewHandle(resolved_method->GetDeclaringClass());
 
   HClinitCheck* clinit_check = nullptr;
-
-  if (IsInitialized(resolved_method_class)) {
+  if (IsInitialized(klass)) {
     *clinit_check_requirement = HInvokeStaticOrDirect::ClinitCheckRequirement::kNone;
-  } else if (storage_index.IsValid()) {
-    *clinit_check_requirement = HInvokeStaticOrDirect::ClinitCheckRequirement::kExplicit;
-    HLoadClass* cls = BuildLoadClass(
-        storage_index, dex_pc, /* check_access */ false, /* outer */ true);
-    clinit_check = new (arena_) HClinitCheck(cls, dex_pc);
-    AppendInstruction(clinit_check);
+  } else {
+    HLoadClass* cls = BuildLoadClass(klass->GetDexTypeIndex(),
+                                     klass->GetDexFile(),
+                                     klass,
+                                     dex_pc,
+                                     /* needs_access_check */ false);
+    if (cls != nullptr) {
+      *clinit_check_requirement = HInvokeStaticOrDirect::ClinitCheckRequirement::kExplicit;
+      clinit_check = new (arena_) HClinitCheck(cls, dex_pc);
+      AppendInstruction(clinit_check);
+    }
   }
   return clinit_check;
 }
@@ -1216,9 +1194,7 @@ bool HInstructionBuilder::BuildInstanceFieldAccess(const Instruction& instructio
   }
 
   ScopedObjectAccess soa(Thread::Current());
-  ArtField* resolved_field =
-      compiler_driver_->ComputeInstanceFieldInfo(field_index, dex_compilation_unit_, is_put, soa);
-
+  ArtField* resolved_field = ResolveField(field_index, /* is_static */ false, is_put);
 
   // Generate an explicit null check on the reference, unless the field access
   // is unresolved. In that case, we rely on the runtime to perform various
@@ -1336,6 +1312,56 @@ void HInstructionBuilder::BuildUnresolvedStaticFieldAccess(const Instruction& in
   }
 }
 
+ArtField* HInstructionBuilder::ResolveField(uint16_t field_idx, bool is_static, bool is_put) {
+  ScopedObjectAccess soa(Thread::Current());
+  StackHandleScope<2> hs(soa.Self());
+
+  ClassLinker* class_linker = dex_compilation_unit_->GetClassLinker();
+  Handle<mirror::ClassLoader> class_loader(hs.NewHandle(
+      soa.Decode<mirror::ClassLoader>(dex_compilation_unit_->GetClassLoader())));
+  Handle<mirror::Class> compiling_class(hs.NewHandle(GetCompilingClass()));
+
+  ArtField* resolved_field = class_linker->ResolveField(*dex_compilation_unit_->GetDexFile(),
+                                                        field_idx,
+                                                        dex_compilation_unit_->GetDexCache(),
+                                                        class_loader,
+                                                        is_static);
+
+  if (UNLIKELY(resolved_field == nullptr)) {
+    // Clean up any exception left by type resolution.
+    soa.Self()->ClearException();
+    return nullptr;
+  }
+
+  // Check static/instance. The class linker has a fast path for looking into the dex cache
+  // and does not check static/instance if it hits it.
+  if (UNLIKELY(resolved_field->IsStatic() != is_static)) {
+    return nullptr;
+  }
+
+  // Check access.
+  if (compiling_class.Get() == nullptr) {
+    if (!resolved_field->IsPublic()) {
+      return nullptr;
+    }
+  } else if (!compiling_class->CanAccessResolvedField(resolved_field->GetDeclaringClass(),
+                                                      resolved_field,
+                                                      dex_compilation_unit_->GetDexCache().Get(),
+                                                      field_idx)) {
+    return nullptr;
+  }
+
+  if (is_put &&
+      resolved_field->IsFinal() &&
+      (compiling_class.Get() != resolved_field->GetDeclaringClass())) {
+    // Final fields can only be updated within their own class.
+    // TODO: Only allow it in constructors. b/34966607.
+    return nullptr;
+  }
+
+  return resolved_field;
+}
+
 bool HInstructionBuilder::BuildStaticFieldAccess(const Instruction& instruction,
                                                  uint32_t dex_pc,
                                                  bool is_put) {
@@ -1343,12 +1369,7 @@ bool HInstructionBuilder::BuildStaticFieldAccess(const Instruction& instruction,
   uint16_t field_index = instruction.VRegB_21c();
 
   ScopedObjectAccess soa(Thread::Current());
-  StackHandleScope<3> hs(soa.Self());
-  Handle<mirror::DexCache> dex_cache = dex_compilation_unit_->GetDexCache();
-  Handle<mirror::ClassLoader> class_loader(hs.NewHandle(
-      soa.Decode<mirror::ClassLoader>(dex_compilation_unit_->GetClassLoader())));
-  ArtField* resolved_field = compiler_driver_->ResolveField(
-      soa, dex_cache, class_loader, dex_compilation_unit_, field_index, true);
+  ArtField* resolved_field = ResolveField(field_index, /* is_static */ true, is_put);
 
   if (resolved_field == nullptr) {
     MaybeRecordStat(MethodCompilationStat::kUnresolvedField);
@@ -1358,38 +1379,23 @@ bool HInstructionBuilder::BuildStaticFieldAccess(const Instruction& instruction,
   }
 
   Primitive::Type field_type = resolved_field->GetTypeAsPrimitiveType();
-  Handle<mirror::DexCache> outer_dex_cache = outer_compilation_unit_->GetDexCache();
-  Handle<mirror::Class> outer_class(hs.NewHandle(GetOutermostCompilingClass()));
 
-  // The index at which the field's class is stored in the DexCache's type array.
-  dex::TypeIndex storage_index;
-  bool is_outer_class = (outer_class.Get() == resolved_field->GetDeclaringClass());
-  if (is_outer_class) {
-    storage_index = outer_class->GetDexTypeIndex();
-  } else if (outer_dex_cache.Get() != dex_cache.Get()) {
-    // The compiler driver cannot currently understand multiple dex caches involved. Just bailout.
-    return false;
-  } else {
-    // TODO: This is rather expensive. Perf it and cache the results if needed.
-    std::pair<bool, bool> pair = compiler_driver_->IsFastStaticField(
-        outer_dex_cache.Get(),
-        GetCompilingClass(),
-        resolved_field,
-        field_index,
-        &storage_index);
-    bool can_easily_access = is_put ? pair.second : pair.first;
-    if (!can_easily_access) {
-      MaybeRecordStat(MethodCompilationStat::kUnresolvedFieldNotAFastAccess);
-      BuildUnresolvedStaticFieldAccess(instruction, dex_pc, is_put, field_type);
-      return true;
-    }
+  Handle<mirror::Class> klass = handles_->NewHandle(resolved_field->GetDeclaringClass());
+  HLoadClass* constant = BuildLoadClass(klass->GetDexTypeIndex(),
+                                        klass->GetDexFile(),
+                                        klass,
+                                        dex_pc,
+                                        /* needs_access_check */ false);
+
+  if (constant == nullptr) {
+    // The class cannot be referenced from this compiled code. Generate
+    // an unresolved access.
+    MaybeRecordStat(MethodCompilationStat::kUnresolvedFieldNotAFastAccess);
+    BuildUnresolvedStaticFieldAccess(instruction, dex_pc, is_put, field_type);
+    return true;
   }
 
-  HLoadClass* constant = BuildLoadClass(
-      storage_index, dex_pc, /* check_access */ false, /* outer */ true);
-
   HInstruction* cls = constant;
-  Handle<mirror::Class> klass(hs.NewHandle(resolved_field->GetDeclaringClass()));
   if (!IsInitialized(klass)) {
     cls = new (arena_) HClinitCheck(constant, dex_pc);
     AppendInstruction(cls);
@@ -1497,7 +1503,7 @@ void HInstructionBuilder::BuildFilledNewArray(uint32_t dex_pc,
                                               uint32_t* args,
                                               uint32_t register_index) {
   HInstruction* length = graph_->GetIntConstant(number_of_vreg_arguments, dex_pc);
-  HLoadClass* cls = BuildLoadClass(type_index, dex_pc, /* check_access */ true);
+  HLoadClass* cls = BuildLoadClass(type_index, dex_pc);
   HInstruction* object = new (arena_) HNewArray(cls, length, dex_pc);
   AppendInstruction(object);
 
@@ -1627,44 +1633,68 @@ static TypeCheckKind ComputeTypeCheckKind(Handle<mirror::Class> cls)
   }
 }
 
-HLoadClass* HInstructionBuilder::BuildLoadClass(dex::TypeIndex type_index,
-                                                uint32_t dex_pc,
-                                                bool check_access,
-                                                bool outer) {
+HLoadClass* HInstructionBuilder::BuildLoadClass(dex::TypeIndex type_index, uint32_t dex_pc) {
   ScopedObjectAccess soa(Thread::Current());
-  const DexCompilationUnit* compilation_unit =
-      outer ? outer_compilation_unit_ : dex_compilation_unit_;
-  const DexFile& dex_file = *compilation_unit->GetDexFile();
-  StackHandleScope<1> hs(soa.Self());
+  StackHandleScope<2> hs(soa.Self());
+  const DexFile& dex_file = *dex_compilation_unit_->GetDexFile();
   Handle<mirror::ClassLoader> class_loader(hs.NewHandle(
       soa.Decode<mirror::ClassLoader>(dex_compilation_unit_->GetClassLoader())));
   Handle<mirror::Class> klass = handles_->NewHandle(compiler_driver_->ResolveClass(
-      soa, compilation_unit->GetDexCache(), class_loader, type_index, compilation_unit));
+      soa, dex_compilation_unit_->GetDexCache(), class_loader, type_index, dex_compilation_unit_));
 
-  bool is_accessible = false;
-  if (!check_access) {
-    is_accessible = true;
-  } else if (klass.Get() != nullptr) {
+  bool needs_access_check = true;
+  if (klass.Get() != nullptr) {
     if (klass->IsPublic()) {
-      is_accessible = true;
+      needs_access_check = false;
     } else {
       mirror::Class* compiling_class = GetCompilingClass();
       if (compiling_class != nullptr && compiling_class->CanAccess(klass.Get())) {
-        is_accessible = true;
+        needs_access_check = false;
       }
     }
   }
 
+  return BuildLoadClass(type_index, dex_file, klass, dex_pc, needs_access_check);
+}
+
+HLoadClass* HInstructionBuilder::BuildLoadClass(dex::TypeIndex type_index,
+                                                const DexFile& dex_file,
+                                                Handle<mirror::Class> klass,
+                                                uint32_t dex_pc,
+                                                bool needs_access_check) {
+  // Try to find a reference in the compiling dex file.
+  const DexFile* actual_dex_file = &dex_file;
+  if (!IsSameDexFile(dex_file, *dex_compilation_unit_->GetDexFile())) {
+    dex::TypeIndex local_type_index =
+        klass->FindTypeIndexInOtherDexFile(*dex_compilation_unit_->GetDexFile());
+    if (local_type_index.IsValid()) {
+      type_index = local_type_index;
+      actual_dex_file = dex_compilation_unit_->GetDexFile();
+    }
+  }
+
+  // Note: `klass` must be from `handles_`.
   HLoadClass* load_class = new (arena_) HLoadClass(
       graph_->GetCurrentMethod(),
       type_index,
-      dex_file,
+      *actual_dex_file,
       klass,
       klass.Get() != nullptr && (klass.Get() == GetOutermostCompilingClass()),
       dex_pc,
-      !is_accessible);
+      needs_access_check);
 
+  HLoadClass::LoadKind load_kind = HSharpening::SharpenClass(load_class,
+                                                             code_generator_,
+                                                             compiler_driver_,
+                                                             *dex_compilation_unit_);
+
+  if (load_kind == HLoadClass::LoadKind::kInvalid) {
+    // We actually cannot reference this class, we're forced to bail.
+    return nullptr;
+  }
+  // Append the instruction first, as setting the load kind affects the inputs.
   AppendInstruction(load_class);
+  load_class->SetLoadKind(load_kind);
   return load_class;
 }
 
@@ -1674,7 +1704,7 @@ void HInstructionBuilder::BuildTypeCheck(const Instruction& instruction,
                                          dex::TypeIndex type_index,
                                          uint32_t dex_pc) {
   HInstruction* object = LoadLocal(reference, Primitive::kPrimNot);
-  HLoadClass* cls = BuildLoadClass(type_index, dex_pc, /* check_access */ true);
+  HLoadClass* cls = BuildLoadClass(type_index, dex_pc);
 
   ScopedObjectAccess soa(Thread::Current());
   TypeCheckKind check_kind = ComputeTypeCheckKind(cls->GetClass());
@@ -2498,7 +2528,7 @@ bool HInstructionBuilder::ProcessDexInstruction(const Instruction& instruction,
     case Instruction::NEW_ARRAY: {
       dex::TypeIndex type_index(instruction.VRegC_22c());
       HInstruction* length = LoadLocal(instruction.VRegB_22c(), Primitive::kPrimInt);
-      HLoadClass* cls = BuildLoadClass(type_index, dex_pc, /* check_access */ true);
+      HLoadClass* cls = BuildLoadClass(type_index, dex_pc);
       AppendInstruction(new (arena_) HNewArray(cls, length, dex_pc));
       UpdateLocal(instruction.VRegA_22c(), current_block_->GetLastInstruction());
       break;
@@ -2673,7 +2703,7 @@ bool HInstructionBuilder::ProcessDexInstruction(const Instruction& instruction,
 
     case Instruction::CONST_CLASS: {
       dex::TypeIndex type_index(instruction.VRegB_21c());
-      BuildLoadClass(type_index, dex_pc, /* check_access */ true);
+      BuildLoadClass(type_index, dex_pc);
       UpdateLocal(instruction.VRegA_21c(), current_block_->GetLastInstruction());
       break;
     }
diff --git a/compiler/optimizing/instruction_builder.h b/compiler/optimizing/instruction_builder.h
index 5efe95094c..3bb680ce44 100644
--- a/compiler/optimizing/instruction_builder.h
+++ b/compiler/optimizing/instruction_builder.h
@@ -31,6 +31,7 @@
 
 namespace art {
 
+class CodeGenerator;
 class Instruction;
 
 class HInstructionBuilder : public ValueObject {
@@ -44,6 +45,7 @@ class HInstructionBuilder : public ValueObject {
                       DexCompilationUnit* dex_compilation_unit,
                       const DexCompilationUnit* const outer_compilation_unit,
                       CompilerDriver* driver,
+                      CodeGenerator* code_generator,
                       const uint8_t* interpreter_metadata,
                       OptimizingCompilerStats* compiler_stats,
                       Handle<mirror::DexCache> dex_cache,
@@ -61,6 +63,7 @@ class HInstructionBuilder : public ValueObject {
         current_locals_(nullptr),
         latest_result_(nullptr),
         compiler_driver_(driver),
+        code_generator_(code_generator),
         dex_compilation_unit_(dex_compilation_unit),
         outer_compilation_unit_(outer_compilation_unit),
         interpreter_metadata_(interpreter_metadata),
@@ -228,10 +231,14 @@ class HInstructionBuilder : public ValueObject {
   // Builds a `HLoadClass` loading the given `type_index`. If `outer` is true,
   // this method will use the outer class's dex file to lookup the type at
   // `type_index`.
+  HLoadClass* BuildLoadClass(dex::TypeIndex type_index, uint32_t dex_pc);
+
   HLoadClass* BuildLoadClass(dex::TypeIndex type_index,
+                             const DexFile& dex_file,
+                             Handle<mirror::Class> klass,
                              uint32_t dex_pc,
-                             bool check_access,
-                             bool outer = false);
+                             bool needs_access_check)
+      REQUIRES_SHARED(Locks::mutator_lock_);
 
   // Returns the outer-most compiling method's class.
   mirror::Class* GetOutermostCompilingClass() const;
@@ -275,7 +282,6 @@ class HInstructionBuilder : public ValueObject {
   HClinitCheck* ProcessClinitCheckForInvoke(
       uint32_t dex_pc,
       ArtMethod* method,
-      uint32_t method_idx,
       HInvokeStaticOrDirect::ClinitCheckRequirement* clinit_check_requirement)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
@@ -290,6 +296,10 @@ class HInstructionBuilder : public ValueObject {
   // not be resolved.
   ArtMethod* ResolveMethod(uint16_t method_idx, InvokeType invoke_type);
 
+  // Try to resolve a field using the class linker. Return null if it could not
+  // be found.
+  ArtField* ResolveField(uint16_t field_idx, bool is_static, bool is_put);
+
   ArenaAllocator* const arena_;
   HGraph* const graph_;
   VariableSizedHandleScope* handles_;
@@ -311,6 +321,8 @@ class HInstructionBuilder : public ValueObject {
 
   CompilerDriver* const compiler_driver_;
 
+  CodeGenerator* const code_generator_;
+
   // The compilation unit of the current method being compiled. Note that
   // it can be an inlined method.
   DexCompilationUnit* const dex_compilation_unit_;
diff --git a/compiler/optimizing/intrinsics.h b/compiler/optimizing/intrinsics.h
index 1e73cf67df..6425e1313f 100644
--- a/compiler/optimizing/intrinsics.h
+++ b/compiler/optimizing/intrinsics.h
@@ -31,6 +31,9 @@ class DexFile;
 static constexpr uint32_t kPositiveInfinityFloat = 0x7f800000U;
 static constexpr uint64_t kPositiveInfinityDouble = UINT64_C(0x7ff0000000000000);
 
+static constexpr uint32_t kNanFloat = 0x7fc00000U;
+static constexpr uint64_t kNanDouble = 0x7ff8000000000000;
+
 // Recognize intrinsics from HInvoke nodes.
 class IntrinsicsRecognizer : public HOptimization {
  public:
diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc
index 68c2d2e36e..70a3d38c13 100644
--- a/compiler/optimizing/intrinsics_arm_vixl.cc
+++ b/compiler/optimizing/intrinsics_arm_vixl.cc
@@ -40,10 +40,12 @@ using helpers::LocationFrom;
 using helpers::LowRegisterFrom;
 using helpers::LowSRegisterFrom;
 using helpers::OutputDRegister;
+using helpers::OutputSRegister;
 using helpers::OutputRegister;
 using helpers::OutputVRegister;
 using helpers::RegisterFrom;
 using helpers::SRegisterFrom;
+using helpers::DRegisterFromS;
 
 using namespace vixl::aarch32;  // NOLINT(build/namespaces)
 
@@ -462,6 +464,214 @@ void IntrinsicCodeGeneratorARMVIXL::VisitMathAbsLong(HInvoke* invoke) {
   GenAbsInteger(invoke->GetLocations(), /* is64bit */ true, GetAssembler());
 }
 
+static void GenMinMaxFloat(HInvoke* invoke, bool is_min, ArmVIXLAssembler* assembler) {
+  Location op1_loc = invoke->GetLocations()->InAt(0);
+  Location op2_loc = invoke->GetLocations()->InAt(1);
+  Location out_loc = invoke->GetLocations()->Out();
+
+  // Optimization: don't generate any code if inputs are the same.
+  if (op1_loc.Equals(op2_loc)) {
+    DCHECK(out_loc.Equals(op1_loc));  // out_loc is set as SameAsFirstInput() in location builder.
+    return;
+  }
+
+  vixl32::SRegister op1 = SRegisterFrom(op1_loc);
+  vixl32::SRegister op2 = SRegisterFrom(op2_loc);
+  vixl32::SRegister out = OutputSRegister(invoke);
+  UseScratchRegisterScope temps(assembler->GetVIXLAssembler());
+  const vixl32::Register temp1 = temps.Acquire();
+  vixl32::Register temp2 = RegisterFrom(invoke->GetLocations()->GetTemp(0));
+  vixl32::Label nan, done;
+
+  DCHECK(op1.Is(out));
+
+  __ Vcmp(op1, op2);
+  __ Vmrs(RegisterOrAPSR_nzcv(kPcCode), FPSCR);
+  __ B(vs, &nan, /* far_target */ false);  // if un-ordered, go to NaN handling.
+
+  // op1 <> op2
+  vixl32::ConditionType cond = is_min ? gt : lt;
+  {
+    ExactAssemblyScope it_scope(assembler->GetVIXLAssembler(),
+                                2 * kMaxInstructionSizeInBytes,
+                                CodeBufferCheckScope::kMaximumSize);
+    __ it(cond);
+    __ vmov(cond, F32, out, op2);
+  }
+  __ B(ne, &done, /* far_target */ false);  // for <>(not equal), we've done min/max calculation.
+
+  // handle op1 == op2, max(+0.0,-0.0), min(+0.0,-0.0).
+  __ Vmov(temp1, op1);
+  __ Vmov(temp2, op2);
+  if (is_min) {
+    __ Orr(temp1, temp1, temp2);
+  } else {
+    __ And(temp1, temp1, temp2);
+  }
+  __ Vmov(out, temp1);
+  __ B(&done);
+
+  // handle NaN input.
+  __ Bind(&nan);
+  __ Movt(temp1, High16Bits(kNanFloat));  // 0x7FC0xxxx is a NaN.
+  __ Vmov(out, temp1);
+
+  __ Bind(&done);
+}
+
+static void CreateFPFPToFPLocations(ArenaAllocator* arena, HInvoke* invoke) {
+  LocationSummary* locations = new (arena) LocationSummary(invoke,
+                                                           LocationSummary::kNoCall,
+                                                           kIntrinsified);
+  locations->SetInAt(0, Location::RequiresFpuRegister());
+  locations->SetInAt(1, Location::RequiresFpuRegister());
+  locations->SetOut(Location::SameAsFirstInput());
+}
+
+void IntrinsicLocationsBuilderARMVIXL::VisitMathMinFloatFloat(HInvoke* invoke) {
+  CreateFPFPToFPLocations(arena_, invoke);
+  invoke->GetLocations()->AddTemp(Location::RequiresRegister());
+}
+
+void IntrinsicCodeGeneratorARMVIXL::VisitMathMinFloatFloat(HInvoke* invoke) {
+  GenMinMaxFloat(invoke, /* is_min */ true, GetAssembler());
+}
+
+void IntrinsicLocationsBuilderARMVIXL::VisitMathMaxFloatFloat(HInvoke* invoke) {
+  CreateFPFPToFPLocations(arena_, invoke);
+  invoke->GetLocations()->AddTemp(Location::RequiresRegister());
+}
+
+void IntrinsicCodeGeneratorARMVIXL::VisitMathMaxFloatFloat(HInvoke* invoke) {
+  GenMinMaxFloat(invoke, /* is_min */ false, GetAssembler());
+}
+
+static void GenMinMaxDouble(HInvoke* invoke, bool is_min, ArmVIXLAssembler* assembler) {
+  Location op1_loc = invoke->GetLocations()->InAt(0);
+  Location op2_loc = invoke->GetLocations()->InAt(1);
+  Location out_loc = invoke->GetLocations()->Out();
+
+  // Optimization: don't generate any code if inputs are the same.
+  if (op1_loc.Equals(op2_loc)) {
+    DCHECK(out_loc.Equals(op1_loc));  // out_loc is set as SameAsFirstInput() in.
+    return;
+  }
+
+  vixl32::DRegister op1 = DRegisterFrom(op1_loc);
+  vixl32::DRegister op2 = DRegisterFrom(op2_loc);
+  vixl32::DRegister out = OutputDRegister(invoke);
+  vixl32::Label handle_nan_eq, done;
+
+  DCHECK(op1.Is(out));
+
+  __ Vcmp(op1, op2);
+  __ Vmrs(RegisterOrAPSR_nzcv(kPcCode), FPSCR);
+  __ B(vs, &handle_nan_eq, /* far_target */ false);  // if un-ordered, go to NaN handling.
+
+  // op1 <> op2
+  vixl32::ConditionType cond = is_min ? gt : lt;
+  {
+    ExactAssemblyScope it_scope(assembler->GetVIXLAssembler(),
+                                2 * kMaxInstructionSizeInBytes,
+                                CodeBufferCheckScope::kMaximumSize);
+    __ it(cond);
+    __ vmov(cond, F64, out, op2);
+  }
+  __ B(ne, &done, /* far_target */ false);  // for <>(not equal), we've done min/max calculation.
+
+  // handle op1 == op2, max(+0.0,-0.0).
+  if (!is_min) {
+    __ Vand(F64, out, op1, op2);
+    __ B(&done);
+  }
+
+  // handle op1 == op2, min(+0.0,-0.0), NaN input.
+  __ Bind(&handle_nan_eq);
+  __ Vorr(F64, out, op1, op2);  // assemble op1/-0.0/NaN.
+
+  __ Bind(&done);
+}
+
+void IntrinsicLocationsBuilderARMVIXL::VisitMathMinDoubleDouble(HInvoke* invoke) {
+  CreateFPFPToFPLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARMVIXL::VisitMathMinDoubleDouble(HInvoke* invoke) {
+  GenMinMaxDouble(invoke, /* is_min */ true , GetAssembler());
+}
+
+void IntrinsicLocationsBuilderARMVIXL::VisitMathMaxDoubleDouble(HInvoke* invoke) {
+  CreateFPFPToFPLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARMVIXL::VisitMathMaxDoubleDouble(HInvoke* invoke) {
+  GenMinMaxDouble(invoke, /* is_min */ false, GetAssembler());
+}
+
+static void GenMinMaxLong(HInvoke* invoke, bool is_min, ArmVIXLAssembler* assembler) {
+  Location op1_loc = invoke->GetLocations()->InAt(0);
+  Location op2_loc = invoke->GetLocations()->InAt(1);
+  Location out_loc = invoke->GetLocations()->Out();
+
+  // Optimization: don't generate any code if inputs are the same.
+  if (op1_loc.Equals(op2_loc)) {
+    DCHECK(out_loc.Equals(op1_loc));  // out_loc is set as SameAsFirstInput() in location builder.
+    return;
+  }
+
+  vixl32::Register op1_lo = LowRegisterFrom(op1_loc);
+  vixl32::Register op1_hi = HighRegisterFrom(op1_loc);
+  vixl32::Register op2_lo = LowRegisterFrom(op2_loc);
+  vixl32::Register op2_hi = HighRegisterFrom(op2_loc);
+  vixl32::Register out_lo = LowRegisterFrom(out_loc);
+  vixl32::Register out_hi = HighRegisterFrom(out_loc);
+  UseScratchRegisterScope temps(assembler->GetVIXLAssembler());
+  const vixl32::Register temp = temps.Acquire();
+
+  DCHECK(op1_lo.Is(out_lo));
+  DCHECK(op1_hi.Is(out_hi));
+
+  // Compare op1 >= op2, or op1 < op2.
+  __ Cmp(out_lo, op2_lo);
+  __ Sbcs(temp, out_hi, op2_hi);
+
+  // Now GE/LT condition code is correct for the long comparison.
+  {
+    vixl32::ConditionType cond = is_min ? ge : lt;
+    ExactAssemblyScope it_scope(assembler->GetVIXLAssembler(),
+                                3 * kMaxInstructionSizeInBytes,
+                                CodeBufferCheckScope::kMaximumSize);
+    __ itt(cond);
+    __ mov(cond, out_lo, op2_lo);
+    __ mov(cond, out_hi, op2_hi);
+  }
+}
+
+static void CreateLongLongToLongLocations(ArenaAllocator* arena, HInvoke* invoke) {
+  LocationSummary* locations = new (arena) LocationSummary(invoke,
+                                                           LocationSummary::kNoCall,
+                                                           kIntrinsified);
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(1, Location::RequiresRegister());
+  locations->SetOut(Location::SameAsFirstInput());
+}
+
+void IntrinsicLocationsBuilderARMVIXL::VisitMathMinLongLong(HInvoke* invoke) {
+  CreateLongLongToLongLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARMVIXL::VisitMathMinLongLong(HInvoke* invoke) {
+  GenMinMaxLong(invoke, /* is_min */ true, GetAssembler());
+}
+
+void IntrinsicLocationsBuilderARMVIXL::VisitMathMaxLongLong(HInvoke* invoke) {
+  CreateLongLongToLongLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARMVIXL::VisitMathMaxLongLong(HInvoke* invoke) {
+  GenMinMaxLong(invoke, /* is_min */ false, GetAssembler());
+}
+
 static void GenMinMax(HInvoke* invoke, bool is_min, ArmVIXLAssembler* assembler) {
   vixl32::Register op1 = InputRegisterAt(invoke, 0);
   vixl32::Register op2 = InputRegisterAt(invoke, 1);
@@ -514,6 +724,18 @@ void IntrinsicCodeGeneratorARMVIXL::VisitMathSqrt(HInvoke* invoke) {
   __ Vsqrt(OutputDRegister(invoke), InputDRegisterAt(invoke, 0));
 }
 
+void IntrinsicLocationsBuilderARMVIXL::VisitMathRint(HInvoke* invoke) {
+  if (features_.HasARMv8AInstructions()) {
+    CreateFPToFPLocations(arena_, invoke);
+  }
+}
+
+void IntrinsicCodeGeneratorARMVIXL::VisitMathRint(HInvoke* invoke) {
+  DCHECK(codegen_->GetInstructionSetFeatures().HasARMv8AInstructions());
+  ArmVIXLAssembler* assembler = GetAssembler();
+  __ Vrintn(F64, F64, OutputDRegister(invoke), InputDRegisterAt(invoke, 0));
+}
+
 void IntrinsicLocationsBuilderARMVIXL::VisitMemoryPeekByte(HInvoke* invoke) {
   CreateIntToIntLocations(arena_, invoke);
 }
@@ -2742,15 +2964,30 @@ void IntrinsicCodeGeneratorARMVIXL::VisitReferenceGetReferent(HInvoke* invoke) {
   __ Bind(slow_path->GetExitLabel());
 }
 
-UNIMPLEMENTED_INTRINSIC(ARMVIXL, MathMinDoubleDouble)
-UNIMPLEMENTED_INTRINSIC(ARMVIXL, MathMinFloatFloat)
-UNIMPLEMENTED_INTRINSIC(ARMVIXL, MathMaxDoubleDouble)
-UNIMPLEMENTED_INTRINSIC(ARMVIXL, MathMaxFloatFloat)
-UNIMPLEMENTED_INTRINSIC(ARMVIXL, MathMinLongLong)
-UNIMPLEMENTED_INTRINSIC(ARMVIXL, MathMaxLongLong)
-UNIMPLEMENTED_INTRINSIC(ARMVIXL, MathCeil)          // Could be done by changing rounding mode, maybe?
-UNIMPLEMENTED_INTRINSIC(ARMVIXL, MathFloor)         // Could be done by changing rounding mode, maybe?
-UNIMPLEMENTED_INTRINSIC(ARMVIXL, MathRint)
+void IntrinsicLocationsBuilderARMVIXL::VisitMathCeil(HInvoke* invoke) {
+  if (features_.HasARMv8AInstructions()) {
+    CreateFPToFPLocations(arena_, invoke);
+  }
+}
+
+void IntrinsicCodeGeneratorARMVIXL::VisitMathCeil(HInvoke* invoke) {
+  ArmVIXLAssembler* assembler = GetAssembler();
+  DCHECK(codegen_->GetInstructionSetFeatures().HasARMv8AInstructions());
+  __ Vrintp(F64, F64, OutputDRegister(invoke), InputDRegisterAt(invoke, 0));
+}
+
+void IntrinsicLocationsBuilderARMVIXL::VisitMathFloor(HInvoke* invoke) {
+  if (features_.HasARMv8AInstructions()) {
+    CreateFPToFPLocations(arena_, invoke);
+  }
+}
+
+void IntrinsicCodeGeneratorARMVIXL::VisitMathFloor(HInvoke* invoke) {
+  ArmVIXLAssembler* assembler = GetAssembler();
+  DCHECK(codegen_->GetInstructionSetFeatures().HasARMv8AInstructions());
+  __ Vrintm(F64, F64, OutputDRegister(invoke), InputDRegisterAt(invoke, 0));
+}
+
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, MathRoundDouble)   // Could be done by changing rounding mode, maybe?
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, MathRoundFloat)    // Could be done by changing rounding mode, maybe?
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, UnsafeCASLong)     // High register pressure.
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index d15145e673..abbb91a1a9 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -1354,13 +1354,15 @@ std::ostream& operator<<(std::ostream& os, const HInstruction::InstructionKind&
   return os;
 }
 
-void HInstruction::MoveBefore(HInstruction* cursor) {
-  DCHECK(!IsPhi());
-  DCHECK(!IsControlFlow());
-  DCHECK(CanBeMoved() ||
-         // HShouldDeoptimizeFlag can only be moved by CHAGuardOptimization.
-         IsShouldDeoptimizeFlag());
-  DCHECK(!cursor->IsPhi());
+void HInstruction::MoveBefore(HInstruction* cursor, bool do_checks) {
+  if (do_checks) {
+    DCHECK(!IsPhi());
+    DCHECK(!IsControlFlow());
+    DCHECK(CanBeMoved() ||
+           // HShouldDeoptimizeFlag can only be moved by CHAGuardOptimization.
+           IsShouldDeoptimizeFlag());
+    DCHECK(!cursor->IsPhi());
+  }
 
   next_->previous_ = previous_;
   if (previous_ != nullptr) {
@@ -2462,16 +2464,15 @@ bool HLoadClass::InstructionDataEquals(const HInstruction* other) const {
   }
 }
 
-void HLoadClass::SetLoadKindInternal(LoadKind load_kind) {
-  // Once sharpened, the load kind should not be changed again.
-  // Also, kReferrersClass should never be overwritten.
-  DCHECK_EQ(GetLoadKind(), LoadKind::kDexCacheViaMethod);
+void HLoadClass::SetLoadKind(LoadKind load_kind) {
   SetPackedField<LoadKindField>(load_kind);
 
-  if (load_kind != LoadKind::kDexCacheViaMethod) {
+  if (load_kind != LoadKind::kDexCacheViaMethod &&
+      load_kind != LoadKind::kReferrersClass) {
     RemoveAsUserOfInput(0u);
     SetRawInputAt(0u, nullptr);
   }
+
   if (!NeedsEnvironment()) {
     RemoveEnvironment();
     SetSideEffects(SideEffects::None());
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index f0ea9e20e6..96f9abafbf 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -2065,8 +2065,8 @@ class HInstruction : public ArenaObject<kArenaAllocInstruction> {
     other->ReplaceInput(this, use_index);
   }
 
-  // Move `this` instruction before `cursor`.
-  void MoveBefore(HInstruction* cursor);
+  // Move `this` instruction before `cursor`
+  void MoveBefore(HInstruction* cursor, bool do_checks = true);
 
   // Move `this` before its first user and out of any loops. If there is no
   // out-of-loop user that dominates all other users, move the instruction
@@ -4322,6 +4322,11 @@ class HInvokeInterface FINAL : public HInvoke {
     return (obj == InputAt(0)) && !GetLocations()->Intrinsified();
   }
 
+  bool NeedsDexCacheOfDeclaringClass() const OVERRIDE {
+    // The assembly stub currently needs it.
+    return true;
+  }
+
   uint32_t GetImtIndex() const { return imt_index_; }
   uint32_t GetDexMethodIndex() const { return dex_method_index_; }
 
@@ -5508,6 +5513,9 @@ class HLoadClass FINAL : public HInstruction {
  public:
   // Determines how to load the Class.
   enum class LoadKind {
+    // We cannot load this class. See HSharpening::SharpenLoadClass.
+    kInvalid = -1,
+
     // Use the Class* from the method's own ArtMethod*.
     kReferrersClass,
 
@@ -5564,18 +5572,7 @@ class HLoadClass FINAL : public HInstruction {
     SetPackedFlag<kFlagGenerateClInitCheck>(false);
   }
 
-  void SetLoadKind(LoadKind load_kind) {
-    SetLoadKindInternal(load_kind);
-  }
-
-  void SetLoadKindWithTypeReference(LoadKind load_kind,
-                                    const DexFile& dex_file,
-                                    dex::TypeIndex type_index) {
-    DCHECK(HasTypeReference(load_kind));
-    DCHECK(IsSameDexFile(dex_file_, dex_file));
-    DCHECK_EQ(type_index_, type_index);
-    SetLoadKindInternal(load_kind);
-  }
+  void SetLoadKind(LoadKind load_kind);
 
   LoadKind GetLoadKind() const {
     return GetPackedField<LoadKindField>();
@@ -5694,6 +5691,11 @@ class HLoadClass FINAL : public HInstruction {
   // for PC-relative loads, i.e. kBssEntry or kBootImageLinkTimePcRelative.
   HUserRecord<HInstruction*> special_input_;
 
+  // A type index and dex file where the class can be accessed. The dex file can be:
+  // - The compiling method's dex file if the class is defined there too.
+  // - The compiling method's dex file if the class is referenced there.
+  // - The dex file where the class is defined. When the load kind can only be
+  //   kBssEntry or kDexCacheViaMethod, we cannot emit code for this `HLoadClass`.
   const dex::TypeIndex type_index_;
   const DexFile& dex_file_;
 
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index 297500b12f..727ca7d893 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -90,6 +90,7 @@
 #include "reference_type_propagation.h"
 #include "register_allocator_linear_scan.h"
 #include "select_generator.h"
+#include "scheduler.h"
 #include "sharpening.h"
 #include "side_effects_analysis.h"
 #include "ssa_builder.h"
@@ -658,10 +659,13 @@ void OptimizingCompiler::RunArchOptimizations(InstructionSet instruction_set,
           new (arena) arm64::InstructionSimplifierArm64(graph, stats);
       SideEffectsAnalysis* side_effects = new (arena) SideEffectsAnalysis(graph);
       GVNOptimization* gvn = new (arena) GVNOptimization(graph, *side_effects, "GVN$after_arch");
+      HInstructionScheduling* scheduling =
+          new (arena) HInstructionScheduling(graph, instruction_set);
       HOptimization* arm64_optimizations[] = {
         simplifier,
         side_effects,
-        gvn
+        gvn,
+        scheduling,
       };
       RunOptimizations(arm64_optimizations, arraysize(arm64_optimizations), pass_observer);
       break;
@@ -995,6 +999,7 @@ CodeGenerator* OptimizingCompiler::TryCompile(ArenaAllocator* arena,
                           &dex_file,
                           *code_item,
                           compiler_driver,
+                          codegen.get(),
                           compilation_stats_.get(),
                           interpreter_metadata,
                           dex_cache,
@@ -1129,6 +1134,25 @@ bool IsCompilingWithCoreImage() {
   return false;
 }
 
+bool EncodeArtMethodInInlineInfo(ArtMethod* method ATTRIBUTE_UNUSED) {
+  // Note: the runtime is null only for unit testing.
+  return Runtime::Current() == nullptr || !Runtime::Current()->IsAotCompiler();
+}
+
+bool CanEncodeInlinedMethodInStackMap(const DexFile& caller_dex_file, ArtMethod* callee) {
+  if (!Runtime::Current()->IsAotCompiler()) {
+    // JIT can always encode methods in stack maps.
+    return true;
+  }
+  if (IsSameDexFile(caller_dex_file, *callee->GetDexFile())) {
+    return true;
+  }
+  // TODO(ngeoffray): Support more AOT cases for inlining:
+  // - methods in multidex
+  // - methods in boot image for on-device non-PIC compilation.
+  return false;
+}
+
 bool OptimizingCompiler::JitCompile(Thread* self,
                                     jit::JitCodeCache* code_cache,
                                     ArtMethod* method,
diff --git a/compiler/optimizing/optimizing_compiler.h b/compiler/optimizing/optimizing_compiler.h
index 0c89da12e8..d8cea30a6b 100644
--- a/compiler/optimizing/optimizing_compiler.h
+++ b/compiler/optimizing/optimizing_compiler.h
@@ -17,10 +17,15 @@
 #ifndef ART_COMPILER_OPTIMIZING_OPTIMIZING_COMPILER_H_
 #define ART_COMPILER_OPTIMIZING_OPTIMIZING_COMPILER_H_
 
+#include "base/mutex.h"
+#include "globals.h"
+
 namespace art {
 
+class ArtMethod;
 class Compiler;
 class CompilerDriver;
+class DexFile;
 
 Compiler* CreateOptimizingCompiler(CompilerDriver* driver);
 
@@ -29,6 +34,10 @@ Compiler* CreateOptimizingCompiler(CompilerDriver* driver);
 // information for checking invariants.
 bool IsCompilingWithCoreImage();
 
+bool EncodeArtMethodInInlineInfo(ArtMethod* method);
+bool CanEncodeInlinedMethodInStackMap(const DexFile& caller_dex_file, ArtMethod* callee)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
 }  // namespace art
 
 #endif  // ART_COMPILER_OPTIMIZING_OPTIMIZING_COMPILER_H_
diff --git a/compiler/optimizing/optimizing_unit_test.h b/compiler/optimizing/optimizing_unit_test.h
index 58d90176cd..bf963b8996 100644
--- a/compiler/optimizing/optimizing_unit_test.h
+++ b/compiler/optimizing/optimizing_unit_test.h
@@ -64,6 +64,9 @@ LiveInterval* BuildInterval(const size_t ranges[][2],
 void RemoveSuspendChecks(HGraph* graph) {
   for (HBasicBlock* block : graph->GetBlocks()) {
     if (block != nullptr) {
+      if (block->GetLoopInformation() != nullptr) {
+        block->GetLoopInformation()->SetSuspendCheck(nullptr);
+      }
       for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
         HInstruction* current = it.Current();
         if (current->IsSuspendCheck()) {
diff --git a/compiler/optimizing/scheduler.cc b/compiler/optimizing/scheduler.cc
new file mode 100644
index 0000000000..d65d20cf43
--- /dev/null
+++ b/compiler/optimizing/scheduler.cc
@@ -0,0 +1,610 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <string>
+
+#include "prepare_for_register_allocation.h"
+#include "scheduler.h"
+
+#ifdef ART_ENABLE_CODEGEN_arm64
+#include "scheduler_arm64.h"
+#endif
+
+namespace art {
+
+void SchedulingGraph::AddDependency(SchedulingNode* node,
+                                    SchedulingNode* dependency,
+                                    bool is_data_dependency) {
+  if (node == nullptr || dependency == nullptr) {
+    // A `nullptr` node indicates an instruction out of scheduling range (eg. in
+    // an other block), so we do not need to add a dependency edge to the graph.
+    return;
+  }
+
+  if (is_data_dependency) {
+    if (!HasImmediateDataDependency(node, dependency)) {
+      node->AddDataPredecessor(dependency);
+    }
+  } else if (!HasImmediateOtherDependency(node, dependency)) {
+    node->AddOtherPredecessor(dependency);
+  }
+}
+
+static bool MayHaveReorderingDependency(SideEffects node, SideEffects other) {
+  // Read after write.
+  if (node.MayDependOn(other)) {
+    return true;
+  }
+
+  // Write after read.
+  if (other.MayDependOn(node)) {
+    return true;
+  }
+
+  // Memory write after write.
+  if (node.DoesAnyWrite() && other.DoesAnyWrite()) {
+    return true;
+  }
+
+  return false;
+}
+
+
+// Check whether `node` depends on `other`, taking into account `SideEffect`
+// information and `CanThrow` information.
+static bool HasSideEffectDependency(const HInstruction* node, const HInstruction* other) {
+  if (MayHaveReorderingDependency(node->GetSideEffects(), other->GetSideEffects())) {
+    return true;
+  }
+
+  if (other->CanThrow() && node->GetSideEffects().DoesAnyWrite()) {
+    return true;
+  }
+
+  if (other->GetSideEffects().DoesAnyWrite() && node->CanThrow()) {
+    return true;
+  }
+
+  if (other->CanThrow() && node->CanThrow()) {
+    return true;
+  }
+
+  // Check side-effect dependency between ArrayGet and BoundsCheck.
+  if (node->IsArrayGet() && other->IsBoundsCheck() && node->InputAt(1) == other) {
+    return true;
+  }
+
+  return false;
+}
+
+void SchedulingGraph::AddDependencies(HInstruction* instruction, bool is_scheduling_barrier) {
+  SchedulingNode* instruction_node = GetNode(instruction);
+
+  // Define-use dependencies.
+  for (const HUseListNode<HInstruction*>& use : instruction->GetUses()) {
+    AddDataDependency(GetNode(use.GetUser()), instruction_node);
+  }
+
+  // Scheduling barrier dependencies.
+  DCHECK(!is_scheduling_barrier || contains_scheduling_barrier_);
+  if (contains_scheduling_barrier_) {
+    // A barrier depends on instructions after it. And instructions before the
+    // barrier depend on it.
+    for (HInstruction* other = instruction->GetNext(); other != nullptr; other = other->GetNext()) {
+      SchedulingNode* other_node = GetNode(other);
+      bool other_is_barrier = other_node->IsSchedulingBarrier();
+      if (is_scheduling_barrier || other_is_barrier) {
+        AddOtherDependency(other_node, instruction_node);
+      }
+      if (other_is_barrier) {
+        // This other scheduling barrier guarantees ordering of instructions after
+        // it, so avoid creating additional useless dependencies in the graph.
+        // For example if we have
+        //     instr_1
+        //     barrier_2
+        //     instr_3
+        //     barrier_4
+        //     instr_5
+        // we only create the following non-data dependencies
+        //     1 -> 2
+        //     2 -> 3
+        //     2 -> 4
+        //     3 -> 4
+        //     4 -> 5
+        // and do not create
+        //     1 -> 4
+        //     2 -> 5
+        // Note that in this example we could also avoid creating the dependency
+        // `2 -> 4`.  But if we remove `instr_3` that dependency is required to
+        // order the barriers. So we generate it to avoid a special case.
+        break;
+      }
+    }
+  }
+
+  // Side effect dependencies.
+  if (!instruction->GetSideEffects().DoesNothing() || instruction->CanThrow()) {
+    for (HInstruction* other = instruction->GetNext(); other != nullptr; other = other->GetNext()) {
+      SchedulingNode* other_node = GetNode(other);
+      if (other_node->IsSchedulingBarrier()) {
+        // We have reached a scheduling barrier so we can stop further
+        // processing.
+        DCHECK(HasImmediateOtherDependency(other_node, instruction_node));
+        break;
+      }
+      if (HasSideEffectDependency(other, instruction)) {
+        AddOtherDependency(other_node, instruction_node);
+      }
+    }
+  }
+
+  // Environment dependencies.
+  // We do not need to process those if the instruction is a scheduling barrier,
+  // since the barrier already has non-data dependencies on all following
+  // instructions.
+  if (!is_scheduling_barrier) {
+    for (const HUseListNode<HEnvironment*>& use : instruction->GetEnvUses()) {
+      // Note that here we could stop processing if the environment holder is
+      // across a scheduling barrier. But checking this would likely require
+      // more work than simply iterating through environment uses.
+      AddOtherDependency(GetNode(use.GetUser()->GetHolder()), instruction_node);
+    }
+  }
+}
+
+bool SchedulingGraph::HasImmediateDataDependency(const SchedulingNode* node,
+                                                 const SchedulingNode* other) const {
+  return ContainsElement(node->GetDataPredecessors(), other);
+}
+
+bool SchedulingGraph::HasImmediateDataDependency(const HInstruction* instruction,
+                                                 const HInstruction* other_instruction) const {
+  const SchedulingNode* node = GetNode(instruction);
+  const SchedulingNode* other = GetNode(other_instruction);
+  if (node == nullptr || other == nullptr) {
+    // Both instructions must be in current basic block, i.e. the SchedulingGraph can see their
+    // corresponding SchedulingNode in the graph, and tell whether there is a dependency.
+    // Otherwise there is no dependency from SchedulingGraph's perspective, for example,
+    // instruction and other_instruction are in different basic blocks.
+    return false;
+  }
+  return HasImmediateDataDependency(node, other);
+}
+
+bool SchedulingGraph::HasImmediateOtherDependency(const SchedulingNode* node,
+                                                  const SchedulingNode* other) const {
+  return ContainsElement(node->GetOtherPredecessors(), other);
+}
+
+bool SchedulingGraph::HasImmediateOtherDependency(const HInstruction* instruction,
+                                                  const HInstruction* other_instruction) const {
+  const SchedulingNode* node = GetNode(instruction);
+  const SchedulingNode* other = GetNode(other_instruction);
+  if (node == nullptr || other == nullptr) {
+    // Both instructions must be in current basic block, i.e. the SchedulingGraph can see their
+    // corresponding SchedulingNode in the graph, and tell whether there is a dependency.
+    // Otherwise there is no dependency from SchedulingGraph's perspective, for example,
+    // instruction and other_instruction are in different basic blocks.
+    return false;
+  }
+  return HasImmediateOtherDependency(node, other);
+}
+
+static const std::string InstructionTypeId(const HInstruction* instruction) {
+  std::string id;
+  Primitive::Type type = instruction->GetType();
+  if (type == Primitive::kPrimNot) {
+    id.append("l");
+  } else {
+    id.append(Primitive::Descriptor(instruction->GetType()));
+  }
+  // Use lower-case to be closer to the `HGraphVisualizer` output.
+  id[0] = std::tolower(id[0]);
+  id.append(std::to_string(instruction->GetId()));
+  return id;
+}
+
+// Ideally we would reuse the graph visualizer code, but it is not available
+// from here and it is not worth moving all that code only for our use.
+static void DumpAsDotNode(std::ostream& output, const SchedulingNode* node) {
+  const HInstruction* instruction = node->GetInstruction();
+  // Use the instruction typed id as the node identifier.
+  std::string instruction_id = InstructionTypeId(instruction);
+  output << instruction_id << "[shape=record, label=\""
+      << instruction_id << ' ' << instruction->DebugName() << " [";
+  // List the instruction's inputs in its description. When visualizing the
+  // graph this helps differentiating data inputs from other dependencies.
+  const char* seperator = "";
+  for (const HInstruction* input : instruction->GetInputs()) {
+    output << seperator << InstructionTypeId(input);
+    seperator = ",";
+  }
+  output << "]";
+  // Other properties of the node.
+  output << "\\ninternal_latency: " << node->GetInternalLatency();
+  output << "\\ncritical_path: " << node->GetCriticalPath();
+  if (node->IsSchedulingBarrier()) {
+    output << "\\n(barrier)";
+  }
+  output << "\"];\n";
+  // We want program order to go from top to bottom in the graph output, so we
+  // reverse the edges and specify `dir=back`.
+  for (const SchedulingNode* predecessor : node->GetDataPredecessors()) {
+    const HInstruction* predecessor_instruction = predecessor->GetInstruction();
+    output << InstructionTypeId(predecessor_instruction) << ":s -> " << instruction_id << ":n "
+        << "[label=\"" << predecessor->GetLatency() << "\",dir=back]\n";
+  }
+  for (const SchedulingNode* predecessor : node->GetOtherPredecessors()) {
+    const HInstruction* predecessor_instruction = predecessor->GetInstruction();
+    output << InstructionTypeId(predecessor_instruction) << ":s -> " << instruction_id << ":n "
+        << "[dir=back,color=blue]\n";
+  }
+}
+
+void SchedulingGraph::DumpAsDotGraph(const std::string& description,
+                                     const ArenaVector<SchedulingNode*>& initial_candidates) {
+  // TODO(xueliang): ideally we should move scheduling information into HInstruction, after that
+  // we should move this dotty graph dump feature to visualizer, and have a compiler option for it.
+  std::ofstream output("scheduling_graphs.dot", std::ofstream::out | std::ofstream::app);
+  // Description of this graph, as a comment.
+  output << "// " << description << "\n";
+  // Start the dot graph. Use an increasing index for easier differentiation.
+  output << "digraph G {\n";
+  for (const auto& entry : nodes_map_) {
+    DumpAsDotNode(output, entry.second);
+  }
+  // Create a fake 'end_of_scheduling' node to help visualization of critical_paths.
+  for (auto node : initial_candidates) {
+    const HInstruction* instruction = node->GetInstruction();
+    output << InstructionTypeId(instruction) << ":s -> end_of_scheduling:n "
+      << "[label=\"" << node->GetLatency() << "\",dir=back]\n";
+  }
+  // End of the dot graph.
+  output << "}\n";
+  output.close();
+}
+
+SchedulingNode* CriticalPathSchedulingNodeSelector::SelectMaterializedCondition(
+    ArenaVector<SchedulingNode*>* nodes, const SchedulingGraph& graph) const {
+  // Schedule condition inputs that can be materialized immediately before their use.
+  // In following example, after we've scheduled HSelect, we want LessThan to be scheduled
+  // immediately, because it is a materialized condition, and will be emitted right before HSelect
+  // in codegen phase.
+  //
+  // i20 HLessThan [...]                  HLessThan    HAdd      HAdd
+  // i21 HAdd [...]                ===>      |          |         |
+  // i22 HAdd [...]                          +----------+---------+
+  // i23 HSelect [i21, i22, i20]                     HSelect
+
+  if (prev_select_ == nullptr) {
+    return nullptr;
+  }
+
+  const HInstruction* instruction = prev_select_->GetInstruction();
+  const HCondition* condition = nullptr;
+  DCHECK(instruction != nullptr);
+
+  if (instruction->IsIf()) {
+    condition = instruction->AsIf()->InputAt(0)->AsCondition();
+  } else if (instruction->IsSelect()) {
+    condition = instruction->AsSelect()->GetCondition()->AsCondition();
+  }
+
+  SchedulingNode* condition_node = (condition != nullptr) ? graph.GetNode(condition) : nullptr;
+
+  if ((condition_node != nullptr) &&
+      condition->HasOnlyOneNonEnvironmentUse() &&
+      ContainsElement(*nodes, condition_node)) {
+    DCHECK(!condition_node->HasUnscheduledSuccessors());
+    // Remove the condition from the list of candidates and schedule it.
+    RemoveElement(*nodes, condition_node);
+    return condition_node;
+  }
+
+  return nullptr;
+}
+
+SchedulingNode* CriticalPathSchedulingNodeSelector::PopHighestPriorityNode(
+    ArenaVector<SchedulingNode*>* nodes, const SchedulingGraph& graph) {
+  DCHECK(!nodes->empty());
+  SchedulingNode* select_node = nullptr;
+
+  // Optimize for materialized condition and its emit before use scenario.
+  select_node = SelectMaterializedCondition(nodes, graph);
+
+  if (select_node == nullptr) {
+    // Get highest priority node based on critical path information.
+    select_node = (*nodes)[0];
+    size_t select = 0;
+    for (size_t i = 1, e = nodes->size(); i < e; i++) {
+      SchedulingNode* check = (*nodes)[i];
+      SchedulingNode* candidate = (*nodes)[select];
+      select_node = GetHigherPrioritySchedulingNode(candidate, check);
+      if (select_node == check) {
+        select = i;
+      }
+    }
+    DeleteNodeAtIndex(nodes, select);
+  }
+
+  prev_select_ = select_node;
+  return select_node;
+}
+
+SchedulingNode* CriticalPathSchedulingNodeSelector::GetHigherPrioritySchedulingNode(
+    SchedulingNode* candidate, SchedulingNode* check) const {
+  uint32_t candidate_path = candidate->GetCriticalPath();
+  uint32_t check_path = check->GetCriticalPath();
+  // First look at the critical_path.
+  if (check_path != candidate_path) {
+    return check_path < candidate_path ? check : candidate;
+  }
+  // If both critical paths are equal, schedule instructions with a higher latency
+  // first in program order.
+  return check->GetLatency() < candidate->GetLatency() ? check : candidate;
+}
+
+void HScheduler::Schedule(HGraph* graph) {
+  for (HBasicBlock* block : graph->GetReversePostOrder()) {
+    if (IsSchedulable(block)) {
+      Schedule(block);
+    }
+  }
+}
+
+void HScheduler::Schedule(HBasicBlock* block) {
+  ArenaVector<SchedulingNode*> scheduling_nodes(arena_->Adapter(kArenaAllocScheduler));
+
+  // Build the scheduling graph.
+  scheduling_graph_.Clear();
+  for (HBackwardInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
+    HInstruction* instruction = it.Current();
+    SchedulingNode* node = scheduling_graph_.AddNode(instruction, IsSchedulingBarrier(instruction));
+    CalculateLatency(node);
+    scheduling_nodes.push_back(node);
+  }
+
+  if (scheduling_graph_.Size() <= 1) {
+    scheduling_graph_.Clear();
+    return;
+  }
+
+  cursor_ = block->GetLastInstruction();
+
+  // Find the initial candidates for scheduling.
+  candidates_.clear();
+  for (SchedulingNode* node : scheduling_nodes) {
+    if (!node->HasUnscheduledSuccessors()) {
+      node->MaybeUpdateCriticalPath(node->GetLatency());
+      candidates_.push_back(node);
+    }
+  }
+
+  ArenaVector<SchedulingNode*> initial_candidates(arena_->Adapter(kArenaAllocScheduler));
+  if (kDumpDotSchedulingGraphs) {
+    // Remember the list of initial candidates for debug output purposes.
+    initial_candidates.assign(candidates_.begin(), candidates_.end());
+  }
+
+  // Schedule all nodes.
+  while (!candidates_.empty()) {
+    Schedule(selector_->PopHighestPriorityNode(&candidates_, scheduling_graph_));
+  }
+
+  if (kDumpDotSchedulingGraphs) {
+    // Dump the graph in `dot` format.
+    HGraph* graph = block->GetGraph();
+    std::stringstream description;
+    description << graph->GetDexFile().PrettyMethod(graph->GetMethodIdx())
+        << " B" << block->GetBlockId();
+    scheduling_graph_.DumpAsDotGraph(description.str(), initial_candidates);
+  }
+}
+
+void HScheduler::Schedule(SchedulingNode* scheduling_node) {
+  // Check whether any of the node's predecessors will be valid candidates after
+  // this node is scheduled.
+  uint32_t path_to_node = scheduling_node->GetCriticalPath();
+  for (SchedulingNode* predecessor : scheduling_node->GetDataPredecessors()) {
+    predecessor->MaybeUpdateCriticalPath(
+        path_to_node + predecessor->GetInternalLatency() + predecessor->GetLatency());
+    predecessor->DecrementNumberOfUnscheduledSuccessors();
+    if (!predecessor->HasUnscheduledSuccessors()) {
+      candidates_.push_back(predecessor);
+    }
+  }
+  for (SchedulingNode* predecessor : scheduling_node->GetOtherPredecessors()) {
+    // Do not update the critical path.
+    // The 'other' (so 'non-data') dependencies (usually) do not represent a
+    // 'material' dependency of nodes on others. They exist for program
+    // correctness. So we do not use them to compute the critical path.
+    predecessor->DecrementNumberOfUnscheduledSuccessors();
+    if (!predecessor->HasUnscheduledSuccessors()) {
+      candidates_.push_back(predecessor);
+    }
+  }
+
+  Schedule(scheduling_node->GetInstruction());
+}
+
+// Move an instruction after cursor instruction inside one basic block.
+static void MoveAfterInBlock(HInstruction* instruction, HInstruction* cursor) {
+  DCHECK_EQ(instruction->GetBlock(), cursor->GetBlock());
+  DCHECK_NE(cursor, cursor->GetBlock()->GetLastInstruction());
+  DCHECK(!instruction->IsControlFlow());
+  DCHECK(!cursor->IsControlFlow());
+  instruction->MoveBefore(cursor->GetNext(), /* do_checks */ false);
+}
+
+void HScheduler::Schedule(HInstruction* instruction) {
+  if (instruction == cursor_) {
+    cursor_ = cursor_->GetPrevious();
+  } else {
+    MoveAfterInBlock(instruction, cursor_);
+  }
+}
+
+bool HScheduler::IsSchedulable(const HInstruction* instruction) const {
+  // We want to avoid exhaustively listing all instructions, so we first check
+  // for instruction categories that we know are safe.
+  if (instruction->IsControlFlow() ||
+      instruction->IsConstant()) {
+    return true;
+  }
+  // Currently all unary and binary operations are safe to schedule, so avoid
+  // checking for each of them individually.
+  // Since nothing prevents a new scheduling-unsafe HInstruction to subclass
+  // HUnaryOperation (or HBinaryOperation), check in debug mode that we have
+  // the exhaustive lists here.
+  if (instruction->IsUnaryOperation()) {
+    DCHECK(instruction->IsBooleanNot() ||
+           instruction->IsNot() ||
+           instruction->IsNeg()) << "unexpected instruction " << instruction->DebugName();
+    return true;
+  }
+  if (instruction->IsBinaryOperation()) {
+    DCHECK(instruction->IsAdd() ||
+           instruction->IsAnd() ||
+           instruction->IsCompare() ||
+           instruction->IsCondition() ||
+           instruction->IsDiv() ||
+           instruction->IsMul() ||
+           instruction->IsOr() ||
+           instruction->IsRem() ||
+           instruction->IsRor() ||
+           instruction->IsShl() ||
+           instruction->IsShr() ||
+           instruction->IsSub() ||
+           instruction->IsUShr() ||
+           instruction->IsXor()) << "unexpected instruction " << instruction->DebugName();
+    return true;
+  }
+  // The scheduler should not see any of these.
+  DCHECK(!instruction->IsParallelMove()) << "unexpected instruction " << instruction->DebugName();
+  // List of instructions explicitly excluded:
+  //    HClearException
+  //    HClinitCheck
+  //    HDeoptimize
+  //    HLoadClass
+  //    HLoadException
+  //    HMemoryBarrier
+  //    HMonitorOperation
+  //    HNativeDebugInfo
+  //    HThrow
+  //    HTryBoundary
+  // TODO: Some of the instructions above may be safe to schedule (maybe as
+  // scheduling barriers).
+  return instruction->IsArrayGet() ||
+      instruction->IsArraySet() ||
+      instruction->IsArrayLength() ||
+      instruction->IsBoundType() ||
+      instruction->IsBoundsCheck() ||
+      instruction->IsCheckCast() ||
+      instruction->IsClassTableGet() ||
+      instruction->IsCurrentMethod() ||
+      instruction->IsDivZeroCheck() ||
+      instruction->IsInstanceFieldGet() ||
+      instruction->IsInstanceFieldSet() ||
+      instruction->IsInstanceOf() ||
+      instruction->IsInvokeInterface() ||
+      instruction->IsInvokeStaticOrDirect() ||
+      instruction->IsInvokeUnresolved() ||
+      instruction->IsInvokeVirtual() ||
+      instruction->IsLoadString() ||
+      instruction->IsNewArray() ||
+      instruction->IsNewInstance() ||
+      instruction->IsNullCheck() ||
+      instruction->IsPackedSwitch() ||
+      instruction->IsParameterValue() ||
+      instruction->IsPhi() ||
+      instruction->IsReturn() ||
+      instruction->IsReturnVoid() ||
+      instruction->IsSelect() ||
+      instruction->IsStaticFieldGet() ||
+      instruction->IsStaticFieldSet() ||
+      instruction->IsSuspendCheck() ||
+      instruction->IsTypeConversion() ||
+      instruction->IsUnresolvedInstanceFieldGet() ||
+      instruction->IsUnresolvedInstanceFieldSet() ||
+      instruction->IsUnresolvedStaticFieldGet() ||
+      instruction->IsUnresolvedStaticFieldSet();
+}
+
+bool HScheduler::IsSchedulable(const HBasicBlock* block) const {
+  // We may be only interested in loop blocks.
+  if (only_optimize_loop_blocks_ && !block->IsInLoop()) {
+    return false;
+  }
+  if (block->GetTryCatchInformation() != nullptr) {
+    // Do not schedule blocks that are part of try-catch.
+    // Because scheduler cannot see if catch block has assumptions on the instruction order in
+    // the try block. In following example, if we enable scheduler for the try block,
+    // MulitiplyAccumulate may be scheduled before DivZeroCheck,
+    // which can result in an incorrect value in the catch block.
+    //   try {
+    //     a = a/b;    // DivZeroCheck
+    //                 // Div
+    //     c = c*d+e;  // MulitiplyAccumulate
+    //   } catch {System.out.print(c); }
+    return false;
+  }
+  // Check whether all instructions in this block are schedulable.
+  for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
+    if (!IsSchedulable(it.Current())) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool HScheduler::IsSchedulingBarrier(const HInstruction* instr) const {
+  return instr->IsControlFlow() ||
+      // Don't break calling convention.
+      instr->IsParameterValue() ||
+      // Code generation of goto relies on SuspendCheck's position.
+      instr->IsSuspendCheck();
+}
+
+void HInstructionScheduling::Run(bool only_optimize_loop_blocks,
+                                 bool schedule_randomly) {
+  // Avoid compilation error when compiling for unsupported instruction set.
+  UNUSED(only_optimize_loop_blocks);
+  UNUSED(schedule_randomly);
+  switch (instruction_set_) {
+#ifdef ART_ENABLE_CODEGEN_arm64
+    case kArm64: {
+      // Phase-local allocator that allocates scheduler internal data structures like
+      // scheduling nodes, internel nodes map, dependencies, etc.
+      ArenaAllocator arena_allocator(graph_->GetArena()->GetArenaPool());
+
+      CriticalPathSchedulingNodeSelector critical_path_selector;
+      RandomSchedulingNodeSelector random_selector;
+      SchedulingNodeSelector* selector = schedule_randomly
+          ? static_cast<SchedulingNodeSelector*>(&random_selector)
+          : static_cast<SchedulingNodeSelector*>(&critical_path_selector);
+
+      arm64::HSchedulerARM64 scheduler(&arena_allocator, selector);
+      scheduler.SetOnlyOptimizeLoopBlocks(only_optimize_loop_blocks);
+      scheduler.Schedule(graph_);
+      break;
+    }
+#endif
+    default:
+      break;
+  }
+}
+
+}  // namespace art
diff --git a/compiler/optimizing/scheduler.h b/compiler/optimizing/scheduler.h
new file mode 100644
index 0000000000..ab0dad4300
--- /dev/null
+++ b/compiler/optimizing/scheduler.h
@@ -0,0 +1,487 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_OPTIMIZING_SCHEDULER_H_
+#define ART_COMPILER_OPTIMIZING_SCHEDULER_H_
+
+#include <fstream>
+
+#include "base/time_utils.h"
+#include "driver/compiler_driver.h"
+#include "nodes.h"
+#include "optimization.h"
+
+namespace art {
+
+// General description of instruction scheduling.
+//
+// This pass tries to improve the quality of the generated code by reordering
+// instructions in the graph to avoid execution delays caused by execution
+// dependencies.
+// Currently, scheduling is performed at the block level, so no `HInstruction`
+// ever leaves its block in this pass.
+//
+// The scheduling process iterates through blocks in the graph. For blocks that
+// we can and want to schedule:
+// 1) Build a dependency graph for instructions.
+//    It includes data dependencies (inputs/uses), but also environment
+//    dependencies and side-effect dependencies.
+// 2) Schedule the dependency graph.
+//    This is a topological sort of the dependency graph, using heuristics to
+//    decide what node to scheduler first when there are multiple candidates.
+//
+// A few factors impacting the quality of the scheduling are:
+// - The heuristics used to decide what node to schedule in the topological sort
+//   when there are multiple valid candidates. There is a wide range of
+//   complexity possible here, going from a simple model only considering
+//   latencies, to a super detailed CPU pipeline model.
+// - Fewer dependencies in the dependency graph give more freedom for the
+//   scheduling heuristics. For example de-aliasing can allow possibilities for
+//   reordering of memory accesses.
+// - The level of abstraction of the IR. It is easier to evaluate scheduling for
+//   IRs that translate to a single assembly instruction than for IRs
+//   that generate multiple assembly instructions or generate different code
+//   depending on properties of the IR.
+// - Scheduling is performed before register allocation, it is not aware of the
+//   impact of moving instructions on register allocation.
+//
+//
+// The scheduling code uses the terms predecessors, successors, and dependencies.
+// This can be confusing at times, so here are clarifications.
+// These terms are used from the point of view of the program dependency graph. So
+// the inputs of an instruction are part of its dependencies, and hence part its
+// predecessors. So the uses of an instruction are (part of) its successors.
+// (Side-effect dependencies can yield predecessors or successors that are not
+// inputs or uses.)
+//
+// Here is a trivial example. For the Java code:
+//
+//    int a = 1 + 2;
+//
+// we would have the instructions
+//
+//    i1 HIntConstant 1
+//    i2 HIntConstant 2
+//    i3 HAdd [i1,i2]
+//
+// `i1` and `i2` are predecessors of `i3`.
+// `i3` is a successor of `i1` and a successor of `i2`.
+// In a scheduling graph for this code we would have three nodes `n1`, `n2`,
+// and `n3` (respectively for instructions `i1`, `i1`, and `i3`).
+// Conceptually the program dependency graph for this would contain two edges
+//
+//    n1 -> n3
+//    n2 -> n3
+//
+// Since we schedule backwards (starting from the last instruction in each basic
+// block), the implementation of nodes keeps a list of pointers their
+// predecessors. So `n3` would keep pointers to its predecessors `n1` and `n2`.
+//
+// Node dependencies are also referred to from the program dependency graph
+// point of view: we say that node `B` immediately depends on `A` if there is an
+// edge from `A` to `B` in the program dependency graph. `A` is a predecessor of
+// `B`, `B` is a successor of `A`. In the example above `n3` depends on `n1` and
+// `n2`.
+// Since nodes in the scheduling graph keep a list of their predecessors, node
+// `B` will have a pointer to its predecessor `A`.
+// As we schedule backwards, `B` will be selected for scheduling before `A` is.
+//
+// So the scheduling for the example above could happen as follow
+//
+//    |---------------------------+------------------------|
+//    | candidates for scheduling | instructions scheduled |
+//    | --------------------------+------------------------|
+//
+// The only node without successors is `n3`, so it is the only initial
+// candidate.
+//
+//    | n3                        | (none)                 |
+//
+// We schedule `n3` as the last (and only) instruction. All its predecessors
+// that do not have any unscheduled successors become candidate. That is, `n1`
+// and `n2` become candidates.
+//
+//    | n1, n2                    | n3                     |
+//
+// One of the candidates is selected. In practice this is where scheduling
+// heuristics kick in, to decide which of the candidates should be selected.
+// In this example, let it be `n1`. It is scheduled before previously scheduled
+// nodes (in program order). There are no other nodes to add to the list of
+// candidates.
+//
+//    | n2                        | n1                     |
+//    |                           | n3                     |
+//
+// The only candidate available for scheduling is `n2`. Schedule it before
+// (in program order) the previously scheduled nodes.
+//
+//    | (none)                    | n2                     |
+//    |                           | n1                     |
+//    |                           | n3                     |
+//    |---------------------------+------------------------|
+//
+// So finally the instructions will be executed in the order `i2`, `i1`, and `i3`.
+// In this trivial example, it does not matter which of `i1` and `i2` is
+// scheduled first since they are constants. However the same process would
+// apply if `i1` and `i2` were actual operations (for example `HMul` and `HDiv`).
+
+// Set to true to have instruction scheduling dump scheduling graphs to the file
+// `scheduling_graphs.dot`. See `SchedulingGraph::DumpAsDotGraph()`.
+static constexpr bool kDumpDotSchedulingGraphs = false;
+
+// Typically used as a default instruction latency.
+static constexpr uint32_t kGenericInstructionLatency = 1;
+
+class HScheduler;
+
+/**
+ * A node representing an `HInstruction` in the `SchedulingGraph`.
+ */
+class SchedulingNode : public ArenaObject<kArenaAllocScheduler> {
+ public:
+  SchedulingNode(HInstruction* instr, ArenaAllocator* arena, bool is_scheduling_barrier)
+      : latency_(0),
+        internal_latency_(0),
+        critical_path_(0),
+        instruction_(instr),
+        is_scheduling_barrier_(is_scheduling_barrier),
+        data_predecessors_(arena->Adapter(kArenaAllocScheduler)),
+        other_predecessors_(arena->Adapter(kArenaAllocScheduler)),
+        num_unscheduled_successors_(0) {
+    data_predecessors_.reserve(kPreallocatedPredecessors);
+  }
+
+  void AddDataPredecessor(SchedulingNode* predecessor) {
+    data_predecessors_.push_back(predecessor);
+    predecessor->num_unscheduled_successors_++;
+  }
+
+  void AddOtherPredecessor(SchedulingNode* predecessor) {
+    other_predecessors_.push_back(predecessor);
+    predecessor->num_unscheduled_successors_++;
+  }
+
+  void DecrementNumberOfUnscheduledSuccessors() {
+    num_unscheduled_successors_--;
+  }
+
+  void MaybeUpdateCriticalPath(uint32_t other_critical_path) {
+    critical_path_ = std::max(critical_path_, other_critical_path);
+  }
+
+  bool HasUnscheduledSuccessors() const {
+    return num_unscheduled_successors_ != 0;
+  }
+
+  HInstruction* GetInstruction() const { return instruction_; }
+  uint32_t GetLatency() const { return latency_; }
+  void SetLatency(uint32_t latency) { latency_ = latency; }
+  uint32_t GetInternalLatency() const { return internal_latency_; }
+  void SetInternalLatency(uint32_t internal_latency) { internal_latency_ = internal_latency; }
+  uint32_t GetCriticalPath() const { return critical_path_; }
+  bool IsSchedulingBarrier() const { return is_scheduling_barrier_; }
+  const ArenaVector<SchedulingNode*>& GetDataPredecessors() const { return data_predecessors_; }
+  const ArenaVector<SchedulingNode*>& GetOtherPredecessors() const { return other_predecessors_; }
+
+ private:
+  // The latency of this node. It represents the latency between the moment the
+  // last instruction for this node has executed to the moment the result
+  // produced by this node is available to users.
+  uint32_t latency_;
+  // This represents the time spent *within* the generated code for this node.
+  // It should be zero for nodes that only generate a single instruction.
+  uint32_t internal_latency_;
+
+  // The critical path from this instruction to the end of scheduling. It is
+  // used by the scheduling heuristics to measure the priority of this instruction.
+  // It is defined as
+  //     critical_path_ = latency_ + max((use.internal_latency_ + use.critical_path_) for all uses)
+  // (Note that here 'uses' is equivalent to 'data successors'. Also see comments in
+  // `HScheduler::Schedule(SchedulingNode* scheduling_node)`).
+  uint32_t critical_path_;
+
+  // The instruction that this node represents.
+  HInstruction* const instruction_;
+
+  // If a node is scheduling barrier, other nodes cannot be scheduled before it.
+  const bool is_scheduling_barrier_;
+
+  // The lists of predecessors. They cannot be scheduled before this node. Once
+  // this node is scheduled, we check whether any of its predecessors has become a
+  // valid candidate for scheduling.
+  // Predecessors in `data_predecessors_` are data dependencies. Those in
+  // `other_predecessors_` contain side-effect dependencies, environment
+  // dependencies, and scheduling barrier dependencies.
+  ArenaVector<SchedulingNode*> data_predecessors_;
+  ArenaVector<SchedulingNode*> other_predecessors_;
+
+  // The number of unscheduled successors for this node. This number is
+  // decremented as successors are scheduled. When it reaches zero this node
+  // becomes a valid candidate to schedule.
+  uint32_t num_unscheduled_successors_;
+
+  static constexpr size_t kPreallocatedPredecessors = 4;
+};
+
+/*
+ * Directed acyclic graph for scheduling.
+ */
+class SchedulingGraph : public ValueObject {
+ public:
+  SchedulingGraph(const HScheduler* scheduler, ArenaAllocator* arena)
+      : scheduler_(scheduler),
+        arena_(arena),
+        contains_scheduling_barrier_(false),
+        nodes_map_(arena_->Adapter(kArenaAllocScheduler)) {}
+
+  SchedulingNode* AddNode(HInstruction* instr, bool is_scheduling_barrier = false) {
+    SchedulingNode* node = new (arena_) SchedulingNode(instr, arena_, is_scheduling_barrier);
+    nodes_map_.Insert(std::make_pair(instr, node));
+    contains_scheduling_barrier_ |= is_scheduling_barrier;
+    AddDependencies(instr, is_scheduling_barrier);
+    return node;
+  }
+
+  void Clear() {
+    nodes_map_.Clear();
+    contains_scheduling_barrier_ = false;
+  }
+
+  SchedulingNode* GetNode(const HInstruction* instr) const {
+    auto it = nodes_map_.Find(instr);
+    if (it == nodes_map_.end()) {
+      return nullptr;
+    } else {
+      return it->second;
+    }
+  }
+
+  bool IsSchedulingBarrier(const HInstruction* instruction) const;
+
+  bool HasImmediateDataDependency(const SchedulingNode* node, const SchedulingNode* other) const;
+  bool HasImmediateDataDependency(const HInstruction* node, const HInstruction* other) const;
+  bool HasImmediateOtherDependency(const SchedulingNode* node, const SchedulingNode* other) const;
+  bool HasImmediateOtherDependency(const HInstruction* node, const HInstruction* other) const;
+
+  size_t Size() const {
+    return nodes_map_.Size();
+  }
+
+  // Dump the scheduling graph, in dot file format, appending it to the file
+  // `scheduling_graphs.dot`.
+  void DumpAsDotGraph(const std::string& description,
+                      const ArenaVector<SchedulingNode*>& initial_candidates);
+
+ protected:
+  void AddDependency(SchedulingNode* node, SchedulingNode* dependency, bool is_data_dependency);
+  void AddDataDependency(SchedulingNode* node, SchedulingNode* dependency) {
+    AddDependency(node, dependency, /*is_data_dependency*/true);
+  }
+  void AddOtherDependency(SchedulingNode* node, SchedulingNode* dependency) {
+    AddDependency(node, dependency, /*is_data_dependency*/false);
+  }
+
+  // Add dependencies nodes for the given `HInstruction`: inputs, environments, and side-effects.
+  void AddDependencies(HInstruction* instruction, bool is_scheduling_barrier = false);
+
+  const HScheduler* const scheduler_;
+
+  ArenaAllocator* const arena_;
+
+  bool contains_scheduling_barrier_;
+
+  ArenaHashMap<const HInstruction*, SchedulingNode*> nodes_map_;
+};
+
+/*
+ * The visitors derived from this base class are used by schedulers to evaluate
+ * the latencies of `HInstruction`s.
+ */
+class SchedulingLatencyVisitor : public HGraphDelegateVisitor {
+ public:
+  // This class and its sub-classes will never be used to drive a visit of an
+  // `HGraph` but only to visit `HInstructions` one at a time, so we do not need
+  // to pass a valid graph to `HGraphDelegateVisitor()`.
+  SchedulingLatencyVisitor() : HGraphDelegateVisitor(nullptr) {}
+
+  void VisitInstruction(HInstruction* instruction) OVERRIDE {
+    LOG(FATAL) << "Error visiting " << instruction->DebugName() << ". "
+        "Architecture-specific scheduling latency visitors must handle all instructions"
+        " (potentially by overriding the generic `VisitInstruction()`.";
+    UNREACHABLE();
+  }
+
+  void Visit(HInstruction* instruction) {
+    instruction->Accept(this);
+  }
+
+  void CalculateLatency(SchedulingNode* node) {
+    // By default nodes have no internal latency.
+    last_visited_internal_latency_ = 0;
+    Visit(node->GetInstruction());
+  }
+
+  uint32_t GetLastVisitedLatency() const { return last_visited_latency_; }
+  uint32_t GetLastVisitedInternalLatency() const { return last_visited_internal_latency_; }
+
+ protected:
+  // The latency of the most recent visited SchedulingNode.
+  // This is for reporting the latency value to the user of this visitor.
+  uint32_t last_visited_latency_;
+  // This represents the time spent *within* the generated code for the most recent visited
+  // SchedulingNode. This is for reporting the internal latency value to the user of this visitor.
+  uint32_t last_visited_internal_latency_;
+};
+
+class SchedulingNodeSelector : public ArenaObject<kArenaAllocScheduler> {
+ public:
+  virtual SchedulingNode* PopHighestPriorityNode(ArenaVector<SchedulingNode*>* nodes,
+                                                 const SchedulingGraph& graph) = 0;
+  virtual ~SchedulingNodeSelector() {}
+ protected:
+  static void DeleteNodeAtIndex(ArenaVector<SchedulingNode*>* nodes, size_t index) {
+    (*nodes)[index] = nodes->back();
+    nodes->pop_back();
+  }
+};
+
+/*
+ * Select a `SchedulingNode` at random within the candidates.
+ */
+class RandomSchedulingNodeSelector : public SchedulingNodeSelector {
+ public:
+  explicit RandomSchedulingNodeSelector() : seed_(0) {
+    seed_  = static_cast<uint32_t>(NanoTime());
+    srand(seed_);
+  }
+
+  SchedulingNode* PopHighestPriorityNode(ArenaVector<SchedulingNode*>* nodes,
+                                         const SchedulingGraph& graph) OVERRIDE {
+    UNUSED(graph);
+    DCHECK(!nodes->empty());
+    size_t select = rand_r(&seed_) % nodes->size();
+    SchedulingNode* select_node = (*nodes)[select];
+    DeleteNodeAtIndex(nodes, select);
+    return select_node;
+  }
+
+  uint32_t seed_;
+};
+
+/*
+ * Select a `SchedulingNode` according to critical path information,
+ * with heuristics to favor certain instruction patterns like materialized condition.
+ */
+class CriticalPathSchedulingNodeSelector : public SchedulingNodeSelector {
+ public:
+  CriticalPathSchedulingNodeSelector() : prev_select_(nullptr) {}
+
+  SchedulingNode* PopHighestPriorityNode(ArenaVector<SchedulingNode*>* nodes,
+                                         const SchedulingGraph& graph) OVERRIDE;
+
+ protected:
+  SchedulingNode* GetHigherPrioritySchedulingNode(SchedulingNode* candidate,
+                                                  SchedulingNode* check) const;
+
+  SchedulingNode* SelectMaterializedCondition(ArenaVector<SchedulingNode*>* nodes,
+                                               const SchedulingGraph& graph) const;
+
+ private:
+  const SchedulingNode* prev_select_;
+};
+
+class HScheduler {
+ public:
+  HScheduler(ArenaAllocator* arena,
+             SchedulingLatencyVisitor* latency_visitor,
+             SchedulingNodeSelector* selector)
+      : arena_(arena),
+        latency_visitor_(latency_visitor),
+        selector_(selector),
+        only_optimize_loop_blocks_(true),
+        scheduling_graph_(this, arena),
+        candidates_(arena_->Adapter(kArenaAllocScheduler)) {}
+  virtual ~HScheduler() {}
+
+  void Schedule(HGraph* graph);
+
+  void SetOnlyOptimizeLoopBlocks(bool loop_only) { only_optimize_loop_blocks_ = loop_only; }
+
+  // Instructions can not be rescheduled across a scheduling barrier.
+  virtual bool IsSchedulingBarrier(const HInstruction* instruction) const;
+
+ protected:
+  void Schedule(HBasicBlock* block);
+  void Schedule(SchedulingNode* scheduling_node);
+  void Schedule(HInstruction* instruction);
+
+  // Any instruction returning `false` via this method will prevent its
+  // containing basic block from being scheduled.
+  // This method is used to restrict scheduling to instructions that we know are
+  // safe to handle.
+  virtual bool IsSchedulable(const HInstruction* instruction) const;
+  bool IsSchedulable(const HBasicBlock* block) const;
+
+  void CalculateLatency(SchedulingNode* node) {
+    latency_visitor_->CalculateLatency(node);
+    node->SetLatency(latency_visitor_->GetLastVisitedLatency());
+    node->SetInternalLatency(latency_visitor_->GetLastVisitedInternalLatency());
+  }
+
+  ArenaAllocator* const arena_;
+  SchedulingLatencyVisitor* const latency_visitor_;
+  SchedulingNodeSelector* const selector_;
+  bool only_optimize_loop_blocks_;
+
+  // We instantiate the members below as part of this class to avoid
+  // instantiating them locally for every chunk scheduled.
+  SchedulingGraph scheduling_graph_;
+  // A pointer indicating where the next instruction to be scheduled will be inserted.
+  HInstruction* cursor_;
+  // The list of candidates for scheduling. A node becomes a candidate when all
+  // its predecessors have been scheduled.
+  ArenaVector<SchedulingNode*> candidates_;
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HScheduler);
+};
+
+inline bool SchedulingGraph::IsSchedulingBarrier(const HInstruction* instruction) const {
+  return scheduler_->IsSchedulingBarrier(instruction);
+}
+
+class HInstructionScheduling : public HOptimization {
+ public:
+  HInstructionScheduling(HGraph* graph, InstructionSet instruction_set)
+      : HOptimization(graph, kInstructionScheduling),
+        instruction_set_(instruction_set) {}
+
+  void Run() {
+    Run(/*only_optimize_loop_blocks*/ true, /*schedule_randomly*/ false);
+  }
+  void Run(bool only_optimize_loop_blocks, bool schedule_randomly);
+
+  static constexpr const char* kInstructionScheduling = "scheduler";
+
+  const InstructionSet instruction_set_;
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HInstructionScheduling);
+};
+
+}  // namespace art
+
+#endif  // ART_COMPILER_OPTIMIZING_SCHEDULER_H_
diff --git a/compiler/optimizing/scheduler_arm64.cc b/compiler/optimizing/scheduler_arm64.cc
new file mode 100644
index 0000000000..e3701fbcb1
--- /dev/null
+++ b/compiler/optimizing/scheduler_arm64.cc
@@ -0,0 +1,196 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "scheduler_arm64.h"
+#include "code_generator_utils.h"
+
+namespace art {
+namespace arm64 {
+
+void SchedulingLatencyVisitorARM64::VisitBinaryOperation(HBinaryOperation* instr) {
+  last_visited_latency_ = Primitive::IsFloatingPointType(instr->GetResultType())
+      ? kArm64FloatingPointOpLatency
+      : kArm64IntegerOpLatency;
+}
+
+void SchedulingLatencyVisitorARM64::VisitBitwiseNegatedRight(
+    HBitwiseNegatedRight* ATTRIBUTE_UNUSED) {
+  last_visited_latency_ = kArm64IntegerOpLatency;
+}
+
+void SchedulingLatencyVisitorARM64::VisitArm64DataProcWithShifterOp(
+    HArm64DataProcWithShifterOp* ATTRIBUTE_UNUSED) {
+  last_visited_latency_ = kArm64DataProcWithShifterOpLatency;
+}
+
+void SchedulingLatencyVisitorARM64::VisitIntermediateAddress(
+    HIntermediateAddress* ATTRIBUTE_UNUSED) {
+  // Although the code generated is a simple `add` instruction, we found through empirical results
+  // that spacing it from its use in memory accesses was beneficial.
+  last_visited_latency_ = kArm64IntegerOpLatency + 2;
+}
+
+void SchedulingLatencyVisitorARM64::VisitMultiplyAccumulate(HMultiplyAccumulate* ATTRIBUTE_UNUSED) {
+  last_visited_latency_ = kArm64MulIntegerLatency;
+}
+
+void SchedulingLatencyVisitorARM64::VisitArrayGet(HArrayGet* instruction) {
+  if (!instruction->GetArray()->IsIntermediateAddress()) {
+    // Take the intermediate address computation into account.
+    last_visited_internal_latency_ = kArm64IntegerOpLatency;
+  }
+  last_visited_latency_ = kArm64MemoryLoadLatency;
+}
+
+void SchedulingLatencyVisitorARM64::VisitArrayLength(HArrayLength* ATTRIBUTE_UNUSED) {
+  last_visited_latency_ = kArm64MemoryLoadLatency;
+}
+
+void SchedulingLatencyVisitorARM64::VisitArraySet(HArraySet* ATTRIBUTE_UNUSED) {
+  last_visited_latency_ = kArm64MemoryStoreLatency;
+}
+
+void SchedulingLatencyVisitorARM64::VisitBoundsCheck(HBoundsCheck* ATTRIBUTE_UNUSED) {
+  last_visited_internal_latency_ = kArm64IntegerOpLatency;
+  // Users do not use any data results.
+  last_visited_latency_ = 0;
+}
+
+void SchedulingLatencyVisitorARM64::VisitDiv(HDiv* instr) {
+  Primitive::Type type = instr->GetResultType();
+  switch (type) {
+    case Primitive::kPrimFloat:
+      last_visited_latency_ = kArm64DivFloatLatency;
+      break;
+    case Primitive::kPrimDouble:
+      last_visited_latency_ = kArm64DivDoubleLatency;
+      break;
+    default:
+      // Follow the code path used by code generation.
+      if (instr->GetRight()->IsConstant()) {
+        int64_t imm = Int64FromConstant(instr->GetRight()->AsConstant());
+        if (imm == 0) {
+          last_visited_internal_latency_ = 0;
+          last_visited_latency_ = 0;
+        } else if (imm == 1 || imm == -1) {
+          last_visited_internal_latency_ = 0;
+          last_visited_latency_ = kArm64IntegerOpLatency;
+        } else if (IsPowerOfTwo(AbsOrMin(imm))) {
+          last_visited_internal_latency_ = 4 * kArm64IntegerOpLatency;
+          last_visited_latency_ = kArm64IntegerOpLatency;
+        } else {
+          DCHECK(imm <= -2 || imm >= 2);
+          last_visited_internal_latency_ = 4 * kArm64IntegerOpLatency;
+          last_visited_latency_ = kArm64MulIntegerLatency;
+        }
+      } else {
+        last_visited_latency_ = kArm64DivIntegerLatency;
+      }
+      break;
+  }
+}
+
+void SchedulingLatencyVisitorARM64::VisitInstanceFieldGet(HInstanceFieldGet* ATTRIBUTE_UNUSED) {
+  last_visited_latency_ = kArm64MemoryLoadLatency;
+}
+
+void SchedulingLatencyVisitorARM64::VisitInstanceOf(HInstanceOf* ATTRIBUTE_UNUSED) {
+  last_visited_internal_latency_ = kArm64CallInternalLatency;
+  last_visited_latency_ = kArm64IntegerOpLatency;
+}
+
+void SchedulingLatencyVisitorARM64::VisitInvoke(HInvoke* ATTRIBUTE_UNUSED) {
+  last_visited_internal_latency_ = kArm64CallInternalLatency;
+  last_visited_latency_ = kArm64CallLatency;
+}
+
+void SchedulingLatencyVisitorARM64::VisitLoadString(HLoadString* ATTRIBUTE_UNUSED) {
+  last_visited_internal_latency_ = kArm64LoadStringInternalLatency;
+  last_visited_latency_ = kArm64MemoryLoadLatency;
+}
+
+void SchedulingLatencyVisitorARM64::VisitMul(HMul* instr) {
+  last_visited_latency_ = Primitive::IsFloatingPointType(instr->GetResultType())
+      ? kArm64MulFloatingPointLatency
+      : kArm64MulIntegerLatency;
+}
+
+void SchedulingLatencyVisitorARM64::VisitNewArray(HNewArray* ATTRIBUTE_UNUSED) {
+  last_visited_internal_latency_ = kArm64IntegerOpLatency + kArm64CallInternalLatency;
+  last_visited_latency_ = kArm64CallLatency;
+}
+
+void SchedulingLatencyVisitorARM64::VisitNewInstance(HNewInstance* instruction) {
+  if (instruction->IsStringAlloc()) {
+    last_visited_internal_latency_ = 2 + kArm64MemoryLoadLatency + kArm64CallInternalLatency;
+  } else {
+    last_visited_internal_latency_ = kArm64CallInternalLatency;
+  }
+  last_visited_latency_ = kArm64CallLatency;
+}
+
+void SchedulingLatencyVisitorARM64::VisitRem(HRem* instruction) {
+  if (Primitive::IsFloatingPointType(instruction->GetResultType())) {
+    last_visited_internal_latency_ = kArm64CallInternalLatency;
+    last_visited_latency_ = kArm64CallLatency;
+  } else {
+    // Follow the code path used by code generation.
+    if (instruction->GetRight()->IsConstant()) {
+      int64_t imm = Int64FromConstant(instruction->GetRight()->AsConstant());
+      if (imm == 0) {
+        last_visited_internal_latency_ = 0;
+        last_visited_latency_ = 0;
+      } else if (imm == 1 || imm == -1) {
+        last_visited_internal_latency_ = 0;
+        last_visited_latency_ = kArm64IntegerOpLatency;
+      } else if (IsPowerOfTwo(AbsOrMin(imm))) {
+        last_visited_internal_latency_ = 4 * kArm64IntegerOpLatency;
+        last_visited_latency_ = kArm64IntegerOpLatency;
+      } else {
+        DCHECK(imm <= -2 || imm >= 2);
+        last_visited_internal_latency_ = 4 * kArm64IntegerOpLatency;
+        last_visited_latency_ = kArm64MulIntegerLatency;
+      }
+    } else {
+      last_visited_internal_latency_ = kArm64DivIntegerLatency;
+      last_visited_latency_ = kArm64MulIntegerLatency;
+    }
+  }
+}
+
+void SchedulingLatencyVisitorARM64::VisitStaticFieldGet(HStaticFieldGet* ATTRIBUTE_UNUSED) {
+  last_visited_latency_ = kArm64MemoryLoadLatency;
+}
+
+void SchedulingLatencyVisitorARM64::VisitSuspendCheck(HSuspendCheck* instruction) {
+  HBasicBlock* block = instruction->GetBlock();
+  DCHECK((block->GetLoopInformation() != nullptr) ||
+         (block->IsEntryBlock() && instruction->GetNext()->IsGoto()));
+  // Users do not use any data results.
+  last_visited_latency_ = 0;
+}
+
+void SchedulingLatencyVisitorARM64::VisitTypeConversion(HTypeConversion* instr) {
+  if (Primitive::IsFloatingPointType(instr->GetResultType()) ||
+      Primitive::IsFloatingPointType(instr->GetInputType())) {
+    last_visited_latency_ = kArm64TypeConversionFloatingPointIntegerLatency;
+  } else {
+    last_visited_latency_ = kArm64IntegerOpLatency;
+  }
+}
+
+}  // namespace arm64
+}  // namespace art
diff --git a/compiler/optimizing/scheduler_arm64.h b/compiler/optimizing/scheduler_arm64.h
new file mode 100644
index 0000000000..702027c535
--- /dev/null
+++ b/compiler/optimizing/scheduler_arm64.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_OPTIMIZING_SCHEDULER_ARM64_H_
+#define ART_COMPILER_OPTIMIZING_SCHEDULER_ARM64_H_
+
+#include "scheduler.h"
+
+namespace art {
+namespace arm64 {
+
+static constexpr uint32_t kArm64MemoryLoadLatency = 5;
+static constexpr uint32_t kArm64MemoryStoreLatency = 3;
+
+static constexpr uint32_t kArm64CallInternalLatency = 10;
+static constexpr uint32_t kArm64CallLatency = 5;
+
+// AArch64 instruction latency.
+// We currently assume that all arm64 CPUs share the same instruction latency list.
+static constexpr uint32_t kArm64IntegerOpLatency = 2;
+static constexpr uint32_t kArm64FloatingPointOpLatency = 5;
+
+
+static constexpr uint32_t kArm64DataProcWithShifterOpLatency = 3;
+static constexpr uint32_t kArm64DivDoubleLatency = 30;
+static constexpr uint32_t kArm64DivFloatLatency = 15;
+static constexpr uint32_t kArm64DivIntegerLatency = 5;
+static constexpr uint32_t kArm64LoadStringInternalLatency = 7;
+static constexpr uint32_t kArm64MulFloatingPointLatency = 6;
+static constexpr uint32_t kArm64MulIntegerLatency = 6;
+static constexpr uint32_t kArm64TypeConversionFloatingPointIntegerLatency = 5;
+
+class SchedulingLatencyVisitorARM64 : public SchedulingLatencyVisitor {
+ public:
+  // Default visitor for instructions not handled specifically below.
+  void VisitInstruction(HInstruction* ATTRIBUTE_UNUSED) {
+    last_visited_latency_ = kArm64IntegerOpLatency;
+  }
+
+// We add a second unused parameter to be able to use this macro like the others
+// defined in `nodes.h`.
+#define FOR_EACH_SCHEDULED_COMMON_INSTRUCTION(M) \
+  M(ArrayGet         , unused)                   \
+  M(ArrayLength      , unused)                   \
+  M(ArraySet         , unused)                   \
+  M(BinaryOperation  , unused)                   \
+  M(BoundsCheck      , unused)                   \
+  M(Div              , unused)                   \
+  M(InstanceFieldGet , unused)                   \
+  M(InstanceOf       , unused)                   \
+  M(Invoke           , unused)                   \
+  M(LoadString       , unused)                   \
+  M(Mul              , unused)                   \
+  M(NewArray         , unused)                   \
+  M(NewInstance      , unused)                   \
+  M(Rem              , unused)                   \
+  M(StaticFieldGet   , unused)                   \
+  M(SuspendCheck     , unused)                   \
+  M(TypeConversion   , unused)
+
+#define FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(M) \
+  M(BitwiseNegatedRight, unused)                 \
+  M(MultiplyAccumulate, unused)                  \
+  M(IntermediateAddress, unused)
+
+#define DECLARE_VISIT_INSTRUCTION(type, unused)  \
+  void Visit##type(H##type* instruction) OVERRIDE;
+
+  FOR_EACH_SCHEDULED_COMMON_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_CONCRETE_INSTRUCTION_ARM64(DECLARE_VISIT_INSTRUCTION)
+
+#undef DECLARE_VISIT_INSTRUCTION
+};
+
+class HSchedulerARM64 : public HScheduler {
+ public:
+  HSchedulerARM64(ArenaAllocator* arena, SchedulingNodeSelector* selector)
+      : HScheduler(arena, &arm64_latency_visitor_, selector) {}
+  ~HSchedulerARM64() OVERRIDE {}
+
+  bool IsSchedulable(const HInstruction* instruction) const OVERRIDE {
+#define CASE_INSTRUCTION_KIND(type, unused) case \
+  HInstruction::InstructionKind::k##type:
+    switch (instruction->GetKind()) {
+      FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(CASE_INSTRUCTION_KIND)
+        return true;
+      FOR_EACH_CONCRETE_INSTRUCTION_ARM64(CASE_INSTRUCTION_KIND)
+        return true;
+      default:
+        return HScheduler::IsSchedulable(instruction);
+    }
+#undef CASE_INSTRUCTION_KIND
+  }
+
+ private:
+  SchedulingLatencyVisitorARM64 arm64_latency_visitor_;
+  DISALLOW_COPY_AND_ASSIGN(HSchedulerARM64);
+};
+
+}  // namespace arm64
+}  // namespace art
+
+#endif  // ART_COMPILER_OPTIMIZING_SCHEDULER_ARM64_H_
diff --git a/compiler/optimizing/scheduler_test.cc b/compiler/optimizing/scheduler_test.cc
new file mode 100644
index 0000000000..31d13e2a26
--- /dev/null
+++ b/compiler/optimizing/scheduler_test.cc
@@ -0,0 +1,238 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "base/arena_allocator.h"
+#include "builder.h"
+#include "codegen_test_utils.h"
+#include "common_compiler_test.h"
+#include "nodes.h"
+#include "optimizing_unit_test.h"
+#include "pc_relative_fixups_x86.h"
+#include "register_allocator.h"
+#include "scheduler.h"
+
+#ifdef ART_ENABLE_CODEGEN_arm64
+#include "scheduler_arm64.h"
+#endif
+
+namespace art {
+
+// Return all combinations of ISA and code generator that are executable on
+// hardware, or on simulator, and that we'd like to test.
+static ::std::vector<CodegenTargetConfig> GetTargetConfigs() {
+  ::std::vector<CodegenTargetConfig> v;
+  ::std::vector<CodegenTargetConfig> test_config_candidates = {
+#ifdef ART_ENABLE_CODEGEN_arm
+    CodegenTargetConfig(kArm, create_codegen_arm),
+    CodegenTargetConfig(kThumb2, create_codegen_arm),
+#endif
+#ifdef ART_ENABLE_CODEGEN_arm64
+    CodegenTargetConfig(kArm64, create_codegen_arm64),
+#endif
+#ifdef ART_ENABLE_CODEGEN_x86
+    CodegenTargetConfig(kX86, create_codegen_x86),
+#endif
+#ifdef ART_ENABLE_CODEGEN_x86_64
+    CodegenTargetConfig(kX86_64, create_codegen_x86_64),
+#endif
+#ifdef ART_ENABLE_CODEGEN_mips
+    CodegenTargetConfig(kMips, create_codegen_mips),
+#endif
+#ifdef ART_ENABLE_CODEGEN_mips64
+    CodegenTargetConfig(kMips64, create_codegen_mips64)
+#endif
+  };
+
+  for (auto test_config : test_config_candidates) {
+    if (CanExecute(test_config.GetInstructionSet())) {
+      v.push_back(test_config);
+    }
+  }
+
+  return v;
+}
+
+class SchedulerTest : public CommonCompilerTest {};
+
+#ifdef ART_ENABLE_CODEGEN_arm64
+TEST_F(SchedulerTest, DependencyGraph) {
+  ArenaPool pool;
+  ArenaAllocator allocator(&pool);
+  HGraph* graph = CreateGraph(&allocator);
+  HBasicBlock* entry = new (&allocator) HBasicBlock(graph);
+  HBasicBlock* block1 = new (&allocator) HBasicBlock(graph);
+  graph->AddBlock(entry);
+  graph->AddBlock(block1);
+  graph->SetEntryBlock(entry);
+
+  // entry:
+  // array         ParameterValue
+  // c1            IntConstant
+  // c2            IntConstant
+  // block1:
+  // add1          Add [c1, c2]
+  // add2          Add [add1, c2]
+  // mul           Mul [add1, add2]
+  // div_check     DivZeroCheck [add2] (env: add2, mul)
+  // div           Div [add1, div_check]
+  // array_get1    ArrayGet [array, add1]
+  // array_set1    ArraySet [array, add1, add2]
+  // array_get2    ArrayGet [array, add1]
+  // array_set2    ArraySet [array, add1, add2]
+
+  HInstruction* array = new (&allocator) HParameterValue(graph->GetDexFile(),
+                                                         dex::TypeIndex(0),
+                                                         0,
+                                                         Primitive::kPrimNot);
+  HInstruction* c1 = graph->GetIntConstant(1);
+  HInstruction* c2 = graph->GetIntConstant(10);
+  HInstruction* add1 = new (&allocator) HAdd(Primitive::kPrimInt, c1, c2);
+  HInstruction* add2 = new (&allocator) HAdd(Primitive::kPrimInt, add1, c2);
+  HInstruction* mul = new (&allocator) HMul(Primitive::kPrimInt, add1, add2);
+  HInstruction* div_check = new (&allocator) HDivZeroCheck(add2, 0);
+  HInstruction* div = new (&allocator) HDiv(Primitive::kPrimInt, add1, div_check, 0);
+  HInstruction* array_get1 = new (&allocator) HArrayGet(array, add1, Primitive::kPrimInt, 0);
+  HInstruction* array_set1 = new (&allocator) HArraySet(array, add1, add2, Primitive::kPrimInt, 0);
+  HInstruction* array_get2 = new (&allocator) HArrayGet(array, add1, Primitive::kPrimInt, 0);
+  HInstruction* array_set2 = new (&allocator) HArraySet(array, add1, add2, Primitive::kPrimInt, 0);
+
+  DCHECK(div_check->CanThrow());
+
+  entry->AddInstruction(array);
+
+  HInstruction* block_instructions[] = {add1,
+                                        add2,
+                                        mul,
+                                        div_check,
+                                        div,
+                                        array_get1,
+                                        array_set1,
+                                        array_get2,
+                                        array_set2};
+  for (auto instr : block_instructions) {
+    block1->AddInstruction(instr);
+  }
+
+  HEnvironment* environment = new (&allocator) HEnvironment(&allocator,
+                                                            2,
+                                                            graph->GetArtMethod(),
+                                                            0,
+                                                            div_check);
+  div_check->SetRawEnvironment(environment);
+  environment->SetRawEnvAt(0, add2);
+  add2->AddEnvUseAt(div_check->GetEnvironment(), 0);
+  environment->SetRawEnvAt(1, mul);
+  mul->AddEnvUseAt(div_check->GetEnvironment(), 1);
+
+  ArenaAllocator* arena = graph->GetArena();
+  CriticalPathSchedulingNodeSelector critical_path_selector;
+  arm64::HSchedulerARM64 scheduler(arena, &critical_path_selector);
+  SchedulingGraph scheduling_graph(&scheduler, arena);
+  // Instructions must be inserted in reverse order into the scheduling graph.
+  for (auto instr : ReverseRange(block_instructions)) {
+    scheduling_graph.AddNode(instr);
+  }
+
+  // Should not have dependencies cross basic blocks.
+  ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(add1, c1));
+  ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(add2, c2));
+
+  // Define-use dependency.
+  ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(add2, add1));
+  ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(add1, add2));
+  ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(div_check, add2));
+  ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(div_check, add1));
+  ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(div, div_check));
+  ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(array_set1, add1));
+  ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(array_set1, add2));
+
+  // Read and write dependencies
+  ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set1, array_get1));
+  ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set2, array_get2));
+  ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_get2, array_set1));
+  ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set2, array_set1));
+
+  // Env dependency.
+  ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(div_check, mul));
+  ASSERT_FALSE(scheduling_graph.HasImmediateOtherDependency(mul, div_check));
+
+  // CanThrow.
+  ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set1, div_check));
+}
+#endif
+
+static void CompileWithRandomSchedulerAndRun(const uint16_t* data,
+                                             bool has_result,
+                                             int expected) {
+  for (CodegenTargetConfig target_config : GetTargetConfigs()) {
+    ArenaPool pool;
+    ArenaAllocator arena(&pool);
+    HGraph* graph = CreateCFG(&arena, data);
+
+    // Schedule the graph randomly.
+    HInstructionScheduling scheduling(graph, target_config.GetInstructionSet());
+    scheduling.Run(/*only_optimize_loop_blocks*/ false, /*schedule_randomly*/ true);
+
+    RunCode(target_config,
+            graph,
+            [](HGraph* graph_arg) { RemoveSuspendChecks(graph_arg); },
+            has_result, expected);
+  }
+}
+
+TEST_F(SchedulerTest, RandomScheduling) {
+  //
+  // Java source: crafted code to make sure (random) scheduling should get correct result.
+  //
+  //  int result = 0;
+  //  float fr = 10.0f;
+  //  for (int i = 1; i < 10; i++) {
+  //    fr ++;
+  //    int t1 = result >> i;
+  //    int t2 = result * i;
+  //    result = result + t1 - t2;
+  //    fr = fr / i;
+  //    result += (int)fr;
+  //  }
+  //  return result;
+  //
+  const uint16_t data[] = SIX_REGISTERS_CODE_ITEM(
+    Instruction::CONST_4 | 0 << 12 | 2 << 8,          // const/4 v2, #int 0
+    Instruction::CONST_HIGH16 | 0 << 8, 0x4120,       // const/high16 v0, #float 10.0 // #41200000
+    Instruction::CONST_4 | 1 << 12 | 1 << 8,          // const/4 v1, #int 1
+    Instruction::CONST_16 | 5 << 8, 0x000a,           // const/16 v5, #int 10
+    Instruction::IF_GE | 5 << 12 | 1 << 8, 0x0014,    // if-ge v1, v5, 001a // +0014
+    Instruction::CONST_HIGH16 | 5 << 8, 0x3f80,       // const/high16 v5, #float 1.0 // #3f800000
+    Instruction::ADD_FLOAT_2ADDR | 5 << 12 | 0 << 8,  // add-float/2addr v0, v5
+    Instruction::SHR_INT | 3 << 8, 1 << 8 | 2 ,       // shr-int v3, v2, v1
+    Instruction::MUL_INT | 4 << 8, 1 << 8 | 2,        // mul-int v4, v2, v1
+    Instruction::ADD_INT | 5 << 8, 3 << 8 | 2,        // add-int v5, v2, v3
+    Instruction::SUB_INT | 2 << 8, 4 << 8 | 5,        // sub-int v2, v5, v4
+    Instruction::INT_TO_FLOAT | 1 << 12 | 5 << 8,     // int-to-float v5, v1
+    Instruction::DIV_FLOAT_2ADDR | 5 << 12 | 0 << 8,  // div-float/2addr v0, v5
+    Instruction::FLOAT_TO_INT | 0 << 12 | 5 << 8,     // float-to-int v5, v0
+    Instruction::ADD_INT_2ADDR | 5 << 12 | 2 << 8,    // add-int/2addr v2, v5
+    Instruction::ADD_INT_LIT8 | 1 << 8, 1 << 8 | 1,   // add-int/lit8 v1, v1, #int 1 // #01
+    Instruction::GOTO | 0xeb << 8,                    // goto 0004 // -0015
+    Instruction::RETURN | 2 << 8);                    // return v2
+
+  constexpr int kNumberOfRuns = 10;
+  for (int i = 0; i < kNumberOfRuns; ++i) {
+    CompileWithRandomSchedulerAndRun(data, true, 138774);
+  }
+}
+
+}  // namespace art
diff --git a/compiler/optimizing/sharpening.cc b/compiler/optimizing/sharpening.cc
index c5294107ae..f07f02a719 100644
--- a/compiler/optimizing/sharpening.cc
+++ b/compiler/optimizing/sharpening.cc
@@ -42,8 +42,6 @@ void HSharpening::Run() {
       HInstruction* instruction = it.Current();
       if (instruction->IsInvokeStaticOrDirect()) {
         ProcessInvokeStaticOrDirect(instruction->AsInvokeStaticOrDirect());
-      } else if (instruction->IsLoadClass()) {
-        ProcessLoadClass(instruction->AsLoadClass());
       } else if (instruction->IsLoadString()) {
         ProcessLoadString(instruction->AsLoadString());
       }
@@ -97,7 +95,9 @@ void HSharpening::ProcessInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) {
   // class is initialized already or being initialized, and the call will not
   // be invoked once the method is deoptimized.
 
-  if (callee == codegen_->GetGraph()->GetArtMethod()) {
+  // We don't optimize for debuggable as it would prevent us from obsoleting the method in some
+  // situations.
+  if (callee == codegen_->GetGraph()->GetArtMethod() && !codegen_->GetGraph()->IsDebuggable()) {
     // Recursive call.
     method_load_kind = HInvokeStaticOrDirect::MethodLoadKind::kRecursive;
     code_ptr_location = HInvokeStaticOrDirect::CodePtrLocation::kCallSelf;
@@ -131,104 +131,93 @@ void HSharpening::ProcessInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) {
   invoke->SetDispatchInfo(dispatch_info);
 }
 
-void HSharpening::ProcessLoadClass(HLoadClass* load_class) {
-  ScopedObjectAccess soa(Thread::Current());
-  SharpenClass(load_class, codegen_, compiler_driver_);
-}
-
-void HSharpening::SharpenClass(HLoadClass* load_class,
-                               CodeGenerator* codegen,
-                               CompilerDriver* compiler_driver) {
+HLoadClass::LoadKind HSharpening::SharpenClass(HLoadClass* load_class,
+                                               CodeGenerator* codegen,
+                                               CompilerDriver* compiler_driver,
+                                               const DexCompilationUnit& dex_compilation_unit) {
   Handle<mirror::Class> klass = load_class->GetClass();
   DCHECK(load_class->GetLoadKind() == HLoadClass::LoadKind::kDexCacheViaMethod ||
          load_class->GetLoadKind() == HLoadClass::LoadKind::kReferrersClass)
       << load_class->GetLoadKind();
   DCHECK(!load_class->IsInBootImage()) << "HLoadClass should not be optimized before sharpening.";
 
+  HLoadClass::LoadKind load_kind = load_class->GetLoadKind();
+
   if (load_class->NeedsAccessCheck()) {
     // We need to call the runtime anyway, so we simply get the class as that call's return value.
-    return;
-  }
-
-  if (load_class->GetLoadKind() == HLoadClass::LoadKind::kReferrersClass) {
+  } else if (load_kind == HLoadClass::LoadKind::kReferrersClass) {
     // Loading from the ArtMethod* is the most efficient retrieval in code size.
     // TODO: This may not actually be true for all architectures and
     // locations of target classes. The additional register pressure
     // for using the ArtMethod* should be considered.
-    return;
-  }
-
-  const DexFile& dex_file = load_class->GetDexFile();
-  dex::TypeIndex type_index = load_class->GetTypeIndex();
+  } else {
+    const DexFile& dex_file = load_class->GetDexFile();
+    dex::TypeIndex type_index = load_class->GetTypeIndex();
 
-  bool is_in_boot_image = false;
-  HLoadClass::LoadKind desired_load_kind = static_cast<HLoadClass::LoadKind>(-1);
-  Runtime* runtime = Runtime::Current();
-  if (codegen->GetCompilerOptions().IsBootImage()) {
-    // Compiling boot image. Check if the class is a boot image class.
-    DCHECK(!runtime->UseJitCompilation());
-    if (!compiler_driver->GetSupportBootImageFixup()) {
-      // compiler_driver_test. Do not sharpen.
-      desired_load_kind = HLoadClass::LoadKind::kDexCacheViaMethod;
-    } else if ((klass.Get() != nullptr) && compiler_driver->IsImageClass(
-        dex_file.StringDataByIdx(dex_file.GetTypeId(type_index).descriptor_idx_))) {
-      is_in_boot_image = true;
-      desired_load_kind = codegen->GetCompilerOptions().GetCompilePic()
-          ? HLoadClass::LoadKind::kBootImageLinkTimePcRelative
-          : HLoadClass::LoadKind::kBootImageLinkTimeAddress;
+    bool is_in_boot_image = false;
+    HLoadClass::LoadKind desired_load_kind = HLoadClass::LoadKind::kInvalid;
+    Runtime* runtime = Runtime::Current();
+    if (codegen->GetCompilerOptions().IsBootImage()) {
+      // Compiling boot image. Check if the class is a boot image class.
+      DCHECK(!runtime->UseJitCompilation());
+      if (!compiler_driver->GetSupportBootImageFixup()) {
+        // compiler_driver_test. Do not sharpen.
+        desired_load_kind = HLoadClass::LoadKind::kDexCacheViaMethod;
+      } else if ((klass.Get() != nullptr) && compiler_driver->IsImageClass(
+          dex_file.StringDataByIdx(dex_file.GetTypeId(type_index).descriptor_idx_))) {
+        is_in_boot_image = true;
+        desired_load_kind = codegen->GetCompilerOptions().GetCompilePic()
+            ? HLoadClass::LoadKind::kBootImageLinkTimePcRelative
+            : HLoadClass::LoadKind::kBootImageLinkTimeAddress;
+      } else {
+        // Not a boot image class.
+        DCHECK(ContainsElement(compiler_driver->GetDexFilesForOatFile(), &dex_file));
+        desired_load_kind = HLoadClass::LoadKind::kBssEntry;
+      }
     } else {
-      // Not a boot image class.
-      DCHECK(ContainsElement(compiler_driver->GetDexFilesForOatFile(), &dex_file));
-      desired_load_kind = HLoadClass::LoadKind::kBssEntry;
-    }
-  } else {
-    is_in_boot_image = (klass.Get() != nullptr) &&
-        runtime->GetHeap()->ObjectIsInBootImageSpace(klass.Get());
-    if (runtime->UseJitCompilation()) {
-      // TODO: Make sure we don't set the "compile PIC" flag for JIT as that's bogus.
-      // DCHECK(!codegen_->GetCompilerOptions().GetCompilePic());
-      if (is_in_boot_image) {
-        // TODO: Use direct pointers for all non-moving spaces, not just boot image. Bug: 29530787
+      is_in_boot_image = (klass.Get() != nullptr) &&
+          runtime->GetHeap()->ObjectIsInBootImageSpace(klass.Get());
+      if (runtime->UseJitCompilation()) {
+        // TODO: Make sure we don't set the "compile PIC" flag for JIT as that's bogus.
+        // DCHECK(!codegen_->GetCompilerOptions().GetCompilePic());
+        if (is_in_boot_image) {
+          // TODO: Use direct pointers for all non-moving spaces, not just boot image. Bug: 29530787
+          desired_load_kind = HLoadClass::LoadKind::kBootImageAddress;
+        } else if (klass.Get() != nullptr) {
+          desired_load_kind = HLoadClass::LoadKind::kJitTableAddress;
+        } else {
+          // Class not loaded yet. This happens when the dex code requesting
+          // this `HLoadClass` hasn't been executed in the interpreter.
+          // Fallback to the dex cache.
+          // TODO(ngeoffray): Generate HDeoptimize instead.
+          desired_load_kind = HLoadClass::LoadKind::kDexCacheViaMethod;
+        }
+      } else if (is_in_boot_image && !codegen->GetCompilerOptions().GetCompilePic()) {
+        // AOT app compilation. Check if the class is in the boot image.
         desired_load_kind = HLoadClass::LoadKind::kBootImageAddress;
-      } else if (klass.Get() != nullptr) {
-        desired_load_kind = HLoadClass::LoadKind::kJitTableAddress;
       } else {
-        // Class not loaded yet. This happens when the dex code requesting
-        // this `HLoadClass` hasn't been executed in the interpreter.
-        // Fallback to the dex cache.
-        // TODO(ngeoffray): Generate HDeoptimize instead.
-        desired_load_kind = HLoadClass::LoadKind::kDexCacheViaMethod;
+        // Not JIT and either the klass is not in boot image or we are compiling in PIC mode.
+        desired_load_kind = HLoadClass::LoadKind::kBssEntry;
       }
-    } else if (is_in_boot_image && !codegen->GetCompilerOptions().GetCompilePic()) {
-      // AOT app compilation. Check if the class is in the boot image.
-      desired_load_kind = HLoadClass::LoadKind::kBootImageAddress;
-    } else {
-      // Not JIT and either the klass is not in boot image or we are compiling in PIC mode.
-      desired_load_kind = HLoadClass::LoadKind::kBssEntry;
     }
-  }
-  DCHECK_NE(desired_load_kind, static_cast<HLoadClass::LoadKind>(-1));
+    DCHECK_NE(desired_load_kind, HLoadClass::LoadKind::kInvalid);
 
-  if (is_in_boot_image) {
-    load_class->MarkInBootImage();
+    if (is_in_boot_image) {
+      load_class->MarkInBootImage();
+    }
+    load_kind = codegen->GetSupportedLoadClassKind(desired_load_kind);
   }
 
-  HLoadClass::LoadKind load_kind = codegen->GetSupportedLoadClassKind(desired_load_kind);
-  switch (load_kind) {
-    case HLoadClass::LoadKind::kBootImageLinkTimeAddress:
-    case HLoadClass::LoadKind::kBootImageLinkTimePcRelative:
-    case HLoadClass::LoadKind::kBssEntry:
-    case HLoadClass::LoadKind::kDexCacheViaMethod:
-      load_class->SetLoadKindWithTypeReference(load_kind, dex_file, type_index);
-      break;
-    case HLoadClass::LoadKind::kBootImageAddress:
-    case HLoadClass::LoadKind::kJitTableAddress:
-      load_class->SetLoadKind(load_kind);
-      break;
-    default:
-      LOG(FATAL) << "Unexpected load kind: " << load_kind;
-      UNREACHABLE();
+  if (!IsSameDexFile(load_class->GetDexFile(), *dex_compilation_unit.GetDexFile())) {
+    if ((load_kind == HLoadClass::LoadKind::kDexCacheViaMethod) ||
+        (load_kind == HLoadClass::LoadKind::kBssEntry)) {
+      // We actually cannot reference this class, we're forced to bail.
+      // We cannot reference this class with Bss, as the entrypoint will lookup the class
+      // in the caller's dex file, but that dex file does not reference the class.
+      return HLoadClass::LoadKind::kInvalid;
+    }
   }
+  return load_kind;
 }
 
 void HSharpening::ProcessLoadString(HLoadString* load_string) {
diff --git a/compiler/optimizing/sharpening.h b/compiler/optimizing/sharpening.h
index ae3d83ef2c..4240b2f339 100644
--- a/compiler/optimizing/sharpening.h
+++ b/compiler/optimizing/sharpening.h
@@ -17,6 +17,7 @@
 #ifndef ART_COMPILER_OPTIMIZING_SHARPENING_H_
 #define ART_COMPILER_OPTIMIZING_SHARPENING_H_
 
+#include "nodes.h"
 #include "optimization.h"
 
 namespace art {
@@ -24,7 +25,6 @@ namespace art {
 class CodeGenerator;
 class CompilerDriver;
 class DexCompilationUnit;
-class HInvokeStaticOrDirect;
 
 // Optimization that tries to improve the way we dispatch methods and access types,
 // fields, etc. Besides actual method sharpening based on receiver type (for example
@@ -47,15 +47,15 @@ class HSharpening : public HOptimization {
 
   static constexpr const char* kSharpeningPassName = "sharpening";
 
-  // Used internally but also by the inliner.
-  static void SharpenClass(HLoadClass* load_class,
-                           CodeGenerator* codegen,
-                           CompilerDriver* compiler_driver)
+  // Used by the builder and the inliner.
+  static HLoadClass::LoadKind SharpenClass(HLoadClass* load_class,
+                                           CodeGenerator* codegen,
+                                           CompilerDriver* compiler_driver,
+                                           const DexCompilationUnit& dex_compilation_unit)
     REQUIRES_SHARED(Locks::mutator_lock_);
 
  private:
   void ProcessInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke);
-  void ProcessLoadClass(HLoadClass* load_class);
   void ProcessLoadString(HLoadString* load_string);
 
   CodeGenerator* codegen_;
diff --git a/compiler/optimizing/ssa_builder.cc b/compiler/optimizing/ssa_builder.cc
index ae1e369999..487e4dd498 100644
--- a/compiler/optimizing/ssa_builder.cc
+++ b/compiler/optimizing/ssa_builder.cc
@@ -17,8 +17,10 @@
 #include "ssa_builder.h"
 
 #include "bytecode_utils.h"
+#include "mirror/class-inl.h"
 #include "nodes.h"
 #include "reference_type_propagation.h"
+#include "scoped_thread_state_change-inl.h"
 #include "ssa_phi_elimination.h"
 
 namespace art {
diff --git a/compiler/optimizing/stack_map_stream.cc b/compiler/optimizing/stack_map_stream.cc
index a9a1e6f592..10f5cab907 100644
--- a/compiler/optimizing/stack_map_stream.cc
+++ b/compiler/optimizing/stack_map_stream.cc
@@ -16,7 +16,11 @@
 
 #include "stack_map_stream.h"
 
-#include "art_method.h"
+#include <unordered_map>
+
+#include "art_method-inl.h"
+#include "base/stl_util.h"
+#include "optimizing/optimizing_compiler.h"
 #include "runtime.h"
 #include "scoped_thread_state_change-inl.h"
 
@@ -40,6 +44,7 @@ void StackMapStream::BeginStackMapEntry(uint32_t dex_pc,
   current_entry_.inline_infos_start_index = inline_infos_.size();
   current_entry_.dex_register_map_hash = 0;
   current_entry_.same_dex_register_map_as_ = kNoSameDexMapFound;
+  current_entry_.stack_mask_index = 0;
   if (num_dex_registers != 0) {
     current_entry_.live_dex_registers_mask =
         ArenaBitVector::Create(allocator_, num_dex_registers, true, kArenaAllocStackMapStream);
@@ -103,11 +108,6 @@ void StackMapStream::AddDexRegisterEntry(DexRegisterLocation::Kind kind, int32_t
   current_dex_register_++;
 }
 
-static bool EncodeArtMethodInInlineInfo(ArtMethod* method ATTRIBUTE_UNUSED) {
-  // Note: the runtime is null only for unit testing.
-  return Runtime::Current() == nullptr || !Runtime::Current()->IsAotCompiler();
-}
-
 void StackMapStream::BeginInlineInfoEntry(ArtMethod* method,
                                           uint32_t dex_pc,
                                           uint32_t num_dex_registers,
@@ -153,32 +153,43 @@ CodeOffset StackMapStream::ComputeMaxNativePcCodeOffset() const {
 }
 
 size_t StackMapStream::PrepareForFillIn() {
-  int stack_mask_number_of_bits = stack_mask_max_ + 1;  // Need room for max element too.
+  const size_t stack_mask_size_in_bits = stack_mask_max_ + 1;  // Need room for max element too.
+  const size_t number_of_stack_masks = PrepareStackMasks(stack_mask_size_in_bits);
+  const size_t register_mask_size_in_bits = MinimumBitsToStore(register_mask_max_);
+  const size_t number_of_register_masks = PrepareRegisterMasks();
   dex_register_maps_size_ = ComputeDexRegisterMapsSize();
   ComputeInlineInfoEncoding();  // needs dex_register_maps_size_.
   inline_info_size_ = inline_infos_.size() * inline_info_encoding_.GetEntrySize();
   CodeOffset max_native_pc_offset = ComputeMaxNativePcCodeOffset();
-  // The stack map contains compressed native offsets.
-  size_t stack_map_size = stack_map_encoding_.SetFromSizes(max_native_pc_offset.CompressedValue(),
-                                                           dex_pc_max_,
-                                                           dex_register_maps_size_,
-                                                           inline_info_size_,
-                                                           register_mask_max_,
-                                                           stack_mask_number_of_bits);
-  stack_maps_size_ = stack_maps_.size() * stack_map_size;
+  // The stack map contains compressed native PC offsets.
+  const size_t stack_map_size = stack_map_encoding_.SetFromSizes(
+      max_native_pc_offset.CompressedValue(),
+      dex_pc_max_,
+      dex_register_maps_size_,
+      inline_info_size_,
+      number_of_register_masks,
+      number_of_stack_masks);
+  stack_maps_size_ = RoundUp(stack_maps_.size() * stack_map_size, kBitsPerByte) / kBitsPerByte;
   dex_register_location_catalog_size_ = ComputeDexRegisterLocationCatalogSize();
-
-  size_t non_header_size =
+  const size_t stack_masks_bits = number_of_stack_masks * stack_mask_size_in_bits;
+  const size_t register_masks_bits = number_of_register_masks * register_mask_size_in_bits;
+  // Register masks are last, stack masks are right before that last.
+  // They are both bit packed / aligned.
+  const size_t non_header_size =
       stack_maps_size_ +
       dex_register_location_catalog_size_ +
       dex_register_maps_size_ +
-      inline_info_size_;
+      inline_info_size_ +
+      RoundUp(stack_masks_bits + register_masks_bits, kBitsPerByte) / kBitsPerByte;
 
   // Prepare the CodeInfo variable-sized encoding.
   CodeInfoEncoding code_info_encoding;
   code_info_encoding.non_header_size = non_header_size;
   code_info_encoding.number_of_stack_maps = stack_maps_.size();
-  code_info_encoding.stack_map_size_in_bytes = stack_map_size;
+  code_info_encoding.number_of_stack_masks = number_of_stack_masks;
+  code_info_encoding.number_of_register_masks = number_of_register_masks;
+  code_info_encoding.stack_mask_size_in_bits = stack_mask_size_in_bits;
+  code_info_encoding.register_mask_size_in_bits = register_mask_size_in_bits;
   code_info_encoding.stack_map_encoding = stack_map_encoding_;
   code_info_encoding.inline_info_encoding = inline_info_encoding_;
   code_info_encoding.number_of_location_catalog_entries = location_catalog_entries_.size();
@@ -321,18 +332,8 @@ void StackMapStream::FillIn(MemoryRegion region) {
 
     stack_map.SetDexPc(stack_map_encoding_, entry.dex_pc);
     stack_map.SetNativePcCodeOffset(stack_map_encoding_, entry.native_pc_code_offset);
-    stack_map.SetRegisterMask(stack_map_encoding_, entry.register_mask);
-    size_t number_of_stack_mask_bits = stack_map.GetNumberOfStackMaskBits(stack_map_encoding_);
-    if (entry.sp_mask != nullptr) {
-      for (size_t bit = 0; bit < number_of_stack_mask_bits; bit++) {
-        stack_map.SetStackMaskBit(stack_map_encoding_, bit, entry.sp_mask->IsBitSet(bit));
-      }
-    } else {
-      // The MemoryRegion does not have to be zeroed, so make sure we clear the bits.
-      for (size_t bit = 0; bit < number_of_stack_mask_bits; bit++) {
-        stack_map.SetStackMaskBit(stack_map_encoding_, bit, false);
-      }
-    }
+    stack_map.SetRegisterMaskIndex(stack_map_encoding_, entry.register_mask_index);
+    stack_map.SetStackMaskIndex(stack_map_encoding_, entry.stack_mask_index);
 
     if (entry.num_dex_registers == 0 || (entry.live_dex_registers_mask->NumSetBits() == 0)) {
       // No dex map available.
@@ -353,7 +354,7 @@ void StackMapStream::FillIn(MemoryRegion region) {
         next_dex_register_map_offset += register_region.size();
         DexRegisterMap dex_register_map(register_region);
         stack_map.SetDexRegisterMapOffset(
-            stack_map_encoding_, register_region.start() - dex_register_locations_region.start());
+            stack_map_encoding_, register_region.begin() - dex_register_locations_region.begin());
 
         // Set the dex register location.
         FillInDexRegisterMap(dex_register_map,
@@ -373,7 +374,7 @@ void StackMapStream::FillIn(MemoryRegion region) {
 
       // Currently relative to the dex register map.
       stack_map.SetInlineDescriptorOffset(
-          stack_map_encoding_, inline_region.start() - dex_register_locations_region.start());
+          stack_map_encoding_, inline_region.begin() - dex_register_locations_region.begin());
 
       inline_info.SetDepth(inline_info_encoding_, entry.inlining_depth);
       DCHECK_LE(entry.inline_infos_start_index + entry.inlining_depth, inline_infos_.size());
@@ -408,7 +409,7 @@ void StackMapStream::FillIn(MemoryRegion region) {
           DexRegisterMap dex_register_map(register_region);
           inline_info.SetDexRegisterMapOffsetAtDepth(
               inline_info_encoding_,
-              depth, register_region.start() - dex_register_locations_region.start());
+              depth, register_region.begin() - dex_register_locations_region.begin());
 
           FillInDexRegisterMap(dex_register_map,
                                inline_entry.num_dex_registers,
@@ -423,6 +424,25 @@ void StackMapStream::FillIn(MemoryRegion region) {
     }
   }
 
+  // Write stack masks table.
+  size_t stack_mask_bits = encoding.stack_mask_size_in_bits;
+  if (stack_mask_bits > 0) {
+    size_t stack_mask_bytes = RoundUp(stack_mask_bits, kBitsPerByte) / kBitsPerByte;
+    for (size_t i = 0; i < encoding.number_of_stack_masks; ++i) {
+      MemoryRegion source(&stack_masks_[i * stack_mask_bytes], stack_mask_bytes);
+      BitMemoryRegion stack_mask = code_info.GetStackMask(encoding, i);
+      for (size_t bit_index = 0; bit_index < encoding.stack_mask_size_in_bits; ++bit_index) {
+        stack_mask.StoreBit(bit_index, source.LoadBit(bit_index));
+      }
+    }
+  }
+
+  // Write register masks table.
+  for (size_t i = 0; i < encoding.number_of_register_masks; ++i) {
+    BitMemoryRegion register_mask = code_info.GetRegisterMask(encoding, i);
+    register_mask.StoreBits(0, register_masks_[i], encoding.register_mask_size_in_bits);
+  }
+
   // Verify all written data in debug build.
   if (kIsDebugBuild) {
     CheckCodeInfo(region);
@@ -536,6 +556,38 @@ void StackMapStream::CheckDexRegisterMap(const CodeInfo& code_info,
   }
 }
 
+size_t StackMapStream::PrepareRegisterMasks() {
+  register_masks_.resize(stack_maps_.size(), 0u);
+  std::unordered_map<uint32_t, size_t> dedupe;
+  for (StackMapEntry& stack_map : stack_maps_) {
+    const size_t index = dedupe.size();
+    stack_map.register_mask_index = dedupe.emplace(stack_map.register_mask, index).first->second;
+    register_masks_[index] = stack_map.register_mask;
+  }
+  return dedupe.size();
+}
+
+size_t StackMapStream::PrepareStackMasks(size_t entry_size_in_bits) {
+  // Preallocate memory since we do not want it to move (the dedup map will point into it).
+  const size_t byte_entry_size = RoundUp(entry_size_in_bits, kBitsPerByte) / kBitsPerByte;
+  stack_masks_.resize(byte_entry_size * stack_maps_.size(), 0u);
+  // For deduplicating we store the stack masks as byte packed for simplicity. We can bit pack later
+  // when copying out from stack_masks_.
+  std::unordered_map<MemoryRegion,
+                     size_t,
+                     FNVHash<MemoryRegion>,
+                     MemoryRegion::ContentEquals> dedup(stack_maps_.size());
+  for (StackMapEntry& stack_map : stack_maps_) {
+    size_t index = dedup.size();
+    MemoryRegion stack_mask(stack_masks_.data() + index * byte_entry_size, byte_entry_size);
+    for (size_t i = 0; i < entry_size_in_bits; i++) {
+      stack_mask.StoreBit(i, stack_map.sp_mask != nullptr && stack_map.sp_mask->IsBitSet(i));
+    }
+    stack_map.stack_mask_index = dedup.emplace(stack_mask, index).first->second;
+  }
+  return dedup.size();
+}
+
 // Check that all StackMapStream inputs are correctly encoded by trying to read them back.
 void StackMapStream::CheckCodeInfo(MemoryRegion region) const {
   CodeInfo code_info(region);
@@ -550,16 +602,19 @@ void StackMapStream::CheckCodeInfo(MemoryRegion region) const {
     DCHECK_EQ(stack_map.GetNativePcOffset(stack_map_encoding, instruction_set_),
               entry.native_pc_code_offset.Uint32Value(instruction_set_));
     DCHECK_EQ(stack_map.GetDexPc(stack_map_encoding), entry.dex_pc);
-    DCHECK_EQ(stack_map.GetRegisterMask(stack_map_encoding), entry.register_mask);
-    size_t num_stack_mask_bits = stack_map.GetNumberOfStackMaskBits(stack_map_encoding);
+    DCHECK_EQ(stack_map.GetRegisterMaskIndex(stack_map_encoding), entry.register_mask_index);
+    DCHECK_EQ(code_info.GetRegisterMaskOf(encoding, stack_map), entry.register_mask);
+    const size_t num_stack_mask_bits = code_info.GetNumberOfStackMaskBits(encoding);
+    DCHECK_EQ(stack_map.GetStackMaskIndex(stack_map_encoding), entry.stack_mask_index);
+    BitMemoryRegion stack_mask = code_info.GetStackMaskOf(encoding, stack_map);
     if (entry.sp_mask != nullptr) {
-      DCHECK_GE(num_stack_mask_bits, entry.sp_mask->GetNumberOfBits());
+      DCHECK_GE(stack_mask.size_in_bits(), entry.sp_mask->GetNumberOfBits());
       for (size_t b = 0; b < num_stack_mask_bits; b++) {
-        DCHECK_EQ(stack_map.GetStackMaskBit(stack_map_encoding, b), entry.sp_mask->IsBitSet(b));
+        DCHECK_EQ(stack_mask.LoadBit(b), entry.sp_mask->IsBitSet(b));
       }
     } else {
       for (size_t b = 0; b < num_stack_mask_bits; b++) {
-        DCHECK_EQ(stack_map.GetStackMaskBit(stack_map_encoding, b), 0u);
+        DCHECK_EQ(stack_mask.LoadBit(b), 0u);
       }
     }
 
diff --git a/compiler/optimizing/stack_map_stream.h b/compiler/optimizing/stack_map_stream.h
index 8fec472437..b1069a17be 100644
--- a/compiler/optimizing/stack_map_stream.h
+++ b/compiler/optimizing/stack_map_stream.h
@@ -68,6 +68,8 @@ class StackMapStream : public ValueObject {
         location_catalog_entries_indices_(allocator->Adapter(kArenaAllocStackMapStream)),
         dex_register_locations_(allocator->Adapter(kArenaAllocStackMapStream)),
         inline_infos_(allocator->Adapter(kArenaAllocStackMapStream)),
+        stack_masks_(allocator->Adapter(kArenaAllocStackMapStream)),
+        register_masks_(allocator->Adapter(kArenaAllocStackMapStream)),
         stack_mask_max_(-1),
         dex_pc_max_(0),
         register_mask_max_(0),
@@ -107,6 +109,8 @@ class StackMapStream : public ValueObject {
     BitVector* live_dex_registers_mask;
     uint32_t dex_register_map_hash;
     size_t same_dex_register_map_as_;
+    uint32_t stack_mask_index;
+    uint32_t register_mask_index;
   };
 
   struct InlineInfoEntry {
@@ -160,6 +164,12 @@ class StackMapStream : public ValueObject {
 
   CodeOffset ComputeMaxNativePcCodeOffset() const;
 
+  // Returns the number of unique stack masks.
+  size_t PrepareStackMasks(size_t entry_size_in_bits);
+
+  // Returns the number of unique register masks.
+  size_t PrepareRegisterMasks();
+
   // Returns the index of an entry with the same dex register map as the current_entry,
   // or kNoSameDexMapFound if no such entry exists.
   size_t FindEntryWithTheSameDexMap();
@@ -193,6 +203,8 @@ class StackMapStream : public ValueObject {
   // A set of concatenated maps of Dex register locations indices to `location_catalog_entries_`.
   ArenaVector<size_t> dex_register_locations_;
   ArenaVector<InlineInfoEntry> inline_infos_;
+  ArenaVector<uint8_t> stack_masks_;
+  ArenaVector<uint32_t> register_masks_;
   int stack_mask_max_;
   uint32_t dex_pc_max_;
   uint32_t register_mask_max_;
diff --git a/compiler/optimizing/stack_map_test.cc b/compiler/optimizing/stack_map_test.cc
index f68695bcbc..ce6d5c2b22 100644
--- a/compiler/optimizing/stack_map_test.cc
+++ b/compiler/optimizing/stack_map_test.cc
@@ -27,15 +27,16 @@ namespace art {
 // Check that the stack mask of given stack map is identical
 // to the given bit vector. Returns true if they are same.
 static bool CheckStackMask(
+    const CodeInfo& code_info,
+    const CodeInfoEncoding& encoding,
     const StackMap& stack_map,
-    StackMapEncoding& encoding,
     const BitVector& bit_vector) {
-  int number_of_bits = stack_map.GetNumberOfStackMaskBits(encoding);
-  if (bit_vector.GetHighestBitSet() >= number_of_bits) {
+  BitMemoryRegion stack_mask = code_info.GetStackMaskOf(encoding, stack_map);
+  if (bit_vector.GetNumberOfBits() > encoding.stack_mask_size_in_bits) {
     return false;
   }
-  for (int i = 0; i < number_of_bits; ++i) {
-    if (stack_map.GetStackMaskBit(encoding, i) != bit_vector.IsBitSet(i)) {
+  for (size_t i = 0; i < encoding.stack_mask_size_in_bits; ++i) {
+    if (stack_mask.LoadBit(i) != bit_vector.IsBitSet(i)) {
       return false;
     }
   }
@@ -79,9 +80,9 @@ TEST(StackMapTest, Test1) {
   ASSERT_TRUE(stack_map.Equals(code_info.GetStackMapForNativePcOffset(64, encoding)));
   ASSERT_EQ(0u, stack_map.GetDexPc(encoding.stack_map_encoding));
   ASSERT_EQ(64u, stack_map.GetNativePcOffset(encoding.stack_map_encoding, kRuntimeISA));
-  ASSERT_EQ(0x3u, stack_map.GetRegisterMask(encoding.stack_map_encoding));
+  ASSERT_EQ(0x3u, code_info.GetRegisterMaskOf(encoding, stack_map));
 
-  ASSERT_TRUE(CheckStackMask(stack_map, encoding.stack_map_encoding, sp_mask));
+  ASSERT_TRUE(CheckStackMask(code_info, encoding, stack_map, sp_mask));
 
   ASSERT_TRUE(stack_map.HasDexRegisterMap(encoding.stack_map_encoding));
   DexRegisterMap dex_register_map =
@@ -194,9 +195,9 @@ TEST(StackMapTest, Test2) {
     ASSERT_TRUE(stack_map.Equals(code_info.GetStackMapForNativePcOffset(64, encoding)));
     ASSERT_EQ(0u, stack_map.GetDexPc(encoding.stack_map_encoding));
     ASSERT_EQ(64u, stack_map.GetNativePcOffset(encoding.stack_map_encoding, kRuntimeISA));
-    ASSERT_EQ(0x3u, stack_map.GetRegisterMask(encoding.stack_map_encoding));
+    ASSERT_EQ(0x3u, code_info.GetRegisterMaskOf(encoding, stack_map));
 
-    ASSERT_TRUE(CheckStackMask(stack_map, encoding.stack_map_encoding, sp_mask1));
+    ASSERT_TRUE(CheckStackMask(code_info, encoding, stack_map, sp_mask1));
 
     ASSERT_TRUE(stack_map.HasDexRegisterMap(encoding.stack_map_encoding));
     DexRegisterMap dex_register_map =
@@ -253,9 +254,9 @@ TEST(StackMapTest, Test2) {
     ASSERT_TRUE(stack_map.Equals(code_info.GetStackMapForNativePcOffset(128u, encoding)));
     ASSERT_EQ(1u, stack_map.GetDexPc(encoding.stack_map_encoding));
     ASSERT_EQ(128u, stack_map.GetNativePcOffset(encoding.stack_map_encoding, kRuntimeISA));
-    ASSERT_EQ(0xFFu, stack_map.GetRegisterMask(encoding.stack_map_encoding));
+    ASSERT_EQ(0xFFu, code_info.GetRegisterMaskOf(encoding, stack_map));
 
-    ASSERT_TRUE(CheckStackMask(stack_map, encoding.stack_map_encoding, sp_mask2));
+    ASSERT_TRUE(CheckStackMask(code_info, encoding, stack_map, sp_mask2));
 
     ASSERT_TRUE(stack_map.HasDexRegisterMap(encoding.stack_map_encoding));
     DexRegisterMap dex_register_map =
@@ -307,9 +308,9 @@ TEST(StackMapTest, Test2) {
     ASSERT_TRUE(stack_map.Equals(code_info.GetStackMapForNativePcOffset(192u, encoding)));
     ASSERT_EQ(2u, stack_map.GetDexPc(encoding.stack_map_encoding));
     ASSERT_EQ(192u, stack_map.GetNativePcOffset(encoding.stack_map_encoding, kRuntimeISA));
-    ASSERT_EQ(0xABu, stack_map.GetRegisterMask(encoding.stack_map_encoding));
+    ASSERT_EQ(0xABu, code_info.GetRegisterMaskOf(encoding, stack_map));
 
-    ASSERT_TRUE(CheckStackMask(stack_map, encoding.stack_map_encoding, sp_mask3));
+    ASSERT_TRUE(CheckStackMask(code_info, encoding, stack_map, sp_mask3));
 
     ASSERT_TRUE(stack_map.HasDexRegisterMap(encoding.stack_map_encoding));
     DexRegisterMap dex_register_map =
@@ -361,9 +362,9 @@ TEST(StackMapTest, Test2) {
     ASSERT_TRUE(stack_map.Equals(code_info.GetStackMapForNativePcOffset(256u, encoding)));
     ASSERT_EQ(3u, stack_map.GetDexPc(encoding.stack_map_encoding));
     ASSERT_EQ(256u, stack_map.GetNativePcOffset(encoding.stack_map_encoding, kRuntimeISA));
-    ASSERT_EQ(0xCDu, stack_map.GetRegisterMask(encoding.stack_map_encoding));
+    ASSERT_EQ(0xCDu, code_info.GetRegisterMaskOf(encoding, stack_map));
 
-    ASSERT_TRUE(CheckStackMask(stack_map, encoding.stack_map_encoding, sp_mask4));
+    ASSERT_TRUE(CheckStackMask(code_info, encoding, stack_map, sp_mask4));
 
     ASSERT_TRUE(stack_map.HasDexRegisterMap(encoding.stack_map_encoding));
     DexRegisterMap dex_register_map =
@@ -443,7 +444,7 @@ TEST(StackMapTest, TestNonLiveDexRegisters) {
   ASSERT_TRUE(stack_map.Equals(code_info.GetStackMapForNativePcOffset(64, encoding)));
   ASSERT_EQ(0u, stack_map.GetDexPc(encoding.stack_map_encoding));
   ASSERT_EQ(64u, stack_map.GetNativePcOffset(encoding.stack_map_encoding, kRuntimeISA));
-  ASSERT_EQ(0x3u, stack_map.GetRegisterMask(encoding.stack_map_encoding));
+  ASSERT_EQ(0x3u, code_info.GetRegisterMaskOf(encoding, stack_map));
 
   ASSERT_TRUE(stack_map.HasDexRegisterMap(encoding.stack_map_encoding));
   DexRegisterMap dex_register_map =
@@ -642,7 +643,7 @@ TEST(StackMapTest, TestNoDexRegisterMap) {
   ASSERT_TRUE(stack_map.Equals(code_info.GetStackMapForNativePcOffset(64, encoding)));
   ASSERT_EQ(0u, stack_map.GetDexPc(encoding.stack_map_encoding));
   ASSERT_EQ(64u, stack_map.GetNativePcOffset(encoding.stack_map_encoding, kRuntimeISA));
-  ASSERT_EQ(0x3u, stack_map.GetRegisterMask(encoding.stack_map_encoding));
+  ASSERT_EQ(0x3u, code_info.GetRegisterMaskOf(encoding, stack_map));
 
   ASSERT_FALSE(stack_map.HasDexRegisterMap(encoding.stack_map_encoding));
   ASSERT_FALSE(stack_map.HasInlineInfo(encoding.stack_map_encoding));
@@ -652,7 +653,7 @@ TEST(StackMapTest, TestNoDexRegisterMap) {
   ASSERT_TRUE(stack_map.Equals(code_info.GetStackMapForNativePcOffset(68, encoding)));
   ASSERT_EQ(1u, stack_map.GetDexPc(encoding.stack_map_encoding));
   ASSERT_EQ(68u, stack_map.GetNativePcOffset(encoding.stack_map_encoding, kRuntimeISA));
-  ASSERT_EQ(0x4u, stack_map.GetRegisterMask(encoding.stack_map_encoding));
+  ASSERT_EQ(0x4u, code_info.GetRegisterMaskOf(encoding, stack_map));
 
   ASSERT_FALSE(stack_map.HasDexRegisterMap(encoding.stack_map_encoding));
   ASSERT_FALSE(stack_map.HasInlineInfo(encoding.stack_map_encoding));
@@ -839,4 +840,33 @@ TEST(StackMapTest, CodeOffsetTest) {
   EXPECT_EQ(offset_mips64.Uint32Value(kMips64), kMips64InstructionAlignment);
 }
 
+
+TEST(StackMapTest, TestDeduplicateStackMask) {
+  ArenaPool pool;
+  ArenaAllocator arena(&pool);
+  StackMapStream stream(&arena, kRuntimeISA);
+
+  ArenaBitVector sp_mask(&arena, 0, true);
+  sp_mask.SetBit(1);
+  sp_mask.SetBit(4);
+  stream.BeginStackMapEntry(0, 4, 0x3, &sp_mask, 0, 0);
+  stream.EndStackMapEntry();
+  stream.BeginStackMapEntry(0, 8, 0x3, &sp_mask, 0, 0);
+  stream.EndStackMapEntry();
+
+  size_t size = stream.PrepareForFillIn();
+  void* memory = arena.Alloc(size, kArenaAllocMisc);
+  MemoryRegion region(memory, size);
+  stream.FillIn(region);
+
+  CodeInfo code_info(region);
+  CodeInfoEncoding encoding = code_info.ExtractEncoding();
+  ASSERT_EQ(2u, code_info.GetNumberOfStackMaps(encoding));
+
+  StackMap stack_map1 = code_info.GetStackMapForNativePcOffset(4, encoding);
+  StackMap stack_map2 = code_info.GetStackMapForNativePcOffset(8, encoding);
+  EXPECT_EQ(stack_map1.GetStackMaskIndex(encoding.stack_map_encoding),
+            stack_map2.GetStackMaskIndex(encoding.stack_map_encoding));
+}
+
 }  // namespace art
diff --git a/compiler/utils/assembler_test_base.h b/compiler/utils/assembler_test_base.h
index e7edf96722..d76cb1c1df 100644
--- a/compiler/utils/assembler_test_base.h
+++ b/compiler/utils/assembler_test_base.h
@@ -26,6 +26,7 @@
 #include "android-base/strings.h"
 
 #include "common_runtime_test.h"  // For ScratchFile
+#include "exec_utils.h"
 #include "utils.h"
 
 namespace art {
diff --git a/compiler/utils/assembler_thumb_test_expected.cc.inc b/compiler/utils/assembler_thumb_test_expected.cc.inc
index f132e2737d..071cd575e1 100644
--- a/compiler/utils/assembler_thumb_test_expected.cc.inc
+++ b/compiler/utils/assembler_thumb_test_expected.cc.inc
@@ -5610,7 +5610,7 @@ const char* const VixlJniHelpersResults[] = {
   " 214:	ecbd 8a10 	vpop	{s16-s31}\n",
   " 218:	e8bd 8de0 	ldmia.w	sp!, {r5, r6, r7, r8, sl, fp, pc}\n",
   " 21c:	4660      	mov	r0, ip\n",
-  " 21e:	f8d9 c2a4 	ldr.w	ip, [r9, #676]	; 0x2a4\n",
+  " 21e:	f8d9 c2b4 	ldr.w	ip, [r9, #692]	; 0x2b4\n",
   " 222:	47e0      	blx	ip\n",
   nullptr
 };
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index d3b15ac8cf..a24d49e08d 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -1057,6 +1057,25 @@ void X86Assembler::andpd(XmmRegister dst, const Address& src) {
 }
 
 
+void X86Assembler::shufpd(XmmRegister dst, XmmRegister src, const Immediate& imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0xC6);
+  EmitXmmRegisterOperand(dst, src);
+  EmitUint8(imm.value());
+}
+
+
+void X86Assembler::shufps(XmmRegister dst, XmmRegister src, const Immediate& imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0xC6);
+  EmitXmmRegisterOperand(dst, src);
+  EmitUint8(imm.value());
+}
+
+
 void X86Assembler::fldl(const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xDD);
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index a93616c3e5..4056ca67fb 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -472,6 +472,9 @@ class X86Assembler FINAL : public Assembler {
   void orpd(XmmRegister dst, XmmRegister src);
   void orps(XmmRegister dst, XmmRegister src);
 
+  void shufpd(XmmRegister dst, XmmRegister src, const Immediate& imm);
+  void shufps(XmmRegister dst, XmmRegister src, const Immediate& imm);
+
   void flds(const Address& src);
   void fstps(const Address& dst);
   void fsts(const Address& dst);
diff --git a/compiler/utils/x86/assembler_x86_test.cc b/compiler/utils/x86/assembler_x86_test.cc
index 4d60a12cb9..1768d8b715 100644
--- a/compiler/utils/x86/assembler_x86_test.cc
+++ b/compiler/utils/x86/assembler_x86_test.cc
@@ -468,51 +468,43 @@ TEST_F(AssemblerX86Test, MovupdAddr) {
 }
 
 TEST_F(AssemblerX86Test, AddPS) {
-  GetAssembler()->addps(x86::XmmRegister(x86::XMM0), x86::XmmRegister(x86::XMM1));
-  const char* expected = "addps %xmm1, %xmm0\n";
-  DriverStr(expected, "addps");
+  DriverStr(RepeatFF(&x86::X86Assembler::addps, "addps %{reg2}, %{reg1}"), "addps");
 }
 
 TEST_F(AssemblerX86Test, AddPD) {
-  GetAssembler()->addpd(x86::XmmRegister(x86::XMM0), x86::XmmRegister(x86::XMM1));
-  const char* expected = "addpd %xmm1, %xmm0\n";
-  DriverStr(expected, "addpd");
+  DriverStr(RepeatFF(&x86::X86Assembler::addpd, "addpd %{reg2}, %{reg1}"), "addpd");
 }
 
 TEST_F(AssemblerX86Test, SubPS) {
-  GetAssembler()->subps(x86::XmmRegister(x86::XMM0), x86::XmmRegister(x86::XMM1));
-  const char* expected = "subps %xmm1, %xmm0\n";
-  DriverStr(expected, "subps");
+  DriverStr(RepeatFF(&x86::X86Assembler::subps, "subps %{reg2}, %{reg1}"), "subps");
 }
 
 TEST_F(AssemblerX86Test, SubPD) {
-  GetAssembler()->subpd(x86::XmmRegister(x86::XMM0), x86::XmmRegister(x86::XMM1));
-  const char* expected = "subpd %xmm1, %xmm0\n";
-  DriverStr(expected, "subpd");
+  DriverStr(RepeatFF(&x86::X86Assembler::subpd, "subpd %{reg2}, %{reg1}"), "subpd");
 }
 
 TEST_F(AssemblerX86Test, MulPS) {
-  GetAssembler()->mulps(x86::XmmRegister(x86::XMM0), x86::XmmRegister(x86::XMM1));
-  const char* expected = "mulps %xmm1, %xmm0\n";
-  DriverStr(expected, "mulps");
+  DriverStr(RepeatFF(&x86::X86Assembler::mulps, "mulps %{reg2}, %{reg1}"), "mulps");
 }
 
 TEST_F(AssemblerX86Test, MulPD) {
-  GetAssembler()->mulpd(x86::XmmRegister(x86::XMM0), x86::XmmRegister(x86::XMM1));
-  const char* expected = "mulpd %xmm1, %xmm0\n";
-  DriverStr(expected, "mulpd");
+  DriverStr(RepeatFF(&x86::X86Assembler::mulpd, "mulpd %{reg2}, %{reg1}"), "mulpd");
 }
 
 TEST_F(AssemblerX86Test, DivPS) {
-  GetAssembler()->divps(x86::XmmRegister(x86::XMM0), x86::XmmRegister(x86::XMM1));
-  const char* expected = "divps %xmm1, %xmm0\n";
-  DriverStr(expected, "divps");
+  DriverStr(RepeatFF(&x86::X86Assembler::divps, "divps %{reg2}, %{reg1}"), "divps");
 }
 
 TEST_F(AssemblerX86Test, DivPD) {
-  GetAssembler()->divpd(x86::XmmRegister(x86::XMM0), x86::XmmRegister(x86::XMM1));
-  const char* expected = "divpd %xmm1, %xmm0\n";
-  DriverStr(expected, "divpd");
+  DriverStr(RepeatFF(&x86::X86Assembler::divpd, "divpd %{reg2}, %{reg1}"), "divpd");
+}
+
+TEST_F(AssemblerX86Test, ShufPS) {
+  DriverStr(RepeatFFI(&x86::X86Assembler::shufps, 1, "shufps ${imm}, %{reg2}, %{reg1}"), "shufps");
+}
+
+TEST_F(AssemblerX86Test, ShufPD) {
+  DriverStr(RepeatFFI(&x86::X86Assembler::shufpd, 1, "shufpd ${imm}, %{reg2}, %{reg1}"), "shufpd");
 }
 
 /////////////////
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index 2366b68f11..c2c44ab58c 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -1213,6 +1213,28 @@ void X86_64Assembler::orps(XmmRegister dst, XmmRegister src) {
   EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
+
+void X86_64Assembler::shufpd(XmmRegister dst, XmmRegister src, const Immediate& imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0xC6);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
+  EmitUint8(imm.value());
+}
+
+
+void X86_64Assembler::shufps(XmmRegister dst, XmmRegister src, const Immediate& imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0xC6);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
+  EmitUint8(imm.value());
+}
+
+
 void X86_64Assembler::fldl(const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xDD);
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index 5923a41fe3..e140b45a00 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -495,6 +495,9 @@ class X86_64Assembler FINAL : public Assembler {
   void orpd(XmmRegister dst, XmmRegister src);
   void orps(XmmRegister dst, XmmRegister src);
 
+  void shufpd(XmmRegister dst, XmmRegister src, const Immediate& imm);
+  void shufps(XmmRegister dst, XmmRegister src, const Immediate& imm);
+
   void flds(const Address& src);
   void fstps(const Address& dst);
   void fsts(const Address& dst);
diff --git a/compiler/utils/x86_64/assembler_x86_64_test.cc b/compiler/utils/x86_64/assembler_x86_64_test.cc
index 2812c34406..efa5cc97ea 100644
--- a/compiler/utils/x86_64/assembler_x86_64_test.cc
+++ b/compiler/utils/x86_64/assembler_x86_64_test.cc
@@ -1203,6 +1203,14 @@ TEST_F(AssemblerX86_64Test, Orpd) {
   DriverStr(RepeatFF(&x86_64::X86_64Assembler::orpd, "orpd %{reg2}, %{reg1}"), "orpd");
 }
 
+TEST_F(AssemblerX86_64Test, Shufps) {
+  DriverStr(RepeatFFI(&x86_64::X86_64Assembler::shufps, 1, "shufps ${imm}, %{reg2}, %{reg1}"), "shufps");
+}
+
+TEST_F(AssemblerX86_64Test, Shufpd) {
+  DriverStr(RepeatFFI(&x86_64::X86_64Assembler::shufpd, 1, "shufpd ${imm}, %{reg2}, %{reg1}"), "shufpd");
+}
+
 TEST_F(AssemblerX86_64Test, UcomissAddress) {
   GetAssembler()->ucomiss(x86_64::XmmRegister(x86_64::XMM0), x86_64::Address(
       x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12));