70 files changed, 9852 insertions, 4176 deletions
diff --git a/compiler/optimizing/bounds_check_elimination.cc b/compiler/optimizing/bounds_check_elimination.cc
index 1fc247faf1..8aefd9ea1f 100644
--- a/compiler/optimizing/bounds_check_elimination.cc
+++ b/compiler/optimizing/bounds_check_elimination.cc
@@ -533,9 +533,6 @@ class BCEVisitor : public HGraphVisitor {
         first_index_bounds_check_map_(
             std::less<int>(),
             graph->GetArena()->Adapter(kArenaAllocBoundsCheckElimination)),
-        dynamic_bce_standby_(
-            graph->GetArena()->Adapter(kArenaAllocBoundsCheckElimination)),
-        record_dynamic_bce_standby_(true),
         early_exit_loop_(
             std::less<uint32_t>(),
             graph->GetArena()->Adapter(kArenaAllocBoundsCheckElimination)),
@@ -560,14 +557,6 @@ class BCEVisitor : public HGraphVisitor {
   }
 
   void Finish() {
-    // Retry dynamic bce candidates on standby that are still in the graph.
-    record_dynamic_bce_standby_ = false;
-    for (HBoundsCheck* bounds_check : dynamic_bce_standby_) {
-      if (bounds_check->IsInBlock()) {
-        TryDynamicBCE(bounds_check);
-      }
-    }
-
     // Preserve SSA structure which may have been broken by adding one or more
     // new taken-test structures (see TransformLoopForDeoptimizationIfNeeded()).
     InsertPhiNodes();
@@ -576,7 +565,6 @@ class BCEVisitor : public HGraphVisitor {
     early_exit_loop_.clear();
     taken_test_loop_.clear();
     finite_loop_.clear();
-    dynamic_bce_standby_.clear();
   }
 
  private:
@@ -832,7 +820,6 @@ class BCEVisitor : public HGraphVisitor {
            array_length->IsArrayLength() ||
            array_length->IsPhi());
     bool try_dynamic_bce = true;
-
     // Analyze index range.
     if (!index->IsIntConstant()) {
       // Non-constant index.
@@ -896,10 +883,20 @@ class BCEVisitor : public HGraphVisitor {
     // If static analysis fails, and OOB is not certain, try dynamic elimination.
     if (try_dynamic_bce) {
       // Try loop-based dynamic elimination.
-      if (TryDynamicBCE(bounds_check)) {
+      HLoopInformation* loop = bounds_check->GetBlock()->GetLoopInformation();
+      bool needs_finite_test = false;
+      bool needs_taken_test = false;
+      if (DynamicBCESeemsProfitable(loop, bounds_check->GetBlock()) &&
+          induction_range_.CanGenerateCode(
+              bounds_check, index, &needs_finite_test, &needs_taken_test) &&
+          CanHandleInfiniteLoop(loop, index, needs_finite_test) &&
+          // Do this test last, since it may generate code.
+          CanHandleLength(loop, array_length, needs_taken_test)) {
+        TransformLoopForDeoptimizationIfNeeded(loop, needs_taken_test);
+        TransformLoopForDynamicBCE(loop, bounds_check);
         return;
       }
-      // Prepare dominator-based dynamic elimination.
+      // Otherwise, prepare dominator-based dynamic elimination.
       if (first_index_bounds_check_map_.find(array_length->GetId()) ==
           first_index_bounds_check_map_.end()) {
         // Remember the first bounds check against each array_length. That bounds check
@@ -1180,7 +1177,7 @@ class BCEVisitor : public HGraphVisitor {
     }
   }
 
-  // Perform dominator-based dynamic elimination on suitable set of bounds checks.
+  /** Performs dominator-based dynamic elimination on suitable set of bounds checks. */
   void AddCompareWithDeoptimization(HBasicBlock* block,
                                     HInstruction* array_length,
                                     HInstruction* base,
@@ -1190,6 +1187,12 @@ class BCEVisitor : public HGraphVisitor {
     // Construct deoptimization on single or double bounds on range [base-min_c,base+max_c],
     // for example either for a[0]..a[3] just 3 or for a[base-1]..a[base+3] both base-1
     // and base+3, since we made the assumption any in between value may occur too.
+    // In code, using unsigned comparisons:
+    // (1) constants only
+    //       if (max_c >= a.length) deoptimize;
+    // (2) general case
+    //       if (base-min_c >  base+max_c) deoptimize;
+    //       if (base+max_c >= a.length  ) deoptimize;
     static_assert(kMaxLengthForAddingDeoptimize < std::numeric_limits<int32_t>::max(),
                   "Incorrect max length may be subject to arithmetic wrap-around");
     HInstruction* upper = GetGraph()->GetIntConstant(max_c);
@@ -1208,7 +1211,7 @@ class BCEVisitor : public HGraphVisitor {
     has_dom_based_dynamic_bce_ = true;
   }
 
-  // Attempt dominator-based dynamic elimination on remaining candidates.
+  /** Attempts dominator-based dynamic elimination on remaining candidates. */
   void AddComparesWithDeoptimization(HBasicBlock* block) {
     for (const auto& entry : first_index_bounds_check_map_) {
       HBoundsCheck* bounds_check = entry.second;
@@ -1272,17 +1275,19 @@ class BCEVisitor : public HGraphVisitor {
           candidates.push_back(other_bounds_check);
         }
       }
-      // Perform dominator-based deoptimization if it seems profitable. Note that we reject cases
-      // where the distance min_c:max_c range gets close to the maximum possible array length,
-      // since those cases are likely to always deopt (such situations do not necessarily go
-      // OOB, though, since the programmer could rely on wrap-around from max to min).
+      // Perform dominator-based deoptimization if it seems profitable, where we eliminate
+      // bounds checks and replace these with deopt checks that guard against any possible
+      // OOB. Note that we reject cases where the distance min_c:max_c range gets close to
+      // the maximum possible array length, since those cases are likely to always deopt
+      // (such situations do not necessarily go OOB, though, since the array could be really
+      // large, or the programmer could rely on arithmetic wrap-around from max to min).
       size_t threshold = kThresholdForAddingDeoptimize + (base == nullptr ? 0 : 1);  // extra test?
       uint32_t distance = static_cast<uint32_t>(max_c) - static_cast<uint32_t>(min_c);
       if (candidates.size() >= threshold &&
           (base != nullptr || min_c >= 0) &&  // reject certain OOB
            distance <= kMaxLengthForAddingDeoptimize) {  // reject likely/certain deopt
         AddCompareWithDeoptimization(block, array_length, base, min_c, max_c);
-        for (HInstruction* other_bounds_check : candidates) {
+        for (HBoundsCheck* other_bounds_check : candidates) {
           // Only replace if still in the graph. This avoids visiting the same
           // bounds check twice if it occurred multiple times in the use list.
           if (other_bounds_check->IsInBlock()) {
@@ -1328,45 +1333,127 @@ class BCEVisitor : public HGraphVisitor {
   }
 
   /**
-   * When the compiler fails to remove a bounds check statically, we try to remove the bounds
-   * check dynamically by adding runtime tests that trigger a deoptimization in case bounds
-   * will go out of range (we want to be rather certain of that given the slowdown of
-   * deoptimization). If no deoptimization occurs, the loop is executed with all corresponding
-   * bounds checks and related null checks removed.
+   * Performs loop-based dynamic elimination on a bounds check. In order to minimize the
+   * number of eventually generated tests, related bounds checks with tests that can be
+   * combined with tests for the given bounds check are collected first.
    */
-  bool TryDynamicBCE(HBoundsCheck* instruction) {
-    HLoopInformation* loop = instruction->GetBlock()->GetLoopInformation();
-    HInstruction* index = instruction->InputAt(0);
-    HInstruction* length = instruction->InputAt(1);
-    // If dynamic bounds check elimination seems profitable and is possible, then proceed.
-    bool needs_finite_test = false;
-    bool needs_taken_test = false;
-    if (DynamicBCESeemsProfitable(loop, instruction->GetBlock()) &&
-        induction_range_.CanGenerateCode(
-            instruction, index, &needs_finite_test, &needs_taken_test) &&
-        CanHandleInfiniteLoop(loop, instruction, index, needs_finite_test) &&
-        CanHandleLength(loop, length, needs_taken_test)) {  // do this test last (may code gen)
-      HInstruction* lower = nullptr;
-      HInstruction* upper = nullptr;
-      // Generate the following unsigned comparisons
-      //     if (lower > upper)   deoptimize;
-      //     if (upper >= length) deoptimize;
-      // or, for a non-induction index, just the unsigned comparison on its 'upper' value
-      //     if (upper >= length) deoptimize;
-      // as runtime test. By restricting dynamic bce to unit strides (with a maximum of 32-bit
-      // iterations) and by not combining access (e.g. a[i], a[i-3], a[i+5] etc.), these tests
-      // correctly guard against any possible OOB (including arithmetic wrap-around cases).
-      TransformLoopForDeoptimizationIfNeeded(loop, needs_taken_test);
-      HBasicBlock* block = GetPreHeader(loop, instruction);
-      induction_range_.GenerateRangeCode(instruction, index, GetGraph(), block, &lower, &upper);
-      if (lower != nullptr) {
-        InsertDeoptInLoop(loop, block, new (GetGraph()->GetArena()) HAbove(lower, upper));
-      }
-      InsertDeoptInLoop(loop, block, new (GetGraph()->GetArena()) HAboveOrEqual(upper, length));
-      ReplaceInstruction(instruction, index);
-      return true;
+  void TransformLoopForDynamicBCE(HLoopInformation* loop, HBoundsCheck* bounds_check) {
+    HInstruction* index = bounds_check->InputAt(0);
+    HInstruction* array_length = bounds_check->InputAt(1);
+    DCHECK(loop->IsDefinedOutOfTheLoop(array_length));  // pre-checked
+    DCHECK(loop->DominatesAllBackEdges(bounds_check->GetBlock()));
+    // Collect all bounds checks in the same loop that are related as "a[base + constant]"
+    // for a base instruction (possibly absent) and various constants.
+    ValueBound value = ValueBound::AsValueBound(index);
+    HInstruction* base = value.GetInstruction();
+    int32_t min_c = base == nullptr ? 0 : value.GetConstant();
+    int32_t max_c = value.GetConstant();
+    ArenaVector<HBoundsCheck*> candidates(
+        GetGraph()->GetArena()->Adapter(kArenaAllocBoundsCheckElimination));
+    ArenaVector<HBoundsCheck*> standby(
+        GetGraph()->GetArena()->Adapter(kArenaAllocBoundsCheckElimination));
+    for (const HUseListNode<HInstruction*>& use : array_length->GetUses()) {
+      HInstruction* user = use.GetUser();
+      if (user->IsBoundsCheck() && loop == user->GetBlock()->GetLoopInformation()) {
+        HBoundsCheck* other_bounds_check = user->AsBoundsCheck();
+        HInstruction* other_index = other_bounds_check->InputAt(0);
+        HInstruction* other_array_length = other_bounds_check->InputAt(1);
+        ValueBound other_value = ValueBound::AsValueBound(other_index);
+        int32_t other_c = other_value.GetConstant();
+        if (array_length == other_array_length && base == other_value.GetInstruction()) {
+          // Does the current basic block dominate all back edges? If not,
+          // add this candidate later only if it falls into the range.
+          if (!loop->DominatesAllBackEdges(user->GetBlock())) {
+            standby.push_back(other_bounds_check);
+            continue;
+          }
+          min_c = std::min(min_c, other_c);
+          max_c = std::max(max_c, other_c);
+          candidates.push_back(other_bounds_check);
+        }
+      }
+    }
+    // Add standby candidates that fall in selected range.
+    for (HBoundsCheck* other_bounds_check : standby) {
+      HInstruction* other_index = other_bounds_check->InputAt(0);
+      int32_t other_c = ValueBound::AsValueBound(other_index).GetConstant();
+      if (min_c <= other_c && other_c <= max_c) {
+        candidates.push_back(other_bounds_check);
+      }
+    }
+    // Perform loop-based deoptimization if it seems profitable, where we eliminate bounds
+    // checks and replace these with deopt checks that guard against any possible OOB.
+    DCHECK_LT(0u, candidates.size());
+    uint32_t distance = static_cast<uint32_t>(max_c) - static_cast<uint32_t>(min_c);
+    if ((base != nullptr || min_c >= 0) &&  // reject certain OOB
+        distance <= kMaxLengthForAddingDeoptimize) {  // reject likely/certain deopt
+      HBasicBlock* block = GetPreHeader(loop, bounds_check);
+      HInstruction* min_lower = nullptr;
+      HInstruction* min_upper = nullptr;
+      HInstruction* max_lower = nullptr;
+      HInstruction* max_upper = nullptr;
+      // Iterate over all bounds checks.
+      for (HBoundsCheck* other_bounds_check : candidates) {
+        // Only handle if still in the graph. This avoids visiting the same
+        // bounds check twice if it occurred multiple times in the use list.
+        if (other_bounds_check->IsInBlock()) {
+          HInstruction* other_index = other_bounds_check->InputAt(0);
+          int32_t other_c = ValueBound::AsValueBound(other_index).GetConstant();
+          // Generate code for either the maximum or minimum. Range analysis already was queried
+          // whether code generation on the original and, thus, related bounds check was possible.
+          // It handles either loop invariants (lower is not set) or unit strides.
+          if (other_c == max_c) {
+            induction_range_.GenerateRangeCode(
+                other_bounds_check, other_index, GetGraph(), block, &max_lower, &max_upper);
+          } else if (other_c == min_c && base != nullptr) {
+            induction_range_.GenerateRangeCode(
+                other_bounds_check, other_index, GetGraph(), block, &min_lower, &min_upper);
+          }
+          ReplaceInstruction(other_bounds_check, other_index);
+        }
+      }
+      // In code, using unsigned comparisons:
+      // (1) constants only
+      //       if (max_upper >= a.length ) deoptimize;
+      // (2) two symbolic invariants
+      //       if (min_upper >  max_upper) deoptimize;   unless min_c == max_c
+      //       if (max_upper >= a.length ) deoptimize;
+      // (3) general case, unit strides (where lower would exceed upper for arithmetic wrap-around)
+      //       if (min_lower >  max_lower) deoptimize;   unless min_c == max_c
+      //       if (max_lower >  max_upper) deoptimize;
+      //       if (max_upper >= a.length ) deoptimize;
+      if (base == nullptr) {
+        // Constants only.
+        DCHECK_GE(min_c, 0);
+        DCHECK(min_lower == nullptr && min_upper == nullptr &&
+               max_lower == nullptr && max_upper != nullptr);
+      } else if (max_lower == nullptr) {
+        // Two symbolic invariants.
+        if (min_c != max_c) {
+          DCHECK(min_lower == nullptr && min_upper != nullptr &&
+                 max_lower == nullptr && max_upper != nullptr);
+          InsertDeoptInLoop(loop, block, new (GetGraph()->GetArena()) HAbove(min_upper, max_upper));
+        } else {
+          DCHECK(min_lower == nullptr && min_upper == nullptr &&
+                 max_lower == nullptr && max_upper != nullptr);
+        }
+      } else {
+        // General case, unit strides.
+        if (min_c != max_c) {
+          DCHECK(min_lower != nullptr && min_upper != nullptr &&
+                 max_lower != nullptr && max_upper != nullptr);
+          InsertDeoptInLoop(loop, block, new (GetGraph()->GetArena()) HAbove(min_lower, max_lower));
+        } else {
+          DCHECK(min_lower == nullptr && min_upper == nullptr &&
+                 max_lower != nullptr && max_upper != nullptr);
+        }
+        InsertDeoptInLoop(loop, block, new (GetGraph()->GetArena()) HAbove(max_lower, max_upper));
+      }
+      InsertDeoptInLoop(
+          loop, block, new (GetGraph()->GetArena()) HAboveOrEqual(max_upper, array_length));
+    } else {
+      // TODO: if rejected, avoid doing this again for subsequent instructions in this set?
     }
-    return false;
   }
 
   /**
@@ -1474,8 +1561,7 @@ class BCEVisitor : public HGraphVisitor {
    * of the loop to use, dynamic bce in such cases is only allowed if other tests
    * ensure the loop is finite.
    */
-  bool CanHandleInfiniteLoop(
-      HLoopInformation* loop, HBoundsCheck* check, HInstruction* index, bool needs_infinite_test) {
+  bool CanHandleInfiniteLoop(HLoopInformation* loop, HInstruction* index, bool needs_infinite_test) {
     if (needs_infinite_test) {
       // If we already forced the loop to be finite, allow directly.
       const uint32_t loop_id = loop->GetHeader()->GetBlockId();
@@ -1497,11 +1583,6 @@ class BCEVisitor : public HGraphVisitor {
           }
         }
       }
-      // If bounds check made it this far, it is worthwhile to check later if
-      // the loop was forced finite by another candidate.
-      if (record_dynamic_bce_standby_) {
-        dynamic_bce_standby_.push_back(check);
-      }
       return false;
     }
     return true;
@@ -1727,10 +1808,6 @@ class BCEVisitor : public HGraphVisitor {
   // in a block that checks an index against that HArrayLength.
   ArenaSafeMap<int, HBoundsCheck*> first_index_bounds_check_map_;
 
-  // Stand by list for dynamic bce.
-  ArenaVector<HBoundsCheck*> dynamic_bce_standby_;
-  bool record_dynamic_bce_standby_;
-
   // Early-exit loop bookkeeping.
   ArenaSafeMap<uint32_t, bool> early_exit_loop_;
 
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index 4520f9b3e3..c532e72465 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -137,7 +137,7 @@ size_t CodeGenerator::GetCacheOffset(uint32_t index) {
 
 size_t CodeGenerator::GetCachePointerOffset(uint32_t index) {
   auto pointer_size = InstructionSetPointerSize(GetInstructionSet());
-  return pointer_size * index;
+  return static_cast<size_t>(pointer_size) * index;
 }
 
 uint32_t CodeGenerator::GetArrayLengthOffset(HArrayLength* array_length) {
@@ -291,7 +291,8 @@ void CodeGenerator::InitializeCodeGeneration(size_t number_of_spill_slots,
   DCHECK(!block_order.empty());
   DCHECK(block_order[0] == GetGraph()->GetEntryBlock());
   ComputeSpillMask();
-  first_register_slot_in_slow_path_ = (number_of_out_slots + number_of_spill_slots) * kVRegSize;
+  first_register_slot_in_slow_path_ = RoundUp(
+      (number_of_out_slots + number_of_spill_slots) * kVRegSize, GetPreferredSlotsAlignment());
 
   if (number_of_spill_slots == 0
       && !HasAllocatedCalleeSaveRegisters()
@@ -302,8 +303,7 @@ void CodeGenerator::InitializeCodeGeneration(size_t number_of_spill_slots,
     SetFrameSize(CallPushesPC() ? GetWordSize() : 0);
   } else {
     SetFrameSize(RoundUp(
-        number_of_spill_slots * kVRegSize
-        + number_of_out_slots * kVRegSize
+        first_register_slot_in_slow_path_
         + maximum_number_of_live_core_registers * GetWordSize()
         + maximum_number_of_live_fpu_registers * GetFloatingPointSpillSlotSize()
         + FrameEntrySpillSize(),
@@ -314,7 +314,8 @@ void CodeGenerator::InitializeCodeGeneration(size_t number_of_spill_slots,
 void CodeGenerator::CreateCommonInvokeLocationSummary(
     HInvoke* invoke, InvokeDexCallingConventionVisitor* visitor) {
   ArenaAllocator* allocator = invoke->GetBlock()->GetGraph()->GetArena();
-  LocationSummary* locations = new (allocator) LocationSummary(invoke, LocationSummary::kCall);
+  LocationSummary* locations = new (allocator) LocationSummary(invoke,
+                                                               LocationSummary::kCallOnMainOnly);
 
   for (size_t i = 0; i < invoke->GetNumberOfArguments(); i++) {
     HInstruction* input = invoke->InputAt(i);
@@ -378,7 +379,7 @@ void CodeGenerator::CreateUnresolvedFieldLocationSummary(
 
   ArenaAllocator* allocator = field_access->GetBlock()->GetGraph()->GetArena();
   LocationSummary* locations =
-      new (allocator) LocationSummary(field_access, LocationSummary::kCall);
+      new (allocator) LocationSummary(field_access, LocationSummary::kCallOnMainOnly);
 
   locations->AddTemp(calling_convention.GetFieldIndexLocation());
 
@@ -499,7 +500,7 @@ void CodeGenerator::CreateLoadClassLocationSummary(HLoadClass* cls,
                                                    bool code_generator_supports_read_barrier) {
   ArenaAllocator* allocator = cls->GetBlock()->GetGraph()->GetArena();
   LocationSummary::CallKind call_kind = cls->NeedsAccessCheck()
-      ? LocationSummary::kCall
+      ? LocationSummary::kCallOnMainOnly
       : (((code_generator_supports_read_barrier && kEmitCompilerReadBarrier) ||
           cls->CanCallRuntime())
             ? LocationSummary::kCallOnSlowPath
@@ -764,16 +765,24 @@ void CodeGenerator::RecordPcInfo(HInstruction* instruction,
   LocationSummary* locations = instruction->GetLocations();
 
   uint32_t register_mask = locations->GetRegisterMask();
-  if (locations->OnlyCallsOnSlowPath()) {
-    // In case of slow path, we currently set the location of caller-save registers
-    // to register (instead of their stack location when pushed before the slow-path
-    // call). Therefore register_mask contains both callee-save and caller-save
-    // registers that hold objects. We must remove the caller-save from the mask, since
-    // they will be overwritten by the callee.
-    register_mask &= core_callee_save_mask_;
+  if (instruction->IsSuspendCheck()) {
+    // Suspend check has special ABI that saves the caller-save registers in callee,
+    // so we want to emit stack maps containing the registers.
+    // TODO: Register allocator still reserves space for the caller-save registers.
+    // We should add slow-path-specific caller-save information into LocationSummary
+    // and refactor the code here as well as in the register allocator to use it.
+  } else {
+    if (locations->OnlyCallsOnSlowPath()) {
+      // In case of slow path, we currently set the location of caller-save registers
+      // to register (instead of their stack location when pushed before the slow-path
+      // call). Therefore register_mask contains both callee-save and caller-save
+      // registers that hold objects. We must remove the caller-save from the mask, since
+      // they will be overwritten by the callee.
+      register_mask &= core_callee_save_mask_;
+    }
+    // The register mask must be a subset of callee-save registers.
+    DCHECK_EQ(register_mask & core_callee_save_mask_, register_mask);
   }
-  // The register mask must be a subset of callee-save registers.
-  DCHECK_EQ(register_mask & core_callee_save_mask_, register_mask);
   stack_map_stream_.BeginStackMapEntry(outer_dex_pc,
                                        native_pc,
                                        register_mask,
@@ -1173,23 +1182,23 @@ void CodeGenerator::ValidateInvokeRuntime(HInstruction* instruction, SlowPathCod
         << "instruction->DebugName()=" << instruction->DebugName()
         << " instruction->GetSideEffects().ToString()=" << instruction->GetSideEffects().ToString();
   } else {
-    DCHECK(instruction->GetLocations()->OnlyCallsOnSlowPath() || slow_path->IsFatal())
+    DCHECK(instruction->GetLocations()->CallsOnSlowPath() || slow_path->IsFatal())
         << "instruction->DebugName()=" << instruction->DebugName()
         << " slow_path->GetDescription()=" << slow_path->GetDescription();
     DCHECK(instruction->GetSideEffects().Includes(SideEffects::CanTriggerGC()) ||
-           // When read barriers are enabled, some instructions use a
-           // slow path to emit a read barrier, which does not trigger
-           // GC, is not fatal, nor is emitted by HDeoptimize
-           // instructions.
+           // When (non-Baker) read barriers are enabled, some instructions
+           // use a slow path to emit a read barrier, which does not trigger
+           // GC.
            (kEmitCompilerReadBarrier &&
+            !kUseBakerReadBarrier &&
             (instruction->IsInstanceFieldGet() ||
              instruction->IsStaticFieldGet() ||
-             instruction->IsArraySet() ||
              instruction->IsArrayGet() ||
              instruction->IsLoadClass() ||
              instruction->IsLoadString() ||
              instruction->IsInstanceOf() ||
-             instruction->IsCheckCast())))
+             instruction->IsCheckCast() ||
+             (instruction->IsInvokeVirtual() && instruction->GetLocations()->Intrinsified()))))
         << "instruction->DebugName()=" << instruction->DebugName()
         << " instruction->GetSideEffects().ToString()=" << instruction->GetSideEffects().ToString()
         << " slow_path->GetDescription()=" << slow_path->GetDescription();
@@ -1203,6 +1212,28 @@ void CodeGenerator::ValidateInvokeRuntime(HInstruction* instruction, SlowPathCod
       << instruction->DebugName() << ((slow_path != nullptr) ? slow_path->GetDescription() : "");
 }
 
+void CodeGenerator::ValidateInvokeRuntimeWithoutRecordingPcInfo(HInstruction* instruction,
+                                                                SlowPathCode* slow_path) {
+  DCHECK(instruction->GetLocations()->OnlyCallsOnSlowPath())
+      << "instruction->DebugName()=" << instruction->DebugName()
+      << " slow_path->GetDescription()=" << slow_path->GetDescription();
+  // Only the Baker read barrier marking slow path used by certains
+  // instructions is expected to invoke the runtime without recording
+  // PC-related information.
+  DCHECK(kUseBakerReadBarrier);
+  DCHECK(instruction->IsInstanceFieldGet() ||
+         instruction->IsStaticFieldGet() ||
+         instruction->IsArrayGet() ||
+         instruction->IsLoadClass() ||
+         instruction->IsLoadString() ||
+         instruction->IsInstanceOf() ||
+         instruction->IsCheckCast() ||
+         (instruction->IsInvokeVirtual() && instruction->GetLocations()->Intrinsified()) ||
+         (instruction->IsInvokeStaticOrDirect() && instruction->GetLocations()->Intrinsified()))
+      << "instruction->DebugName()=" << instruction->DebugName()
+      << " slow_path->GetDescription()=" << slow_path->GetDescription();
+}
+
 void SlowPathCode::SaveLiveRegisters(CodeGenerator* codegen, LocationSummary* locations) {
   RegisterSet* live_registers = locations->GetLiveRegisters();
   size_t stack_offset = codegen->GetFirstRegisterSlotInSlowPath();
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index 9364be35ff..fd396c474c 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -22,6 +22,7 @@
 #include "base/arena_containers.h"
 #include "base/arena_object.h"
 #include "base/bit_field.h"
+#include "base/enums.h"
 #include "compiled_method.h"
 #include "driver/compiler_options.h"
 #include "globals.h"
@@ -80,7 +81,11 @@ class SlowPathCode : public DeletableArenaObject<kArenaAllocSlowPaths> {
 
   virtual void EmitNativeCode(CodeGenerator* codegen) = 0;
 
+  // Save live core and floating-point caller-save registers and
+  // update the stack mask in `locations` for registers holding object
+  // references.
   virtual void SaveLiveRegisters(CodeGenerator* codegen, LocationSummary* locations);
+  // Restore live core and floating-point caller-save registers.
   virtual void RestoreLiveRegisters(CodeGenerator* codegen, LocationSummary* locations);
 
   bool IsCoreRegisterSaved(int reg) const {
@@ -187,7 +192,7 @@ class CodeGenerator : public DeletableArenaObject<kArenaAllocCodeGenerator> {
   size_t GetStackSlotOfParameter(HParameterValue* parameter) const {
     // Note that this follows the current calling convention.
     return GetFrameSize()
-        + InstructionSetPointerSize(GetInstructionSet())  // Art method
+        + static_cast<size_t>(InstructionSetPointerSize(GetInstructionSet()))  // Art method
         + parameter->GetIndex() * kVRegSize;
   }
 
@@ -211,6 +216,8 @@ class CodeGenerator : public DeletableArenaObject<kArenaAllocCodeGenerator> {
                                 size_t maximum_number_of_live_fpu_registers,
                                 size_t number_of_out_slots,
                                 const ArenaVector<HBasicBlock*>& block_order);
+  // Backends can override this as necessary. For most, no special alignment is required.
+  virtual uint32_t GetPreferredSlotsAlignment() const { return 1; }
 
   uint32_t GetFrameSize() const { return frame_size_; }
   void SetFrameSize(uint32_t size) { frame_size_ = size; }
@@ -333,6 +340,9 @@ class CodeGenerator : public DeletableArenaObject<kArenaAllocCodeGenerator> {
   bool* GetBlockedCoreRegisters() const { return blocked_core_registers_; }
   bool* GetBlockedFloatingPointRegisters() const { return blocked_fpu_registers_; }
 
+  bool IsBlockedCoreRegister(size_t i) { return blocked_core_registers_[i]; }
+  bool IsBlockedFloatingPointRegister(size_t i) { return blocked_fpu_registers_[i]; }
+
   // Helper that returns the pointer offset of an index in an object array.
   // Note: this method assumes we always have the same pointer size, regardless
   // of the architecture.
@@ -350,6 +360,17 @@ class CodeGenerator : public DeletableArenaObject<kArenaAllocCodeGenerator> {
   // accessing the String's `value` field in String intrinsics.
   static uint32_t GetArrayDataOffset(HArrayGet* array_get);
 
+  // Return the entry point offset for ReadBarrierMarkRegX, where X is `reg`.
+  template <PointerSize pointer_size>
+  static int32_t GetReadBarrierMarkEntryPointsOffset(size_t reg) {
+    // The entry point list defines 30 ReadBarrierMarkRegX entry points.
+    DCHECK_LT(reg, 30u);
+    // The ReadBarrierMarkRegX entry points are ordered by increasing
+    // register number in Thread::tls_Ptr_.quick_entrypoints.
+    return QUICK_ENTRYPOINT_OFFSET(pointer_size, pReadBarrierMarkReg00).Int32Value()
+        + static_cast<size_t>(pointer_size) * reg;
+  }
+
   void EmitParallelMoves(Location from1,
                          Location to1,
                          Primitive::Type type1,
@@ -363,8 +384,14 @@ class CodeGenerator : public DeletableArenaObject<kArenaAllocCodeGenerator> {
     return type == Primitive::kPrimNot && !value->IsNullConstant();
   }
 
+
+  // Perfoms checks pertaining to an InvokeRuntime call.
   void ValidateInvokeRuntime(HInstruction* instruction, SlowPathCode* slow_path);
 
+  // Perfoms checks pertaining to an InvokeRuntimeWithoutRecordingPcInfo call.
+  static void ValidateInvokeRuntimeWithoutRecordingPcInfo(HInstruction* instruction,
+                                                          SlowPathCode* slow_path);
+
   void AddAllocatedRegister(Location location) {
     allocated_registers_.Add(location);
   }
@@ -677,7 +704,7 @@ class CallingConvention {
                     size_t number_of_registers,
                     const F* fpu_registers,
                     size_t number_of_fpu_registers,
-                    size_t pointer_size)
+                    PointerSize pointer_size)
       : registers_(registers),
         number_of_registers_(number_of_registers),
         fpu_registers_(fpu_registers),
@@ -700,7 +727,7 @@ class CallingConvention {
   size_t GetStackOffsetOf(size_t index) const {
     // We still reserve the space for parameters passed by registers.
     // Add space for the method pointer.
-    return pointer_size_ + index * kVRegSize;
+    return static_cast<size_t>(pointer_size_) + index * kVRegSize;
   }
 
  private:
@@ -708,7 +735,7 @@ class CallingConvention {
   const size_t number_of_registers_;
   const F* fpu_registers_;
   const size_t number_of_fpu_registers_;
-  const size_t pointer_size_;
+  const PointerSize pointer_size_;
 
   DISALLOW_COPY_AND_ASSIGN(CallingConvention);
 };
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index e441f825bc..404f044cef 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -59,9 +59,9 @@ static constexpr DRegister DTMP = D31;
 
 static constexpr uint32_t kPackedSwitchCompareJumpThreshold = 7;
 
-// NOLINT on __ macro to suppress wrong warning/fix from clang-tidy.
-#define __ down_cast<ArmAssembler*>(codegen->GetAssembler())-> // NOLINT
-#define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kArmWordSize, x).Int32Value()
+// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
+#define __ down_cast<ArmAssembler*>(codegen->GetAssembler())->  // NOLINT
+#define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kArmPointerSize, x).Int32Value()
 
 class NullCheckSlowPathARM : public SlowPathCode {
  public:
@@ -119,11 +119,9 @@ class SuspendCheckSlowPathARM : public SlowPathCode {
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen);
     __ Bind(GetEntryLabel());
-    SaveLiveRegisters(codegen, instruction_->GetLocations());
     arm_codegen->InvokeRuntime(
         QUICK_ENTRY_POINT(pTestSuspend), instruction_, instruction_->GetDexPc(), this);
     CheckEntrypointTypes<kQuickTestSuspend, void, void>();
-    RestoreLiveRegisters(codegen, instruction_->GetLocations());
     if (successor_ == nullptr) {
       __ b(GetReturnLabel());
     } else {
@@ -316,7 +314,7 @@ class TypeCheckSlowPathARM : public SlowPathCode {
                                  instruction_->GetDexPc(),
                                  this);
       CheckEntrypointTypes<
-          kQuickInstanceofNonTrivial, uint32_t, const mirror::Class*, const mirror::Class*>();
+          kQuickInstanceofNonTrivial, size_t, const mirror::Class*, const mirror::Class*>();
       arm_codegen->Move32(locations->Out(), Location::RegisterLocation(R0));
     } else {
       DCHECK(instruction_->IsCheckCast());
@@ -412,8 +410,8 @@ class ArraySetSlowPathARM : public SlowPathCode {
 // Slow path marking an object during a read barrier.
 class ReadBarrierMarkSlowPathARM : public SlowPathCode {
  public:
-  ReadBarrierMarkSlowPathARM(HInstruction* instruction, Location out, Location obj)
-      : SlowPathCode(instruction), out_(out), obj_(obj) {
+  ReadBarrierMarkSlowPathARM(HInstruction* instruction, Location obj)
+      : SlowPathCode(instruction), obj_(obj) {
     DCHECK(kEmitCompilerReadBarrier);
   }
 
@@ -421,9 +419,9 @@ class ReadBarrierMarkSlowPathARM : public SlowPathCode {
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     LocationSummary* locations = instruction_->GetLocations();
-    Register reg_out = out_.AsRegister<Register>();
+    Register reg = obj_.AsRegister<Register>();
     DCHECK(locations->CanCall());
-    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg_out));
+    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg));
     DCHECK(instruction_->IsInstanceFieldGet() ||
            instruction_->IsStaticFieldGet() ||
            instruction_->IsArrayGet() ||
@@ -431,30 +429,45 @@ class ReadBarrierMarkSlowPathARM : public SlowPathCode {
            instruction_->IsLoadString() ||
            instruction_->IsInstanceOf() ||
            instruction_->IsCheckCast() ||
-           ((instruction_->IsInvokeStaticOrDirect() || instruction_->IsInvokeVirtual()) &&
-            instruction_->GetLocations()->Intrinsified()))
+           (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()) ||
+           (instruction_->IsInvokeStaticOrDirect() && instruction_->GetLocations()->Intrinsified()))
         << "Unexpected instruction in read barrier marking slow path: "
         << instruction_->DebugName();
 
     __ Bind(GetEntryLabel());
-    SaveLiveRegisters(codegen, locations);
-
-    InvokeRuntimeCallingConvention calling_convention;
+    // No need to save live registers; it's taken care of by the
+    // entrypoint. Also, there is no need to update the stack mask,
+    // as this runtime call will not trigger a garbage collection.
     CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen);
-    arm_codegen->Move32(Location::RegisterLocation(calling_convention.GetRegisterAt(0)), obj_);
-    arm_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pReadBarrierMark),
-                               instruction_,
-                               instruction_->GetDexPc(),
-                               this);
-    CheckEntrypointTypes<kQuickReadBarrierMark, mirror::Object*, mirror::Object*>();
-    arm_codegen->Move32(out_, Location::RegisterLocation(R0));
-
-    RestoreLiveRegisters(codegen, locations);
+    DCHECK_NE(reg, SP);
+    DCHECK_NE(reg, LR);
+    DCHECK_NE(reg, PC);
+    // IP is used internally by the ReadBarrierMarkRegX entry point
+    // as a temporary, it cannot be the entry point's input/output.
+    DCHECK_NE(reg, IP);
+    DCHECK(0 <= reg && reg < kNumberOfCoreRegisters) << reg;
+    // "Compact" slow path, saving two moves.
+    //
+    // Instead of using the standard runtime calling convention (input
+    // and output in R0):
+    //
+    //   R0 <- obj
+    //   R0 <- ReadBarrierMark(R0)
+    //   obj <- R0
+    //
+    // we just use rX (the register holding `obj`) as input and output
+    // of a dedicated entrypoint:
+    //
+    //   rX <- ReadBarrierMarkRegX(rX)
+    //
+    int32_t entry_point_offset =
+        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(reg);
+    // This runtime call does not require a stack map.
+    arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
     __ b(GetExitLabel());
   }
 
  private:
-  const Location out_;
   const Location obj_;
 
   DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathARM);
@@ -500,8 +513,7 @@ class ReadBarrierForHeapReferenceSlowPathARM : public SlowPathCode {
            instruction_->IsArrayGet() ||
            instruction_->IsInstanceOf() ||
            instruction_->IsCheckCast() ||
-           ((instruction_->IsInvokeStaticOrDirect() || instruction_->IsInvokeVirtual()) &&
-            instruction_->GetLocations()->Intrinsified()))
+           (instruction_->IsInvokeVirtual()) && instruction_->GetLocations()->Intrinsified())
         << "Unexpected instruction in read barrier for heap reference slow path: "
         << instruction_->DebugName();
 
@@ -688,8 +700,8 @@ class ReadBarrierForRootSlowPathARM : public SlowPathCode {
 };
 
 #undef __
-// NOLINT on __ macro to suppress wrong warning/fix from clang-tidy.
-#define __ down_cast<ArmAssembler*>(GetAssembler())-> // NOLINT
+// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
+#define __ down_cast<ArmAssembler*>(GetAssembler())->  // NOLINT
 
 inline Condition ARMCondition(IfCondition cond) {
   switch (cond) {
@@ -956,7 +968,7 @@ void CodeGeneratorARM::GenerateFrameExit() {
   if (fpu_spill_mask_ != 0) {
     SRegister start_register = SRegister(LeastSignificantBit(fpu_spill_mask_));
     __ vpops(start_register, POPCOUNT(fpu_spill_mask_));
-    __ cfi().AdjustCFAOffset(-kArmPointerSize * POPCOUNT(fpu_spill_mask_));
+    __ cfi().AdjustCFAOffset(-static_cast<int>(kArmPointerSize) * POPCOUNT(fpu_spill_mask_));
     __ cfi().RestoreMany(DWARFReg(SRegister(0)), fpu_spill_mask_);
   }
   // Pop LR into PC to return.
@@ -1208,7 +1220,7 @@ void CodeGeneratorARM::InvokeRuntime(QuickEntrypointEnum entrypoint,
                                      HInstruction* instruction,
                                      uint32_t dex_pc,
                                      SlowPathCode* slow_path) {
-  InvokeRuntime(GetThreadOffset<kArmWordSize>(entrypoint).Int32Value(),
+  InvokeRuntime(GetThreadOffset<kArmPointerSize>(entrypoint).Int32Value(),
                 instruction,
                 dex_pc,
                 slow_path);
@@ -1224,6 +1236,14 @@ void CodeGeneratorARM::InvokeRuntime(int32_t entry_point_offset,
   RecordPcInfo(instruction, dex_pc, slow_path);
 }
 
+void CodeGeneratorARM::InvokeRuntimeWithoutRecordingPcInfo(int32_t entry_point_offset,
+                                                           HInstruction* instruction,
+                                                           SlowPathCode* slow_path) {
+  ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction, slow_path);
+  __ LoadFromOffset(kLoadWord, LR, TR, entry_point_offset);
+  __ blx(LR);
+}
+
 void InstructionCodeGeneratorARM::HandleGoto(HInstruction* got, HBasicBlock* successor) {
   DCHECK(!successor->IsExitBlock());
 
@@ -1271,6 +1291,44 @@ void LocationsBuilderARM::VisitExit(HExit* exit) {
 void InstructionCodeGeneratorARM::VisitExit(HExit* exit ATTRIBUTE_UNUSED) {
 }
 
+void InstructionCodeGeneratorARM::GenerateVcmp(HInstruction* instruction) {
+  Primitive::Type type = instruction->InputAt(0)->GetType();
+  Location lhs_loc = instruction->GetLocations()->InAt(0);
+  Location rhs_loc = instruction->GetLocations()->InAt(1);
+  if (rhs_loc.IsConstant()) {
+    // 0.0 is the only immediate that can be encoded directly in
+    // a VCMP instruction.
+    //
+    // Both the JLS (section 15.20.1) and the JVMS (section 6.5)
+    // specify that in a floating-point comparison, positive zero
+    // and negative zero are considered equal, so we can use the
+    // literal 0.0 for both cases here.
+    //
+    // Note however that some methods (Float.equal, Float.compare,
+    // Float.compareTo, Double.equal, Double.compare,
+    // Double.compareTo, Math.max, Math.min, StrictMath.max,
+    // StrictMath.min) consider 0.0 to be (strictly) greater than
+    // -0.0. So if we ever translate calls to these methods into a
+    // HCompare instruction, we must handle the -0.0 case with
+    // care here.
+    DCHECK(rhs_loc.GetConstant()->IsArithmeticZero());
+    if (type == Primitive::kPrimFloat) {
+      __ vcmpsz(lhs_loc.AsFpuRegister<SRegister>());
+    } else {
+      DCHECK_EQ(type, Primitive::kPrimDouble);
+      __ vcmpdz(FromLowSToD(lhs_loc.AsFpuRegisterPairLow<SRegister>()));
+    }
+  } else {
+    if (type == Primitive::kPrimFloat) {
+      __ vcmps(lhs_loc.AsFpuRegister<SRegister>(), rhs_loc.AsFpuRegister<SRegister>());
+    } else {
+      DCHECK_EQ(type, Primitive::kPrimDouble);
+      __ vcmpd(FromLowSToD(lhs_loc.AsFpuRegisterPairLow<SRegister>()),
+               FromLowSToD(rhs_loc.AsFpuRegisterPairLow<SRegister>()));
+    }
+  }
+}
+
 void InstructionCodeGeneratorARM::GenerateFPJumps(HCondition* cond,
                                                   Label* true_label,
                                                   Label* false_label ATTRIBUTE_UNUSED) {
@@ -1371,22 +1429,14 @@ void InstructionCodeGeneratorARM::GenerateCompareTestAndBranch(HCondition* condi
   Label* true_target = true_target_in == nullptr ? &fallthrough_target : true_target_in;
   Label* false_target = false_target_in == nullptr ? &fallthrough_target : false_target_in;
 
-  LocationSummary* locations = condition->GetLocations();
-  Location left = locations->InAt(0);
-  Location right = locations->InAt(1);
-
   Primitive::Type type = condition->InputAt(0)->GetType();
   switch (type) {
     case Primitive::kPrimLong:
       GenerateLongComparesAndJumps(condition, true_target, false_target);
       break;
     case Primitive::kPrimFloat:
-      __ vcmps(left.AsFpuRegister<SRegister>(), right.AsFpuRegister<SRegister>());
-      GenerateFPJumps(condition, true_target, false_target);
-      break;
     case Primitive::kPrimDouble:
-      __ vcmpd(FromLowSToD(left.AsFpuRegisterPairLow<SRegister>()),
-               FromLowSToD(right.AsFpuRegisterPairLow<SRegister>()));
+      GenerateVcmp(condition);
       GenerateFPJumps(condition, true_target, false_target);
       break;
     default:
@@ -1567,7 +1617,7 @@ void LocationsBuilderARM::HandleCondition(HCondition* cond) {
     case Primitive::kPrimFloat:
     case Primitive::kPrimDouble:
       locations->SetInAt(0, Location::RequiresFpuRegister());
-      locations->SetInAt(1, Location::RequiresFpuRegister());
+      locations->SetInAt(1, ArithmeticZeroOrFpuRegister(cond->InputAt(1)));
       if (!cond->IsEmittedAtUseSite()) {
         locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
       }
@@ -1614,12 +1664,8 @@ void InstructionCodeGeneratorARM::HandleCondition(HCondition* cond) {
       GenerateLongComparesAndJumps(cond, &true_label, &false_label);
       break;
     case Primitive::kPrimFloat:
-      __ vcmps(left.AsFpuRegister<SRegister>(), right.AsFpuRegister<SRegister>());
-      GenerateFPJumps(cond, &true_label, &false_label);
-      break;
     case Primitive::kPrimDouble:
-      __ vcmpd(FromLowSToD(left.AsFpuRegisterPairLow<SRegister>()),
-               FromLowSToD(right.AsFpuRegisterPairLow<SRegister>()));
+      GenerateVcmp(cond);
       GenerateFPJumps(cond, &true_label, &false_label);
       break;
   }
@@ -1889,8 +1935,6 @@ void InstructionCodeGeneratorARM::VisitInvokeInterface(HInvokeInterface* invoke)
   LocationSummary* locations = invoke->GetLocations();
   Register temp = locations->GetTemp(0).AsRegister<Register>();
   Register hidden_reg = locations->GetTemp(1).AsRegister<Register>();
-  uint32_t method_offset = mirror::Class::EmbeddedImTableEntryOffset(
-      invoke->GetImtIndex() % mirror::Class::kImtSize, kArmPointerSize).Uint32Value();
   Location receiver = locations->InAt(0);
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
 
@@ -1916,10 +1960,14 @@ void InstructionCodeGeneratorARM::VisitInvokeInterface(HInvokeInterface* invoke)
   // intact/accessible until the end of the marking phase (the
   // concurrent copying collector may not in the future).
   __ MaybeUnpoisonHeapReference(temp);
+  __ LoadFromOffset(kLoadWord, temp, temp,
+        mirror::Class::ImtPtrOffset(kArmPointerSize).Uint32Value());
+  uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement(
+      invoke->GetImtIndex(), kArmPointerSize));
   // temp = temp->GetImtEntryAt(method_offset);
-  uint32_t entry_point =
-      ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArmWordSize).Int32Value();
   __ LoadFromOffset(kLoadWord, temp, temp, method_offset);
+  uint32_t entry_point =
+      ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArmPointerSize).Int32Value();
   // LR = temp->GetEntryPoint();
   __ LoadFromOffset(kLoadWord, LR, temp, entry_point);
   // LR();
@@ -2012,7 +2060,7 @@ void LocationsBuilderARM::VisitTypeConversion(HTypeConversion* conversion) {
       (((input_type == Primitive::kPrimFloat || input_type == Primitive::kPrimDouble)
         && result_type == Primitive::kPrimLong)
        || (input_type == Primitive::kPrimLong && result_type == Primitive::kPrimFloat))
-      ? LocationSummary::kCall
+      ? LocationSummary::kCallOnMainOnly
       : LocationSummary::kNoCall;
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(conversion, call_kind);
@@ -2477,7 +2525,7 @@ void LocationsBuilderARM::VisitAdd(HAdd* add) {
 
     case Primitive::kPrimLong: {
       locations->SetInAt(0, Location::RequiresRegister());
-      locations->SetInAt(1, Location::RequiresRegister());
+      locations->SetInAt(1, ArmEncodableConstantOrRegister(add->InputAt(1), ADD));
       locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
       break;
     }
@@ -2514,13 +2562,18 @@ void InstructionCodeGeneratorARM::VisitAdd(HAdd* add) {
       break;
 
     case Primitive::kPrimLong: {
-      DCHECK(second.IsRegisterPair());
-      __ adds(out.AsRegisterPairLow<Register>(),
-              first.AsRegisterPairLow<Register>(),
-              ShifterOperand(second.AsRegisterPairLow<Register>()));
-      __ adc(out.AsRegisterPairHigh<Register>(),
-             first.AsRegisterPairHigh<Register>(),
-             ShifterOperand(second.AsRegisterPairHigh<Register>()));
+      if (second.IsConstant()) {
+        uint64_t value = static_cast<uint64_t>(Int64FromConstant(second.GetConstant()));
+        GenerateAddLongConst(out, first, value);
+      } else {
+        DCHECK(second.IsRegisterPair());
+        __ adds(out.AsRegisterPairLow<Register>(),
+                first.AsRegisterPairLow<Register>(),
+                ShifterOperand(second.AsRegisterPairLow<Register>()));
+        __ adc(out.AsRegisterPairHigh<Register>(),
+               first.AsRegisterPairHigh<Register>(),
+               ShifterOperand(second.AsRegisterPairHigh<Register>()));
+      }
       break;
     }
 
@@ -2554,7 +2607,7 @@ void LocationsBuilderARM::VisitSub(HSub* sub) {
 
     case Primitive::kPrimLong: {
       locations->SetInAt(0, Location::RequiresRegister());
-      locations->SetInAt(1, Location::RequiresRegister());
+      locations->SetInAt(1, ArmEncodableConstantOrRegister(sub->InputAt(1), SUB));
       locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
       break;
     }
@@ -2590,13 +2643,18 @@ void InstructionCodeGeneratorARM::VisitSub(HSub* sub) {
     }
 
     case Primitive::kPrimLong: {
-      DCHECK(second.IsRegisterPair());
-      __ subs(out.AsRegisterPairLow<Register>(),
-              first.AsRegisterPairLow<Register>(),
-              ShifterOperand(second.AsRegisterPairLow<Register>()));
-      __ sbc(out.AsRegisterPairHigh<Register>(),
-             first.AsRegisterPairHigh<Register>(),
-             ShifterOperand(second.AsRegisterPairHigh<Register>()));
+      if (second.IsConstant()) {
+        uint64_t value = static_cast<uint64_t>(Int64FromConstant(second.GetConstant()));
+        GenerateAddLongConst(out, first, -value);
+      } else {
+        DCHECK(second.IsRegisterPair());
+        __ subs(out.AsRegisterPairLow<Register>(),
+                first.AsRegisterPairLow<Register>(),
+                ShifterOperand(second.AsRegisterPairLow<Register>()));
+        __ sbc(out.AsRegisterPairHigh<Register>(),
+               first.AsRegisterPairHigh<Register>(),
+               ShifterOperand(second.AsRegisterPairHigh<Register>()));
+      }
       break;
     }
 
@@ -2831,13 +2889,13 @@ void LocationsBuilderARM::VisitDiv(HDiv* div) {
   LocationSummary::CallKind call_kind = LocationSummary::kNoCall;
   if (div->GetResultType() == Primitive::kPrimLong) {
     // pLdiv runtime call.
-    call_kind = LocationSummary::kCall;
+    call_kind = LocationSummary::kCallOnMainOnly;
   } else if (div->GetResultType() == Primitive::kPrimInt && div->InputAt(1)->IsConstant()) {
     // sdiv will be replaced by other instruction sequence.
   } else if (div->GetResultType() == Primitive::kPrimInt &&
              !codegen_->GetInstructionSetFeatures().HasDivideInstruction()) {
     // pIdivmod runtime call.
-    call_kind = LocationSummary::kCall;
+    call_kind = LocationSummary::kCallOnMainOnly;
   }
 
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(div, call_kind);
@@ -2956,7 +3014,7 @@ void LocationsBuilderARM::VisitRem(HRem* rem) {
   Primitive::Type type = rem->GetResultType();
 
   // Most remainders are implemented in the runtime.
-  LocationSummary::CallKind call_kind = LocationSummary::kCall;
+  LocationSummary::CallKind call_kind = LocationSummary::kCallOnMainOnly;
   if (rem->GetResultType() == Primitive::kPrimInt && rem->InputAt(1)->IsConstant()) {
     // sdiv will be replaced by other instruction sequence.
     call_kind = LocationSummary::kNoCall;
@@ -3493,7 +3551,7 @@ void InstructionCodeGeneratorARM::VisitUShr(HUShr* ushr) {
 
 void LocationsBuilderARM::VisitNewInstance(HNewInstance* instruction) {
   LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
+      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly);
   if (instruction->IsStringAlloc()) {
     locations->AddTemp(Location::RegisterLocation(kMethodRegisterArgument));
   } else {
@@ -3510,7 +3568,7 @@ void InstructionCodeGeneratorARM::VisitNewInstance(HNewInstance* instruction) {
   if (instruction->IsStringAlloc()) {
     // String is allocated through StringFactory. Call NewEmptyString entry point.
     Register temp = instruction->GetLocations()->GetTemp(0).AsRegister<Register>();
-    MemberOffset code_offset = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArmWordSize);
+    MemberOffset code_offset = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArmPointerSize);
     __ LoadFromOffset(kLoadWord, temp, TR, QUICK_ENTRY_POINT(pNewEmptyString));
     __ LoadFromOffset(kLoadWord, LR, temp, code_offset.Int32Value());
     __ blx(LR);
@@ -3526,7 +3584,7 @@ void InstructionCodeGeneratorARM::VisitNewInstance(HNewInstance* instruction) {
 
 void LocationsBuilderARM::VisitNewArray(HNewArray* instruction) {
   LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
+      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly);
   InvokeRuntimeCallingConvention calling_convention;
   locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
   locations->SetOut(Location::RegisterLocation(R0));
@@ -3634,7 +3692,7 @@ void LocationsBuilderARM::VisitCompare(HCompare* compare) {
     case Primitive::kPrimFloat:
     case Primitive::kPrimDouble: {
       locations->SetInAt(0, Location::RequiresFpuRegister());
-      locations->SetInAt(1, Location::RequiresFpuRegister());
+      locations->SetInAt(1, ArithmeticZeroOrFpuRegister(compare->InputAt(1)));
       locations->SetOut(Location::RequiresRegister());
       break;
     }
@@ -3679,12 +3737,7 @@ void InstructionCodeGeneratorARM::VisitCompare(HCompare* compare) {
     case Primitive::kPrimFloat:
     case Primitive::kPrimDouble: {
       __ LoadImmediate(out, 0);
-      if (type == Primitive::kPrimFloat) {
-        __ vcmps(left.AsFpuRegister<SRegister>(), right.AsFpuRegister<SRegister>());
-      } else {
-        __ vcmpd(FromLowSToD(left.AsFpuRegisterPairLow<SRegister>()),
-                 FromLowSToD(right.AsFpuRegisterPairLow<SRegister>()));
-      }
+      GenerateVcmp(compare);
       __ vmstat();  // transfer FP status register to ARM APSR.
       less_cond = ARMFPCondition(kCondLT, compare->IsGtBias());
       break;
@@ -3978,6 +4031,17 @@ void LocationsBuilderARM::HandleFieldGet(HInstruction* instruction, const FieldI
   }
 }
 
+Location LocationsBuilderARM::ArithmeticZeroOrFpuRegister(HInstruction* input) {
+  DCHECK(input->GetType() == Primitive::kPrimDouble || input->GetType() == Primitive::kPrimFloat)
+      << input->GetType();
+  if ((input->IsFloatConstant() && (input->AsFloatConstant()->IsArithmeticZero())) ||
+      (input->IsDoubleConstant() && (input->AsDoubleConstant()->IsArithmeticZero()))) {
+    return Location::ConstantLocation(input->AsConstant());
+  } else {
+    return Location::RequiresFpuRegister();
+  }
+}
+
 Location LocationsBuilderARM::ArmEncodableConstantOrRegister(HInstruction* constant,
                                                              Opcode opcode) {
   DCHECK(!Primitive::IsFloatingPointType(constant->GetType()));
@@ -3992,31 +4056,51 @@ bool LocationsBuilderARM::CanEncodeConstantAsImmediate(HConstant* input_cst,
                                                        Opcode opcode) {
   uint64_t value = static_cast<uint64_t>(Int64FromConstant(input_cst));
   if (Primitive::Is64BitType(input_cst->GetType())) {
-    return CanEncodeConstantAsImmediate(Low32Bits(value), opcode) &&
-        CanEncodeConstantAsImmediate(High32Bits(value), opcode);
+    Opcode high_opcode = opcode;
+    SetCc low_set_cc = kCcDontCare;
+    switch (opcode) {
+      case SUB:
+        // Flip the operation to an ADD.
+        value = -value;
+        opcode = ADD;
+        FALLTHROUGH_INTENDED;
+      case ADD:
+        if (Low32Bits(value) == 0u) {
+          return CanEncodeConstantAsImmediate(High32Bits(value), opcode, kCcDontCare);
+        }
+        high_opcode = ADC;
+        low_set_cc = kCcSet;
+        break;
+      default:
+        break;
+    }
+    return CanEncodeConstantAsImmediate(Low32Bits(value), opcode, low_set_cc) &&
+        CanEncodeConstantAsImmediate(High32Bits(value), high_opcode, kCcDontCare);
   } else {
     return CanEncodeConstantAsImmediate(Low32Bits(value), opcode);
   }
 }
 
-bool LocationsBuilderARM::CanEncodeConstantAsImmediate(uint32_t value, Opcode opcode) {
+bool LocationsBuilderARM::CanEncodeConstantAsImmediate(uint32_t value,
+                                                       Opcode opcode,
+                                                       SetCc set_cc) {
   ShifterOperand so;
   ArmAssembler* assembler = codegen_->GetAssembler();
-  if (assembler->ShifterOperandCanHold(kNoRegister, kNoRegister, opcode, value, &so)) {
+  if (assembler->ShifterOperandCanHold(kNoRegister, kNoRegister, opcode, value, set_cc, &so)) {
     return true;
   }
   Opcode neg_opcode = kNoOperand;
   switch (opcode) {
-    case AND:
-      neg_opcode = BIC;
-      break;
-    case ORR:
-      neg_opcode = ORN;
-      break;
+    case AND: neg_opcode = BIC; value = ~value; break;
+    case ORR: neg_opcode = ORN; value = ~value; break;
+    case ADD: neg_opcode = SUB; value = -value; break;
+    case ADC: neg_opcode = SBC; value = ~value; break;
+    case SUB: neg_opcode = ADD; value = -value; break;
+    case SBC: neg_opcode = ADC; value = ~value; break;
     default:
       return false;
   }
-  return assembler->ShifterOperandCanHold(kNoRegister, kNoRegister, neg_opcode, ~value, &so);
+  return assembler->ShifterOperandCanHold(kNoRegister, kNoRegister, neg_opcode, value, set_cc, &so);
 }
 
 void InstructionCodeGeneratorARM::HandleFieldGet(HInstruction* instruction,
@@ -4264,6 +4348,122 @@ void InstructionCodeGeneratorARM::VisitNullCheck(HNullCheck* instruction) {
   codegen_->GenerateNullCheck(instruction);
 }
 
+static LoadOperandType GetLoadOperandType(Primitive::Type type) {
+  switch (type) {
+    case Primitive::kPrimNot:
+      return kLoadWord;
+    case Primitive::kPrimBoolean:
+      return kLoadUnsignedByte;
+    case Primitive::kPrimByte:
+      return kLoadSignedByte;
+    case Primitive::kPrimChar:
+      return kLoadUnsignedHalfword;
+    case Primitive::kPrimShort:
+      return kLoadSignedHalfword;
+    case Primitive::kPrimInt:
+      return kLoadWord;
+    case Primitive::kPrimLong:
+      return kLoadWordPair;
+    case Primitive::kPrimFloat:
+      return kLoadSWord;
+    case Primitive::kPrimDouble:
+      return kLoadDWord;
+    default:
+      LOG(FATAL) << "Unreachable type " << type;
+      UNREACHABLE();
+  }
+}
+
+static StoreOperandType GetStoreOperandType(Primitive::Type type) {
+  switch (type) {
+    case Primitive::kPrimNot:
+      return kStoreWord;
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+      return kStoreByte;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      return kStoreHalfword;
+    case Primitive::kPrimInt:
+      return kStoreWord;
+    case Primitive::kPrimLong:
+      return kStoreWordPair;
+    case Primitive::kPrimFloat:
+      return kStoreSWord;
+    case Primitive::kPrimDouble:
+      return kStoreDWord;
+    default:
+      LOG(FATAL) << "Unreachable type " << type;
+      UNREACHABLE();
+  }
+}
+
+void CodeGeneratorARM::LoadFromShiftedRegOffset(Primitive::Type type,
+                                                Location out_loc,
+                                                Register base,
+                                                Register reg_offset,
+                                                Condition cond) {
+  uint32_t shift_count = Primitive::ComponentSizeShift(type);
+  Address mem_address(base, reg_offset, Shift::LSL, shift_count);
+
+  switch (type) {
+    case Primitive::kPrimByte:
+      __ ldrsb(out_loc.AsRegister<Register>(), mem_address, cond);
+      break;
+    case Primitive::kPrimBoolean:
+      __ ldrb(out_loc.AsRegister<Register>(), mem_address, cond);
+      break;
+    case Primitive::kPrimShort:
+      __ ldrsh(out_loc.AsRegister<Register>(), mem_address, cond);
+      break;
+    case Primitive::kPrimChar:
+      __ ldrh(out_loc.AsRegister<Register>(), mem_address, cond);
+      break;
+    case Primitive::kPrimNot:
+    case Primitive::kPrimInt:
+      __ ldr(out_loc.AsRegister<Register>(), mem_address, cond);
+      break;
+    // T32 doesn't support LoadFromShiftedRegOffset mem address mode for these types.
+    case Primitive::kPrimLong:
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+    default:
+      LOG(FATAL) << "Unreachable type " << type;
+      UNREACHABLE();
+  }
+}
+
+void CodeGeneratorARM::StoreToShiftedRegOffset(Primitive::Type type,
+                                               Location loc,
+                                               Register base,
+                                               Register reg_offset,
+                                               Condition cond) {
+  uint32_t shift_count = Primitive::ComponentSizeShift(type);
+  Address mem_address(base, reg_offset, Shift::LSL, shift_count);
+
+  switch (type) {
+    case Primitive::kPrimByte:
+    case Primitive::kPrimBoolean:
+      __ strb(loc.AsRegister<Register>(), mem_address, cond);
+      break;
+    case Primitive::kPrimShort:
+    case Primitive::kPrimChar:
+      __ strh(loc.AsRegister<Register>(), mem_address, cond);
+      break;
+    case Primitive::kPrimNot:
+    case Primitive::kPrimInt:
+      __ str(loc.AsRegister<Register>(), mem_address, cond);
+      break;
+    // T32 doesn't support StoreToShiftedRegOffset mem address mode for these types.
+    case Primitive::kPrimLong:
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+    default:
+      LOG(FATAL) << "Unreachable type " << type;
+      UNREACHABLE();
+  }
+}
+
 void LocationsBuilderARM::VisitArrayGet(HArrayGet* instruction) {
   bool object_array_get_with_read_barrier =
       kEmitCompilerReadBarrier && (instruction->GetType() == Primitive::kPrimNot);
@@ -4298,70 +4498,40 @@ void InstructionCodeGeneratorARM::VisitArrayGet(HArrayGet* instruction) {
   Location index = locations->InAt(1);
   Location out_loc = locations->Out();
   uint32_t data_offset = CodeGenerator::GetArrayDataOffset(instruction);
-
   Primitive::Type type = instruction->GetType();
-  switch (type) {
-    case Primitive::kPrimBoolean: {
-      Register out = out_loc.AsRegister<Register>();
-      if (index.IsConstant()) {
-        size_t offset =
-            (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset;
-        __ LoadFromOffset(kLoadUnsignedByte, out, obj, offset);
-      } else {
-        __ add(IP, obj, ShifterOperand(index.AsRegister<Register>()));
-        __ LoadFromOffset(kLoadUnsignedByte, out, IP, data_offset);
-      }
-      break;
-    }
-
-    case Primitive::kPrimByte: {
-      Register out = out_loc.AsRegister<Register>();
-      if (index.IsConstant()) {
-        size_t offset =
-            (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset;
-        __ LoadFromOffset(kLoadSignedByte, out, obj, offset);
-      } else {
-        __ add(IP, obj, ShifterOperand(index.AsRegister<Register>()));
-        __ LoadFromOffset(kLoadSignedByte, out, IP, data_offset);
-      }
-      break;
-    }
-
-    case Primitive::kPrimShort: {
-      Register out = out_loc.AsRegister<Register>();
-      if (index.IsConstant()) {
-        size_t offset =
-            (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_2) + data_offset;
-        __ LoadFromOffset(kLoadSignedHalfword, out, obj, offset);
-      } else {
-        __ add(IP, obj, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_2));
-        __ LoadFromOffset(kLoadSignedHalfword, out, IP, data_offset);
-      }
-      break;
-    }
-
-    case Primitive::kPrimChar: {
-      Register out = out_loc.AsRegister<Register>();
-      if (index.IsConstant()) {
-        size_t offset =
-            (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_2) + data_offset;
-        __ LoadFromOffset(kLoadUnsignedHalfword, out, obj, offset);
-      } else {
-        __ add(IP, obj, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_2));
-        __ LoadFromOffset(kLoadUnsignedHalfword, out, IP, data_offset);
-      }
-      break;
-    }
+  HInstruction* array_instr = instruction->GetArray();
+  bool has_intermediate_address = array_instr->IsIntermediateAddress();
+  // The read barrier instrumentation does not support the HIntermediateAddress instruction yet.
+  DCHECK(!(has_intermediate_address && kEmitCompilerReadBarrier));
 
+  switch (type) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimChar:
     case Primitive::kPrimInt: {
-      Register out = out_loc.AsRegister<Register>();
       if (index.IsConstant()) {
-        size_t offset =
-            (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
-        __ LoadFromOffset(kLoadWord, out, obj, offset);
+        int32_t const_index = index.GetConstant()->AsIntConstant()->GetValue();
+        uint32_t full_offset = data_offset + (const_index << Primitive::ComponentSizeShift(type));
+
+        LoadOperandType load_type = GetLoadOperandType(type);
+        __ LoadFromOffset(load_type, out_loc.AsRegister<Register>(), obj, full_offset);
       } else {
-        __ add(IP, obj, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_4));
-        __ LoadFromOffset(kLoadWord, out, IP, data_offset);
+        Register temp = IP;
+
+        if (has_intermediate_address) {
+          // We do not need to compute the intermediate address from the array: the
+          // input instruction has done it already. See the comment in
+          // `TryExtractArrayAccessAddress()`.
+          if (kIsDebugBuild) {
+            HIntermediateAddress* tmp = array_instr->AsIntermediateAddress();
+            DCHECK_EQ(tmp->GetOffset()->AsIntConstant()->GetValueAsUint64(), data_offset);
+          }
+          temp = obj;
+        } else {
+          __ add(temp, obj, ShifterOperand(data_offset));
+        }
+        codegen_->LoadFromShiftedRegOffset(type, out_loc, temp, index.AsRegister<Register>());
       }
       break;
     }
@@ -4390,8 +4560,22 @@ void InstructionCodeGeneratorARM::VisitArrayGet(HArrayGet* instruction) {
           // reference, if heap poisoning is enabled).
           codegen_->MaybeGenerateReadBarrierSlow(instruction, out_loc, out_loc, obj_loc, offset);
         } else {
-          __ add(IP, obj, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_4));
-          __ LoadFromOffset(kLoadWord, out, IP, data_offset);
+          Register temp = IP;
+
+          if (has_intermediate_address) {
+            // We do not need to compute the intermediate address from the array: the
+            // input instruction has done it already. See the comment in
+            // `TryExtractArrayAccessAddress()`.
+            if (kIsDebugBuild) {
+              HIntermediateAddress* tmp = array_instr->AsIntermediateAddress();
+              DCHECK_EQ(tmp->GetOffset()->AsIntConstant()->GetValueAsUint64(), data_offset);
+            }
+            temp = obj;
+          } else {
+            __ add(temp, obj, ShifterOperand(data_offset));
+          }
+          codegen_->LoadFromShiftedRegOffset(type, out_loc, temp, index.AsRegister<Register>());
+
           codegen_->MaybeRecordImplicitNullCheck(instruction);
           // If read barriers are enabled, emit read barriers other than
           // Baker's using a slow path (and also unpoison the loaded
@@ -4490,54 +4674,68 @@ void InstructionCodeGeneratorARM::VisitArraySet(HArraySet* instruction) {
   bool may_need_runtime_call_for_type_check = instruction->NeedsTypeCheck();
   bool needs_write_barrier =
       CodeGenerator::StoreNeedsWriteBarrier(value_type, instruction->GetValue());
+  uint32_t data_offset =
+      mirror::Array::DataOffset(Primitive::ComponentSize(value_type)).Uint32Value();
+  Location value_loc = locations->InAt(2);
+  HInstruction* array_instr = instruction->GetArray();
+  bool has_intermediate_address = array_instr->IsIntermediateAddress();
+  // The read barrier instrumentation does not support the HIntermediateAddress instruction yet.
+  DCHECK(!(has_intermediate_address && kEmitCompilerReadBarrier));
 
   switch (value_type) {
     case Primitive::kPrimBoolean:
-    case Primitive::kPrimByte: {
-      uint32_t data_offset = mirror::Array::DataOffset(sizeof(uint8_t)).Uint32Value();
-      Register value = locations->InAt(2).AsRegister<Register>();
-      if (index.IsConstant()) {
-        size_t offset =
-            (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset;
-        __ StoreToOffset(kStoreByte, value, array, offset);
-      } else {
-        __ add(IP, array, ShifterOperand(index.AsRegister<Register>()));
-        __ StoreToOffset(kStoreByte, value, IP, data_offset);
-      }
-      break;
-    }
-
+    case Primitive::kPrimByte:
     case Primitive::kPrimShort:
-    case Primitive::kPrimChar: {
-      uint32_t data_offset = mirror::Array::DataOffset(sizeof(uint16_t)).Uint32Value();
-      Register value = locations->InAt(2).AsRegister<Register>();
+    case Primitive::kPrimChar:
+    case Primitive::kPrimInt: {
       if (index.IsConstant()) {
-        size_t offset =
-            (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_2) + data_offset;
-        __ StoreToOffset(kStoreHalfword, value, array, offset);
+        int32_t const_index = index.GetConstant()->AsIntConstant()->GetValue();
+        uint32_t full_offset =
+            data_offset + (const_index << Primitive::ComponentSizeShift(value_type));
+        StoreOperandType store_type = GetStoreOperandType(value_type);
+        __ StoreToOffset(store_type, value_loc.AsRegister<Register>(), array, full_offset);
       } else {
-        __ add(IP, array, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_2));
-        __ StoreToOffset(kStoreHalfword, value, IP, data_offset);
+        Register temp = IP;
+
+        if (has_intermediate_address) {
+          // We do not need to compute the intermediate address from the array: the
+          // input instruction has done it already. See the comment in
+          // `TryExtractArrayAccessAddress()`.
+          if (kIsDebugBuild) {
+            HIntermediateAddress* tmp = array_instr->AsIntermediateAddress();
+            DCHECK(tmp->GetOffset()->AsIntConstant()->GetValueAsUint64() == data_offset);
+          }
+          temp = array;
+        } else {
+          __ add(temp, array, ShifterOperand(data_offset));
+        }
+        codegen_->StoreToShiftedRegOffset(value_type,
+                                          value_loc,
+                                          temp,
+                                          index.AsRegister<Register>());
       }
       break;
     }
 
     case Primitive::kPrimNot: {
-      uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value();
-      Location value_loc = locations->InAt(2);
       Register value = value_loc.AsRegister<Register>();
-      Register source = value;
+      // TryExtractArrayAccessAddress optimization is never applied for non-primitive ArraySet.
+      // See the comment in instruction_simplifier_shared.cc.
+      DCHECK(!has_intermediate_address);
 
       if (instruction->InputAt(2)->IsNullConstant()) {
         // Just setting null.
         if (index.IsConstant()) {
           size_t offset =
               (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
-          __ StoreToOffset(kStoreWord, source, array, offset);
+          __ StoreToOffset(kStoreWord, value, array, offset);
         } else {
           DCHECK(index.IsRegister()) << index;
-          __ add(IP, array, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_4));
-          __ StoreToOffset(kStoreWord, source, IP, data_offset);
+          __ add(IP, array, ShifterOperand(data_offset));
+          codegen_->StoreToShiftedRegOffset(value_type,
+                                            value_loc,
+                                            IP,
+                                            index.AsRegister<Register>());
         }
         codegen_->MaybeRecordImplicitNullCheck(instruction);
         DCHECK(!needs_write_barrier);
@@ -4566,8 +4764,11 @@ void InstructionCodeGeneratorARM::VisitArraySet(HArraySet* instruction) {
             __ StoreToOffset(kStoreWord, value, array, offset);
           } else {
             DCHECK(index.IsRegister()) << index;
-            __ add(IP, array, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_4));
-            __ StoreToOffset(kStoreWord, value, IP, data_offset);
+            __ add(IP, array, ShifterOperand(data_offset));
+            codegen_->StoreToShiftedRegOffset(value_type,
+                                              value_loc,
+                                              IP,
+                                              index.AsRegister<Register>());
           }
           codegen_->MaybeRecordImplicitNullCheck(instruction);
           __ b(&done);
@@ -4634,6 +4835,7 @@ void InstructionCodeGeneratorARM::VisitArraySet(HArraySet* instruction) {
         }
       }
 
+      Register source = value;
       if (kPoisonHeapReferences) {
         // Note that in the case where `value` is a null reference,
         // we do not enter this block, as a null reference does not
@@ -4650,8 +4852,12 @@ void InstructionCodeGeneratorARM::VisitArraySet(HArraySet* instruction) {
         __ StoreToOffset(kStoreWord, source, array, offset);
       } else {
         DCHECK(index.IsRegister()) << index;
-        __ add(IP, array, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_4));
-        __ StoreToOffset(kStoreWord, source, IP, data_offset);
+
+        __ add(IP, array, ShifterOperand(data_offset));
+        codegen_->StoreToShiftedRegOffset(value_type,
+                                          Location::RegisterLocation(source),
+                                          IP,
+                                          index.AsRegister<Register>());
       }
 
       if (!may_need_runtime_call_for_type_check) {
@@ -4671,23 +4877,7 @@ void InstructionCodeGeneratorARM::VisitArraySet(HArraySet* instruction) {
       break;
     }
 
-    case Primitive::kPrimInt: {
-      uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value();
-      Register value = locations->InAt(2).AsRegister<Register>();
-      if (index.IsConstant()) {
-        size_t offset =
-            (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
-        __ StoreToOffset(kStoreWord, value, array, offset);
-      } else {
-        DCHECK(index.IsRegister()) << index;
-        __ add(IP, array, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_4));
-        __ StoreToOffset(kStoreWord, value, IP, data_offset);
-      }
-      break;
-    }
-
     case Primitive::kPrimLong: {
-      uint32_t data_offset = mirror::Array::DataOffset(sizeof(int64_t)).Uint32Value();
       Location value = locations->InAt(2);
       if (index.IsConstant()) {
         size_t offset =
@@ -4701,7 +4891,6 @@ void InstructionCodeGeneratorARM::VisitArraySet(HArraySet* instruction) {
     }
 
     case Primitive::kPrimFloat: {
-      uint32_t data_offset = mirror::Array::DataOffset(sizeof(float)).Uint32Value();
       Location value = locations->InAt(2);
       DCHECK(value.IsFpuRegister());
       if (index.IsConstant()) {
@@ -4715,7 +4904,6 @@ void InstructionCodeGeneratorARM::VisitArraySet(HArraySet* instruction) {
     }
 
     case Primitive::kPrimDouble: {
-      uint32_t data_offset = mirror::Array::DataOffset(sizeof(double)).Uint32Value();
       Location value = locations->InAt(2);
       DCHECK(value.IsFpuRegisterPair());
       if (index.IsConstant()) {
@@ -4756,6 +4944,37 @@ void InstructionCodeGeneratorARM::VisitArrayLength(HArrayLength* instruction) {
   codegen_->MaybeRecordImplicitNullCheck(instruction);
 }
 
+void LocationsBuilderARM::VisitIntermediateAddress(HIntermediateAddress* instruction) {
+  // The read barrier instrumentation does not support the HIntermediateAddress instruction yet.
+  DCHECK(!kEmitCompilerReadBarrier);
+  LocationSummary* locations =
+      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
+
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(1, Location::RegisterOrConstant(instruction->GetOffset()));
+  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+}
+
+void InstructionCodeGeneratorARM::VisitIntermediateAddress(HIntermediateAddress* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  Location out = locations->Out();
+  Location first = locations->InAt(0);
+  Location second = locations->InAt(1);
+
+  // The read barrier instrumentation does not support the HIntermediateAddress instruction yet.
+  DCHECK(!kEmitCompilerReadBarrier);
+
+  if (second.IsRegister()) {
+    __ add(out.AsRegister<Register>(),
+           first.AsRegister<Register>(),
+           ShifterOperand(second.AsRegister<Register>()));
+  } else {
+    __ AddConstant(out.AsRegister<Register>(),
+                   first.AsRegister<Register>(),
+                   second.GetConstant()->AsIntConstant()->GetValue());
+  }
+}
+
 void LocationsBuilderARM::VisitBoundsCheck(HBoundsCheck* instruction) {
   LocationSummary::CallKind call_kind = instruction->CanThrowIntoCatchBlock()
       ? LocationSummary::kCallOnSlowPath
@@ -4790,7 +5009,7 @@ void CodeGeneratorARM::MarkGCCard(Register temp,
   if (can_be_null) {
     __ CompareAndBranchIfZero(value, &is_null);
   }
-  __ LoadFromOffset(kLoadWord, card, TR, Thread::CardTableOffset<kArmWordSize>().Int32Value());
+  __ LoadFromOffset(kLoadWord, card, TR, Thread::CardTableOffset<kArmPointerSize>().Int32Value());
   __ Lsr(temp, object, gc::accounting::CardTable::kCardShift);
   __ strb(card, Address(card, temp));
   if (can_be_null) {
@@ -4841,7 +5060,7 @@ void InstructionCodeGeneratorARM::GenerateSuspendCheck(HSuspendCheck* instructio
   }
 
   __ LoadFromOffset(
-      kLoadUnsignedHalfword, IP, TR, Thread::ThreadFlagsOffset<kArmWordSize>().Int32Value());
+      kLoadUnsignedHalfword, IP, TR, Thread::ThreadFlagsOffset<kArmPointerSize>().Int32Value());
   if (successor == nullptr) {
     __ CompareAndBranchIfNonZero(IP, slow_path->GetEntryLabel());
     __ Bind(slow_path->GetReturnLabel());
@@ -5370,59 +5589,19 @@ void InstructionCodeGeneratorARM::VisitLoadString(HLoadString* load) {
       __ LoadLiteral(out, codegen_->DeduplicateBootImageAddressLiteral(address));
       return;  // No dex cache slow path.
     }
-    case HLoadString::LoadKind::kDexCacheAddress: {
-      DCHECK_NE(load->GetAddress(), 0u);
-      uint32_t address = dchecked_integral_cast<uint32_t>(load->GetAddress());
-      // 16-bit LDR immediate has a 5-bit offset multiplied by the size and that gives
-      // a 128B range. To try and reduce the number of literals if we load multiple strings,
-      // simply split the dex cache address to a 128B aligned base loaded from a literal
-      // and the remaining offset embedded in the load.
-      static_assert(sizeof(GcRoot<mirror::String>) == 4u, "Expected GC root to be 4 bytes.");
-      DCHECK_ALIGNED(load->GetAddress(), 4u);
-      constexpr size_t offset_bits = /* encoded bits */ 5 + /* scale */ 2;
-      uint32_t base_address = address & ~MaxInt<uint32_t>(offset_bits);
-      uint32_t offset = address & MaxInt<uint32_t>(offset_bits);
-      __ LoadLiteral(out, codegen_->DeduplicateDexCacheAddressLiteral(base_address));
-      // /* GcRoot<mirror::String> */ out = *(base_address + offset)
-      GenerateGcRootFieldLoad(load, out_loc, out, offset);
-      break;
-    }
-    case HLoadString::LoadKind::kDexCachePcRelative: {
-      Register base_reg = locations->InAt(0).AsRegister<Register>();
-      HArmDexCacheArraysBase* base = load->InputAt(0)->AsArmDexCacheArraysBase();
-      int32_t offset = load->GetDexCacheElementOffset() - base->GetElementOffset();
-      // /* GcRoot<mirror::String> */ out = *(dex_cache_arrays_base + offset)
-      GenerateGcRootFieldLoad(load, out_loc, base_reg, offset);
-      break;
-    }
-    case HLoadString::LoadKind::kDexCacheViaMethod: {
-      Register current_method = locations->InAt(0).AsRegister<Register>();
-
-      // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
-      GenerateGcRootFieldLoad(
-          load, out_loc, current_method, ArtMethod::DeclaringClassOffset().Int32Value());
-      // /* GcRoot<mirror::String>[] */ out = out->dex_cache_strings_
-      __ LoadFromOffset(kLoadWord, out, out, mirror::Class::DexCacheStringsOffset().Int32Value());
-      // /* GcRoot<mirror::String> */ out = out[string_index]
-      GenerateGcRootFieldLoad(
-          load, out_loc, out, CodeGenerator::GetCacheOffset(load->GetStringIndex()));
-      break;
-    }
     default:
-      LOG(FATAL) << "Unexpected load kind: " << load->GetLoadKind();
-      UNREACHABLE();
+      break;
   }
 
-  if (!load->IsInDexCache()) {
-    SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathARM(load);
-    codegen_->AddSlowPath(slow_path);
-    __ CompareAndBranchIfZero(out, slow_path->GetEntryLabel());
-    __ Bind(slow_path->GetExitLabel());
-  }
+  // TODO: Re-add the compiler code to do string dex cache lookup again.
+  SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathARM(load);
+  codegen_->AddSlowPath(slow_path);
+  __ b(slow_path->GetEntryLabel());
+  __ Bind(slow_path->GetExitLabel());
 }
 
 static int32_t GetExceptionTlsOffset() {
-  return Thread::ExceptionOffset<kArmWordSize>().Int32Value();
+  return Thread::ExceptionOffset<kArmPointerSize>().Int32Value();
 }
 
 void LocationsBuilderARM::VisitLoadException(HLoadException* load) {
@@ -5447,7 +5626,7 @@ void InstructionCodeGeneratorARM::VisitClearException(HClearException* clear ATT
 
 void LocationsBuilderARM::VisitThrow(HThrow* instruction) {
   LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
+      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
 }
@@ -5848,7 +6027,7 @@ void InstructionCodeGeneratorARM::VisitCheckCast(HCheckCast* instruction) {
 
 void LocationsBuilderARM::VisitMonitorOperation(HMonitorOperation* instruction) {
   LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
+      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
 }
@@ -6011,6 +6190,34 @@ void InstructionCodeGeneratorARM::GenerateEorConst(Register out, Register first,
   __ eor(out, first, ShifterOperand(value));
 }
 
+void InstructionCodeGeneratorARM::GenerateAddLongConst(Location out,
+                                                       Location first,
+                                                       uint64_t value) {
+  Register out_low = out.AsRegisterPairLow<Register>();
+  Register out_high = out.AsRegisterPairHigh<Register>();
+  Register first_low = first.AsRegisterPairLow<Register>();
+  Register first_high = first.AsRegisterPairHigh<Register>();
+  uint32_t value_low = Low32Bits(value);
+  uint32_t value_high = High32Bits(value);
+  if (value_low == 0u) {
+    if (out_low != first_low) {
+      __ mov(out_low, ShifterOperand(first_low));
+    }
+    __ AddConstant(out_high, first_high, value_high);
+    return;
+  }
+  __ AddConstantSetFlags(out_low, first_low, value_low);
+  ShifterOperand so;
+  if (__ ShifterOperandCanHold(out_high, first_high, ADC, value_high, kCcDontCare, &so)) {
+    __ adc(out_high, first_high, so);
+  } else if (__ ShifterOperandCanHold(out_low, first_low, SBC, ~value_high, kCcDontCare, &so)) {
+    __ sbc(out_high, first_high, so);
+  } else {
+    LOG(FATAL) << "Unexpected constant " << value_high;
+    UNREACHABLE();
+  }
+}
+
 void InstructionCodeGeneratorARM::HandleBitwiseOperation(HBinaryOperation* instruction) {
   LocationSummary* locations = instruction->GetLocations();
   Location first = locations->InAt(0);
@@ -6172,12 +6379,12 @@ void InstructionCodeGeneratorARM::GenerateGcRootFieldLoad(HInstruction* instruct
 
       // Slow path used to mark the GC root `root`.
       SlowPathCode* slow_path =
-          new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM(instruction, root, root);
+          new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM(instruction, root);
       codegen_->AddSlowPath(slow_path);
 
       // IP = Thread::Current()->GetIsGcMarking()
       __ LoadFromOffset(
-          kLoadWord, IP, TR, Thread::IsGcMarkingOffset<kArmWordSize>().Int32Value());
+          kLoadWord, IP, TR, Thread::IsGcMarkingOffset<kArmPointerSize>().Int32Value());
       __ CompareAndBranchIfNonZero(IP, slow_path->GetEntryLabel());
       __ Bind(slow_path->GetExitLabel());
     } else {
@@ -6275,21 +6482,13 @@ void CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* i
   // /* LockWord */ lock_word = LockWord(monitor)
   static_assert(sizeof(LockWord) == sizeof(int32_t),
                 "art::LockWord and int32_t have different sizes.");
-  // /* uint32_t */ rb_state = lock_word.ReadBarrierState()
-  __ Lsr(temp_reg, temp_reg, LockWord::kReadBarrierStateShift);
-  __ and_(temp_reg, temp_reg, ShifterOperand(LockWord::kReadBarrierStateMask));
-  static_assert(
-      LockWord::kReadBarrierStateMask == ReadBarrier::rb_ptr_mask_,
-      "art::LockWord::kReadBarrierStateMask is not equal to art::ReadBarrier::rb_ptr_mask_.");
 
-  // Introduce a dependency on the high bits of rb_state, which shall
-  // be all zeroes, to prevent load-load reordering, and without using
+  // Introduce a dependency on the lock_word including the rb_state,
+  // which shall prevent load-load reordering without using
   // a memory barrier (which would be more expensive).
-  // IP = rb_state & ~LockWord::kReadBarrierStateMask = 0
-  __ bic(IP, temp_reg, ShifterOperand(LockWord::kReadBarrierStateMask));
-  // obj is unchanged by this operation, but its value now depends on
-  // IP, which depends on temp_reg.
-  __ add(obj, obj, ShifterOperand(IP));
+  // `obj` is unchanged by this operation, but its value now depends
+  // on `temp_reg`.
+  __ add(obj, obj, ShifterOperand(temp_reg, LSR, 32));
 
   // The actual reference load.
   if (index.IsValid()) {
@@ -6321,13 +6520,19 @@ void CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* i
 
   // Slow path used to mark the object `ref` when it is gray.
   SlowPathCode* slow_path =
-      new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM(instruction, ref, ref);
+      new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM(instruction, ref);
   AddSlowPath(slow_path);
 
   // if (rb_state == ReadBarrier::gray_ptr_)
   //   ref = ReadBarrier::Mark(ref);
-  __ cmp(temp_reg, ShifterOperand(ReadBarrier::gray_ptr_));
-  __ b(slow_path->GetEntryLabel(), EQ);
+  // Given the numeric representation, it's enough to check the low bit of the
+  // rb_state. We do that by shifting the bit out of the lock word with LSRS
+  // which can be a 16-bit instruction unlike the TST immediate.
+  static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0");
+  static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1");
+  static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2");
+  __ Lsrs(temp_reg, temp_reg, LockWord::kReadBarrierStateShift + 1);
+  __ b(slow_path->GetEntryLabel(), CS);  // Carry flag is the last bit shifted out by LSRS.
   __ Bind(slow_path->GetExitLabel());
 }
 
@@ -6539,7 +6744,7 @@ void CodeGeneratorARM::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke,
       // LR = callee_method->entry_point_from_quick_compiled_code_
       __ LoadFromOffset(
           kLoadWord, LR, callee_method.AsRegister<Register>(),
-          ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArmWordSize).Int32Value());
+          ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArmPointerSize).Int32Value());
       // LR()
       __ blx(LR);
       break;
@@ -6573,7 +6778,7 @@ void CodeGeneratorARM::GenerateVirtualCall(HInvokeVirtual* invoke, Location temp
   __ MaybeUnpoisonHeapReference(temp);
   // temp = temp->GetMethodAt(method_offset);
   uint32_t entry_point = ArtMethod::EntryPointFromQuickCompiledCodeOffset(
-      kArmWordSize).Int32Value();
+      kArmPointerSize).Int32Value();
   __ LoadFromOffset(kLoadWord, temp, temp, method_offset);
   // LR = temp->GetEntryPoint();
   __ LoadFromOffset(kLoadWord, LR, temp, entry_point);
@@ -6951,18 +7156,25 @@ void LocationsBuilderARM::VisitClassTableGet(HClassTableGet* instruction) {
 
 void InstructionCodeGeneratorARM::VisitClassTableGet(HClassTableGet* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  uint32_t method_offset = 0;
   if (instruction->GetTableKind() == HClassTableGet::TableKind::kVTable) {
-    method_offset = mirror::Class::EmbeddedVTableEntryOffset(
+    uint32_t method_offset = mirror::Class::EmbeddedVTableEntryOffset(
         instruction->GetIndex(), kArmPointerSize).SizeValue();
+    __ LoadFromOffset(kLoadWord,
+                      locations->Out().AsRegister<Register>(),
+                      locations->InAt(0).AsRegister<Register>(),
+                      method_offset);
   } else {
-    method_offset = mirror::Class::EmbeddedImTableEntryOffset(
-        instruction->GetIndex() % mirror::Class::kImtSize, kArmPointerSize).Uint32Value();
+    uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement(
+        instruction->GetIndex(), kArmPointerSize));
+    __ LoadFromOffset(kLoadWord,
+                      locations->Out().AsRegister<Register>(),
+                      locations->InAt(0).AsRegister<Register>(),
+                      mirror::Class::ImtPtrOffset(kArmPointerSize).Uint32Value());
+    __ LoadFromOffset(kLoadWord,
+                      locations->Out().AsRegister<Register>(),
+                      locations->Out().AsRegister<Register>(),
+                      method_offset);
   }
-  __ LoadFromOffset(kLoadWord,
-                    locations->Out().AsRegister<Register>(),
-                    locations->InAt(0).AsRegister<Register>(),
-                    method_offset);
 }
 
 #undef __
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index 477c4f18c1..5d9b2dce1c 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -17,13 +17,13 @@
 #ifndef ART_COMPILER_OPTIMIZING_CODE_GENERATOR_ARM_H_
 #define ART_COMPILER_OPTIMIZING_CODE_GENERATOR_ARM_H_
 
+#include "base/enums.h"
 #include "code_generator.h"
-#include "dex/compiler_enums.h"
 #include "driver/compiler_options.h"
 #include "nodes.h"
+#include "string_reference.h"
 #include "parallel_move_resolver.h"
 #include "utils/arm/assembler_thumb2.h"
-#include "utils/string_reference.h"
 #include "utils/type_reference.h"
 
 namespace art {
@@ -32,7 +32,7 @@ namespace arm {
 class CodeGeneratorARM;
 
 // Use a local definition to prevent copying mistakes.
-static constexpr size_t kArmWordSize = kArmPointerSize;
+static constexpr size_t kArmWordSize = static_cast<size_t>(kArmPointerSize);
 static constexpr size_t kArmBitsPerWord = kArmWordSize * kBitsPerByte;
 
 static constexpr Register kParameterCoreRegisters[] = { R1, R2, R3 };
@@ -180,9 +180,10 @@ class LocationsBuilderARM : public HGraphVisitor {
   void HandleFieldSet(HInstruction* instruction, const FieldInfo& field_info);
   void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info);
 
+  Location ArithmeticZeroOrFpuRegister(HInstruction* input);
   Location ArmEncodableConstantOrRegister(HInstruction* constant, Opcode opcode);
   bool CanEncodeConstantAsImmediate(HConstant* input_cst, Opcode opcode);
-  bool CanEncodeConstantAsImmediate(uint32_t value, Opcode opcode);
+  bool CanEncodeConstantAsImmediate(uint32_t value, Opcode opcode, SetCc set_cc = kCcDontCare);
 
   CodeGeneratorARM* const codegen_;
   InvokeDexCallingConventionVisitorARM parameter_visitor_;
@@ -219,6 +220,7 @@ class InstructionCodeGeneratorARM : public InstructionCodeGenerator {
   void GenerateAndConst(Register out, Register first, uint32_t value);
   void GenerateOrrConst(Register out, Register first, uint32_t value);
   void GenerateEorConst(Register out, Register first, uint32_t value);
+  void GenerateAddLongConst(Location out, Location first, uint64_t value);
   void HandleBitwiseOperation(HBinaryOperation* operation);
   void HandleCondition(HCondition* condition);
   void HandleIntegerRotate(LocationSummary* locations);
@@ -281,6 +283,7 @@ class InstructionCodeGeneratorARM : public InstructionCodeGenerator {
   void GenerateCompareTestAndBranch(HCondition* condition,
                                     Label* true_target,
                                     Label* false_target);
+  void GenerateVcmp(HInstruction* instruction);
   void GenerateFPJumps(HCondition* cond, Label* true_label, Label* false_label);
   void GenerateLongComparesAndJumps(HCondition* cond, Label* true_label, Label* false_label);
   void DivRemOneOrMinusOne(HBinaryOperation* instruction);
@@ -365,6 +368,24 @@ class CodeGeneratorARM : public CodeGenerator {
   // Helper method to move a 64bits value between two locations.
   void Move64(Location destination, Location source);
 
+  void LoadOrStoreToOffset(Primitive::Type type,
+                           Location loc,
+                           Register base,
+                           int32_t offset,
+                           bool is_load,
+                           Condition cond = AL);
+
+  void LoadFromShiftedRegOffset(Primitive::Type type,
+                                Location out_loc,
+                                Register base,
+                                Register reg_offset,
+                                Condition cond = AL);
+  void StoreToShiftedRegOffset(Primitive::Type type,
+                               Location out_loc,
+                               Register base,
+                               Register reg_offset,
+                               Condition cond = AL);
+
   // Generate code to invoke a runtime entry point.
   void InvokeRuntime(QuickEntrypointEnum entrypoint,
                      HInstruction* instruction,
@@ -376,6 +397,12 @@ class CodeGeneratorARM : public CodeGenerator {
                      uint32_t dex_pc,
                      SlowPathCode* slow_path);
 
+  // Generate code to invoke a runtime entry point, but do not record
+  // PC-related information in a stack map.
+  void InvokeRuntimeWithoutRecordingPcInfo(int32_t entry_point_offset,
+                                           HInstruction* instruction,
+                                           SlowPathCode* slow_path);
+
   // Emit a write barrier.
   void MarkGCCard(Register temp, Register card, Register object, Register value, bool can_be_null);
 
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index fc2c2c34aa..122c174eae 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -33,8 +33,7 @@
 #include "utils/assembler.h"
 #include "utils/stack_checks.h"
 
-
-using namespace vixl;   // NOLINT(build/namespaces)
+using namespace vixl::aarch64;  // NOLINT(build/namespaces)
 
 #ifdef __
 #error "ARM64 Codegen VIXL macro-assembler macro already defined."
@@ -132,9 +131,9 @@ Location InvokeRuntimeCallingConvention::GetReturnLocation(Primitive::Type retur
   return ARM64ReturnLocation(return_type);
 }
 
-// NOLINT on __ macro to suppress wrong warning/fix from clang-tidy.
-#define __ down_cast<CodeGeneratorARM64*>(codegen)->GetVIXLAssembler()-> // NOLINT
-#define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kArm64WordSize, x).Int32Value()
+// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
+#define __ down_cast<CodeGeneratorARM64*>(codegen)->GetVIXLAssembler()->  // NOLINT
+#define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kArm64PointerSize, x).Int32Value()
 
 // Calculate memory accessing operand for save/restore live registers.
 static void SaveRestoreLiveRegistersHelper(CodeGenerator* codegen,
@@ -147,20 +146,20 @@ static void SaveRestoreLiveRegistersHelper(CodeGenerator* codegen,
                                          codegen->GetNumberOfFloatingPointRegisters()));
 
   CPURegList core_list = CPURegList(CPURegister::kRegister, kXRegSize,
-      register_set->GetCoreRegisters() & (~callee_saved_core_registers.list()));
+      register_set->GetCoreRegisters() & (~callee_saved_core_registers.GetList()));
   CPURegList fp_list = CPURegList(CPURegister::kFPRegister, kDRegSize,
-      register_set->GetFloatingPointRegisters() & (~callee_saved_fp_registers.list()));
+      register_set->GetFloatingPointRegisters() & (~callee_saved_fp_registers.GetList()));
 
   MacroAssembler* masm = down_cast<CodeGeneratorARM64*>(codegen)->GetVIXLAssembler();
   UseScratchRegisterScope temps(masm);
 
   Register base = masm->StackPointer();
-  int64_t core_spill_size = core_list.TotalSizeInBytes();
-  int64_t fp_spill_size = fp_list.TotalSizeInBytes();
+  int64_t core_spill_size = core_list.GetTotalSizeInBytes();
+  int64_t fp_spill_size = fp_list.GetTotalSizeInBytes();
   int64_t reg_size = kXRegSizeInBytes;
   int64_t max_ls_pair_offset = spill_offset + core_spill_size + fp_spill_size - 2 * reg_size;
   uint32_t ls_access_size = WhichPowerOf2(reg_size);
-  if (((core_list.Count() > 1) || (fp_list.Count() > 1)) &&
+  if (((core_list.GetCount() > 1) || (fp_list.GetCount() > 1)) &&
       !masm->IsImmLSPair(max_ls_pair_offset, ls_access_size)) {
     // If the offset does not fit in the instruction's immediate field, use an alternate register
     // to compute the base address(float point registers spill base address).
@@ -399,11 +398,9 @@ class SuspendCheckSlowPathARM64 : public SlowPathCodeARM64 {
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen);
     __ Bind(GetEntryLabel());
-    SaveLiveRegisters(codegen, instruction_->GetLocations());
     arm64_codegen->InvokeRuntime(
         QUICK_ENTRY_POINT(pTestSuspend), instruction_, instruction_->GetDexPc(), this);
     CheckEntrypointTypes<kQuickTestSuspend, void, void>();
-    RestoreLiveRegisters(codegen, instruction_->GetLocations());
     if (successor_ == nullptr) {
       __ B(GetReturnLabel());
     } else {
@@ -411,7 +408,7 @@ class SuspendCheckSlowPathARM64 : public SlowPathCodeARM64 {
     }
   }
 
-  vixl::Label* GetReturnLabel() {
+  vixl::aarch64::Label* GetReturnLabel() {
     DCHECK(successor_ == nullptr);
     return &return_label_;
   }
@@ -427,7 +424,7 @@ class SuspendCheckSlowPathARM64 : public SlowPathCodeARM64 {
   HBasicBlock* const successor_;
 
   // If `successor_` is null, the label to branch to after the suspend check.
-  vixl::Label return_label_;
+  vixl::aarch64::Label return_label_;
 
   DISALLOW_COPY_AND_ASSIGN(SuspendCheckSlowPathARM64);
 };
@@ -463,7 +460,7 @@ class TypeCheckSlowPathARM64 : public SlowPathCodeARM64 {
     if (instruction_->IsInstanceOf()) {
       arm64_codegen->InvokeRuntime(
           QUICK_ENTRY_POINT(pInstanceofNonTrivial), instruction_, dex_pc, this);
-      CheckEntrypointTypes<kQuickInstanceofNonTrivial, uint32_t,
+      CheckEntrypointTypes<kQuickInstanceofNonTrivial, size_t,
                            const mirror::Class*, const mirror::Class*>();
       Primitive::Type ret_type = instruction_->GetType();
       Location ret_loc = calling_convention.GetReturnLocation(ret_type);
@@ -567,9 +564,9 @@ void JumpTableARM64::EmitTable(CodeGeneratorARM64* codegen) {
   __ Bind(&table_start_);
   const ArenaVector<HBasicBlock*>& successors = switch_instr_->GetBlock()->GetSuccessors();
   for (uint32_t i = 0; i < num_entries; i++) {
-    vixl::Label* target_label = codegen->GetLabelOf(successors[i]);
+    vixl::aarch64::Label* target_label = codegen->GetLabelOf(successors[i]);
     DCHECK(target_label->IsBound());
-    ptrdiff_t jump_offset = target_label->location() - table_start_.location();
+    ptrdiff_t jump_offset = target_label->GetLocation() - table_start_.GetLocation();
     DCHECK_GT(jump_offset, std::numeric_limits<int32_t>::min());
     DCHECK_LE(jump_offset, std::numeric_limits<int32_t>::max());
     Literal<int32_t> literal(jump_offset);
@@ -580,8 +577,8 @@ void JumpTableARM64::EmitTable(CodeGeneratorARM64* codegen) {
 // Slow path marking an object during a read barrier.
 class ReadBarrierMarkSlowPathARM64 : public SlowPathCodeARM64 {
  public:
-  ReadBarrierMarkSlowPathARM64(HInstruction* instruction, Location out, Location obj)
-      : SlowPathCodeARM64(instruction), out_(out), obj_(obj) {
+  ReadBarrierMarkSlowPathARM64(HInstruction* instruction, Location obj)
+      : SlowPathCodeARM64(instruction), obj_(obj) {
     DCHECK(kEmitCompilerReadBarrier);
   }
 
@@ -589,9 +586,8 @@ class ReadBarrierMarkSlowPathARM64 : public SlowPathCodeARM64 {
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     LocationSummary* locations = instruction_->GetLocations();
-    Primitive::Type type = Primitive::kPrimNot;
     DCHECK(locations->CanCall());
-    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(out_.reg()));
+    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(obj_.reg()));
     DCHECK(instruction_->IsInstanceFieldGet() ||
            instruction_->IsStaticFieldGet() ||
            instruction_->IsArrayGet() ||
@@ -599,30 +595,45 @@ class ReadBarrierMarkSlowPathARM64 : public SlowPathCodeARM64 {
            instruction_->IsLoadString() ||
            instruction_->IsInstanceOf() ||
            instruction_->IsCheckCast() ||
-           ((instruction_->IsInvokeStaticOrDirect() || instruction_->IsInvokeVirtual()) &&
-            instruction_->GetLocations()->Intrinsified()))
+           (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()) ||
+           (instruction_->IsInvokeStaticOrDirect() && instruction_->GetLocations()->Intrinsified()))
         << "Unexpected instruction in read barrier marking slow path: "
         << instruction_->DebugName();
 
     __ Bind(GetEntryLabel());
-    SaveLiveRegisters(codegen, locations);
-
-    InvokeRuntimeCallingConvention calling_convention;
+    // No need to save live registers; it's taken care of by the
+    // entrypoint. Also, there is no need to update the stack mask,
+    // as this runtime call will not trigger a garbage collection.
     CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen);
-    arm64_codegen->MoveLocation(LocationFrom(calling_convention.GetRegisterAt(0)), obj_, type);
-    arm64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pReadBarrierMark),
-                                 instruction_,
-                                 instruction_->GetDexPc(),
-                                 this);
-    CheckEntrypointTypes<kQuickReadBarrierMark, mirror::Object*, mirror::Object*>();
-    arm64_codegen->MoveLocation(out_, calling_convention.GetReturnLocation(type), type);
-
-    RestoreLiveRegisters(codegen, locations);
+    DCHECK_NE(obj_.reg(), LR);
+    DCHECK_NE(obj_.reg(), WSP);
+    DCHECK_NE(obj_.reg(), WZR);
+    // IP0 is used internally by the ReadBarrierMarkRegX entry point
+    // as a temporary, it cannot be the entry point's input/output.
+    DCHECK_NE(obj_.reg(), IP0);
+    DCHECK(0 <= obj_.reg() && obj_.reg() < kNumberOfWRegisters) << obj_.reg();
+    // "Compact" slow path, saving two moves.
+    //
+    // Instead of using the standard runtime calling convention (input
+    // and output in W0):
+    //
+    //   W0 <- obj
+    //   W0 <- ReadBarrierMark(W0)
+    //   obj <- W0
+    //
+    // we just use rX (the register holding `obj`) as input and output
+    // of a dedicated entrypoint:
+    //
+    //   rX <- ReadBarrierMarkRegX(rX)
+    //
+    int32_t entry_point_offset =
+        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(obj_.reg());
+    // This runtime call does not require a stack map.
+    arm64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
     __ B(GetExitLabel());
   }
 
  private:
-  const Location out_;
   const Location obj_;
 
   DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathARM64);
@@ -668,14 +679,12 @@ class ReadBarrierForHeapReferenceSlowPathARM64 : public SlowPathCodeARM64 {
            instruction_->IsArrayGet() ||
            instruction_->IsInstanceOf() ||
            instruction_->IsCheckCast() ||
-           ((instruction_->IsInvokeStaticOrDirect() || instruction_->IsInvokeVirtual()) &&
-            instruction_->GetLocations()->Intrinsified()))
+           (instruction_->IsInvokeVirtual()) && instruction_->GetLocations()->Intrinsified())
         << "Unexpected instruction in read barrier for heap reference slow path: "
         << instruction_->DebugName();
-    // The read barrier instrumentation does not support the
-    // HArm64IntermediateAddress instruction yet.
+    // The read barrier instrumentation does not support the HIntermediateAddress instruction yet.
     DCHECK(!(instruction_->IsArrayGet() &&
-             instruction_->AsArrayGet()->GetArray()->IsArm64IntermediateAddress()));
+             instruction_->AsArrayGet()->GetArray()->IsIntermediateAddress()));
 
     __ Bind(GetEntryLabel());
 
@@ -744,10 +753,7 @@ class ReadBarrierForHeapReferenceSlowPathARM64 : public SlowPathCodeARM64 {
                (instruction_->AsInvoke()->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile))
             << instruction_->AsInvoke()->GetIntrinsic();
         DCHECK_EQ(offset_, 0U);
-        DCHECK(index_.IsRegisterPair());
-        // UnsafeGet's offset location is a register pair, the low
-        // part contains the correct offset.
-        index = index_.ToLow();
+        DCHECK(index_.IsRegister());
       }
     }
 
@@ -790,8 +796,8 @@ class ReadBarrierForHeapReferenceSlowPathARM64 : public SlowPathCodeARM64 {
 
  private:
   Register FindAvailableCallerSaveRegister(CodeGenerator* codegen) {
-    size_t ref = static_cast<int>(XRegisterFrom(ref_).code());
-    size_t obj = static_cast<int>(XRegisterFrom(obj_).code());
+    size_t ref = static_cast<int>(XRegisterFrom(ref_).GetCode());
+    size_t obj = static_cast<int>(XRegisterFrom(obj_).GetCode());
     for (size_t i = 0, e = codegen->GetNumberOfCoreRegisters(); i < e; ++i) {
       if (i != ref && i != obj && !codegen->IsCoreCalleeSaveRegister(i)) {
         return Register(VIXLRegCodeFromART(i), kXRegSize);
@@ -909,8 +915,8 @@ CodeGeneratorARM64::CodeGeneratorARM64(HGraph* graph,
                     kNumberOfAllocatableRegisters,
                     kNumberOfAllocatableFPRegisters,
                     kNumberOfAllocatableRegisterPairs,
-                    callee_saved_core_registers.list(),
-                    callee_saved_fp_registers.list(),
+                    callee_saved_core_registers.GetList(),
+                    callee_saved_fp_registers.GetList(),
                     compiler_options,
                     stats),
       block_labels_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
@@ -1060,17 +1066,17 @@ void CodeGeneratorARM64::GenerateFrameExit() {
   GetAssembler()->cfi().DefCFAOffset(GetFrameSize());
 }
 
-vixl::CPURegList CodeGeneratorARM64::GetFramePreservedCoreRegisters() const {
+CPURegList CodeGeneratorARM64::GetFramePreservedCoreRegisters() const {
   DCHECK(ArtVixlRegCodeCoherentForRegSet(core_spill_mask_, GetNumberOfCoreRegisters(), 0, 0));
-  return vixl::CPURegList(vixl::CPURegister::kRegister, vixl::kXRegSize,
-                          core_spill_mask_);
+  return CPURegList(CPURegister::kRegister, kXRegSize,
+                    core_spill_mask_);
 }
 
-vixl::CPURegList CodeGeneratorARM64::GetFramePreservedFPRegisters() const {
+CPURegList CodeGeneratorARM64::GetFramePreservedFPRegisters() const {
   DCHECK(ArtVixlRegCodeCoherentForRegSet(0, 0, fpu_spill_mask_,
                                          GetNumberOfFloatingPointRegisters()));
-  return vixl::CPURegList(vixl::CPURegister::kFPRegister, vixl::kDRegSize,
-                          fpu_spill_mask_);
+  return CPURegList(CPURegister::kFPRegister, kDRegSize,
+                    fpu_spill_mask_);
 }
 
 void CodeGeneratorARM64::Bind(HBasicBlock* block) {
@@ -1094,11 +1100,11 @@ void CodeGeneratorARM64::MarkGCCard(Register object, Register value, bool value_
   UseScratchRegisterScope temps(GetVIXLAssembler());
   Register card = temps.AcquireX();
   Register temp = temps.AcquireW();   // Index within the CardTable - 32bit.
-  vixl::Label done;
+  vixl::aarch64::Label done;
   if (value_can_be_null) {
     __ Cbz(value, &done);
   }
-  __ Ldr(card, MemOperand(tr, Thread::CardTableOffset<kArm64WordSize>().Int32Value()));
+  __ Ldr(card, MemOperand(tr, Thread::CardTableOffset<kArm64PointerSize>().Int32Value()));
   __ Lsr(temp, object, gc::accounting::CardTable::kCardShift);
   __ Strb(card, MemOperand(card, temp.X()));
   if (value_can_be_null) {
@@ -1119,12 +1125,12 @@ void CodeGeneratorARM64::SetupBlockedRegisters() const {
   CPURegList reserved_core_registers = vixl_reserved_core_registers;
   reserved_core_registers.Combine(runtime_reserved_core_registers);
   while (!reserved_core_registers.IsEmpty()) {
-    blocked_core_registers_[reserved_core_registers.PopLowestIndex().code()] = true;
+    blocked_core_registers_[reserved_core_registers.PopLowestIndex().GetCode()] = true;
   }
 
   CPURegList reserved_fp_registers = vixl_reserved_fp_registers;
   while (!reserved_fp_registers.IsEmpty()) {
-    blocked_fpu_registers_[reserved_fp_registers.PopLowestIndex().code()] = true;
+    blocked_fpu_registers_[reserved_fp_registers.PopLowestIndex().GetCode()] = true;
   }
 
   if (GetGraph()->IsDebuggable()) {
@@ -1133,7 +1139,7 @@ void CodeGeneratorARM64::SetupBlockedRegisters() const {
     // now, just block them.
     CPURegList reserved_fp_registers_debuggable = callee_saved_fp_registers;
     while (!reserved_fp_registers_debuggable.IsEmpty()) {
-      blocked_fpu_registers_[reserved_fp_registers_debuggable.PopLowestIndex().code()] = true;
+      blocked_fpu_registers_[reserved_fp_registers_debuggable.PopLowestIndex().GetCode()] = true;
     }
   }
 }
@@ -1277,17 +1283,21 @@ void CodeGeneratorARM64::MoveLocation(Location destination,
       UseScratchRegisterScope temps(GetVIXLAssembler());
       HConstant* src_cst = source.GetConstant();
       CPURegister temp;
-      if (src_cst->IsIntConstant() || src_cst->IsNullConstant()) {
-        temp = temps.AcquireW();
-      } else if (src_cst->IsLongConstant()) {
-        temp = temps.AcquireX();
-      } else if (src_cst->IsFloatConstant()) {
-        temp = temps.AcquireS();
+      if (src_cst->IsZeroBitPattern()) {
+        temp = (src_cst->IsLongConstant() || src_cst->IsDoubleConstant()) ? xzr : wzr;
       } else {
-        DCHECK(src_cst->IsDoubleConstant());
-        temp = temps.AcquireD();
+        if (src_cst->IsIntConstant()) {
+          temp = temps.AcquireW();
+        } else if (src_cst->IsLongConstant()) {
+          temp = temps.AcquireX();
+        } else if (src_cst->IsFloatConstant()) {
+          temp = temps.AcquireS();
+        } else {
+          DCHECK(src_cst->IsDoubleConstant());
+          temp = temps.AcquireD();
+        }
+        MoveConstant(temp, src_cst);
       }
-      MoveConstant(temp, src_cst);
       __ Str(temp, StackOperandFrom(destination));
     } else {
       DCHECK(source.IsStackSlot() || source.IsDoubleStackSlot());
@@ -1344,7 +1354,7 @@ void CodeGeneratorARM64::LoadAcquire(HInstruction* instruction,
   DCHECK(!src.IsPostIndex());
 
   // TODO(vixl): Let the MacroAssembler handle MemOperand.
-  __ Add(temp_base, src.base(), OperandFromMemOperand(src));
+  __ Add(temp_base, src.GetBaseRegister(), OperandFromMemOperand(src));
   MemOperand base = MemOperand(temp_base);
   switch (type) {
     case Primitive::kPrimBoolean:
@@ -1436,7 +1446,7 @@ void CodeGeneratorARM64::StoreRelease(Primitive::Type type,
 
   // TODO(vixl): Let the MacroAssembler handle this.
   Operand op = OperandFromMemOperand(dst);
-  __ Add(temp_base, dst.base(), op);
+  __ Add(temp_base, dst.GetBaseRegister(), op);
   MemOperand base = MemOperand(temp_base);
   switch (type) {
     case Primitive::kPrimBoolean:
@@ -1472,7 +1482,7 @@ void CodeGeneratorARM64::InvokeRuntime(QuickEntrypointEnum entrypoint,
                                        HInstruction* instruction,
                                        uint32_t dex_pc,
                                        SlowPathCode* slow_path) {
-  InvokeRuntime(GetThreadOffset<kArm64WordSize>(entrypoint).Int32Value(),
+  InvokeRuntime(GetThreadOffset<kArm64PointerSize>(entrypoint).Int32Value(),
                 instruction,
                 dex_pc,
                 slow_path);
@@ -1489,8 +1499,17 @@ void CodeGeneratorARM64::InvokeRuntime(int32_t entry_point_offset,
   RecordPcInfo(instruction, dex_pc, slow_path);
 }
 
+void CodeGeneratorARM64::InvokeRuntimeWithoutRecordingPcInfo(int32_t entry_point_offset,
+                                                             HInstruction* instruction,
+                                                             SlowPathCode* slow_path) {
+  ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction, slow_path);
+  BlockPoolsScope block_pools(GetVIXLAssembler());
+  __ Ldr(lr, MemOperand(tr, entry_point_offset));
+  __ Blr(lr);
+}
+
 void InstructionCodeGeneratorARM64::GenerateClassInitializationCheck(SlowPathCodeARM64* slow_path,
-                                                                     vixl::Register class_reg) {
+                                                                     Register class_reg) {
   UseScratchRegisterScope temps(GetVIXLAssembler());
   Register temp = temps.AcquireW();
   size_t status_offset = mirror::Class::StatusOffset().SizeValue();
@@ -1546,7 +1565,7 @@ void InstructionCodeGeneratorARM64::GenerateSuspendCheck(HSuspendCheck* instruct
   UseScratchRegisterScope temps(codegen_->GetVIXLAssembler());
   Register temp = temps.AcquireW();
 
-  __ Ldrh(temp, MemOperand(tr, Thread::ThreadFlagsOffset<kArm64WordSize>().SizeValue()));
+  __ Ldrh(temp, MemOperand(tr, Thread::ThreadFlagsOffset<kArm64PointerSize>().SizeValue()));
   if (successor == nullptr) {
     __ Cbnz(temp, slow_path->GetEntryLabel());
     __ Bind(slow_path->GetReturnLabel());
@@ -1755,7 +1774,7 @@ void InstructionCodeGeneratorARM64::HandleBinaryOp(HBinaryOperation* instr) {
         __ Sub(dst, lhs, rhs);
       } else if (instr->IsRor()) {
         if (rhs.IsImmediate()) {
-          uint32_t shift = rhs.immediate() & (lhs.SizeInBits() - 1);
+          uint32_t shift = rhs.GetImmediate() & (lhs.GetSizeInBits() - 1);
           __ Ror(dst, lhs, shift);
         } else {
           // Ensure shift distance is in the same size register as the result. If
@@ -1818,7 +1837,7 @@ void InstructionCodeGeneratorARM64::HandleShift(HBinaryOperation* instr) {
       Register lhs = InputRegisterAt(instr, 0);
       Operand rhs = InputOperandAt(instr, 1);
       if (rhs.IsImmediate()) {
-        uint32_t shift_value = rhs.immediate() &
+        uint32_t shift_value = rhs.GetImmediate() &
             (type == Primitive::kPrimInt ? kMaxIntShiftDistance : kMaxLongShiftDistance);
         if (instr->IsShl()) {
           __ Lsl(dst, lhs, shift_value);
@@ -1828,7 +1847,7 @@ void InstructionCodeGeneratorARM64::HandleShift(HBinaryOperation* instr) {
           __ Lsr(dst, lhs, shift_value);
         }
       } else {
-        Register rhs_reg = dst.IsX() ? rhs.reg().X() : rhs.reg().W();
+        Register rhs_reg = dst.IsX() ? rhs.GetRegister().X() : rhs.GetRegister().W();
 
         if (instr->IsShl()) {
           __ Lsl(dst, lhs, rhs_reg);
@@ -1965,9 +1984,8 @@ void InstructionCodeGeneratorARM64::VisitArm64DataProcWithShifterOp(
   }
 }
 
-void LocationsBuilderARM64::VisitArm64IntermediateAddress(HArm64IntermediateAddress* instruction) {
-  // The read barrier instrumentation does not support the
-  // HArm64IntermediateAddress instruction yet.
+void LocationsBuilderARM64::VisitIntermediateAddress(HIntermediateAddress* instruction) {
+  // The read barrier instrumentation does not support the HIntermediateAddress instruction yet.
   DCHECK(!kEmitCompilerReadBarrier);
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
@@ -1976,10 +1994,9 @@ void LocationsBuilderARM64::VisitArm64IntermediateAddress(HArm64IntermediateAddr
   locations->SetOut(Location::RequiresRegister());
 }
 
-void InstructionCodeGeneratorARM64::VisitArm64IntermediateAddress(
-    HArm64IntermediateAddress* instruction) {
-  // The read barrier instrumentation does not support the
-  // HArm64IntermediateAddress instruction yet.
+void InstructionCodeGeneratorARM64::VisitIntermediateAddress(
+    HIntermediateAddress* instruction) {
+  // The read barrier instrumentation does not support the HIntermediateAddress instruction yet.
   DCHECK(!kEmitCompilerReadBarrier);
   __ Add(OutputRegister(instruction),
          InputRegisterAt(instruction, 0),
@@ -2014,13 +2031,14 @@ void InstructionCodeGeneratorARM64::VisitMultiplyAccumulate(HMultiplyAccumulate*
   if (instr->GetType() == Primitive::kPrimLong &&
       codegen_->GetInstructionSetFeatures().NeedFixCortexA53_835769()) {
     MacroAssembler* masm = down_cast<CodeGeneratorARM64*>(codegen_)->GetVIXLAssembler();
-    vixl::Instruction* prev = masm->GetCursorAddress<vixl::Instruction*>() - vixl::kInstructionSize;
+    vixl::aarch64::Instruction* prev =
+        masm->GetCursorAddress<vixl::aarch64::Instruction*>() - kInstructionSize;
     if (prev->IsLoadOrStore()) {
       // Make sure we emit only exactly one nop.
-      vixl::CodeBufferCheckScope scope(masm,
-                                       vixl::kInstructionSize,
-                                       vixl::CodeBufferCheckScope::kCheck,
-                                       vixl::CodeBufferCheckScope::kExactSize);
+      vixl::aarch64::CodeBufferCheckScope scope(masm,
+                                                kInstructionSize,
+                                                vixl::aarch64::CodeBufferCheckScope::kCheck,
+                                                vixl::aarch64::CodeBufferCheckScope::kExactSize);
       __ nop();
     }
   }
@@ -2078,9 +2096,8 @@ void InstructionCodeGeneratorARM64::VisitArrayGet(HArrayGet* instruction) {
   if (type == Primitive::kPrimNot && kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
     // Object ArrayGet with Baker's read barrier case.
     Register temp = temps.AcquireW();
-    // The read barrier instrumentation does not support the
-    // HArm64IntermediateAddress instruction yet.
-    DCHECK(!instruction->GetArray()->IsArm64IntermediateAddress());
+    // The read barrier instrumentation does not support the HIntermediateAddress instruction yet.
+    DCHECK(!instruction->GetArray()->IsIntermediateAddress());
     // Note that a potential implicit null check is handled in the
     // CodeGeneratorARM64::GenerateArrayLoadWithBakerReadBarrier call.
     codegen_->GenerateArrayLoadWithBakerReadBarrier(
@@ -2093,15 +2110,15 @@ void InstructionCodeGeneratorARM64::VisitArrayGet(HArrayGet* instruction) {
       source = HeapOperand(obj, offset);
     } else {
       Register temp = temps.AcquireSameSizeAs(obj);
-      if (instruction->GetArray()->IsArm64IntermediateAddress()) {
+      if (instruction->GetArray()->IsIntermediateAddress()) {
         // The read barrier instrumentation does not support the
-        // HArm64IntermediateAddress instruction yet.
+        // HIntermediateAddress instruction yet.
         DCHECK(!kEmitCompilerReadBarrier);
         // We do not need to compute the intermediate address from the array: the
         // input instruction has done it already. See the comment in
-        // `InstructionSimplifierArm64::TryExtractArrayAccessAddress()`.
+        // `TryExtractArrayAccessAddress()`.
         if (kIsDebugBuild) {
-          HArm64IntermediateAddress* tmp = instruction->GetArray()->AsArm64IntermediateAddress();
+          HIntermediateAddress* tmp = instruction->GetArray()->AsIntermediateAddress();
           DCHECK_EQ(tmp->GetOffset()->AsIntConstant()->GetValueAsUint64(), offset);
         }
         temp = obj;
@@ -2185,15 +2202,15 @@ void InstructionCodeGeneratorARM64::VisitArraySet(HArraySet* instruction) {
     } else {
       UseScratchRegisterScope temps(masm);
       Register temp = temps.AcquireSameSizeAs(array);
-      if (instruction->GetArray()->IsArm64IntermediateAddress()) {
+      if (instruction->GetArray()->IsIntermediateAddress()) {
         // The read barrier instrumentation does not support the
-        // HArm64IntermediateAddress instruction yet.
+        // HIntermediateAddress instruction yet.
         DCHECK(!kEmitCompilerReadBarrier);
         // We do not need to compute the intermediate address from the array: the
         // input instruction has done it already. See the comment in
-        // `InstructionSimplifierArm64::TryExtractArrayAccessAddress()`.
+        // `TryExtractArrayAccessAddress()`.
         if (kIsDebugBuild) {
-          HArm64IntermediateAddress* tmp = instruction->GetArray()->AsArm64IntermediateAddress();
+          HIntermediateAddress* tmp = instruction->GetArray()->AsIntermediateAddress();
           DCHECK(tmp->GetOffset()->AsIntConstant()->GetValueAsUint64() == offset);
         }
         temp = array;
@@ -2209,8 +2226,8 @@ void InstructionCodeGeneratorARM64::VisitArraySet(HArraySet* instruction) {
     codegen_->MaybeRecordImplicitNullCheck(instruction);
   } else {
     DCHECK(needs_write_barrier);
-    DCHECK(!instruction->GetArray()->IsArm64IntermediateAddress());
-    vixl::Label done;
+    DCHECK(!instruction->GetArray()->IsIntermediateAddress());
+    vixl::aarch64::Label done;
     SlowPathCodeARM64* slow_path = nullptr;
     {
       // We use a block to end the scratch scope before the write barrier, thus
@@ -2235,7 +2252,7 @@ void InstructionCodeGeneratorARM64::VisitArraySet(HArraySet* instruction) {
         slow_path = new (GetGraph()->GetArena()) ArraySetSlowPathARM64(instruction);
         codegen_->AddSlowPath(slow_path);
         if (instruction->GetValueCanBeNull()) {
-          vixl::Label non_zero;
+          vixl::aarch64::Label non_zero;
           __ Cbnz(Register(value), &non_zero);
           if (!index.IsConstant()) {
             __ Add(temp, array, offset);
@@ -2289,7 +2306,7 @@ void InstructionCodeGeneratorARM64::VisitArraySet(HArraySet* instruction) {
           __ Cmp(temp, temp2);
 
           if (instruction->StaticTypeOfArrayIsObjectArray()) {
-            vixl::Label do_put;
+            vixl::aarch64::Label do_put;
             __ B(eq, &do_put);
             // If heap poisoning is enabled, the `temp` reference has
             // not been unpoisoned yet; unpoison it now.
@@ -2822,11 +2839,11 @@ void InstructionCodeGeneratorARM64::VisitTryBoundary(HTryBoundary* try_boundary)
 
 void InstructionCodeGeneratorARM64::GenerateTestAndBranch(HInstruction* instruction,
                                                           size_t condition_input_index,
-                                                          vixl::Label* true_target,
-                                                          vixl::Label* false_target) {
+                                                          vixl::aarch64::Label* true_target,
+                                                          vixl::aarch64::Label* false_target) {
   // FP branching requires both targets to be explicit. If either of the targets
   // is nullptr (fallthrough) use and bind `fallthrough_target` instead.
-  vixl::Label fallthrough_target;
+  vixl::aarch64::Label fallthrough_target;
   HInstruction* cond = instruction->InputAt(condition_input_index);
 
   if (true_target == nullptr && false_target == nullptr) {
@@ -2884,7 +2901,7 @@ void InstructionCodeGeneratorARM64::GenerateTestAndBranch(HInstruction* instruct
       Operand rhs = InputOperandAt(condition, 1);
 
       Condition arm64_cond;
-      vixl::Label* non_fallthrough_target;
+      vixl::aarch64::Label* non_fallthrough_target;
       if (true_target == nullptr) {
         arm64_cond = ARM64Condition(condition->GetOppositeCondition());
         non_fallthrough_target = false_target;
@@ -2894,7 +2911,7 @@ void InstructionCodeGeneratorARM64::GenerateTestAndBranch(HInstruction* instruct
       }
 
       if ((arm64_cond == eq || arm64_cond == ne || arm64_cond == lt || arm64_cond == ge) &&
-          rhs.IsImmediate() && (rhs.immediate() == 0)) {
+          rhs.IsImmediate() && (rhs.GetImmediate() == 0)) {
         switch (arm64_cond) {
           case eq:
             __ Cbz(lhs, non_fallthrough_target);
@@ -2943,10 +2960,14 @@ void LocationsBuilderARM64::VisitIf(HIf* if_instr) {
 void InstructionCodeGeneratorARM64::VisitIf(HIf* if_instr) {
   HBasicBlock* true_successor = if_instr->IfTrueSuccessor();
   HBasicBlock* false_successor = if_instr->IfFalseSuccessor();
-  vixl::Label* true_target = codegen_->GoesToNextBlock(if_instr->GetBlock(), true_successor) ?
-      nullptr : codegen_->GetLabelOf(true_successor);
-  vixl::Label* false_target = codegen_->GoesToNextBlock(if_instr->GetBlock(), false_successor) ?
-      nullptr : codegen_->GetLabelOf(false_successor);
+  vixl::aarch64::Label* true_target = codegen_->GetLabelOf(true_successor);
+  if (codegen_->GoesToNextBlock(if_instr->GetBlock(), true_successor)) {
+    true_target = nullptr;
+  }
+  vixl::aarch64::Label* false_target = codegen_->GetLabelOf(false_successor);
+  if (codegen_->GoesToNextBlock(if_instr->GetBlock(), false_successor)) {
+    false_target = nullptr;
+  }
   GenerateTestAndBranch(if_instr, /* condition_input_index */ 0, true_target, false_target);
 }
 
@@ -3130,7 +3151,7 @@ void InstructionCodeGeneratorARM64::VisitInstanceOf(HInstanceOf* instruction) {
   uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
   uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
 
-  vixl::Label done, zero;
+  vixl::aarch64::Label done, zero;
   SlowPathCodeARM64* slow_path = nullptr;
 
   // Return 0 if `obj` is null.
@@ -3155,7 +3176,7 @@ void InstructionCodeGeneratorARM64::VisitInstanceOf(HInstanceOf* instruction) {
     case TypeCheckKind::kAbstractClassCheck: {
       // If the class is abstract, we eagerly fetch the super class of the
       // object to avoid doing a comparison we know will fail.
-      vixl::Label loop, success;
+      vixl::aarch64::Label loop, success;
       __ Bind(&loop);
       // /* HeapReference<Class> */ out = out->super_class_
       GenerateReferenceLoadOneRegister(instruction, out_loc, super_offset, maybe_temp_loc);
@@ -3172,7 +3193,7 @@ void InstructionCodeGeneratorARM64::VisitInstanceOf(HInstanceOf* instruction) {
 
     case TypeCheckKind::kClassHierarchyCheck: {
       // Walk over the class hierarchy to find a match.
-      vixl::Label loop, success;
+      vixl::aarch64::Label loop, success;
       __ Bind(&loop);
       __ Cmp(out, cls);
       __ B(eq, &success);
@@ -3191,7 +3212,7 @@ void InstructionCodeGeneratorARM64::VisitInstanceOf(HInstanceOf* instruction) {
 
     case TypeCheckKind::kArrayObjectCheck: {
       // Do an exact check.
-      vixl::Label exact_check;
+      vixl::aarch64::Label exact_check;
       __ Cmp(out, cls);
       __ B(eq, &exact_check);
       // Otherwise, we need to check that the object's class is a non-primitive array.
@@ -3328,7 +3349,7 @@ void InstructionCodeGeneratorARM64::VisitCheckCast(HCheckCast* instruction) {
                                                           is_type_check_slow_path_fatal);
   codegen_->AddSlowPath(type_check_slow_path);
 
-  vixl::Label done;
+  vixl::aarch64::Label done;
   // Avoid null check if we know obj is not null.
   if (instruction->MustDoNullCheck()) {
     __ Cbz(obj, &done);
@@ -3350,7 +3371,7 @@ void InstructionCodeGeneratorARM64::VisitCheckCast(HCheckCast* instruction) {
     case TypeCheckKind::kAbstractClassCheck: {
       // If the class is abstract, we eagerly fetch the super class of the
       // object to avoid doing a comparison we know will fail.
-      vixl::Label loop, compare_classes;
+      vixl::aarch64::Label loop, compare_classes;
       __ Bind(&loop);
       // /* HeapReference<Class> */ temp = temp->super_class_
       GenerateReferenceLoadOneRegister(instruction, temp_loc, super_offset, maybe_temp2_loc);
@@ -3377,7 +3398,7 @@ void InstructionCodeGeneratorARM64::VisitCheckCast(HCheckCast* instruction) {
 
     case TypeCheckKind::kClassHierarchyCheck: {
       // Walk over the class hierarchy to find a match.
-      vixl::Label loop;
+      vixl::aarch64::Label loop;
       __ Bind(&loop);
       __ Cmp(temp, cls);
       __ B(eq, &done);
@@ -3402,7 +3423,7 @@ void InstructionCodeGeneratorARM64::VisitCheckCast(HCheckCast* instruction) {
 
     case TypeCheckKind::kArrayObjectCheck: {
       // Do an exact check.
-      vixl::Label check_non_primitive_component_type;
+      vixl::aarch64::Label check_non_primitive_component_type;
       __ Cmp(temp, cls);
       __ B(eq, &done);
 
@@ -3506,11 +3527,9 @@ void InstructionCodeGeneratorARM64::VisitInvokeInterface(HInvokeInterface* invok
   // TODO: b/18116999, our IMTs can miss an IncompatibleClassChangeError.
   LocationSummary* locations = invoke->GetLocations();
   Register temp = XRegisterFrom(locations->GetTemp(0));
-  uint32_t method_offset = mirror::Class::EmbeddedImTableEntryOffset(
-      invoke->GetImtIndex() % mirror::Class::kImtSize, kArm64PointerSize).Uint32Value();
   Location receiver = locations->InAt(0);
   Offset class_offset = mirror::Object::ClassOffset();
-  Offset entry_point = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArm64WordSize);
+  Offset entry_point = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArm64PointerSize);
 
   // The register ip1 is required to be used for the hidden argument in
   // art_quick_imt_conflict_trampoline, so prevent VIXL from using it.
@@ -3537,6 +3556,10 @@ void InstructionCodeGeneratorARM64::VisitInvokeInterface(HInvokeInterface* invok
   // intact/accessible until the end of the marking phase (the
   // concurrent copying collector may not in the future).
   GetAssembler()->MaybeUnpoisonHeapReference(temp.W());
+  __ Ldr(temp,
+      MemOperand(temp, mirror::Class::ImtPtrOffset(kArm64PointerSize).Uint32Value()));
+  uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement(
+      invoke->GetImtIndex(), kArm64PointerSize));
   // temp = temp->GetImtEntryAt(method_offset);
   __ Ldr(temp, MemOperand(temp, method_offset));
   // lr = temp->GetEntryPoint();
@@ -3626,17 +3649,17 @@ void CodeGeneratorARM64::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invok
       // Add ADRP with its PC-relative DexCache access patch.
       const DexFile& dex_file = *invoke->GetTargetMethod().dex_file;
       uint32_t element_offset = invoke->GetDexCacheArrayOffset();
-      vixl::Label* adrp_label = NewPcRelativeDexCacheArrayPatch(dex_file, element_offset);
+      vixl::aarch64::Label* adrp_label = NewPcRelativeDexCacheArrayPatch(dex_file, element_offset);
       {
-        vixl::SingleEmissionCheckScope guard(GetVIXLAssembler());
+        SingleEmissionCheckScope guard(GetVIXLAssembler());
         __ Bind(adrp_label);
         __ adrp(XRegisterFrom(temp), /* offset placeholder */ 0);
       }
       // Add LDR with its PC-relative DexCache access patch.
-      vixl::Label* ldr_label =
+      vixl::aarch64::Label* ldr_label =
           NewPcRelativeDexCacheArrayPatch(dex_file, element_offset, adrp_label);
       {
-        vixl::SingleEmissionCheckScope guard(GetVIXLAssembler());
+        SingleEmissionCheckScope guard(GetVIXLAssembler());
         __ Bind(ldr_label);
         __ ldr(XRegisterFrom(temp), MemOperand(XRegisterFrom(temp), /* offset placeholder */ 0));
       }
@@ -3658,7 +3681,7 @@ void CodeGeneratorARM64::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invok
       // /* ArtMethod*[] */ temp = temp.ptr_sized_fields_->dex_cache_resolved_methods_;
       __ Ldr(reg.X(),
              MemOperand(method_reg.X(),
-                        ArtMethod::DexCacheResolvedMethodsOffset(kArm64WordSize).Int32Value()));
+                        ArtMethod::DexCacheResolvedMethodsOffset(kArm64PointerSize).Int32Value()));
       // temp = temp[index_in_cache];
       // Note: Don't use invoke->GetTargetMethod() as it may point to a different dex file.
       uint32_t index_in_cache = invoke->GetDexMethodIndex();
@@ -3673,8 +3696,8 @@ void CodeGeneratorARM64::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invok
       break;
     case HInvokeStaticOrDirect::CodePtrLocation::kCallPCRelative: {
       relative_call_patches_.emplace_back(invoke->GetTargetMethod());
-      vixl::Label* label = &relative_call_patches_.back().label;
-      vixl::SingleEmissionCheckScope guard(GetVIXLAssembler());
+      vixl::aarch64::Label* label = &relative_call_patches_.back().label;
+      SingleEmissionCheckScope guard(GetVIXLAssembler());
       __ Bind(label);
       __ bl(0);  // Branch and link to itself. This will be overriden at link time.
       break;
@@ -3690,7 +3713,7 @@ void CodeGeneratorARM64::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invok
       // LR = callee_method->entry_point_from_quick_compiled_code_;
       __ Ldr(lr, MemOperand(
           XRegisterFrom(callee_method),
-          ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArm64WordSize).Int32Value()));
+          ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArm64PointerSize).Int32Value()));
       // lr()
       __ Blr(lr);
       break;
@@ -3710,7 +3733,7 @@ void CodeGeneratorARM64::GenerateVirtualCall(HInvokeVirtual* invoke, Location te
   size_t method_offset = mirror::Class::EmbeddedVTableEntryOffset(
       invoke->GetVTableIndex(), kArm64PointerSize).SizeValue();
   Offset class_offset = mirror::Object::ClassOffset();
-  Offset entry_point = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArm64WordSize);
+  Offset entry_point = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArm64PointerSize);
 
   BlockPoolsScope block_pools(GetVIXLAssembler());
 
@@ -3733,58 +3756,64 @@ void CodeGeneratorARM64::GenerateVirtualCall(HInvokeVirtual* invoke, Location te
   __ Blr(lr);
 }
 
-vixl::Label* CodeGeneratorARM64::NewPcRelativeStringPatch(const DexFile& dex_file,
-                                                          uint32_t string_index,
-                                                          vixl::Label* adrp_label) {
+vixl::aarch64::Label* CodeGeneratorARM64::NewPcRelativeStringPatch(
+    const DexFile& dex_file,
+    uint32_t string_index,
+    vixl::aarch64::Label* adrp_label) {
   return NewPcRelativePatch(dex_file, string_index, adrp_label, &pc_relative_string_patches_);
 }
 
-vixl::Label* CodeGeneratorARM64::NewPcRelativeTypePatch(const DexFile& dex_file,
-                                                        uint32_t type_index,
-                                                        vixl::Label* adrp_label) {
+vixl::aarch64::Label* CodeGeneratorARM64::NewPcRelativeTypePatch(
+    const DexFile& dex_file,
+    uint32_t type_index,
+    vixl::aarch64::Label* adrp_label) {
   return NewPcRelativePatch(dex_file, type_index, adrp_label, &pc_relative_type_patches_);
 }
 
-vixl::Label* CodeGeneratorARM64::NewPcRelativeDexCacheArrayPatch(const DexFile& dex_file,
-                                                                 uint32_t element_offset,
-                                                                 vixl::Label* adrp_label) {
+vixl::aarch64::Label* CodeGeneratorARM64::NewPcRelativeDexCacheArrayPatch(
+    const DexFile& dex_file,
+    uint32_t element_offset,
+    vixl::aarch64::Label* adrp_label) {
   return NewPcRelativePatch(dex_file, element_offset, adrp_label, &pc_relative_dex_cache_patches_);
 }
 
-vixl::Label* CodeGeneratorARM64::NewPcRelativePatch(const DexFile& dex_file,
-                                                    uint32_t offset_or_index,
-                                                    vixl::Label* adrp_label,
-                                                    ArenaDeque<PcRelativePatchInfo>* patches) {
+vixl::aarch64::Label* CodeGeneratorARM64::NewPcRelativePatch(
+    const DexFile& dex_file,
+    uint32_t offset_or_index,
+    vixl::aarch64::Label* adrp_label,
+    ArenaDeque<PcRelativePatchInfo>* patches) {
   // Add a patch entry and return the label.
   patches->emplace_back(dex_file, offset_or_index);
   PcRelativePatchInfo* info = &patches->back();
-  vixl::Label* label = &info->label;
+  vixl::aarch64::Label* label = &info->label;
   // If adrp_label is null, this is the ADRP patch and needs to point to its own label.
   info->pc_insn_label = (adrp_label != nullptr) ? adrp_label : label;
   return label;
 }
 
-vixl::Literal<uint32_t>* CodeGeneratorARM64::DeduplicateBootImageStringLiteral(
+vixl::aarch64::Literal<uint32_t>* CodeGeneratorARM64::DeduplicateBootImageStringLiteral(
     const DexFile& dex_file, uint32_t string_index) {
   return boot_image_string_patches_.GetOrCreate(
       StringReference(&dex_file, string_index),
       [this]() { return __ CreateLiteralDestroyedWithPool<uint32_t>(/* placeholder */ 0u); });
 }
 
-vixl::Literal<uint32_t>* CodeGeneratorARM64::DeduplicateBootImageTypeLiteral(
+vixl::aarch64::Literal<uint32_t>* CodeGeneratorARM64::DeduplicateBootImageTypeLiteral(
     const DexFile& dex_file, uint32_t type_index) {
   return boot_image_type_patches_.GetOrCreate(
       TypeReference(&dex_file, type_index),
       [this]() { return __ CreateLiteralDestroyedWithPool<uint32_t>(/* placeholder */ 0u); });
 }
 
-vixl::Literal<uint32_t>* CodeGeneratorARM64::DeduplicateBootImageAddressLiteral(uint64_t address) {
+vixl::aarch64::Literal<uint32_t>* CodeGeneratorARM64::DeduplicateBootImageAddressLiteral(
+    uint64_t address) {
   bool needs_patch = GetCompilerOptions().GetIncludePatchInformation();
   Uint32ToLiteralMap* map = needs_patch ? &boot_image_address_patches_ : &uint32_literals_;
   return DeduplicateUint32Literal(dchecked_integral_cast<uint32_t>(address), map);
 }
 
-vixl::Literal<uint64_t>* CodeGeneratorARM64::DeduplicateDexCacheAddressLiteral(uint64_t address) {
+vixl::aarch64::Literal<uint64_t>* CodeGeneratorARM64::DeduplicateDexCacheAddressLiteral(
+    uint64_t address) {
   return DeduplicateUint64Literal(address);
 }
 
@@ -3803,76 +3832,76 @@ void CodeGeneratorARM64::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patc
   linker_patches->reserve(size);
   for (const auto& entry : method_patches_) {
     const MethodReference& target_method = entry.first;
-    vixl::Literal<uint64_t>* literal = entry.second;
-    linker_patches->push_back(LinkerPatch::MethodPatch(literal->offset(),
+    vixl::aarch64::Literal<uint64_t>* literal = entry.second;
+    linker_patches->push_back(LinkerPatch::MethodPatch(literal->GetOffset(),
                                                        target_method.dex_file,
                                                        target_method.dex_method_index));
   }
   for (const auto& entry : call_patches_) {
     const MethodReference& target_method = entry.first;
-    vixl::Literal<uint64_t>* literal = entry.second;
-    linker_patches->push_back(LinkerPatch::CodePatch(literal->offset(),
+    vixl::aarch64::Literal<uint64_t>* literal = entry.second;
+    linker_patches->push_back(LinkerPatch::CodePatch(literal->GetOffset(),
                                                      target_method.dex_file,
                                                      target_method.dex_method_index));
   }
-  for (const MethodPatchInfo<vixl::Label>& info : relative_call_patches_) {
-    linker_patches->push_back(LinkerPatch::RelativeCodePatch(info.label.location(),
+  for (const MethodPatchInfo<vixl::aarch64::Label>& info : relative_call_patches_) {
+    linker_patches->push_back(LinkerPatch::RelativeCodePatch(info.label.GetLocation(),
                                                              info.target_method.dex_file,
                                                              info.target_method.dex_method_index));
   }
   for (const PcRelativePatchInfo& info : pc_relative_dex_cache_patches_) {
-    linker_patches->push_back(LinkerPatch::DexCacheArrayPatch(info.label.location(),
+    linker_patches->push_back(LinkerPatch::DexCacheArrayPatch(info.label.GetLocation(),
                                                               &info.target_dex_file,
-                                                              info.pc_insn_label->location(),
+                                                              info.pc_insn_label->GetLocation(),
                                                               info.offset_or_index));
   }
   for (const auto& entry : boot_image_string_patches_) {
     const StringReference& target_string = entry.first;
-    vixl::Literal<uint32_t>* literal = entry.second;
-    linker_patches->push_back(LinkerPatch::StringPatch(literal->offset(),
+    vixl::aarch64::Literal<uint32_t>* literal = entry.second;
+    linker_patches->push_back(LinkerPatch::StringPatch(literal->GetOffset(),
                                                        target_string.dex_file,
                                                        target_string.string_index));
   }
   for (const PcRelativePatchInfo& info : pc_relative_string_patches_) {
-    linker_patches->push_back(LinkerPatch::RelativeStringPatch(info.label.location(),
+    linker_patches->push_back(LinkerPatch::RelativeStringPatch(info.label.GetLocation(),
                                                                &info.target_dex_file,
-                                                               info.pc_insn_label->location(),
+                                                               info.pc_insn_label->GetLocation(),
                                                                info.offset_or_index));
   }
   for (const auto& entry : boot_image_type_patches_) {
     const TypeReference& target_type = entry.first;
-    vixl::Literal<uint32_t>* literal = entry.second;
-    linker_patches->push_back(LinkerPatch::TypePatch(literal->offset(),
+    vixl::aarch64::Literal<uint32_t>* literal = entry.second;
+    linker_patches->push_back(LinkerPatch::TypePatch(literal->GetOffset(),
                                                      target_type.dex_file,
                                                      target_type.type_index));
   }
   for (const PcRelativePatchInfo& info : pc_relative_type_patches_) {
-    linker_patches->push_back(LinkerPatch::RelativeTypePatch(info.label.location(),
+    linker_patches->push_back(LinkerPatch::RelativeTypePatch(info.label.GetLocation(),
                                                              &info.target_dex_file,
-                                                             info.pc_insn_label->location(),
+                                                             info.pc_insn_label->GetLocation(),
                                                              info.offset_or_index));
   }
   for (const auto& entry : boot_image_address_patches_) {
     DCHECK(GetCompilerOptions().GetIncludePatchInformation());
-    vixl::Literal<uint32_t>* literal = entry.second;
-    linker_patches->push_back(LinkerPatch::RecordPosition(literal->offset()));
+    vixl::aarch64::Literal<uint32_t>* literal = entry.second;
+    linker_patches->push_back(LinkerPatch::RecordPosition(literal->GetOffset()));
   }
 }
 
-vixl::Literal<uint32_t>* CodeGeneratorARM64::DeduplicateUint32Literal(uint32_t value,
+vixl::aarch64::Literal<uint32_t>* CodeGeneratorARM64::DeduplicateUint32Literal(uint32_t value,
                                                                       Uint32ToLiteralMap* map) {
   return map->GetOrCreate(
       value,
       [this, value]() { return __ CreateLiteralDestroyedWithPool<uint32_t>(value); });
 }
 
-vixl::Literal<uint64_t>* CodeGeneratorARM64::DeduplicateUint64Literal(uint64_t value) {
+vixl::aarch64::Literal<uint64_t>* CodeGeneratorARM64::DeduplicateUint64Literal(uint64_t value) {
   return uint64_literals_.GetOrCreate(
       value,
       [this, value]() { return __ CreateLiteralDestroyedWithPool<uint64_t>(value); });
 }
 
-vixl::Literal<uint64_t>* CodeGeneratorARM64::DeduplicateMethodLiteral(
+vixl::aarch64::Literal<uint64_t>* CodeGeneratorARM64::DeduplicateMethodLiteral(
     MethodReference target_method,
     MethodToLiteralMap* map) {
   return map->GetOrCreate(
@@ -3880,12 +3909,12 @@ vixl::Literal<uint64_t>* CodeGeneratorARM64::DeduplicateMethodLiteral(
       [this]() { return __ CreateLiteralDestroyedWithPool<uint64_t>(/* placeholder */ 0u); });
 }
 
-vixl::Literal<uint64_t>* CodeGeneratorARM64::DeduplicateMethodAddressLiteral(
+vixl::aarch64::Literal<uint64_t>* CodeGeneratorARM64::DeduplicateMethodAddressLiteral(
     MethodReference target_method) {
   return DeduplicateMethodLiteral(target_method, &method_patches_);
 }
 
-vixl::Literal<uint64_t>* CodeGeneratorARM64::DeduplicateMethodCodeLiteral(
+vixl::aarch64::Literal<uint64_t>* CodeGeneratorARM64::DeduplicateMethodCodeLiteral(
     MethodReference target_method) {
   return DeduplicateMethodLiteral(target_method, &call_patches_);
 }
@@ -3959,7 +3988,7 @@ void LocationsBuilderARM64::VisitLoadClass(HLoadClass* cls) {
     CodeGenerator::CreateLoadClassLocationSummary(
         cls,
         LocationFrom(calling_convention.GetRegisterAt(0)),
-        LocationFrom(vixl::x0),
+        LocationFrom(vixl::aarch64::x0),
         /* code_generator_supports_read_barrier */ true);
     return;
   }
@@ -4011,16 +4040,17 @@ void InstructionCodeGeneratorARM64::VisitLoadClass(HLoadClass* cls) {
       // Add ADRP with its PC-relative type patch.
       const DexFile& dex_file = cls->GetDexFile();
       uint32_t type_index = cls->GetTypeIndex();
-      vixl::Label* adrp_label = codegen_->NewPcRelativeTypePatch(dex_file, type_index);
+      vixl::aarch64::Label* adrp_label = codegen_->NewPcRelativeTypePatch(dex_file, type_index);
       {
-        vixl::SingleEmissionCheckScope guard(GetVIXLAssembler());
+        SingleEmissionCheckScope guard(GetVIXLAssembler());
         __ Bind(adrp_label);
         __ adrp(out.X(), /* offset placeholder */ 0);
       }
       // Add ADD with its PC-relative type patch.
-      vixl::Label* add_label = codegen_->NewPcRelativeTypePatch(dex_file, type_index, adrp_label);
+      vixl::aarch64::Label* add_label =
+          codegen_->NewPcRelativeTypePatch(dex_file, type_index, adrp_label);
       {
-        vixl::SingleEmissionCheckScope guard(GetVIXLAssembler());
+        SingleEmissionCheckScope guard(GetVIXLAssembler());
         __ Bind(add_label);
         __ add(out.X(), out.X(), Operand(/* offset placeholder */ 0));
       }
@@ -4053,14 +4083,15 @@ void InstructionCodeGeneratorARM64::VisitLoadClass(HLoadClass* cls) {
       // Add ADRP with its PC-relative DexCache access patch.
       const DexFile& dex_file = cls->GetDexFile();
       uint32_t element_offset = cls->GetDexCacheElementOffset();
-      vixl::Label* adrp_label = codegen_->NewPcRelativeDexCacheArrayPatch(dex_file, element_offset);
+      vixl::aarch64::Label* adrp_label =
+          codegen_->NewPcRelativeDexCacheArrayPatch(dex_file, element_offset);
       {
-        vixl::SingleEmissionCheckScope guard(GetVIXLAssembler());
+        SingleEmissionCheckScope guard(GetVIXLAssembler());
         __ Bind(adrp_label);
         __ adrp(out.X(), /* offset placeholder */ 0);
       }
       // Add LDR with its PC-relative DexCache access patch.
-      vixl::Label* ldr_label =
+      vixl::aarch64::Label* ldr_label =
           codegen_->NewPcRelativeDexCacheArrayPatch(dex_file, element_offset, adrp_label);
       // /* GcRoot<mirror::Class> */ out = *(base_address + offset)  /* PC-relative */
       GenerateGcRootFieldLoad(cls, out_loc, out.X(), /* offset placeholder */ 0, ldr_label);
@@ -4099,7 +4130,7 @@ void InstructionCodeGeneratorARM64::VisitLoadClass(HLoadClass* cls) {
 }
 
 static MemOperand GetExceptionTlsAddress() {
-  return MemOperand(tr, Thread::ExceptionOffset<kArm64WordSize>().Int32Value());
+  return MemOperand(tr, Thread::ExceptionOffset<kArm64PointerSize>().Int32Value());
 }
 
 void LocationsBuilderARM64::VisitLoadException(HLoadException* load) {
@@ -4166,7 +4197,6 @@ void LocationsBuilderARM64::VisitLoadString(HLoadString* load) {
 }
 
 void InstructionCodeGeneratorARM64::VisitLoadString(HLoadString* load) {
-  Location out_loc = load->GetLocations()->Out();
   Register out = OutputRegister(load);
 
   switch (load->GetLoadKind()) {
@@ -4180,17 +4210,17 @@ void InstructionCodeGeneratorARM64::VisitLoadString(HLoadString* load) {
       // Add ADRP with its PC-relative String patch.
       const DexFile& dex_file = load->GetDexFile();
       uint32_t string_index = load->GetStringIndex();
-      vixl::Label* adrp_label = codegen_->NewPcRelativeStringPatch(dex_file, string_index);
+      vixl::aarch64::Label* adrp_label = codegen_->NewPcRelativeStringPatch(dex_file, string_index);
       {
-        vixl::SingleEmissionCheckScope guard(GetVIXLAssembler());
+        SingleEmissionCheckScope guard(GetVIXLAssembler());
         __ Bind(adrp_label);
         __ adrp(out.X(), /* offset placeholder */ 0);
       }
       // Add ADD with its PC-relative String patch.
-      vixl::Label* add_label =
+      vixl::aarch64::Label* add_label =
           codegen_->NewPcRelativeStringPatch(dex_file, string_index, adrp_label);
       {
-        vixl::SingleEmissionCheckScope guard(GetVIXLAssembler());
+        SingleEmissionCheckScope guard(GetVIXLAssembler());
         __ Bind(add_label);
         __ add(out.X(), out.X(), Operand(/* offset placeholder */ 0));
       }
@@ -4202,62 +4232,15 @@ void InstructionCodeGeneratorARM64::VisitLoadString(HLoadString* load) {
       __ Ldr(out.W(), codegen_->DeduplicateBootImageAddressLiteral(load->GetAddress()));
       return;  // No dex cache slow path.
     }
-    case HLoadString::LoadKind::kDexCacheAddress: {
-      DCHECK_NE(load->GetAddress(), 0u);
-      // LDR immediate has a 12-bit offset multiplied by the size and for 32-bit loads
-      // that gives a 16KiB range. To try and reduce the number of literals if we load
-      // multiple strings, simply split the dex cache address to a 16KiB aligned base
-      // loaded from a literal and the remaining offset embedded in the load.
-      static_assert(sizeof(GcRoot<mirror::String>) == 4u, "Expected GC root to be 4 bytes.");
-      DCHECK_ALIGNED(load->GetAddress(), 4u);
-      constexpr size_t offset_bits = /* encoded bits */ 12 + /* scale */ 2;
-      uint64_t base_address = load->GetAddress() & ~MaxInt<uint64_t>(offset_bits);
-      uint32_t offset = load->GetAddress() & MaxInt<uint64_t>(offset_bits);
-      __ Ldr(out.X(), codegen_->DeduplicateDexCacheAddressLiteral(base_address));
-      // /* GcRoot<mirror::String> */ out = *(base_address + offset)
-      GenerateGcRootFieldLoad(load, out_loc, out.X(), offset);
-      break;
-    }
-    case HLoadString::LoadKind::kDexCachePcRelative: {
-      // Add ADRP with its PC-relative DexCache access patch.
-      const DexFile& dex_file = load->GetDexFile();
-      uint32_t element_offset = load->GetDexCacheElementOffset();
-      vixl::Label* adrp_label = codegen_->NewPcRelativeDexCacheArrayPatch(dex_file, element_offset);
-      {
-        vixl::SingleEmissionCheckScope guard(GetVIXLAssembler());
-        __ Bind(adrp_label);
-        __ adrp(out.X(), /* offset placeholder */ 0);
-      }
-      // Add LDR with its PC-relative DexCache access patch.
-      vixl::Label* ldr_label =
-          codegen_->NewPcRelativeDexCacheArrayPatch(dex_file, element_offset, adrp_label);
-      // /* GcRoot<mirror::String> */ out = *(base_address + offset)  /* PC-relative */
-      GenerateGcRootFieldLoad(load, out_loc, out.X(), /* offset placeholder */ 0, ldr_label);
-      break;
-    }
-    case HLoadString::LoadKind::kDexCacheViaMethod: {
-      Register current_method = InputRegisterAt(load, 0);
-      // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
-      GenerateGcRootFieldLoad(
-          load, out_loc, current_method, ArtMethod::DeclaringClassOffset().Int32Value());
-      // /* GcRoot<mirror::String>[] */ out = out->dex_cache_strings_
-      __ Ldr(out.X(), HeapOperand(out, mirror::Class::DexCacheStringsOffset().Uint32Value()));
-      // /* GcRoot<mirror::String> */ out = out[string_index]
-      GenerateGcRootFieldLoad(
-          load, out_loc, out.X(), CodeGenerator::GetCacheOffset(load->GetStringIndex()));
-      break;
-    }
     default:
-      LOG(FATAL) << "Unexpected load kind: " << load->GetLoadKind();
-      UNREACHABLE();
+      break;
   }
 
-  if (!load->IsInDexCache()) {
-    SlowPathCodeARM64* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathARM64(load);
-    codegen_->AddSlowPath(slow_path);
-    __ Cbz(out, slow_path->GetEntryLabel());
-    __ Bind(slow_path->GetExitLabel());
-  }
+  // TODO: Re-add the compiler code to do string dex cache lookup again.
+  SlowPathCodeARM64* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathARM64(load);
+  codegen_->AddSlowPath(slow_path);
+  __ B(slow_path->GetEntryLabel());
+  __ Bind(slow_path->GetExitLabel());
 }
 
 void LocationsBuilderARM64::VisitLongConstant(HLongConstant* constant) {
@@ -4271,7 +4254,7 @@ void InstructionCodeGeneratorARM64::VisitLongConstant(HLongConstant* constant AT
 
 void LocationsBuilderARM64::VisitMonitorOperation(HMonitorOperation* instruction) {
   LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
+      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
 }
@@ -4369,7 +4352,7 @@ void InstructionCodeGeneratorARM64::VisitNeg(HNeg* neg) {
 
 void LocationsBuilderARM64::VisitNewArray(HNewArray* instruction) {
   LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
+      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly);
   InvokeRuntimeCallingConvention calling_convention;
   locations->AddTemp(LocationFrom(calling_convention.GetRegisterAt(0)));
   locations->SetOut(LocationFrom(x0));
@@ -4394,7 +4377,7 @@ void InstructionCodeGeneratorARM64::VisitNewArray(HNewArray* instruction) {
 
 void LocationsBuilderARM64::VisitNewInstance(HNewInstance* instruction) {
   LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
+      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly);
   InvokeRuntimeCallingConvention calling_convention;
   if (instruction->IsStringAlloc()) {
     locations->AddTemp(LocationFrom(kArtMethodRegister));
@@ -4411,7 +4394,7 @@ void InstructionCodeGeneratorARM64::VisitNewInstance(HNewInstance* instruction)
   if (instruction->IsStringAlloc()) {
     // String is allocated through StringFactory. Call NewEmptyString entry point.
     Location temp = instruction->GetLocations()->GetTemp(0);
-    MemberOffset code_offset = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArm64WordSize);
+    MemberOffset code_offset = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArm64PointerSize);
     __ Ldr(XRegisterFrom(temp), MemOperand(tr, QUICK_ENTRY_POINT(pNewEmptyString)));
     __ Ldr(lr, MemOperand(XRegisterFrom(temp), code_offset.Int32Value()));
     __ Blr(lr);
@@ -4450,7 +4433,7 @@ void LocationsBuilderARM64::VisitBooleanNot(HBooleanNot* instruction) {
 }
 
 void InstructionCodeGeneratorARM64::VisitBooleanNot(HBooleanNot* instruction) {
-  __ Eor(OutputRegister(instruction), InputRegisterAt(instruction, 0), vixl::Operand(1));
+  __ Eor(OutputRegister(instruction), InputRegisterAt(instruction, 0), vixl::aarch64::Operand(1));
 }
 
 void LocationsBuilderARM64::VisitNullCheck(HNullCheck* instruction) {
@@ -4547,7 +4530,8 @@ void InstructionCodeGeneratorARM64::VisitPhi(HPhi* instruction ATTRIBUTE_UNUSED)
 void LocationsBuilderARM64::VisitRem(HRem* rem) {
   Primitive::Type type = rem->GetResultType();
   LocationSummary::CallKind call_kind =
-      Primitive::IsFloatingPointType(type) ? LocationSummary::kCall : LocationSummary::kNoCall;
+      Primitive::IsFloatingPointType(type) ? LocationSummary::kCallOnMainOnly
+                                           : LocationSummary::kNoCall;
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(rem, call_kind);
 
   switch (type) {
@@ -4764,7 +4748,7 @@ void InstructionCodeGeneratorARM64::VisitSuspendCheck(HSuspendCheck* instruction
 
 void LocationsBuilderARM64::VisitThrow(HThrow* instruction) {
   LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
+      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
 }
@@ -4882,7 +4866,7 @@ void InstructionCodeGeneratorARM64::VisitPackedSwitch(HPackedSwitch* switch_inst
   HBasicBlock* default_block = switch_instr->GetDefaultBlock();
 
   // Roughly set 16 as max average assemblies generated per HIR in a graph.
-  static constexpr int32_t kMaxExpectedSizePerHInstruction = 16 * vixl::kInstructionSize;
+  static constexpr int32_t kMaxExpectedSizePerHInstruction = 16 * kInstructionSize;
   // ADR has a limited range(+/-1MB), so we set a threshold for the number of HIRs in the graph to
   // make sure we don't emit it if the target may run out of range.
   // TODO: Instead of emitting all jump tables at the end of the code, we could keep track of ADR
@@ -5027,9 +5011,9 @@ void InstructionCodeGeneratorARM64::GenerateReferenceLoadTwoRegisters(HInstructi
 
 void InstructionCodeGeneratorARM64::GenerateGcRootFieldLoad(HInstruction* instruction,
                                                             Location root,
-                                                            vixl::Register obj,
+                                                            Register obj,
                                                             uint32_t offset,
-                                                            vixl::Label* fixup_label) {
+                                                            vixl::aarch64::Label* fixup_label) {
   Register root_reg = RegisterFrom(root, Primitive::kPrimNot);
   if (kEmitCompilerReadBarrier) {
     if (kUseBakerReadBarrier) {
@@ -5045,7 +5029,7 @@ void InstructionCodeGeneratorARM64::GenerateGcRootFieldLoad(HInstruction* instru
       if (fixup_label == nullptr) {
         __ Ldr(root_reg, MemOperand(obj, offset));
       } else {
-        vixl::SingleEmissionCheckScope guard(GetVIXLAssembler());
+        SingleEmissionCheckScope guard(GetVIXLAssembler());
         __ Bind(fixup_label);
         __ ldr(root_reg, MemOperand(obj, offset));
       }
@@ -5059,14 +5043,14 @@ void InstructionCodeGeneratorARM64::GenerateGcRootFieldLoad(HInstruction* instru
 
       // Slow path used to mark the GC root `root`.
       SlowPathCodeARM64* slow_path =
-          new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64(instruction, root, root);
+          new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64(instruction, root);
       codegen_->AddSlowPath(slow_path);
 
       MacroAssembler* masm = GetVIXLAssembler();
       UseScratchRegisterScope temps(masm);
       Register temp = temps.AcquireW();
       // temp = Thread::Current()->GetIsGcMarking()
-      __ Ldr(temp, MemOperand(tr, Thread::IsGcMarkingOffset<kArm64WordSize>().Int32Value()));
+      __ Ldr(temp, MemOperand(tr, Thread::IsGcMarkingOffset<kArm64PointerSize>().Int32Value()));
       __ Cbnz(temp, slow_path->GetEntryLabel());
       __ Bind(slow_path->GetExitLabel());
     } else {
@@ -5076,7 +5060,7 @@ void InstructionCodeGeneratorARM64::GenerateGcRootFieldLoad(HInstruction* instru
       if (fixup_label == nullptr) {
         __ Add(root_reg.X(), obj.X(), offset);
       } else {
-        vixl::SingleEmissionCheckScope guard(GetVIXLAssembler());
+        SingleEmissionCheckScope guard(GetVIXLAssembler());
         __ Bind(fixup_label);
         __ add(root_reg.X(), obj.X(), offset);
       }
@@ -5089,7 +5073,7 @@ void InstructionCodeGeneratorARM64::GenerateGcRootFieldLoad(HInstruction* instru
     if (fixup_label == nullptr) {
       __ Ldr(root_reg, MemOperand(obj, offset));
     } else {
-      vixl::SingleEmissionCheckScope guard(GetVIXLAssembler());
+      SingleEmissionCheckScope guard(GetVIXLAssembler());
       __ Bind(fixup_label);
       __ ldr(root_reg, MemOperand(obj, offset));
     }
@@ -5100,7 +5084,7 @@ void InstructionCodeGeneratorARM64::GenerateGcRootFieldLoad(HInstruction* instru
 
 void CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction,
                                                                Location ref,
-                                                               vixl::Register obj,
+                                                               Register obj,
                                                                uint32_t offset,
                                                                Register temp,
                                                                bool needs_null_check,
@@ -5124,7 +5108,7 @@ void CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier(HInstruction* ins
 
 void CodeGeneratorARM64::GenerateArrayLoadWithBakerReadBarrier(HInstruction* instruction,
                                                                Location ref,
-                                                               vixl::Register obj,
+                                                               Register obj,
                                                                uint32_t data_offset,
                                                                Location index,
                                                                Register temp,
@@ -5155,7 +5139,7 @@ void CodeGeneratorARM64::GenerateArrayLoadWithBakerReadBarrier(HInstruction* ins
 
 void CodeGeneratorARM64::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* instruction,
                                                                    Location ref,
-                                                                   vixl::Register obj,
+                                                                   Register obj,
                                                                    uint32_t offset,
                                                                    Location index,
                                                                    size_t scale_factor,
@@ -5204,23 +5188,13 @@ void CodeGeneratorARM64::GenerateReferenceLoadWithBakerReadBarrier(HInstruction*
   // /* LockWord */ lock_word = LockWord(monitor)
   static_assert(sizeof(LockWord) == sizeof(int32_t),
                 "art::LockWord and int32_t have different sizes.");
-  // /* uint32_t */ rb_state = lock_word.ReadBarrierState()
-  __ Lsr(temp, temp, LockWord::kReadBarrierStateShift);
-  __ And(temp, temp, Operand(LockWord::kReadBarrierStateMask));
-  static_assert(
-      LockWord::kReadBarrierStateMask == ReadBarrier::rb_ptr_mask_,
-      "art::LockWord::kReadBarrierStateMask is not equal to art::ReadBarrier::rb_ptr_mask_.");
 
-  // Introduce a dependency on the high bits of rb_state, which shall
-  // be all zeroes, to prevent load-load reordering, and without using
+  // Introduce a dependency on the lock_word including rb_state,
+  // to prevent load-load reordering, and without using
   // a memory barrier (which would be more expensive).
-  // temp2 = rb_state & ~LockWord::kReadBarrierStateMask = 0
-  Register temp2 = temps.AcquireW();
-  __ Bic(temp2, temp, Operand(LockWord::kReadBarrierStateMask));
-  // obj is unchanged by this operation, but its value now depends on
-  // temp2, which depends on temp.
-  __ Add(obj, obj, Operand(temp2));
-  temps.Release(temp2);
+  // `obj` is unchanged by this operation, but its value now depends
+  // on `temp`.
+  __ Add(obj.X(), obj.X(), Operand(temp.X(), LSR, 32));
 
   // The actual reference load.
   if (index.IsValid()) {
@@ -5246,7 +5220,7 @@ void CodeGeneratorARM64::GenerateReferenceLoadWithBakerReadBarrier(HInstruction*
         uint32_t computed_offset = offset + (Int64ConstantFrom(index) << scale_factor);
         Load(type, ref_reg, HeapOperand(obj, computed_offset));
       } else {
-        temp2 = temps.AcquireW();
+        Register temp2 = temps.AcquireW();
         __ Add(temp2, obj, offset);
         Load(type, ref_reg, HeapOperand(temp2, XRegisterFrom(index), LSL, scale_factor));
         temps.Release(temp2);
@@ -5267,13 +5241,16 @@ void CodeGeneratorARM64::GenerateReferenceLoadWithBakerReadBarrier(HInstruction*
 
   // Slow path used to mark the object `ref` when it is gray.
   SlowPathCodeARM64* slow_path =
-      new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64(instruction, ref, ref);
+      new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64(instruction, ref);
   AddSlowPath(slow_path);
 
   // if (rb_state == ReadBarrier::gray_ptr_)
   //   ref = ReadBarrier::Mark(ref);
-  __ Cmp(temp, ReadBarrier::gray_ptr_);
-  __ B(eq, slow_path->GetEntryLabel());
+  // Given the numeric representation, it's enough to check the low bit of the rb_state.
+  static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0");
+  static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1");
+  static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2");
+  __ Tbnz(temp, LockWord::kReadBarrierStateShift, slow_path->GetEntryLabel());
   __ Bind(slow_path->GetExitLabel());
 }
 
@@ -5348,16 +5325,19 @@ void LocationsBuilderARM64::VisitClassTableGet(HClassTableGet* instruction) {
 
 void InstructionCodeGeneratorARM64::VisitClassTableGet(HClassTableGet* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  uint32_t method_offset = 0;
   if (instruction->GetTableKind() == HClassTableGet::TableKind::kVTable) {
-    method_offset = mirror::Class::EmbeddedVTableEntryOffset(
+    uint32_t method_offset = mirror::Class::EmbeddedVTableEntryOffset(
         instruction->GetIndex(), kArm64PointerSize).SizeValue();
+    __ Ldr(XRegisterFrom(locations->Out()),
+           MemOperand(XRegisterFrom(locations->InAt(0)), method_offset));
   } else {
-    method_offset = mirror::Class::EmbeddedImTableEntryOffset(
-        instruction->GetIndex() % mirror::Class::kImtSize, kArm64PointerSize).Uint32Value();
+    uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement(
+        instruction->GetIndex(), kArm64PointerSize));
+    __ Ldr(XRegisterFrom(locations->Out()), MemOperand(XRegisterFrom(locations->InAt(0)),
+        mirror::Class::ImtPtrOffset(kArm64PointerSize).Uint32Value()));
+    __ Ldr(XRegisterFrom(locations->Out()),
+           MemOperand(XRegisterFrom(locations->Out()), method_offset));
   }
-  __ Ldr(XRegisterFrom(locations->Out()),
-         MemOperand(XRegisterFrom(locations->InAt(0)), method_offset));
 }
 
 
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index d4bf695602..921ce10aaa 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -20,15 +20,19 @@
 #include "arch/arm64/quick_method_frame_info_arm64.h"
 #include "code_generator.h"
 #include "common_arm64.h"
-#include "dex/compiler_enums.h"
 #include "driver/compiler_options.h"
 #include "nodes.h"
 #include "parallel_move_resolver.h"
+#include "string_reference.h"
 #include "utils/arm64/assembler_arm64.h"
-#include "utils/string_reference.h"
 #include "utils/type_reference.h"
-#include "vixl/a64/disasm-a64.h"
-#include "vixl/a64/macro-assembler-a64.h"
+
+// TODO(VIXL): Make VIXL compile with -Wshadow.
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wshadow"
+#include "aarch64/disasm-aarch64.h"
+#include "aarch64/macro-assembler-aarch64.h"
+#pragma GCC diagnostic pop
 
 namespace art {
 namespace arm64 {
@@ -36,34 +40,49 @@ namespace arm64 {
 class CodeGeneratorARM64;
 
 // Use a local definition to prevent copying mistakes.
-static constexpr size_t kArm64WordSize = kArm64PointerSize;
-
-static const vixl::Register kParameterCoreRegisters[] = {
-  vixl::x1, vixl::x2, vixl::x3, vixl::x4, vixl::x5, vixl::x6, vixl::x7
+static constexpr size_t kArm64WordSize = static_cast<size_t>(kArm64PointerSize);
+
+static const vixl::aarch64::Register kParameterCoreRegisters[] = {
+  vixl::aarch64::x1,
+  vixl::aarch64::x2,
+  vixl::aarch64::x3,
+  vixl::aarch64::x4,
+  vixl::aarch64::x5,
+  vixl::aarch64::x6,
+  vixl::aarch64::x7
 };
 static constexpr size_t kParameterCoreRegistersLength = arraysize(kParameterCoreRegisters);
-static const vixl::FPRegister kParameterFPRegisters[] = {
-  vixl::d0, vixl::d1, vixl::d2, vixl::d3, vixl::d4, vixl::d5, vixl::d6, vixl::d7
+static const vixl::aarch64::FPRegister kParameterFPRegisters[] = {
+  vixl::aarch64::d0,
+  vixl::aarch64::d1,
+  vixl::aarch64::d2,
+  vixl::aarch64::d3,
+  vixl::aarch64::d4,
+  vixl::aarch64::d5,
+  vixl::aarch64::d6,
+  vixl::aarch64::d7
 };
 static constexpr size_t kParameterFPRegistersLength = arraysize(kParameterFPRegisters);
 
-const vixl::Register tr = vixl::x19;                        // Thread Register
-static const vixl::Register kArtMethodRegister = vixl::x0;  // Method register on invoke.
-
-const vixl::CPURegList vixl_reserved_core_registers(vixl::ip0, vixl::ip1);
-const vixl::CPURegList vixl_reserved_fp_registers(vixl::d31);
+// Thread Register
+const vixl::aarch64::Register tr = vixl::aarch64::x19;
+// Method register on invoke.
+static const vixl::aarch64::Register kArtMethodRegister = vixl::aarch64::x0;
+const vixl::aarch64::CPURegList vixl_reserved_core_registers(vixl::aarch64::ip0,
+                                                             vixl::aarch64::ip1);
+const vixl::aarch64::CPURegList vixl_reserved_fp_registers(vixl::aarch64::d31);
 
-const vixl::CPURegList runtime_reserved_core_registers(tr, vixl::lr);
+const vixl::aarch64::CPURegList runtime_reserved_core_registers(tr, vixl::aarch64::lr);
 
 // Callee-saved registers AAPCS64 (without x19 - Thread Register)
-const vixl::CPURegList callee_saved_core_registers(vixl::CPURegister::kRegister,
-                                                   vixl::kXRegSize,
-                                                   vixl::x20.code(),
-                                                   vixl::x30.code());
-const vixl::CPURegList callee_saved_fp_registers(vixl::CPURegister::kFPRegister,
-                                                 vixl::kDRegSize,
-                                                 vixl::d8.code(),
-                                                 vixl::d15.code());
+const vixl::aarch64::CPURegList callee_saved_core_registers(vixl::aarch64::CPURegister::kRegister,
+                                                            vixl::aarch64::kXRegSize,
+                                                            vixl::aarch64::x20.GetCode(),
+                                                            vixl::aarch64::x30.GetCode());
+const vixl::aarch64::CPURegList callee_saved_fp_registers(vixl::aarch64::CPURegister::kFPRegister,
+                                                          vixl::aarch64::kDRegSize,
+                                                          vixl::aarch64::d8.GetCode(),
+                                                          vixl::aarch64::d15.GetCode());
 Location ARM64ReturnLocation(Primitive::Type return_type);
 
 class SlowPathCodeARM64 : public SlowPathCode {
@@ -71,15 +90,15 @@ class SlowPathCodeARM64 : public SlowPathCode {
   explicit SlowPathCodeARM64(HInstruction* instruction)
       : SlowPathCode(instruction), entry_label_(), exit_label_() {}
 
-  vixl::Label* GetEntryLabel() { return &entry_label_; }
-  vixl::Label* GetExitLabel() { return &exit_label_; }
+  vixl::aarch64::Label* GetEntryLabel() { return &entry_label_; }
+  vixl::aarch64::Label* GetExitLabel() { return &exit_label_; }
 
   void SaveLiveRegisters(CodeGenerator* codegen, LocationSummary* locations) OVERRIDE;
   void RestoreLiveRegisters(CodeGenerator* codegen, LocationSummary* locations) OVERRIDE;
 
  private:
-  vixl::Label entry_label_;
-  vixl::Label exit_label_;
+  vixl::aarch64::Label entry_label_;
+  vixl::aarch64::Label exit_label_;
 
   DISALLOW_COPY_AND_ASSIGN(SlowPathCodeARM64);
 };
@@ -89,27 +108,42 @@ class JumpTableARM64 : public DeletableArenaObject<kArenaAllocSwitchTable> {
   explicit JumpTableARM64(HPackedSwitch* switch_instr)
     : switch_instr_(switch_instr), table_start_() {}
 
-  vixl::Label* GetTableStartLabel() { return &table_start_; }
+  vixl::aarch64::Label* GetTableStartLabel() { return &table_start_; }
 
   void EmitTable(CodeGeneratorARM64* codegen);
 
  private:
   HPackedSwitch* const switch_instr_;
-  vixl::Label table_start_;
+  vixl::aarch64::Label table_start_;
 
   DISALLOW_COPY_AND_ASSIGN(JumpTableARM64);
 };
 
-static const vixl::Register kRuntimeParameterCoreRegisters[] =
-    { vixl::x0, vixl::x1, vixl::x2, vixl::x3, vixl::x4, vixl::x5, vixl::x6, vixl::x7 };
+static const vixl::aarch64::Register kRuntimeParameterCoreRegisters[] =
+    { vixl::aarch64::x0,
+      vixl::aarch64::x1,
+      vixl::aarch64::x2,
+      vixl::aarch64::x3,
+      vixl::aarch64::x4,
+      vixl::aarch64::x5,
+      vixl::aarch64::x6,
+      vixl::aarch64::x7 };
 static constexpr size_t kRuntimeParameterCoreRegistersLength =
     arraysize(kRuntimeParameterCoreRegisters);
-static const vixl::FPRegister kRuntimeParameterFpuRegisters[] =
-    { vixl::d0, vixl::d1, vixl::d2, vixl::d3, vixl::d4, vixl::d5, vixl::d6, vixl::d7 };
+static const vixl::aarch64::FPRegister kRuntimeParameterFpuRegisters[] =
+    { vixl::aarch64::d0,
+      vixl::aarch64::d1,
+      vixl::aarch64::d2,
+      vixl::aarch64::d3,
+      vixl::aarch64::d4,
+      vixl::aarch64::d5,
+      vixl::aarch64::d6,
+      vixl::aarch64::d7 };
 static constexpr size_t kRuntimeParameterFpuRegistersLength =
     arraysize(kRuntimeParameterCoreRegisters);
 
-class InvokeRuntimeCallingConvention : public CallingConvention<vixl::Register, vixl::FPRegister> {
+class InvokeRuntimeCallingConvention : public CallingConvention<vixl::aarch64::Register,
+                                                                vixl::aarch64::FPRegister> {
  public:
   static constexpr size_t kParameterCoreRegistersLength = arraysize(kParameterCoreRegisters);
 
@@ -126,7 +160,8 @@ class InvokeRuntimeCallingConvention : public CallingConvention<vixl::Register,
   DISALLOW_COPY_AND_ASSIGN(InvokeRuntimeCallingConvention);
 };
 
-class InvokeDexCallingConvention : public CallingConvention<vixl::Register, vixl::FPRegister> {
+class InvokeDexCallingConvention : public CallingConvention<vixl::aarch64::Register,
+                                                            vixl::aarch64::FPRegister> {
  public:
   InvokeDexCallingConvention()
       : CallingConvention(kParameterCoreRegisters,
@@ -166,23 +201,23 @@ class FieldAccessCallingConventionARM64 : public FieldAccessCallingConvention {
   FieldAccessCallingConventionARM64() {}
 
   Location GetObjectLocation() const OVERRIDE {
-    return helpers::LocationFrom(vixl::x1);
+    return helpers::LocationFrom(vixl::aarch64::x1);
   }
   Location GetFieldIndexLocation() const OVERRIDE {
-    return helpers::LocationFrom(vixl::x0);
+    return helpers::LocationFrom(vixl::aarch64::x0);
   }
   Location GetReturnLocation(Primitive::Type type ATTRIBUTE_UNUSED) const OVERRIDE {
-    return helpers::LocationFrom(vixl::x0);
+    return helpers::LocationFrom(vixl::aarch64::x0);
   }
   Location GetSetValueLocation(Primitive::Type type, bool is_instance) const OVERRIDE {
     return Primitive::Is64BitType(type)
-        ? helpers::LocationFrom(vixl::x2)
+        ? helpers::LocationFrom(vixl::aarch64::x2)
         : (is_instance
-            ? helpers::LocationFrom(vixl::x2)
-            : helpers::LocationFrom(vixl::x1));
+            ? helpers::LocationFrom(vixl::aarch64::x2)
+            : helpers::LocationFrom(vixl::aarch64::x1));
   }
   Location GetFpuLocation(Primitive::Type type ATTRIBUTE_UNUSED) const OVERRIDE {
-    return helpers::LocationFrom(vixl::d0);
+    return helpers::LocationFrom(vixl::aarch64::d0);
   }
 
  private:
@@ -208,10 +243,11 @@ class InstructionCodeGeneratorARM64 : public InstructionCodeGenerator {
   }
 
   Arm64Assembler* GetAssembler() const { return assembler_; }
-  vixl::MacroAssembler* GetVIXLAssembler() { return GetAssembler()->vixl_masm_; }
+  vixl::aarch64::MacroAssembler* GetVIXLAssembler() { return GetAssembler()->GetVIXLAssembler(); }
 
  private:
-  void GenerateClassInitializationCheck(SlowPathCodeARM64* slow_path, vixl::Register class_reg);
+  void GenerateClassInitializationCheck(SlowPathCodeARM64* slow_path,
+                                        vixl::aarch64::Register class_reg);
   void GenerateSuspendCheck(HSuspendCheck* instruction, HBasicBlock* successor);
   void HandleBinaryOp(HBinaryOperation* instr);
 
@@ -256,9 +292,9 @@ class InstructionCodeGeneratorARM64 : public InstructionCodeGenerator {
   // while honoring read barriers (if any).
   void GenerateGcRootFieldLoad(HInstruction* instruction,
                                Location root,
-                               vixl::Register obj,
+                               vixl::aarch64::Register obj,
                                uint32_t offset,
-                               vixl::Label* fixup_label = nullptr);
+                               vixl::aarch64::Label* fixup_label = nullptr);
 
   // Generate a floating-point comparison.
   void GenerateFcmp(HInstruction* instruction);
@@ -266,8 +302,8 @@ class InstructionCodeGeneratorARM64 : public InstructionCodeGenerator {
   void HandleShift(HBinaryOperation* instr);
   void GenerateTestAndBranch(HInstruction* instruction,
                              size_t condition_input_index,
-                             vixl::Label* true_target,
-                             vixl::Label* false_target);
+                             vixl::aarch64::Label* true_target,
+                             vixl::aarch64::Label* false_target);
   void DivRemOneOrMinusOne(HBinaryOperation* instruction);
   void DivRemByPowerOfTwo(HBinaryOperation* instruction);
   void GenerateDivRemWithAnyConstant(HBinaryOperation* instruction);
@@ -327,12 +363,12 @@ class ParallelMoveResolverARM64 : public ParallelMoveResolverNoSwap {
 
  private:
   Arm64Assembler* GetAssembler() const;
-  vixl::MacroAssembler* GetVIXLAssembler() const {
-    return GetAssembler()->vixl_masm_;
+  vixl::aarch64::MacroAssembler* GetVIXLAssembler() const {
+    return GetAssembler()->GetVIXLAssembler();
   }
 
   CodeGeneratorARM64* const codegen_;
-  vixl::UseScratchRegisterScope vixl_temps_;
+  vixl::aarch64::UseScratchRegisterScope vixl_temps_;
 
   DISALLOW_COPY_AND_ASSIGN(ParallelMoveResolverARM64);
 };
@@ -348,12 +384,12 @@ class CodeGeneratorARM64 : public CodeGenerator {
   void GenerateFrameEntry() OVERRIDE;
   void GenerateFrameExit() OVERRIDE;
 
-  vixl::CPURegList GetFramePreservedCoreRegisters() const;
-  vixl::CPURegList GetFramePreservedFPRegisters() const;
+  vixl::aarch64::CPURegList GetFramePreservedCoreRegisters() const;
+  vixl::aarch64::CPURegList GetFramePreservedFPRegisters() const;
 
   void Bind(HBasicBlock* block) OVERRIDE;
 
-  vixl::Label* GetLabelOf(HBasicBlock* block) {
+  vixl::aarch64::Label* GetLabelOf(HBasicBlock* block) {
     block = FirstNonEmptyBlock(block);
     return &(block_labels_[block->GetBlockId()]);
   }
@@ -368,19 +404,21 @@ class CodeGeneratorARM64 : public CodeGenerator {
   }
 
   uintptr_t GetAddressOf(HBasicBlock* block) OVERRIDE {
-    vixl::Label* block_entry_label = GetLabelOf(block);
+    vixl::aarch64::Label* block_entry_label = GetLabelOf(block);
     DCHECK(block_entry_label->IsBound());
-    return block_entry_label->location();
+    return block_entry_label->GetLocation();
   }
 
   HGraphVisitor* GetLocationBuilder() OVERRIDE { return &location_builder_; }
   HGraphVisitor* GetInstructionVisitor() OVERRIDE { return &instruction_visitor_; }
   Arm64Assembler* GetAssembler() OVERRIDE { return &assembler_; }
   const Arm64Assembler& GetAssembler() const OVERRIDE { return assembler_; }
-  vixl::MacroAssembler* GetVIXLAssembler() { return GetAssembler()->vixl_masm_; }
+  vixl::aarch64::MacroAssembler* GetVIXLAssembler() { return GetAssembler()->GetVIXLAssembler(); }
 
   // Emit a write barrier.
-  void MarkGCCard(vixl::Register object, vixl::Register value, bool value_can_be_null);
+  void MarkGCCard(vixl::aarch64::Register object,
+                  vixl::aarch64::Register value,
+                  bool value_can_be_null);
 
   void GenerateMemoryBarrier(MemBarrierKind kind);
 
@@ -399,8 +437,8 @@ class CodeGeneratorARM64 : public CodeGenerator {
   // (xzr, wzr), or make for poor allocatable registers (sp alignment
   // requirements, etc.). This also facilitates our task as all other registers
   // can easily be mapped via to or from their type and index or code.
-  static const int kNumberOfAllocatableRegisters = vixl::kNumberOfRegisters - 1;
-  static const int kNumberOfAllocatableFPRegisters = vixl::kNumberOfFPRegisters;
+  static const int kNumberOfAllocatableRegisters = vixl::aarch64::kNumberOfRegisters - 1;
+  static const int kNumberOfAllocatableFPRegisters = vixl::aarch64::kNumberOfFPRegisters;
   static constexpr int kNumberOfAllocatableRegisterPairs = 0;
 
   void DumpCoreRegister(std::ostream& stream, int reg) const OVERRIDE;
@@ -418,6 +456,10 @@ class CodeGeneratorARM64 : public CodeGenerator {
     block_labels_.resize(GetGraph()->GetBlocks().size());
   }
 
+  // We want to use the STP and LDP instructions to spill and restore registers for slow paths.
+  // These instructions can only encode offsets that are multiples of the register size accessed.
+  uint32_t GetPreferredSlotsAlignment() const OVERRIDE { return vixl::aarch64::kXRegSizeInBytes; }
+
   JumpTableARM64* CreateJumpTable(HPackedSwitch* switch_instr) {
     jump_tables_.emplace_back(new (GetGraph()->GetArena()) JumpTableARM64(switch_instr));
     return jump_tables_.back().get();
@@ -426,18 +468,24 @@ class CodeGeneratorARM64 : public CodeGenerator {
   void Finalize(CodeAllocator* allocator) OVERRIDE;
 
   // Code generation helpers.
-  void MoveConstant(vixl::CPURegister destination, HConstant* constant);
+  void MoveConstant(vixl::aarch64::CPURegister destination, HConstant* constant);
   void MoveConstant(Location destination, int32_t value) OVERRIDE;
   void MoveLocation(Location dst, Location src, Primitive::Type dst_type) OVERRIDE;
   void AddLocationAsTemp(Location location, LocationSummary* locations) OVERRIDE;
 
-  void Load(Primitive::Type type, vixl::CPURegister dst, const vixl::MemOperand& src);
-  void Store(Primitive::Type type, vixl::CPURegister src, const vixl::MemOperand& dst);
+  void Load(Primitive::Type type,
+            vixl::aarch64::CPURegister dst,
+            const vixl::aarch64::MemOperand& src);
+  void Store(Primitive::Type type,
+             vixl::aarch64::CPURegister src,
+             const vixl::aarch64::MemOperand& dst);
   void LoadAcquire(HInstruction* instruction,
-                   vixl::CPURegister dst,
-                   const vixl::MemOperand& src,
+                   vixl::aarch64::CPURegister dst,
+                   const vixl::aarch64::MemOperand& src,
                    bool needs_null_check);
-  void StoreRelease(Primitive::Type type, vixl::CPURegister src, const vixl::MemOperand& dst);
+  void StoreRelease(Primitive::Type type,
+                    vixl::aarch64::CPURegister src,
+                    const vixl::aarch64::MemOperand& dst);
 
   // Generate code to invoke a runtime entry point.
   void InvokeRuntime(QuickEntrypointEnum entrypoint,
@@ -450,6 +498,12 @@ class CodeGeneratorARM64 : public CodeGenerator {
                      uint32_t dex_pc,
                      SlowPathCode* slow_path);
 
+  // Generate code to invoke a runtime entry point, but do not record
+  // PC-related information in a stack map.
+  void InvokeRuntimeWithoutRecordingPcInfo(int32_t entry_point_offset,
+                                           HInstruction* instruction,
+                                           SlowPathCode* slow_path);
+
   ParallelMoveResolverARM64* GetMoveResolver() OVERRIDE { return &move_resolver_; }
 
   bool NeedsTwoRegisters(Primitive::Type type ATTRIBUTE_UNUSED) const OVERRIDE {
@@ -484,32 +538,33 @@ class CodeGeneratorARM64 : public CodeGenerator {
   // to be bound before the instruction. The instruction will be either the
   // ADRP (pass `adrp_label = null`) or the ADD (pass `adrp_label` pointing
   // to the associated ADRP patch label).
-  vixl::Label* NewPcRelativeStringPatch(const DexFile& dex_file,
-                                        uint32_t string_index,
-                                        vixl::Label* adrp_label = nullptr);
+  vixl::aarch64::Label* NewPcRelativeStringPatch(const DexFile& dex_file,
+                                                 uint32_t string_index,
+                                                 vixl::aarch64::Label* adrp_label = nullptr);
 
   // Add a new PC-relative type patch for an instruction and return the label
   // to be bound before the instruction. The instruction will be either the
   // ADRP (pass `adrp_label = null`) or the ADD (pass `adrp_label` pointing
   // to the associated ADRP patch label).
-  vixl::Label* NewPcRelativeTypePatch(const DexFile& dex_file,
-                                      uint32_t type_index,
-                                      vixl::Label* adrp_label = nullptr);
+  vixl::aarch64::Label* NewPcRelativeTypePatch(const DexFile& dex_file,
+                                               uint32_t type_index,
+                                               vixl::aarch64::Label* adrp_label = nullptr);
 
   // Add a new PC-relative dex cache array patch for an instruction and return
   // the label to be bound before the instruction. The instruction will be
   // either the ADRP (pass `adrp_label = null`) or the LDR (pass `adrp_label`
   // pointing to the associated ADRP patch label).
-  vixl::Label* NewPcRelativeDexCacheArrayPatch(const DexFile& dex_file,
-                                               uint32_t element_offset,
-                                               vixl::Label* adrp_label = nullptr);
-
-  vixl::Literal<uint32_t>* DeduplicateBootImageStringLiteral(const DexFile& dex_file,
-                                                             uint32_t string_index);
-  vixl::Literal<uint32_t>* DeduplicateBootImageTypeLiteral(const DexFile& dex_file,
-                                                           uint32_t type_index);
-  vixl::Literal<uint32_t>* DeduplicateBootImageAddressLiteral(uint64_t address);
-  vixl::Literal<uint64_t>* DeduplicateDexCacheAddressLiteral(uint64_t address);
+  vixl::aarch64::Label* NewPcRelativeDexCacheArrayPatch(
+      const DexFile& dex_file,
+      uint32_t element_offset,
+      vixl::aarch64::Label* adrp_label = nullptr);
+
+  vixl::aarch64::Literal<uint32_t>* DeduplicateBootImageStringLiteral(const DexFile& dex_file,
+                                                                      uint32_t string_index);
+  vixl::aarch64::Literal<uint32_t>* DeduplicateBootImageTypeLiteral(const DexFile& dex_file,
+                                                                    uint32_t type_index);
+  vixl::aarch64::Literal<uint32_t>* DeduplicateBootImageAddressLiteral(uint64_t address);
+  vixl::aarch64::Literal<uint64_t>* DeduplicateDexCacheAddressLiteral(uint64_t address);
 
   void EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches) OVERRIDE;
 
@@ -517,29 +572,29 @@ class CodeGeneratorARM64 : public CodeGenerator {
   // reference field load when Baker's read barriers are used.
   void GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction,
                                              Location ref,
-                                             vixl::Register obj,
+                                             vixl::aarch64::Register obj,
                                              uint32_t offset,
-                                             vixl::Register temp,
+                                             vixl::aarch64::Register temp,
                                              bool needs_null_check,
                                              bool use_load_acquire);
   // Fast path implementation of ReadBarrier::Barrier for a heap
   // reference array load when Baker's read barriers are used.
   void GenerateArrayLoadWithBakerReadBarrier(HInstruction* instruction,
                                              Location ref,
-                                             vixl::Register obj,
+                                             vixl::aarch64::Register obj,
                                              uint32_t data_offset,
                                              Location index,
-                                             vixl::Register temp,
+                                             vixl::aarch64::Register temp,
                                              bool needs_null_check);
   // Factored implementation used by GenerateFieldLoadWithBakerReadBarrier
   // and GenerateArrayLoadWithBakerReadBarrier.
   void GenerateReferenceLoadWithBakerReadBarrier(HInstruction* instruction,
                                                  Location ref,
-                                                 vixl::Register obj,
+                                                 vixl::aarch64::Register obj,
                                                  uint32_t offset,
                                                  Location index,
                                                  size_t scale_factor,
-                                                 vixl::Register temp,
+                                                 vixl::aarch64::Register temp,
                                                  bool needs_null_check,
                                                  bool use_load_acquire);
 
@@ -597,24 +652,25 @@ class CodeGeneratorARM64 : public CodeGenerator {
   void GenerateExplicitNullCheck(HNullCheck* instruction);
 
  private:
-  using Uint64ToLiteralMap = ArenaSafeMap<uint64_t, vixl::Literal<uint64_t>*>;
-  using Uint32ToLiteralMap = ArenaSafeMap<uint32_t, vixl::Literal<uint32_t>*>;
+  using Uint64ToLiteralMap = ArenaSafeMap<uint64_t, vixl::aarch64::Literal<uint64_t>*>;
+  using Uint32ToLiteralMap = ArenaSafeMap<uint32_t, vixl::aarch64::Literal<uint32_t>*>;
   using MethodToLiteralMap = ArenaSafeMap<MethodReference,
-                                          vixl::Literal<uint64_t>*,
+                                          vixl::aarch64::Literal<uint64_t>*,
                                           MethodReferenceComparator>;
   using BootStringToLiteralMap = ArenaSafeMap<StringReference,
-                                              vixl::Literal<uint32_t>*,
+                                              vixl::aarch64::Literal<uint32_t>*,
                                               StringReferenceValueComparator>;
   using BootTypeToLiteralMap = ArenaSafeMap<TypeReference,
-                                            vixl::Literal<uint32_t>*,
+                                            vixl::aarch64::Literal<uint32_t>*,
                                             TypeReferenceValueComparator>;
 
-  vixl::Literal<uint32_t>* DeduplicateUint32Literal(uint32_t value, Uint32ToLiteralMap* map);
-  vixl::Literal<uint64_t>* DeduplicateUint64Literal(uint64_t value);
-  vixl::Literal<uint64_t>* DeduplicateMethodLiteral(MethodReference target_method,
-                                                    MethodToLiteralMap* map);
-  vixl::Literal<uint64_t>* DeduplicateMethodAddressLiteral(MethodReference target_method);
-  vixl::Literal<uint64_t>* DeduplicateMethodCodeLiteral(MethodReference target_method);
+  vixl::aarch64::Literal<uint32_t>* DeduplicateUint32Literal(uint32_t value,
+                                                             Uint32ToLiteralMap* map);
+  vixl::aarch64::Literal<uint64_t>* DeduplicateUint64Literal(uint64_t value);
+  vixl::aarch64::Literal<uint64_t>* DeduplicateMethodLiteral(MethodReference target_method,
+                                                             MethodToLiteralMap* map);
+  vixl::aarch64::Literal<uint64_t>* DeduplicateMethodAddressLiteral(MethodReference target_method);
+  vixl::aarch64::Literal<uint64_t>* DeduplicateMethodCodeLiteral(MethodReference target_method);
 
   // The PcRelativePatchInfo is used for PC-relative addressing of dex cache arrays
   // and boot image strings/types. The only difference is the interpretation of the
@@ -626,21 +682,21 @@ class CodeGeneratorARM64 : public CodeGenerator {
     const DexFile& target_dex_file;
     // Either the dex cache array element offset or the string/type index.
     uint32_t offset_or_index;
-    vixl::Label label;
-    vixl::Label* pc_insn_label;
+    vixl::aarch64::Label label;
+    vixl::aarch64::Label* pc_insn_label;
   };
 
-  vixl::Label* NewPcRelativePatch(const DexFile& dex_file,
-                                  uint32_t offset_or_index,
-                                  vixl::Label* adrp_label,
-                                  ArenaDeque<PcRelativePatchInfo>* patches);
+  vixl::aarch64::Label* NewPcRelativePatch(const DexFile& dex_file,
+                                           uint32_t offset_or_index,
+                                           vixl::aarch64::Label* adrp_label,
+                                           ArenaDeque<PcRelativePatchInfo>* patches);
 
   void EmitJumpTables();
 
   // Labels for each block that will be compiled.
-  // We use a deque so that the `vixl::Label` objects do not move in memory.
-  ArenaDeque<vixl::Label> block_labels_;  // Indexed by block id.
-  vixl::Label frame_entry_label_;
+  // We use a deque so that the `vixl::aarch64::Label` objects do not move in memory.
+  ArenaDeque<vixl::aarch64::Label> block_labels_;  // Indexed by block id.
+  vixl::aarch64::Label frame_entry_label_;
   ArenaVector<std::unique_ptr<JumpTableARM64>> jump_tables_;
 
   LocationsBuilderARM64 location_builder_;
@@ -659,7 +715,7 @@ class CodeGeneratorARM64 : public CodeGenerator {
   MethodToLiteralMap call_patches_;
   // Relative call patch info.
   // Using ArenaDeque<> which retains element addresses on push/emplace_back().
-  ArenaDeque<MethodPatchInfo<vixl::Label>> relative_call_patches_;
+  ArenaDeque<MethodPatchInfo<vixl::aarch64::Label>> relative_call_patches_;
   // PC-relative DexCache access info.
   ArenaDeque<PcRelativePatchInfo> pc_relative_dex_cache_patches_;
   // Deduplication map for boot string literals for kBootImageLinkTimeAddress.
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index 37f1c35c50..a7fbc84340 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -145,9 +145,9 @@ Location InvokeRuntimeCallingConvention::GetReturnLocation(Primitive::Type type)
   return MipsReturnLocation(type);
 }
 
-// NOLINT on __ macro to suppress wrong warning/fix from clang-tidy.
-#define __ down_cast<CodeGeneratorMIPS*>(codegen)->GetAssembler()-> // NOLINT
-#define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kMipsWordSize, x).Int32Value()
+// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
+#define __ down_cast<CodeGeneratorMIPS*>(codegen)->GetAssembler()->  // NOLINT
+#define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kMipsPointerSize, x).Int32Value()
 
 class BoundsCheckSlowPathMIPS : public SlowPathCodeMIPS {
  public:
@@ -351,14 +351,12 @@ class SuspendCheckSlowPathMIPS : public SlowPathCodeMIPS {
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     CodeGeneratorMIPS* mips_codegen = down_cast<CodeGeneratorMIPS*>(codegen);
     __ Bind(GetEntryLabel());
-    SaveLiveRegisters(codegen, instruction_->GetLocations());
     mips_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pTestSuspend),
                                 instruction_,
                                 instruction_->GetDexPc(),
                                 this,
                                 IsDirectEntrypoint(kQuickTestSuspend));
     CheckEntrypointTypes<kQuickTestSuspend, void, void>();
-    RestoreLiveRegisters(codegen, instruction_->GetLocations());
     if (successor_ == nullptr) {
       __ B(GetReturnLabel());
     } else {
@@ -415,7 +413,7 @@ class TypeCheckSlowPathMIPS : public SlowPathCodeMIPS {
                                   this,
                                   IsDirectEntrypoint(kQuickInstanceofNonTrivial));
       CheckEntrypointTypes<
-          kQuickInstanceofNonTrivial, uint32_t, const mirror::Class*, const mirror::Class*>();
+          kQuickInstanceofNonTrivial, size_t, const mirror::Class*, const mirror::Class*>();
       Primitive::Type ret_type = instruction_->GetType();
       Location ret_loc = calling_convention.GetReturnLocation(ret_type);
       mips_codegen->MoveLocation(locations->Out(), ret_loc, ret_type);
@@ -482,19 +480,30 @@ CodeGeneratorMIPS::CodeGeneratorMIPS(HGraph* graph,
       move_resolver_(graph->GetArena(), this),
       assembler_(graph->GetArena(), &isa_features),
       isa_features_(isa_features),
+      uint32_literals_(std::less<uint32_t>(),
+                       graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       method_patches_(MethodReferenceComparator(),
                       graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       call_patches_(MethodReferenceComparator(),
                     graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
-      pc_relative_dex_cache_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)) {
+      pc_relative_dex_cache_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      boot_image_string_patches_(StringReferenceValueComparator(),
+                                 graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      pc_relative_string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      boot_image_type_patches_(TypeReferenceValueComparator(),
+                               graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      pc_relative_type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      boot_image_address_patches_(std::less<uint32_t>(),
+                                  graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      clobbered_ra_(false) {
   // Save RA (containing the return address) to mimic Quick.
   AddAllocatedRegister(Location::RegisterLocation(RA));
 }
 
 #undef __
-// NOLINT on __ macro to suppress wrong warning/fix from clang-tidy.
-#define __ down_cast<MipsAssembler*>(GetAssembler())-> // NOLINT
-#define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kMipsWordSize, x).Int32Value()
+// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
+#define __ down_cast<MipsAssembler*>(GetAssembler())->  // NOLINT
+#define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kMipsPointerSize, x).Int32Value()
 
 void CodeGeneratorMIPS::Finalize(CodeAllocator* allocator) {
   // Ensure that we fix up branches.
@@ -688,6 +697,16 @@ void CodeGeneratorMIPS::ComputeSpillMask() {
   if ((fpu_spill_mask_ != 0) && (POPCOUNT(core_spill_mask_) % 2 != 0)) {
     core_spill_mask_ |= (1 << ZERO);
   }
+  // If RA is clobbered by PC-relative operations on R2 and it's the only spilled register
+  // (this can happen in leaf methods), artificially spill the ZERO register in order to
+  // force explicit saving and restoring of RA. RA isn't saved/restored when it's the only
+  // spilled register.
+  // TODO: Can this be improved? It causes creation of a stack frame (while RA might be
+  // saved in an unused temporary register) and saving of RA and the current method pointer
+  // in the frame.
+  if (clobbered_ra_ && core_spill_mask_ == (1u << RA) && fpu_spill_mask_ == 0) {
+    core_spill_mask_ |= (1 << ZERO);
+  }
 }
 
 static dwarf::Reg DWARFReg(Register reg) {
@@ -962,7 +981,12 @@ void CodeGeneratorMIPS::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patch
   size_t size =
       method_patches_.size() +
       call_patches_.size() +
-      pc_relative_dex_cache_patches_.size();
+      pc_relative_dex_cache_patches_.size() +
+      pc_relative_string_patches_.size() +
+      pc_relative_type_patches_.size() +
+      boot_image_string_patches_.size() +
+      boot_image_type_patches_.size() +
+      boot_image_address_patches_.size();
   linker_patches->reserve(size);
   for (const auto& entry : method_patches_) {
     const MethodReference& target_method = entry.first;
@@ -994,6 +1018,71 @@ void CodeGeneratorMIPS::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patch
                                                               pc_rel_offset,
                                                               base_element_offset));
   }
+  for (const PcRelativePatchInfo& info : pc_relative_string_patches_) {
+    const DexFile& dex_file = info.target_dex_file;
+    size_t string_index = info.offset_or_index;
+    DCHECK(info.high_label.IsBound());
+    uint32_t high_offset = __ GetLabelLocation(&info.high_label);
+    // On R2 we use HMipsComputeBaseMethodAddress and patch relative to
+    // the assembler's base label used for PC-relative literals.
+    uint32_t pc_rel_offset = info.pc_rel_label.IsBound()
+        ? __ GetLabelLocation(&info.pc_rel_label)
+        : __ GetPcRelBaseLabelLocation();
+    linker_patches->push_back(LinkerPatch::RelativeStringPatch(high_offset,
+                                                               &dex_file,
+                                                               pc_rel_offset,
+                                                               string_index));
+  }
+  for (const PcRelativePatchInfo& info : pc_relative_type_patches_) {
+    const DexFile& dex_file = info.target_dex_file;
+    size_t type_index = info.offset_or_index;
+    DCHECK(info.high_label.IsBound());
+    uint32_t high_offset = __ GetLabelLocation(&info.high_label);
+    // On R2 we use HMipsComputeBaseMethodAddress and patch relative to
+    // the assembler's base label used for PC-relative literals.
+    uint32_t pc_rel_offset = info.pc_rel_label.IsBound()
+        ? __ GetLabelLocation(&info.pc_rel_label)
+        : __ GetPcRelBaseLabelLocation();
+    linker_patches->push_back(LinkerPatch::RelativeTypePatch(high_offset,
+                                                             &dex_file,
+                                                             pc_rel_offset,
+                                                             type_index));
+  }
+  for (const auto& entry : boot_image_string_patches_) {
+    const StringReference& target_string = entry.first;
+    Literal* literal = entry.second;
+    DCHECK(literal->GetLabel()->IsBound());
+    uint32_t literal_offset = __ GetLabelLocation(literal->GetLabel());
+    linker_patches->push_back(LinkerPatch::StringPatch(literal_offset,
+                                                       target_string.dex_file,
+                                                       target_string.string_index));
+  }
+  for (const auto& entry : boot_image_type_patches_) {
+    const TypeReference& target_type = entry.first;
+    Literal* literal = entry.second;
+    DCHECK(literal->GetLabel()->IsBound());
+    uint32_t literal_offset = __ GetLabelLocation(literal->GetLabel());
+    linker_patches->push_back(LinkerPatch::TypePatch(literal_offset,
+                                                     target_type.dex_file,
+                                                     target_type.type_index));
+  }
+  for (const auto& entry : boot_image_address_patches_) {
+    DCHECK(GetCompilerOptions().GetIncludePatchInformation());
+    Literal* literal = entry.second;
+    DCHECK(literal->GetLabel()->IsBound());
+    uint32_t literal_offset = __ GetLabelLocation(literal->GetLabel());
+    linker_patches->push_back(LinkerPatch::RecordPosition(literal_offset));
+  }
+}
+
+CodeGeneratorMIPS::PcRelativePatchInfo* CodeGeneratorMIPS::NewPcRelativeStringPatch(
+    const DexFile& dex_file, uint32_t string_index) {
+  return NewPcRelativePatch(dex_file, string_index, &pc_relative_string_patches_);
+}
+
+CodeGeneratorMIPS::PcRelativePatchInfo* CodeGeneratorMIPS::NewPcRelativeTypePatch(
+    const DexFile& dex_file, uint32_t type_index) {
+  return NewPcRelativePatch(dex_file, type_index, &pc_relative_type_patches_);
 }
 
 CodeGeneratorMIPS::PcRelativePatchInfo* CodeGeneratorMIPS::NewPcRelativeDexCacheArrayPatch(
@@ -1007,6 +1096,12 @@ CodeGeneratorMIPS::PcRelativePatchInfo* CodeGeneratorMIPS::NewPcRelativePatch(
   return &patches->back();
 }
 
+Literal* CodeGeneratorMIPS::DeduplicateUint32Literal(uint32_t value, Uint32ToLiteralMap* map) {
+  return map->GetOrCreate(
+      value,
+      [this, value]() { return __ NewLiteral<uint32_t>(value); });
+}
+
 Literal* CodeGeneratorMIPS::DeduplicateMethodLiteral(MethodReference target_method,
                                                      MethodToLiteralMap* map) {
   return map->GetOrCreate(
@@ -1022,6 +1117,26 @@ Literal* CodeGeneratorMIPS::DeduplicateMethodCodeLiteral(MethodReference target_
   return DeduplicateMethodLiteral(target_method, &call_patches_);
 }
 
+Literal* CodeGeneratorMIPS::DeduplicateBootImageStringLiteral(const DexFile& dex_file,
+                                                              uint32_t string_index) {
+  return boot_image_string_patches_.GetOrCreate(
+      StringReference(&dex_file, string_index),
+      [this]() { return __ NewLiteral<uint32_t>(/* placeholder */ 0u); });
+}
+
+Literal* CodeGeneratorMIPS::DeduplicateBootImageTypeLiteral(const DexFile& dex_file,
+                                                            uint32_t type_index) {
+  return boot_image_type_patches_.GetOrCreate(
+      TypeReference(&dex_file, type_index),
+      [this]() { return __ NewLiteral<uint32_t>(/* placeholder */ 0u); });
+}
+
+Literal* CodeGeneratorMIPS::DeduplicateBootImageAddressLiteral(uint32_t address) {
+  bool needs_patch = GetCompilerOptions().GetIncludePatchInformation();
+  Uint32ToLiteralMap* map = needs_patch ? &boot_image_address_patches_ : &uint32_literals_;
+  return DeduplicateUint32Literal(dchecked_integral_cast<uint32_t>(address), map);
+}
+
 void CodeGeneratorMIPS::MarkGCCard(Register object, Register value) {
   MipsLabel done;
   Register card = AT;
@@ -1030,7 +1145,7 @@ void CodeGeneratorMIPS::MarkGCCard(Register object, Register value) {
   __ LoadFromOffset(kLoadWord,
                     card,
                     TR,
-                    Thread::CardTableOffset<kMipsWordSize>().Int32Value());
+                    Thread::CardTableOffset<kMipsPointerSize>().Int32Value());
   __ Srl(temp, object, gc::accounting::CardTable::kCardShift);
   __ Addu(temp, card, temp);
   __ Sb(card, temp, 0);
@@ -1067,6 +1182,15 @@ void CodeGeneratorMIPS::SetupBlockedRegisters() const {
     blocked_fpu_registers_[i] = true;
   }
 
+  if (GetGraph()->IsDebuggable()) {
+    // Stubs do not save callee-save floating point registers. If the graph
+    // is debuggable, we need to deal with these registers differently. For
+    // now, just block them.
+    for (size_t i = 0; i < arraysize(kFpuCalleeSaves); ++i) {
+      blocked_fpu_registers_[kFpuCalleeSaves[i]] = true;
+    }
+  }
+
   UpdateBlockedPairRegisters();
 }
 
@@ -1113,7 +1237,7 @@ void CodeGeneratorMIPS::InvokeRuntime(QuickEntrypointEnum entrypoint,
                                       HInstruction* instruction,
                                       uint32_t dex_pc,
                                       SlowPathCode* slow_path) {
-  InvokeRuntime(GetThreadOffset<kMipsWordSize>(entrypoint).Int32Value(),
+  InvokeRuntime(GetThreadOffset<kMipsPointerSize>(entrypoint).Int32Value(),
                 instruction,
                 dex_pc,
                 slow_path,
@@ -1164,7 +1288,7 @@ void InstructionCodeGeneratorMIPS::GenerateSuspendCheck(HSuspendCheck* instructi
   __ LoadFromOffset(kLoadUnsignedHalfword,
                     TMP,
                     TR,
-                    Thread::ThreadFlagsOffset<kMipsWordSize>().Int32Value());
+                    Thread::ThreadFlagsOffset<kMipsPointerSize>().Int32Value());
   if (successor == nullptr) {
     __ Bnez(TMP, slow_path->GetEntryLabel());
     __ Bind(slow_path->GetReturnLabel());
@@ -1855,7 +1979,7 @@ void LocationsBuilderMIPS::VisitArraySet(HArraySet* instruction) {
   bool needs_runtime_call = instruction->NeedsTypeCheck();
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(
       instruction,
-      needs_runtime_call ? LocationSummary::kCall : LocationSummary::kNoCall);
+      needs_runtime_call ? LocationSummary::kCallOnMainOnly : LocationSummary::kNoCall);
   if (needs_runtime_call) {
     InvokeRuntimeCallingConvention calling_convention;
     locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
@@ -2467,7 +2591,7 @@ void InstructionCodeGeneratorMIPS::GenerateDivRemIntegral(HBinaryOperation* inst
 void LocationsBuilderMIPS::VisitDiv(HDiv* div) {
   Primitive::Type type = div->GetResultType();
   LocationSummary::CallKind call_kind = (type == Primitive::kPrimLong)
-      ? LocationSummary::kCall
+      ? LocationSummary::kCallOnMainOnly
       : LocationSummary::kNoCall;
 
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(div, call_kind);
@@ -3430,7 +3554,7 @@ void LocationsBuilderMIPS::HandleFieldGet(HInstruction* instruction, const Field
   bool is_wide = (field_type == Primitive::kPrimLong) || (field_type == Primitive::kPrimDouble);
   bool generate_volatile = field_info.IsVolatile() && is_wide;
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(
-      instruction, generate_volatile ? LocationSummary::kCall : LocationSummary::kNoCall);
+      instruction, generate_volatile ? LocationSummary::kCallOnMainOnly : LocationSummary::kNoCall);
 
   locations->SetInAt(0, Location::RequiresRegister());
   if (generate_volatile) {
@@ -3440,7 +3564,8 @@ void LocationsBuilderMIPS::HandleFieldGet(HInstruction* instruction, const Field
     if (field_type == Primitive::kPrimLong) {
       locations->SetOut(calling_convention.GetReturnLocation(Primitive::kPrimLong));
     } else {
-      locations->SetOut(Location::RequiresFpuRegister());
+      // Use Location::Any() to prevent situations when running out of available fp registers.
+      locations->SetOut(Location::Any());
       // Need some temp core regs since FP results are returned in core registers
       Location reg = calling_convention.GetReturnLocation(Primitive::kPrimLong);
       locations->AddTemp(Location::RegisterLocation(reg.AsRegisterPairLow<Register>()));
@@ -3505,11 +3630,23 @@ void InstructionCodeGeneratorMIPS::HandleFieldGet(HInstruction* instruction,
                             IsDirectEntrypoint(kQuickA64Load));
     CheckEntrypointTypes<kQuickA64Load, int64_t, volatile const int64_t*>();
     if (type == Primitive::kPrimDouble) {
-      // Need to move to FP regs since FP results are returned in core registers.
-      __ Mtc1(locations->GetTemp(1).AsRegister<Register>(),
-              locations->Out().AsFpuRegister<FRegister>());
-      __ MoveToFpuHigh(locations->GetTemp(2).AsRegister<Register>(),
-                       locations->Out().AsFpuRegister<FRegister>());
+      // FP results are returned in core registers. Need to move them.
+      Location out = locations->Out();
+      if (out.IsFpuRegister()) {
+        __ Mtc1(locations->GetTemp(1).AsRegister<Register>(), out.AsFpuRegister<FRegister>());
+        __ MoveToFpuHigh(locations->GetTemp(2).AsRegister<Register>(),
+                         out.AsFpuRegister<FRegister>());
+      } else {
+        DCHECK(out.IsDoubleStackSlot());
+        __ StoreToOffset(kStoreWord,
+                         locations->GetTemp(1).AsRegister<Register>(),
+                         SP,
+                         out.GetStackIndex());
+        __ StoreToOffset(kStoreWord,
+                         locations->GetTemp(2).AsRegister<Register>(),
+                         SP,
+                         out.GetStackIndex() + 4);
+      }
     }
   } else {
     if (!Primitive::IsFloatingPointType(type)) {
@@ -3557,7 +3694,7 @@ void LocationsBuilderMIPS::HandleFieldSet(HInstruction* instruction, const Field
   bool is_wide = (field_type == Primitive::kPrimLong) || (field_type == Primitive::kPrimDouble);
   bool generate_volatile = field_info.IsVolatile() && is_wide;
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(
-      instruction, generate_volatile ? LocationSummary::kCall : LocationSummary::kNoCall);
+      instruction, generate_volatile ? LocationSummary::kCallOnMainOnly : LocationSummary::kNoCall);
 
   locations->SetInAt(0, Location::RequiresRegister());
   if (generate_volatile) {
@@ -3568,7 +3705,8 @@ void LocationsBuilderMIPS::HandleFieldSet(HInstruction* instruction, const Field
       locations->SetInAt(1, Location::RegisterPairLocation(
           calling_convention.GetRegisterAt(2), calling_convention.GetRegisterAt(3)));
     } else {
-      locations->SetInAt(1, Location::RequiresFpuRegister());
+      // Use Location::Any() to prevent situations when running out of available fp registers.
+      locations->SetInAt(1, Location::Any());
       // Pass FP parameters in core registers.
       locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
       locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(3)));
@@ -3627,10 +3765,28 @@ void InstructionCodeGeneratorMIPS::HandleFieldSet(HInstruction* instruction,
     codegen_->RecordPcInfo(instruction, instruction->GetDexPc());
     if (type == Primitive::kPrimDouble) {
       // Pass FP parameters in core registers.
-      __ Mfc1(locations->GetTemp(1).AsRegister<Register>(),
-              locations->InAt(1).AsFpuRegister<FRegister>());
-      __ MoveFromFpuHigh(locations->GetTemp(2).AsRegister<Register>(),
-                         locations->InAt(1).AsFpuRegister<FRegister>());
+      Location in = locations->InAt(1);
+      if (in.IsFpuRegister()) {
+        __ Mfc1(locations->GetTemp(1).AsRegister<Register>(), in.AsFpuRegister<FRegister>());
+        __ MoveFromFpuHigh(locations->GetTemp(2).AsRegister<Register>(),
+                           in.AsFpuRegister<FRegister>());
+      } else if (in.IsDoubleStackSlot()) {
+        __ LoadFromOffset(kLoadWord,
+                          locations->GetTemp(1).AsRegister<Register>(),
+                          SP,
+                          in.GetStackIndex());
+        __ LoadFromOffset(kLoadWord,
+                          locations->GetTemp(2).AsRegister<Register>(),
+                          SP,
+                          in.GetStackIndex() + 4);
+      } else {
+        DCHECK(in.IsConstant());
+        DCHECK(in.GetConstant()->IsDoubleConstant());
+        int64_t value = bit_cast<int64_t, double>(in.GetConstant()->AsDoubleConstant()->GetValue());
+        __ LoadConst64(locations->GetTemp(2).AsRegister<Register>(),
+                       locations->GetTemp(1).AsRegister<Register>(),
+                       value);
+      }
     }
     codegen_->InvokeRuntime(QUICK_ENTRY_POINT(pA64Store),
                             instruction,
@@ -3696,6 +3852,23 @@ void InstructionCodeGeneratorMIPS::VisitInstanceFieldSet(HInstanceFieldSet* inst
   HandleFieldSet(instruction, instruction->GetFieldInfo(), instruction->GetDexPc());
 }
 
+void InstructionCodeGeneratorMIPS::GenerateGcRootFieldLoad(
+    HInstruction* instruction ATTRIBUTE_UNUSED,
+    Location root,
+    Register obj,
+    uint32_t offset) {
+  Register root_reg = root.AsRegister<Register>();
+  if (kEmitCompilerReadBarrier) {
+    UNIMPLEMENTED(FATAL) << "for read barrier";
+  } else {
+    // Plain GC root load with no read barrier.
+    // /* GcRoot<mirror::Object> */ root = *(obj + offset)
+    __ LoadFromOffset(kLoadWord, root_reg, obj, offset);
+    // Note that GC roots are not affected by heap poisoning, thus we
+    // do not have to unpoison `root_reg` here.
+  }
+}
+
 void LocationsBuilderMIPS::VisitInstanceOf(HInstanceOf* instruction) {
   LocationSummary::CallKind call_kind =
       instruction->IsExactCheck() ? LocationSummary::kNoCall : LocationSummary::kCallOnSlowPath;
@@ -3772,11 +3945,9 @@ void LocationsBuilderMIPS::VisitInvokeInterface(HInvokeInterface* invoke) {
 void InstructionCodeGeneratorMIPS::VisitInvokeInterface(HInvokeInterface* invoke) {
   // TODO: b/18116999, our IMTs can miss an IncompatibleClassChangeError.
   Register temp = invoke->GetLocations()->GetTemp(0).AsRegister<Register>();
-  uint32_t method_offset = mirror::Class::EmbeddedImTableEntryOffset(
-      invoke->GetImtIndex() % mirror::Class::kImtSize, kMipsPointerSize).Uint32Value();
   Location receiver = invoke->GetLocations()->InAt(0);
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
-  Offset entry_point = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kMipsWordSize);
+  Offset entry_point = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kMipsPointerSize);
 
   // Set the hidden argument.
   __ LoadConst32(invoke->GetLocations()->GetTemp(1).AsRegister<Register>(),
@@ -3790,6 +3961,10 @@ void InstructionCodeGeneratorMIPS::VisitInvokeInterface(HInvokeInterface* invoke
     __ LoadFromOffset(kLoadWord, temp, receiver.AsRegister<Register>(), class_offset);
   }
   codegen_->MaybeRecordImplicitNullCheck(invoke);
+  __ LoadFromOffset(kLoadWord, temp, temp,
+      mirror::Class::ImtPtrOffset(kMipsPointerSize).Uint32Value());
+  uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement(
+      invoke->GetImtIndex(), kMipsPointerSize));
   // temp = temp->GetImtEntryAt(method_offset);
   __ LoadFromOffset(kLoadWord, temp, temp, method_offset);
   // T9 = temp->GetEntryPoint();
@@ -3859,16 +4034,80 @@ static bool TryGenerateIntrinsicCode(HInvoke* invoke, CodeGeneratorMIPS* codegen
 }
 
 HLoadString::LoadKind CodeGeneratorMIPS::GetSupportedLoadStringKind(
-    HLoadString::LoadKind desired_string_load_kind ATTRIBUTE_UNUSED) {
-  // TODO: Implement other kinds.
-  return HLoadString::LoadKind::kDexCacheViaMethod;
+    HLoadString::LoadKind desired_string_load_kind) {
+  if (kEmitCompilerReadBarrier) {
+    UNIMPLEMENTED(FATAL) << "for read barrier";
+  }
+  // We disable PC-relative load when there is an irreducible loop, as the optimization
+  // is incompatible with it.
+  bool has_irreducible_loops = GetGraph()->HasIrreducibleLoops();
+  bool fallback_load = has_irreducible_loops;
+  switch (desired_string_load_kind) {
+    case HLoadString::LoadKind::kBootImageLinkTimeAddress:
+      DCHECK(!GetCompilerOptions().GetCompilePic());
+      break;
+    case HLoadString::LoadKind::kBootImageLinkTimePcRelative:
+      DCHECK(GetCompilerOptions().GetCompilePic());
+      break;
+    case HLoadString::LoadKind::kBootImageAddress:
+      break;
+    case HLoadString::LoadKind::kDexCacheAddress:
+      DCHECK(Runtime::Current()->UseJitCompilation());
+      fallback_load = false;
+      break;
+    case HLoadString::LoadKind::kDexCachePcRelative:
+      DCHECK(!Runtime::Current()->UseJitCompilation());
+      // TODO: Create as many MipsDexCacheArraysBase instructions as needed for methods
+      // with irreducible loops.
+      break;
+    case HLoadString::LoadKind::kDexCacheViaMethod:
+      fallback_load = false;
+      break;
+  }
+  if (fallback_load) {
+    desired_string_load_kind = HLoadString::LoadKind::kDexCacheViaMethod;
+  }
+  return desired_string_load_kind;
 }
 
 HLoadClass::LoadKind CodeGeneratorMIPS::GetSupportedLoadClassKind(
     HLoadClass::LoadKind desired_class_load_kind) {
-  DCHECK_NE(desired_class_load_kind, HLoadClass::LoadKind::kReferrersClass);
-  // TODO: Implement other kinds.
-  return HLoadClass::LoadKind::kDexCacheViaMethod;
+  if (kEmitCompilerReadBarrier) {
+    UNIMPLEMENTED(FATAL) << "for read barrier";
+  }
+  // We disable pc-relative load when there is an irreducible loop, as the optimization
+  // is incompatible with it.
+  bool has_irreducible_loops = GetGraph()->HasIrreducibleLoops();
+  bool fallback_load = has_irreducible_loops;
+  switch (desired_class_load_kind) {
+    case HLoadClass::LoadKind::kReferrersClass:
+      fallback_load = false;
+      break;
+    case HLoadClass::LoadKind::kBootImageLinkTimeAddress:
+      DCHECK(!GetCompilerOptions().GetCompilePic());
+      break;
+    case HLoadClass::LoadKind::kBootImageLinkTimePcRelative:
+      DCHECK(GetCompilerOptions().GetCompilePic());
+      break;
+    case HLoadClass::LoadKind::kBootImageAddress:
+      break;
+    case HLoadClass::LoadKind::kDexCacheAddress:
+      DCHECK(Runtime::Current()->UseJitCompilation());
+      fallback_load = false;
+      break;
+    case HLoadClass::LoadKind::kDexCachePcRelative:
+      DCHECK(!Runtime::Current()->UseJitCompilation());
+      // TODO: Create as many MipsDexCacheArraysBase instructions as needed for methods
+      // with irreducible loops.
+      break;
+    case HLoadClass::LoadKind::kDexCacheViaMethod:
+      fallback_load = false;
+      break;
+  }
+  if (fallback_load) {
+    desired_class_load_kind = HLoadClass::LoadKind::kDexCacheViaMethod;
+  }
+  return desired_class_load_kind;
 }
 
 Register CodeGeneratorMIPS::GetInvokeStaticOrDirectExtraParameter(HInvokeStaticOrDirect* invoke,
@@ -4046,7 +4285,7 @@ void CodeGeneratorMIPS::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke
                         T9,
                         callee_method.AsRegister<Register>(),
                         ArtMethod::EntryPointFromQuickCompiledCodeOffset(
-                            kMipsWordSize).Int32Value());
+                            kMipsPointerSize).Int32Value());
       // T9()
       __ Jalr(T9);
       __ Nop();
@@ -4079,7 +4318,7 @@ void CodeGeneratorMIPS::GenerateVirtualCall(HInvokeVirtual* invoke, Location tem
   size_t method_offset = mirror::Class::EmbeddedVTableEntryOffset(
       invoke->GetVTableIndex(), kMipsPointerSize).SizeValue();
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
-  Offset entry_point = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kMipsWordSize);
+  Offset entry_point = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kMipsPointerSize);
 
   // temp = object->GetClass();
   DCHECK(receiver.IsRegister());
@@ -4105,11 +4344,40 @@ void InstructionCodeGeneratorMIPS::VisitInvokeVirtual(HInvokeVirtual* invoke) {
 }
 
 void LocationsBuilderMIPS::VisitLoadClass(HLoadClass* cls) {
-  InvokeRuntimeCallingConvention calling_convention;
-  CodeGenerator::CreateLoadClassLocationSummary(
-      cls,
-      Location::RegisterLocation(calling_convention.GetRegisterAt(0)),
-      Location::RegisterLocation(V0));
+  if (cls->NeedsAccessCheck()) {
+    InvokeRuntimeCallingConvention calling_convention;
+    CodeGenerator::CreateLoadClassLocationSummary(
+        cls,
+        Location::RegisterLocation(calling_convention.GetRegisterAt(0)),
+        Location::RegisterLocation(V0),
+        /* code_generator_supports_read_barrier */ false);  // TODO: revisit this bool.
+    return;
+  }
+
+  LocationSummary::CallKind call_kind = (cls->NeedsEnvironment() || kEmitCompilerReadBarrier)
+      ? LocationSummary::kCallOnSlowPath
+      : LocationSummary::kNoCall;
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(cls, call_kind);
+  HLoadClass::LoadKind load_kind = cls->GetLoadKind();
+  switch (load_kind) {
+    // We need an extra register for PC-relative literals on R2.
+    case HLoadClass::LoadKind::kBootImageLinkTimeAddress:
+    case HLoadClass::LoadKind::kBootImageAddress:
+    case HLoadClass::LoadKind::kBootImageLinkTimePcRelative:
+      if (codegen_->GetInstructionSetFeatures().IsR6()) {
+        break;
+      }
+      FALLTHROUGH_INTENDED;
+    // We need an extra register for PC-relative dex cache accesses.
+    case HLoadClass::LoadKind::kDexCachePcRelative:
+    case HLoadClass::LoadKind::kReferrersClass:
+    case HLoadClass::LoadKind::kDexCacheViaMethod:
+      locations->SetInAt(0, Location::RequiresRegister());
+      break;
+    default:
+      break;
+  }
+  locations->SetOut(Location::RequiresRegister());
 }
 
 void InstructionCodeGeneratorMIPS::VisitLoadClass(HLoadClass* cls) {
@@ -4125,40 +4393,132 @@ void InstructionCodeGeneratorMIPS::VisitLoadClass(HLoadClass* cls) {
     return;
   }
 
-  Register out = locations->Out().AsRegister<Register>();
-  Register current_method = locations->InAt(0).AsRegister<Register>();
-  if (cls->IsReferrersClass()) {
-    DCHECK(!cls->CanCallRuntime());
-    DCHECK(!cls->MustGenerateClinitCheck());
-    __ LoadFromOffset(kLoadWord, out, current_method,
-                      ArtMethod::DeclaringClassOffset().Int32Value());
-  } else {
-    __ LoadFromOffset(kLoadWord, out, current_method,
-                      ArtMethod::DexCacheResolvedTypesOffset(kMipsPointerSize).Int32Value());
-    __ LoadFromOffset(kLoadWord, out, out, CodeGenerator::GetCacheOffset(cls->GetTypeIndex()));
-
-    if (!cls->IsInDexCache() || cls->MustGenerateClinitCheck()) {
-      DCHECK(cls->CanCallRuntime());
-      SlowPathCodeMIPS* slow_path = new (GetGraph()->GetArena()) LoadClassSlowPathMIPS(
-          cls,
-          cls,
-          cls->GetDexPc(),
-          cls->MustGenerateClinitCheck());
-      codegen_->AddSlowPath(slow_path);
-      if (!cls->IsInDexCache()) {
-        __ Beqz(out, slow_path->GetEntryLabel());
-      }
-      if (cls->MustGenerateClinitCheck()) {
-        GenerateClassInitializationCheck(slow_path, out);
+  HLoadClass::LoadKind load_kind = cls->GetLoadKind();
+  Location out_loc = locations->Out();
+  Register out = out_loc.AsRegister<Register>();
+  Register base_or_current_method_reg;
+  bool isR6 = codegen_->GetInstructionSetFeatures().IsR6();
+  switch (load_kind) {
+    // We need an extra register for PC-relative literals on R2.
+    case HLoadClass::LoadKind::kBootImageLinkTimeAddress:
+    case HLoadClass::LoadKind::kBootImageAddress:
+    case HLoadClass::LoadKind::kBootImageLinkTimePcRelative:
+      base_or_current_method_reg = isR6 ? ZERO : locations->InAt(0).AsRegister<Register>();
+      break;
+    // We need an extra register for PC-relative dex cache accesses.
+    case HLoadClass::LoadKind::kDexCachePcRelative:
+    case HLoadClass::LoadKind::kReferrersClass:
+    case HLoadClass::LoadKind::kDexCacheViaMethod:
+      base_or_current_method_reg = locations->InAt(0).AsRegister<Register>();
+      break;
+    default:
+      base_or_current_method_reg = ZERO;
+      break;
+  }
+
+  bool generate_null_check = false;
+  switch (load_kind) {
+    case HLoadClass::LoadKind::kReferrersClass: {
+      DCHECK(!cls->CanCallRuntime());
+      DCHECK(!cls->MustGenerateClinitCheck());
+      // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
+      GenerateGcRootFieldLoad(cls,
+                              out_loc,
+                              base_or_current_method_reg,
+                              ArtMethod::DeclaringClassOffset().Int32Value());
+      break;
+    }
+    case HLoadClass::LoadKind::kBootImageLinkTimeAddress:
+      DCHECK(!kEmitCompilerReadBarrier);
+      __ LoadLiteral(out,
+                     base_or_current_method_reg,
+                     codegen_->DeduplicateBootImageTypeLiteral(cls->GetDexFile(),
+                                                               cls->GetTypeIndex()));
+      break;
+    case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: {
+      DCHECK(!kEmitCompilerReadBarrier);
+      CodeGeneratorMIPS::PcRelativePatchInfo* info =
+          codegen_->NewPcRelativeTypePatch(cls->GetDexFile(), cls->GetTypeIndex());
+      if (isR6) {
+        __ Bind(&info->high_label);
+        __ Bind(&info->pc_rel_label);
+        // Add a 32-bit offset to PC.
+        __ Auipc(out, /* placeholder */ 0x1234);
+        __ Addiu(out, out, /* placeholder */ 0x5678);
       } else {
-        __ Bind(slow_path->GetExitLabel());
+        __ Bind(&info->high_label);
+        __ Lui(out, /* placeholder */ 0x1234);
+        // We do not bind info->pc_rel_label here, we'll use the assembler's label
+        // for PC-relative literals and the base from HMipsComputeBaseMethodAddress.
+        __ Ori(out, out, /* placeholder */ 0x5678);
+        // Add a 32-bit offset to PC.
+        __ Addu(out, out, base_or_current_method_reg);
       }
+      break;
+    }
+    case HLoadClass::LoadKind::kBootImageAddress: {
+      DCHECK(!kEmitCompilerReadBarrier);
+      DCHECK_NE(cls->GetAddress(), 0u);
+      uint32_t address = dchecked_integral_cast<uint32_t>(cls->GetAddress());
+      __ LoadLiteral(out,
+                     base_or_current_method_reg,
+                     codegen_->DeduplicateBootImageAddressLiteral(address));
+      break;
+    }
+    case HLoadClass::LoadKind::kDexCacheAddress: {
+      DCHECK_NE(cls->GetAddress(), 0u);
+      uint32_t address = dchecked_integral_cast<uint32_t>(cls->GetAddress());
+      static_assert(sizeof(GcRoot<mirror::Class>) == 4u, "Expected GC root to be 4 bytes.");
+      DCHECK_ALIGNED(cls->GetAddress(), 4u);
+      int16_t offset = Low16Bits(address);
+      uint32_t base_address = address - offset;  // This accounts for offset sign extension.
+      __ Lui(out, High16Bits(base_address));
+      // /* GcRoot<mirror::Class> */ out = *(base_address + offset)
+      GenerateGcRootFieldLoad(cls, out_loc, out, offset);
+      generate_null_check = !cls->IsInDexCache();
+      break;
+    }
+    case HLoadClass::LoadKind::kDexCachePcRelative: {
+      HMipsDexCacheArraysBase* base = cls->InputAt(0)->AsMipsDexCacheArraysBase();
+      int32_t offset =
+          cls->GetDexCacheElementOffset() - base->GetElementOffset() - kDexCacheArrayLwOffset;
+      // /* GcRoot<mirror::Class> */ out = *(dex_cache_arrays_base + offset)
+      GenerateGcRootFieldLoad(cls, out_loc, base_or_current_method_reg, offset);
+      generate_null_check = !cls->IsInDexCache();
+      break;
+    }
+    case HLoadClass::LoadKind::kDexCacheViaMethod: {
+      // /* GcRoot<mirror::Class>[] */ out =
+      //        current_method.ptr_sized_fields_->dex_cache_resolved_types_
+      __ LoadFromOffset(kLoadWord,
+                        out,
+                        base_or_current_method_reg,
+                        ArtMethod::DexCacheResolvedTypesOffset(kArmPointerSize).Int32Value());
+      // /* GcRoot<mirror::Class> */ out = out[type_index]
+      size_t offset = CodeGenerator::GetCacheOffset(cls->GetTypeIndex());
+      GenerateGcRootFieldLoad(cls, out_loc, out, offset);
+      generate_null_check = !cls->IsInDexCache();
+    }
+  }
+
+  if (generate_null_check || cls->MustGenerateClinitCheck()) {
+    DCHECK(cls->CanCallRuntime());
+    SlowPathCodeMIPS* slow_path = new (GetGraph()->GetArena()) LoadClassSlowPathMIPS(
+        cls, cls, cls->GetDexPc(), cls->MustGenerateClinitCheck());
+    codegen_->AddSlowPath(slow_path);
+    if (generate_null_check) {
+      __ Beqz(out, slow_path->GetEntryLabel());
+    }
+    if (cls->MustGenerateClinitCheck()) {
+      GenerateClassInitializationCheck(slow_path, out);
+    } else {
+      __ Bind(slow_path->GetExitLabel());
     }
   }
 }
 
 static int32_t GetExceptionTlsOffset() {
-  return Thread::ExceptionOffset<kMipsWordSize>().Int32Value();
+  return Thread::ExceptionOffset<kMipsPointerSize>().Int32Value();
 }
 
 void LocationsBuilderMIPS::VisitLoadException(HLoadException* load) {
@@ -4181,28 +4541,97 @@ void InstructionCodeGeneratorMIPS::VisitClearException(HClearException* clear AT
 }
 
 void LocationsBuilderMIPS::VisitLoadString(HLoadString* load) {
-  LocationSummary::CallKind call_kind = load->NeedsEnvironment()
+  LocationSummary::CallKind call_kind = (load->NeedsEnvironment() || kEmitCompilerReadBarrier)
       ? LocationSummary::kCallOnSlowPath
       : LocationSummary::kNoCall;
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(load, call_kind);
-  locations->SetInAt(0, Location::RequiresRegister());
+  HLoadString::LoadKind load_kind = load->GetLoadKind();
+  switch (load_kind) {
+    // We need an extra register for PC-relative literals on R2.
+    case HLoadString::LoadKind::kBootImageLinkTimeAddress:
+    case HLoadString::LoadKind::kBootImageAddress:
+    case HLoadString::LoadKind::kBootImageLinkTimePcRelative:
+      if (codegen_->GetInstructionSetFeatures().IsR6()) {
+        break;
+      }
+      FALLTHROUGH_INTENDED;
+    // We need an extra register for PC-relative dex cache accesses.
+    case HLoadString::LoadKind::kDexCachePcRelative:
+    case HLoadString::LoadKind::kDexCacheViaMethod:
+      locations->SetInAt(0, Location::RequiresRegister());
+      break;
+    default:
+      break;
+  }
   locations->SetOut(Location::RequiresRegister());
 }
 
 void InstructionCodeGeneratorMIPS::VisitLoadString(HLoadString* load) {
+  HLoadString::LoadKind load_kind = load->GetLoadKind();
   LocationSummary* locations = load->GetLocations();
-  Register out = locations->Out().AsRegister<Register>();
-  Register current_method = locations->InAt(0).AsRegister<Register>();
-  __ LoadFromOffset(kLoadWord, out, current_method, ArtMethod::DeclaringClassOffset().Int32Value());
-  __ LoadFromOffset(kLoadWord, out, out, mirror::Class::DexCacheStringsOffset().Int32Value());
-  __ LoadFromOffset(kLoadWord, out, out, CodeGenerator::GetCacheOffset(load->GetStringIndex()));
+  Location out_loc = locations->Out();
+  Register out = out_loc.AsRegister<Register>();
+  Register base_or_current_method_reg;
+  bool isR6 = codegen_->GetInstructionSetFeatures().IsR6();
+  switch (load_kind) {
+    // We need an extra register for PC-relative literals on R2.
+    case HLoadString::LoadKind::kBootImageLinkTimeAddress:
+    case HLoadString::LoadKind::kBootImageAddress:
+    case HLoadString::LoadKind::kBootImageLinkTimePcRelative:
+      base_or_current_method_reg = isR6 ? ZERO : locations->InAt(0).AsRegister<Register>();
+      break;
+    default:
+      base_or_current_method_reg = ZERO;
+      break;
+  }
 
-  if (!load->IsInDexCache()) {
-    SlowPathCodeMIPS* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathMIPS(load);
-    codegen_->AddSlowPath(slow_path);
-    __ Beqz(out, slow_path->GetEntryLabel());
-    __ Bind(slow_path->GetExitLabel());
+  switch (load_kind) {
+    case HLoadString::LoadKind::kBootImageLinkTimeAddress:
+      DCHECK(!kEmitCompilerReadBarrier);
+      __ LoadLiteral(out,
+                     base_or_current_method_reg,
+                     codegen_->DeduplicateBootImageStringLiteral(load->GetDexFile(),
+                                                                 load->GetStringIndex()));
+      return;  // No dex cache slow path.
+    case HLoadString::LoadKind::kBootImageLinkTimePcRelative: {
+      DCHECK(!kEmitCompilerReadBarrier);
+      CodeGeneratorMIPS::PcRelativePatchInfo* info =
+          codegen_->NewPcRelativeStringPatch(load->GetDexFile(), load->GetStringIndex());
+      if (isR6) {
+        __ Bind(&info->high_label);
+        __ Bind(&info->pc_rel_label);
+        // Add a 32-bit offset to PC.
+        __ Auipc(out, /* placeholder */ 0x1234);
+        __ Addiu(out, out, /* placeholder */ 0x5678);
+      } else {
+        __ Bind(&info->high_label);
+        __ Lui(out, /* placeholder */ 0x1234);
+        // We do not bind info->pc_rel_label here, we'll use the assembler's label
+        // for PC-relative literals and the base from HMipsComputeBaseMethodAddress.
+        __ Ori(out, out, /* placeholder */ 0x5678);
+        // Add a 32-bit offset to PC.
+        __ Addu(out, out, base_or_current_method_reg);
+      }
+      return;  // No dex cache slow path.
+    }
+    case HLoadString::LoadKind::kBootImageAddress: {
+      DCHECK(!kEmitCompilerReadBarrier);
+      DCHECK_NE(load->GetAddress(), 0u);
+      uint32_t address = dchecked_integral_cast<uint32_t>(load->GetAddress());
+      __ LoadLiteral(out,
+                     base_or_current_method_reg,
+                     codegen_->DeduplicateBootImageAddressLiteral(address));
+      return;  // No dex cache slow path.
+    }
+    default:
+      break;
   }
+
+  // TODO: Re-add the compiler code to do string dex cache lookup again.
+  SlowPathCodeMIPS* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathMIPS(load);
+  codegen_->AddSlowPath(slow_path);
+  __ B(slow_path->GetEntryLabel());
+  __ Bind(slow_path->GetExitLabel());
 }
 
 void LocationsBuilderMIPS::VisitLongConstant(HLongConstant* constant) {
@@ -4216,7 +4645,7 @@ void InstructionCodeGeneratorMIPS::VisitLongConstant(HLongConstant* constant ATT
 
 void LocationsBuilderMIPS::VisitMonitorOperation(HMonitorOperation* instruction) {
   LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
+      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
 }
@@ -4395,7 +4824,7 @@ void InstructionCodeGeneratorMIPS::VisitNeg(HNeg* instruction) {
 
 void LocationsBuilderMIPS::VisitNewArray(HNewArray* instruction) {
   LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
+      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly);
   InvokeRuntimeCallingConvention calling_convention;
   locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
   locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
@@ -4410,7 +4839,7 @@ void InstructionCodeGeneratorMIPS::VisitNewArray(HNewArray* instruction) {
   // Move an uint16_t value to a register.
   __ LoadConst32(calling_convention.GetRegisterAt(0), instruction->GetTypeIndex());
   codegen_->InvokeRuntime(
-      GetThreadOffset<kMipsWordSize>(instruction->GetEntrypoint()).Int32Value(),
+      GetThreadOffset<kMipsPointerSize>(instruction->GetEntrypoint()).Int32Value(),
       instruction,
       instruction->GetDexPc(),
       nullptr,
@@ -4421,7 +4850,7 @@ void InstructionCodeGeneratorMIPS::VisitNewArray(HNewArray* instruction) {
 
 void LocationsBuilderMIPS::VisitNewInstance(HNewInstance* instruction) {
   LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
+      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly);
   InvokeRuntimeCallingConvention calling_convention;
   if (instruction->IsStringAlloc()) {
     locations->AddTemp(Location::RegisterLocation(kMethodRegisterArgument));
@@ -4436,7 +4865,7 @@ void InstructionCodeGeneratorMIPS::VisitNewInstance(HNewInstance* instruction) {
   if (instruction->IsStringAlloc()) {
     // String is allocated through StringFactory. Call NewEmptyString entry point.
     Register temp = instruction->GetLocations()->GetTemp(0).AsRegister<Register>();
-    MemberOffset code_offset = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kMipsWordSize);
+    MemberOffset code_offset = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kMipsPointerSize);
     __ LoadFromOffset(kLoadWord, temp, TR, QUICK_ENTRY_POINT(pNewEmptyString));
     __ LoadFromOffset(kLoadWord, T9, temp, code_offset.Int32Value());
     __ Jalr(T9);
@@ -4444,7 +4873,7 @@ void InstructionCodeGeneratorMIPS::VisitNewInstance(HNewInstance* instruction) {
     codegen_->RecordPcInfo(instruction, instruction->GetDexPc());
   } else {
     codegen_->InvokeRuntime(
-        GetThreadOffset<kMipsWordSize>(instruction->GetEntrypoint()).Int32Value(),
+        GetThreadOffset<kMipsPointerSize>(instruction->GetEntrypoint()).Int32Value(),
         instruction,
         instruction->GetDexPc(),
         nullptr,
@@ -4591,7 +5020,7 @@ void InstructionCodeGeneratorMIPS::VisitPhi(HPhi* instruction ATTRIBUTE_UNUSED)
 void LocationsBuilderMIPS::VisitRem(HRem* rem) {
   Primitive::Type type = rem->GetResultType();
   LocationSummary::CallKind call_kind =
-      (type == Primitive::kPrimInt) ? LocationSummary::kNoCall : LocationSummary::kCall;
+      (type == Primitive::kPrimInt) ? LocationSummary::kNoCall : LocationSummary::kCallOnMainOnly;
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(rem, call_kind);
 
   switch (type) {
@@ -4828,7 +5257,7 @@ void InstructionCodeGeneratorMIPS::VisitSuspendCheck(HSuspendCheck* instruction)
 
 void LocationsBuilderMIPS::VisitThrow(HThrow* instruction) {
   LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
+      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
 }
@@ -4857,7 +5286,7 @@ void LocationsBuilderMIPS::VisitTypeConversion(HTypeConversion* conversion) {
   if (!isR6 &&
       ((Primitive::IsFloatingPointType(result_type) && input_type == Primitive::kPrimLong) ||
        (result_type == Primitive::kPrimLong && Primitive::IsFloatingPointType(input_type)))) {
-    call_kind = LocationSummary::kCall;
+    call_kind = LocationSummary::kCallOnMainOnly;
   }
 
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(conversion, call_kind);
@@ -5325,6 +5754,7 @@ void InstructionCodeGeneratorMIPS::VisitMipsComputeBaseMethodAddress(
   __ Nal();
   // Grab the return address off RA.
   __ Move(reg, RA);
+  // TODO: Can we share this code with that of VisitMipsDexCacheArraysBase()?
 
   // Remember this offset (the obtained PC value) for later use with constant area.
   __ BindPcRelBaseLabel();
@@ -5355,6 +5785,7 @@ void InstructionCodeGeneratorMIPS::VisitMipsDexCacheArraysBase(HMipsDexCacheArra
     __ Ori(reg, reg, /* placeholder */ 0x5678);
     // Add a 32-bit offset to PC.
     __ Addu(reg, reg, RA);
+    // TODO: Can we share this code with that of VisitMipsComputeBaseMethodAddress()?
   }
 }
 
@@ -5378,18 +5809,25 @@ void LocationsBuilderMIPS::VisitClassTableGet(HClassTableGet* instruction) {
 
 void InstructionCodeGeneratorMIPS::VisitClassTableGet(HClassTableGet* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  uint32_t method_offset = 0;
   if (instruction->GetTableKind() == HClassTableGet::TableKind::kVTable) {
-    method_offset = mirror::Class::EmbeddedVTableEntryOffset(
+    uint32_t method_offset = mirror::Class::EmbeddedVTableEntryOffset(
         instruction->GetIndex(), kMipsPointerSize).SizeValue();
+    __ LoadFromOffset(kLoadWord,
+                      locations->Out().AsRegister<Register>(),
+                      locations->InAt(0).AsRegister<Register>(),
+                      method_offset);
   } else {
-    method_offset = mirror::Class::EmbeddedImTableEntryOffset(
-        instruction->GetIndex() % mirror::Class::kImtSize, kMipsPointerSize).Uint32Value();
+    uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement(
+        instruction->GetIndex(), kMipsPointerSize));
+    __ LoadFromOffset(kLoadWord,
+                      locations->Out().AsRegister<Register>(),
+                      locations->InAt(0).AsRegister<Register>(),
+                      mirror::Class::ImtPtrOffset(kMipsPointerSize).Uint32Value());
+    __ LoadFromOffset(kLoadWord,
+                      locations->Out().AsRegister<Register>(),
+                      locations->Out().AsRegister<Register>(),
+                      method_offset);
   }
-  __ LoadFromOffset(kLoadWord,
-                    locations->Out().AsRegister<Register>(),
-                    locations->InAt(0).AsRegister<Register>(),
-                    method_offset);
 }
 
 #undef __
diff --git a/compiler/optimizing/code_generator_mips.h b/compiler/optimizing/code_generator_mips.h
index 08f74c04d1..63a0345c1c 100644
--- a/compiler/optimizing/code_generator_mips.h
+++ b/compiler/optimizing/code_generator_mips.h
@@ -18,11 +18,12 @@
 #define ART_COMPILER_OPTIMIZING_CODE_GENERATOR_MIPS_H_
 
 #include "code_generator.h"
-#include "dex/compiler_enums.h"
 #include "driver/compiler_options.h"
 #include "nodes.h"
 #include "parallel_move_resolver.h"
+#include "string_reference.h"
 #include "utils/mips/assembler_mips.h"
+#include "utils/type_reference.h"
 
 namespace art {
 namespace mips {
@@ -226,6 +227,15 @@ class InstructionCodeGeneratorMIPS : public InstructionCodeGenerator {
   void HandleShift(HBinaryOperation* operation);
   void HandleFieldSet(HInstruction* instruction, const FieldInfo& field_info, uint32_t dex_pc);
   void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info, uint32_t dex_pc);
+  // Generate a GC root reference load:
+  //
+  //   root <- *(obj + offset)
+  //
+  // while honoring read barriers (if any).
+  void GenerateGcRootFieldLoad(HInstruction* instruction,
+                               Location root,
+                               Register obj,
+                               uint32_t offset);
   void GenerateIntCompare(IfCondition cond, LocationSummary* locations);
   void GenerateIntCompareAndBranch(IfCondition cond,
                                    LocationSummary* locations,
@@ -298,6 +308,9 @@ class CodeGeneratorMIPS : public CodeGenerator {
   size_t RestoreCoreRegister(size_t stack_index, uint32_t reg_id);
   size_t SaveFloatingPointRegister(size_t stack_index, uint32_t reg_id);
   size_t RestoreFloatingPointRegister(size_t stack_index, uint32_t reg_id);
+  void ClobberRA() {
+    clobbered_ra_ = true;
+  }
 
   void DumpCoreRegister(std::ostream& stream, int reg) const OVERRIDE;
   void DumpFloatingPointRegister(std::ostream& stream, int reg) const OVERRIDE;
@@ -383,7 +396,7 @@ class CodeGeneratorMIPS : public CodeGenerator {
     PcRelativePatchInfo(PcRelativePatchInfo&& other) = default;
 
     const DexFile& target_dex_file;
-    // Either the dex cache array element offset or the string index.
+    // Either the dex cache array element offset or the string/type index.
     uint32_t offset_or_index;
     // Label for the instruction loading the most significant half of the offset that's added to PC
     // to form the base address (the least significant half is loaded with the instruction that
@@ -393,14 +406,27 @@ class CodeGeneratorMIPS : public CodeGenerator {
     MipsLabel pc_rel_label;
   };
 
+  PcRelativePatchInfo* NewPcRelativeStringPatch(const DexFile& dex_file, uint32_t string_index);
+  PcRelativePatchInfo* NewPcRelativeTypePatch(const DexFile& dex_file, uint32_t type_index);
   PcRelativePatchInfo* NewPcRelativeDexCacheArrayPatch(const DexFile& dex_file,
                                                        uint32_t element_offset);
+  Literal* DeduplicateBootImageStringLiteral(const DexFile& dex_file, uint32_t string_index);
+  Literal* DeduplicateBootImageTypeLiteral(const DexFile& dex_file, uint32_t type_index);
+  Literal* DeduplicateBootImageAddressLiteral(uint32_t address);
 
  private:
   Register GetInvokeStaticOrDirectExtraParameter(HInvokeStaticOrDirect* invoke, Register temp);
 
+  using Uint32ToLiteralMap = ArenaSafeMap<uint32_t, Literal*>;
   using MethodToLiteralMap = ArenaSafeMap<MethodReference, Literal*, MethodReferenceComparator>;
-
+  using BootStringToLiteralMap = ArenaSafeMap<StringReference,
+                                              Literal*,
+                                              StringReferenceValueComparator>;
+  using BootTypeToLiteralMap = ArenaSafeMap<TypeReference,
+                                            Literal*,
+                                            TypeReferenceValueComparator>;
+
+  Literal* DeduplicateUint32Literal(uint32_t value, Uint32ToLiteralMap* map);
   Literal* DeduplicateMethodLiteral(MethodReference target_method, MethodToLiteralMap* map);
   Literal* DeduplicateMethodAddressLiteral(MethodReference target_method);
   Literal* DeduplicateMethodCodeLiteral(MethodReference target_method);
@@ -417,11 +443,27 @@ class CodeGeneratorMIPS : public CodeGenerator {
   MipsAssembler assembler_;
   const MipsInstructionSetFeatures& isa_features_;
 
+  // Deduplication map for 32-bit literals, used for non-patchable boot image addresses.
+  Uint32ToLiteralMap uint32_literals_;
   // Method patch info, map MethodReference to a literal for method address and method code.
   MethodToLiteralMap method_patches_;
   MethodToLiteralMap call_patches_;
   // PC-relative patch info for each HMipsDexCacheArraysBase.
   ArenaDeque<PcRelativePatchInfo> pc_relative_dex_cache_patches_;
+  // Deduplication map for boot string literals for kBootImageLinkTimeAddress.
+  BootStringToLiteralMap boot_image_string_patches_;
+  // PC-relative String patch info.
+  ArenaDeque<PcRelativePatchInfo> pc_relative_string_patches_;
+  // Deduplication map for boot type literals for kBootImageLinkTimeAddress.
+  BootTypeToLiteralMap boot_image_type_patches_;
+  // PC-relative type patch info.
+  ArenaDeque<PcRelativePatchInfo> pc_relative_type_patches_;
+  // Deduplication map for patchable boot image addresses.
+  Uint32ToLiteralMap boot_image_address_patches_;
+
+  // PC-relative loads on R2 clobber RA, which may need to be preserved explicitly in leaf methods.
+  // This is a flag set by pc_relative_fixups_mips and dex_cache_array_fixups_mips optimizations.
+  bool clobbered_ra_;
 
   DISALLOW_COPY_AND_ASSIGN(CodeGeneratorMIPS);
 };
diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc
index 2e78884daf..4a5755c925 100644
--- a/compiler/optimizing/code_generator_mips64.cc
+++ b/compiler/optimizing/code_generator_mips64.cc
@@ -102,9 +102,9 @@ Location InvokeRuntimeCallingConvention::GetReturnLocation(Primitive::Type type)
   return Mips64ReturnLocation(type);
 }
 
-// NOLINT on __ macro to suppress wrong warning/fix from clang-tidy.
-#define __ down_cast<CodeGeneratorMIPS64*>(codegen)->GetAssembler()-> // NOLINT
-#define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kMips64DoublewordSize, x).Int32Value()
+// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
+#define __ down_cast<CodeGeneratorMIPS64*>(codegen)->GetAssembler()->  // NOLINT
+#define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kMips64PointerSize, x).Int32Value()
 
 class BoundsCheckSlowPathMIPS64 : public SlowPathCodeMIPS64 {
  public:
@@ -300,13 +300,11 @@ class SuspendCheckSlowPathMIPS64 : public SlowPathCodeMIPS64 {
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     CodeGeneratorMIPS64* mips64_codegen = down_cast<CodeGeneratorMIPS64*>(codegen);
     __ Bind(GetEntryLabel());
-    SaveLiveRegisters(codegen, instruction_->GetLocations());
     mips64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pTestSuspend),
                                   instruction_,
                                   instruction_->GetDexPc(),
                                   this);
     CheckEntrypointTypes<kQuickTestSuspend, void, void>();
-    RestoreLiveRegisters(codegen, instruction_->GetLocations());
     if (successor_ == nullptr) {
       __ Bc(GetReturnLabel());
     } else {
@@ -362,7 +360,7 @@ class TypeCheckSlowPathMIPS64 : public SlowPathCodeMIPS64 {
                                     dex_pc,
                                     this);
       CheckEntrypointTypes<
-          kQuickInstanceofNonTrivial, uint32_t, const mirror::Class*, const mirror::Class*>();
+          kQuickInstanceofNonTrivial, size_t, const mirror::Class*, const mirror::Class*>();
       Primitive::Type ret_type = instruction_->GetType();
       Location ret_loc = calling_convention.GetReturnLocation(ret_type);
       mips64_codegen->MoveLocation(locations->Out(), ret_loc, ret_type);
@@ -429,9 +427,9 @@ CodeGeneratorMIPS64::CodeGeneratorMIPS64(HGraph* graph,
 }
 
 #undef __
-// NOLINT on __ macro to suppress wrong warning/fix from clang-tidy.
-#define __ down_cast<Mips64Assembler*>(GetAssembler())-> // NOLINT
-#define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kMips64DoublewordSize, x).Int32Value()
+// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
+#define __ down_cast<Mips64Assembler*>(GetAssembler())->  // NOLINT
+#define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kMips64PointerSize, x).Int32Value()
 
 void CodeGeneratorMIPS64::Finalize(CodeAllocator* allocator) {
   // Ensure that we fix up branches.
@@ -888,7 +886,7 @@ void CodeGeneratorMIPS64::MarkGCCard(GpuRegister object,
   __ LoadFromOffset(kLoadDoubleword,
                     card,
                     TR,
-                    Thread::CardTableOffset<kMips64DoublewordSize>().Int32Value());
+                    Thread::CardTableOffset<kMips64PointerSize>().Int32Value());
   __ Dsrl(temp, object, gc::accounting::CardTable::kCardShift);
   __ Daddu(temp, card, temp);
   __ Sb(card, temp, 0);
@@ -964,7 +962,7 @@ void CodeGeneratorMIPS64::InvokeRuntime(QuickEntrypointEnum entrypoint,
                                      HInstruction* instruction,
                                      uint32_t dex_pc,
                                      SlowPathCode* slow_path) {
-  InvokeRuntime(GetThreadOffset<kMips64DoublewordSize>(entrypoint).Int32Value(),
+  InvokeRuntime(GetThreadOffset<kMips64PointerSize>(entrypoint).Int32Value(),
                 instruction,
                 dex_pc,
                 slow_path);
@@ -1004,7 +1002,7 @@ void InstructionCodeGeneratorMIPS64::GenerateSuspendCheck(HSuspendCheck* instruc
   __ LoadFromOffset(kLoadUnsignedHalfword,
                     TMP,
                     TR,
-                    Thread::ThreadFlagsOffset<kMips64DoublewordSize>().Int32Value());
+                    Thread::ThreadFlagsOffset<kMips64PointerSize>().Int32Value());
   if (successor == nullptr) {
     __ Bnezc(TMP, slow_path->GetEntryLabel());
     __ Bind(slow_path->GetReturnLabel());
@@ -1436,7 +1434,7 @@ void LocationsBuilderMIPS64::VisitArraySet(HArraySet* instruction) {
   bool needs_runtime_call = instruction->NeedsTypeCheck();
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(
       instruction,
-      needs_runtime_call ? LocationSummary::kCall : LocationSummary::kNoCall);
+      needs_runtime_call ? LocationSummary::kCallOnMainOnly : LocationSummary::kNoCall);
   if (needs_runtime_call) {
     InvokeRuntimeCallingConvention calling_convention;
     locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
@@ -2932,11 +2930,9 @@ void LocationsBuilderMIPS64::VisitInvokeInterface(HInvokeInterface* invoke) {
 void InstructionCodeGeneratorMIPS64::VisitInvokeInterface(HInvokeInterface* invoke) {
   // TODO: b/18116999, our IMTs can miss an IncompatibleClassChangeError.
   GpuRegister temp = invoke->GetLocations()->GetTemp(0).AsRegister<GpuRegister>();
-  uint32_t method_offset = mirror::Class::EmbeddedImTableEntryOffset(
-      invoke->GetImtIndex() % mirror::Class::kImtSize, kMips64PointerSize).Uint32Value();
   Location receiver = invoke->GetLocations()->InAt(0);
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
-  Offset entry_point = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kMips64DoublewordSize);
+  Offset entry_point = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kMips64PointerSize);
 
   // Set the hidden argument.
   __ LoadConst32(invoke->GetLocations()->GetTemp(1).AsRegister<GpuRegister>(),
@@ -2950,6 +2946,10 @@ void InstructionCodeGeneratorMIPS64::VisitInvokeInterface(HInvokeInterface* invo
     __ LoadFromOffset(kLoadUnsignedWord, temp, receiver.AsRegister<GpuRegister>(), class_offset);
   }
   codegen_->MaybeRecordImplicitNullCheck(invoke);
+  __ LoadFromOffset(kLoadDoubleword, temp, temp,
+      mirror::Class::ImtPtrOffset(kMips64PointerSize).Uint32Value());
+  uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement(
+      invoke->GetImtIndex(), kMips64PointerSize));
   // temp = temp->GetImtEntryAt(method_offset);
   __ LoadFromOffset(kLoadDoubleword, temp, temp, method_offset);
   // T9 = temp->GetEntryPoint();
@@ -3113,7 +3113,7 @@ void CodeGeneratorMIPS64::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invo
                         T9,
                         callee_method.AsRegister<GpuRegister>(),
                         ArtMethod::EntryPointFromQuickCompiledCodeOffset(
-                            kMips64DoublewordSize).Int32Value());
+                            kMips64PointerSize).Int32Value());
       // T9()
       __ Jalr(T9);
       __ Nop();
@@ -3151,7 +3151,7 @@ void CodeGeneratorMIPS64::GenerateVirtualCall(HInvokeVirtual* invoke, Location t
   size_t method_offset = mirror::Class::EmbeddedVTableEntryOffset(
       invoke->GetVTableIndex(), kMips64PointerSize).SizeValue();
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
-  Offset entry_point = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kMips64DoublewordSize);
+  Offset entry_point = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kMips64PointerSize);
 
   // temp = object->GetClass();
   __ LoadFromOffset(kLoadUnsignedWord, temp, receiver, class_offset);
@@ -3229,7 +3229,7 @@ void InstructionCodeGeneratorMIPS64::VisitLoadClass(HLoadClass* cls) {
 }
 
 static int32_t GetExceptionTlsOffset() {
-  return Thread::ExceptionOffset<kMips64DoublewordSize>().Int32Value();
+  return Thread::ExceptionOffset<kMips64PointerSize>().Int32Value();
 }
 
 void LocationsBuilderMIPS64::VisitLoadException(HLoadException* load) {
@@ -3261,22 +3261,11 @@ void LocationsBuilderMIPS64::VisitLoadString(HLoadString* load) {
 }
 
 void InstructionCodeGeneratorMIPS64::VisitLoadString(HLoadString* load) {
-  LocationSummary* locations = load->GetLocations();
-  GpuRegister out = locations->Out().AsRegister<GpuRegister>();
-  GpuRegister current_method = locations->InAt(0).AsRegister<GpuRegister>();
-  __ LoadFromOffset(kLoadUnsignedWord, out, current_method,
-                    ArtMethod::DeclaringClassOffset().Int32Value());
-  __ LoadFromOffset(kLoadDoubleword, out, out, mirror::Class::DexCacheStringsOffset().Int32Value());
-  __ LoadFromOffset(
-      kLoadUnsignedWord, out, out, CodeGenerator::GetCacheOffset(load->GetStringIndex()));
-  // TODO: We will need a read barrier here.
-
-  if (!load->IsInDexCache()) {
-    SlowPathCodeMIPS64* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathMIPS64(load);
-    codegen_->AddSlowPath(slow_path);
-    __ Beqzc(out, slow_path->GetEntryLabel());
-    __ Bind(slow_path->GetExitLabel());
-  }
+  // TODO: Re-add the compiler code to do string dex cache lookup again.
+  SlowPathCodeMIPS64* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathMIPS64(load);
+  codegen_->AddSlowPath(slow_path);
+  __ Bc(slow_path->GetEntryLabel());
+  __ Bind(slow_path->GetExitLabel());
 }
 
 void LocationsBuilderMIPS64::VisitLongConstant(HLongConstant* constant) {
@@ -3290,7 +3279,7 @@ void InstructionCodeGeneratorMIPS64::VisitLongConstant(HLongConstant* constant A
 
 void LocationsBuilderMIPS64::VisitMonitorOperation(HMonitorOperation* instruction) {
   LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
+      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
 }
@@ -3417,7 +3406,7 @@ void InstructionCodeGeneratorMIPS64::VisitNeg(HNeg* instruction) {
 
 void LocationsBuilderMIPS64::VisitNewArray(HNewArray* instruction) {
   LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
+      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly);
   InvokeRuntimeCallingConvention calling_convention;
   locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
   locations->SetOut(calling_convention.GetReturnLocation(Primitive::kPrimNot));
@@ -3438,7 +3427,7 @@ void InstructionCodeGeneratorMIPS64::VisitNewArray(HNewArray* instruction) {
 
 void LocationsBuilderMIPS64::VisitNewInstance(HNewInstance* instruction) {
   LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
+      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly);
   InvokeRuntimeCallingConvention calling_convention;
   if (instruction->IsStringAlloc()) {
     locations->AddTemp(Location::RegisterLocation(kMethodRegisterArgument));
@@ -3454,7 +3443,7 @@ void InstructionCodeGeneratorMIPS64::VisitNewInstance(HNewInstance* instruction)
     // String is allocated through StringFactory. Call NewEmptyString entry point.
     GpuRegister temp = instruction->GetLocations()->GetTemp(0).AsRegister<GpuRegister>();
     MemberOffset code_offset =
-        ArtMethod::EntryPointFromQuickCompiledCodeOffset(kMips64DoublewordSize);
+        ArtMethod::EntryPointFromQuickCompiledCodeOffset(kMips64PointerSize);
     __ LoadFromOffset(kLoadDoubleword, temp, TR, QUICK_ENTRY_POINT(pNewEmptyString));
     __ LoadFromOffset(kLoadDoubleword, T9, temp, code_offset.Int32Value());
     __ Jalr(T9);
@@ -3598,7 +3587,8 @@ void InstructionCodeGeneratorMIPS64::VisitPhi(HPhi* instruction ATTRIBUTE_UNUSED
 void LocationsBuilderMIPS64::VisitRem(HRem* rem) {
   Primitive::Type type = rem->GetResultType();
   LocationSummary::CallKind call_kind =
-      Primitive::IsFloatingPointType(type) ? LocationSummary::kCall : LocationSummary::kNoCall;
+      Primitive::IsFloatingPointType(type) ? LocationSummary::kCallOnMainOnly
+                                           : LocationSummary::kNoCall;
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(rem, call_kind);
 
   switch (type) {
@@ -3811,7 +3801,7 @@ void InstructionCodeGeneratorMIPS64::VisitSuspendCheck(HSuspendCheck* instructio
 
 void LocationsBuilderMIPS64::VisitThrow(HThrow* instruction) {
   LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
+      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
 }
diff --git a/compiler/optimizing/code_generator_mips64.h b/compiler/optimizing/code_generator_mips64.h
index 4b462cc800..197f86b22b 100644
--- a/compiler/optimizing/code_generator_mips64.h
+++ b/compiler/optimizing/code_generator_mips64.h
@@ -18,7 +18,6 @@
 #define ART_COMPILER_OPTIMIZING_CODE_GENERATOR_MIPS64_H_
 
 #include "code_generator.h"
-#include "dex/compiler_enums.h"
 #include "driver/compiler_options.h"
 #include "nodes.h"
 #include "parallel_move_resolver.h"
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 1261619536..7aca16f867 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -47,9 +47,9 @@ static constexpr int kC2ConditionMask = 0x400;
 
 static constexpr int kFakeReturnRegister = Register(8);
 
-// NOLINT on __ macro to suppress wrong warning/fix from clang-tidy.
-#define __ down_cast<X86Assembler*>(codegen->GetAssembler())-> // NOLINT
-#define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kX86WordSize, x).Int32Value()
+// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
+#define __ down_cast<X86Assembler*>(codegen->GetAssembler())->  // NOLINT
+#define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kX86PointerSize, x).Int32Value()
 
 class NullCheckSlowPathX86 : public SlowPathCode {
  public:
@@ -140,12 +140,29 @@ class BoundsCheckSlowPathX86 : public SlowPathCode {
       // Live registers will be restored in the catch block if caught.
       SaveLiveRegisters(codegen, instruction_->GetLocations());
     }
+
+    // Are we using an array length from memory?
+    HInstruction* array_length = instruction_->InputAt(1);
+    Location length_loc = locations->InAt(1);
     InvokeRuntimeCallingConvention calling_convention;
+    if (array_length->IsArrayLength() && array_length->IsEmittedAtUseSite()) {
+      // Load the array length into our temporary.
+      uint32_t len_offset = CodeGenerator::GetArrayLengthOffset(array_length->AsArrayLength());
+      Location array_loc = array_length->GetLocations()->InAt(0);
+      Address array_len(array_loc.AsRegister<Register>(), len_offset);
+      length_loc = Location::RegisterLocation(calling_convention.GetRegisterAt(1));
+      // Check for conflicts with index.
+      if (length_loc.Equals(locations->InAt(0))) {
+        // We know we aren't using parameter 2.
+        length_loc = Location::RegisterLocation(calling_convention.GetRegisterAt(2));
+      }
+      __ movl(length_loc.AsRegister<Register>(), array_len);
+    }
     x86_codegen->EmitParallelMoves(
         locations->InAt(0),
         Location::RegisterLocation(calling_convention.GetRegisterAt(0)),
         Primitive::kPrimInt,
-        locations->InAt(1),
+        length_loc,
         Location::RegisterLocation(calling_convention.GetRegisterAt(1)),
         Primitive::kPrimInt);
     uint32_t entry_point_offset = instruction_->AsBoundsCheck()->IsStringCharAt()
@@ -175,13 +192,11 @@ class SuspendCheckSlowPathX86 : public SlowPathCode {
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     CodeGeneratorX86* x86_codegen = down_cast<CodeGeneratorX86*>(codegen);
     __ Bind(GetEntryLabel());
-    SaveLiveRegisters(codegen, instruction_->GetLocations());
     x86_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pTestSuspend),
                                instruction_,
                                instruction_->GetDexPc(),
                                this);
     CheckEntrypointTypes<kQuickTestSuspend, void, void>();
-    RestoreLiveRegisters(codegen, instruction_->GetLocations());
     if (successor_ == nullptr) {
       __ jmp(GetReturnLabel());
     } else {
@@ -332,7 +347,7 @@ class TypeCheckSlowPathX86 : public SlowPathCode {
                                  instruction_->GetDexPc(),
                                  this);
       CheckEntrypointTypes<
-          kQuickInstanceofNonTrivial, uint32_t, const mirror::Class*, const mirror::Class*>();
+          kQuickInstanceofNonTrivial, size_t, const mirror::Class*, const mirror::Class*>();
     } else {
       DCHECK(instruction_->IsCheckCast());
       x86_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pCheckCast),
@@ -430,8 +445,8 @@ class ArraySetSlowPathX86 : public SlowPathCode {
 // Slow path marking an object during a read barrier.
 class ReadBarrierMarkSlowPathX86 : public SlowPathCode {
  public:
-  ReadBarrierMarkSlowPathX86(HInstruction* instruction, Location out, Location obj)
-      : SlowPathCode(instruction), out_(out), obj_(obj) {
+  ReadBarrierMarkSlowPathX86(HInstruction* instruction, Location obj)
+      : SlowPathCode(instruction), obj_(obj) {
     DCHECK(kEmitCompilerReadBarrier);
   }
 
@@ -439,9 +454,9 @@ class ReadBarrierMarkSlowPathX86 : public SlowPathCode {
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     LocationSummary* locations = instruction_->GetLocations();
-    Register reg_out = out_.AsRegister<Register>();
+    Register reg = obj_.AsRegister<Register>();
     DCHECK(locations->CanCall());
-    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg_out));
+    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg));
     DCHECK(instruction_->IsInstanceFieldGet() ||
            instruction_->IsStaticFieldGet() ||
            instruction_->IsArrayGet() ||
@@ -449,30 +464,40 @@ class ReadBarrierMarkSlowPathX86 : public SlowPathCode {
            instruction_->IsLoadString() ||
            instruction_->IsInstanceOf() ||
            instruction_->IsCheckCast() ||
-           ((instruction_->IsInvokeStaticOrDirect() || instruction_->IsInvokeVirtual()) &&
-            instruction_->GetLocations()->Intrinsified()))
+           (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()) ||
+           (instruction_->IsInvokeStaticOrDirect() && instruction_->GetLocations()->Intrinsified()))
         << "Unexpected instruction in read barrier marking slow path: "
         << instruction_->DebugName();
 
     __ Bind(GetEntryLabel());
-    SaveLiveRegisters(codegen, locations);
-
-    InvokeRuntimeCallingConvention calling_convention;
+    // No need to save live registers; it's taken care of by the
+    // entrypoint. Also, there is no need to update the stack mask,
+    // as this runtime call will not trigger a garbage collection.
     CodeGeneratorX86* x86_codegen = down_cast<CodeGeneratorX86*>(codegen);
-    x86_codegen->Move32(Location::RegisterLocation(calling_convention.GetRegisterAt(0)), obj_);
-    x86_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pReadBarrierMark),
-                               instruction_,
-                               instruction_->GetDexPc(),
-                               this);
-    CheckEntrypointTypes<kQuickReadBarrierMark, mirror::Object*, mirror::Object*>();
-    x86_codegen->Move32(out_, Location::RegisterLocation(EAX));
-
-    RestoreLiveRegisters(codegen, locations);
+    DCHECK_NE(reg, ESP);
+    DCHECK(0 <= reg && reg < kNumberOfCpuRegisters) << reg;
+    // "Compact" slow path, saving two moves.
+    //
+    // Instead of using the standard runtime calling convention (input
+    // and output in EAX):
+    //
+    //   EAX <- obj
+    //   EAX <- ReadBarrierMark(EAX)
+    //   obj <- EAX
+    //
+    // we just use rX (the register holding `obj`) as input and output
+    // of a dedicated entrypoint:
+    //
+    //   rX <- ReadBarrierMarkRegX(rX)
+    //
+    int32_t entry_point_offset =
+        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kX86PointerSize>(reg);
+    // This runtime call does not require a stack map.
+    x86_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
     __ jmp(GetExitLabel());
   }
 
  private:
-  const Location out_;
   const Location obj_;
 
   DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathX86);
@@ -518,8 +543,7 @@ class ReadBarrierForHeapReferenceSlowPathX86 : public SlowPathCode {
            instruction_->IsArrayGet() ||
            instruction_->IsInstanceOf() ||
            instruction_->IsCheckCast() ||
-           ((instruction_->IsInvokeStaticOrDirect() || instruction_->IsInvokeVirtual()) &&
-            instruction_->GetLocations()->Intrinsified()))
+           (instruction_->IsInvokeVirtual()) && instruction_->GetLocations()->Intrinsified())
         << "Unexpected instruction in read barrier for heap reference slow path: "
         << instruction_->DebugName();
 
@@ -706,8 +730,8 @@ class ReadBarrierForRootSlowPathX86 : public SlowPathCode {
 };
 
 #undef __
-// NOLINT on __ macro to suppress wrong warning/fix from clang-tidy.
-#define __ down_cast<X86Assembler*>(GetAssembler())-> /* NOLINT */
+// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
+#define __ down_cast<X86Assembler*>(GetAssembler())->  // NOLINT
 
 inline Condition X86Condition(IfCondition cond) {
   switch (cond) {
@@ -778,7 +802,7 @@ void CodeGeneratorX86::InvokeRuntime(QuickEntrypointEnum entrypoint,
                                      HInstruction* instruction,
                                      uint32_t dex_pc,
                                      SlowPathCode* slow_path) {
-  InvokeRuntime(GetThreadOffset<kX86WordSize>(entrypoint).Int32Value(),
+  InvokeRuntime(GetThreadOffset<kX86PointerSize>(entrypoint).Int32Value(),
                 instruction,
                 dex_pc,
                 slow_path);
@@ -793,6 +817,13 @@ void CodeGeneratorX86::InvokeRuntime(int32_t entry_point_offset,
   RecordPcInfo(instruction, dex_pc, slow_path);
 }
 
+void CodeGeneratorX86::InvokeRuntimeWithoutRecordingPcInfo(int32_t entry_point_offset,
+                                                           HInstruction* instruction,
+                                                           SlowPathCode* slow_path) {
+  ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction, slow_path);
+  __ fs()->call(Address::Absolute(entry_point_offset));
+}
+
 CodeGeneratorX86::CodeGeneratorX86(HGraph* graph,
                                    const X86InstructionSetFeatures& isa_features,
                                    const CompilerOptions& compiler_options,
@@ -1548,15 +1579,15 @@ void LocationsBuilderX86::VisitSelect(HSelect* select) {
   locations->SetOut(Location::SameAsFirstInput());
 }
 
-void InstructionCodeGeneratorX86::GenerateIntCompare(Location lhs, Location rhs) {
+void CodeGeneratorX86::GenerateIntCompare(Location lhs, Location rhs) {
   Register lhs_reg = lhs.AsRegister<Register>();
   if (rhs.IsConstant()) {
     int32_t value = CodeGenerator::GetInt32ValueOf(rhs.GetConstant());
-    codegen_->Compare32BitValue(lhs_reg, value);
+    Compare32BitValue(lhs_reg, value);
   } else if (rhs.IsStackSlot()) {
-    __ cmpl(lhs_reg, Address(ESP, rhs.GetStackIndex()));
+    assembler_.cmpl(lhs_reg, Address(ESP, rhs.GetStackIndex()));
   } else {
-    __ cmpl(lhs_reg, rhs.AsRegister<Register>());
+    assembler_.cmpl(lhs_reg, rhs.AsRegister<Register>());
   }
 }
 
@@ -1589,7 +1620,7 @@ void InstructionCodeGeneratorX86::VisitSelect(HSelect* select) {
         DCHECK_NE(condition->InputAt(0)->GetType(), Primitive::kPrimLong);
         DCHECK(!Primitive::IsFloatingPointType(condition->InputAt(0)->GetType()));
         LocationSummary* cond_locations = condition->GetLocations();
-        GenerateIntCompare(cond_locations->InAt(0), cond_locations->InAt(1));
+        codegen_->GenerateIntCompare(cond_locations->InAt(0), cond_locations->InAt(1));
         cond = X86Condition(condition->GetCondition());
       }
     } else {
@@ -1698,7 +1729,7 @@ void InstructionCodeGeneratorX86::HandleCondition(HCondition* cond) {
 
       // Clear output register: setb only sets the low byte.
       __ xorl(reg, reg);
-      GenerateIntCompare(lhs, rhs);
+      codegen_->GenerateIntCompare(lhs, rhs);
       __ setb(X86Condition(cond->GetCondition()), reg);
       return;
     }
@@ -2027,8 +2058,6 @@ void InstructionCodeGeneratorX86::VisitInvokeInterface(HInvokeInterface* invoke)
   LocationSummary* locations = invoke->GetLocations();
   Register temp = locations->GetTemp(0).AsRegister<Register>();
   XmmRegister hidden_reg = locations->GetTemp(1).AsFpuRegister<XmmRegister>();
-  uint32_t method_offset = mirror::Class::EmbeddedImTableEntryOffset(
-      invoke->GetImtIndex() % mirror::Class::kImtSize, kX86PointerSize).Uint32Value();
   Location receiver = locations->InAt(0);
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
 
@@ -2055,11 +2084,16 @@ void InstructionCodeGeneratorX86::VisitInvokeInterface(HInvokeInterface* invoke)
   // intact/accessible until the end of the marking phase (the
   // concurrent copying collector may not in the future).
   __ MaybeUnpoisonHeapReference(temp);
+  // temp = temp->GetAddressOfIMT()
+  __ movl(temp,
+      Address(temp, mirror::Class::ImtPtrOffset(kX86PointerSize).Uint32Value()));
   // temp = temp->GetImtEntryAt(method_offset);
+  uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement(
+      invoke->GetImtIndex(), kX86PointerSize));
   __ movl(temp, Address(temp, method_offset));
   // call temp->GetEntryPoint();
   __ call(Address(temp,
-                  ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86WordSize).Int32Value()));
+                  ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86PointerSize).Int32Value()));
 
   DCHECK(!codegen_->IsLeafMethod());
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
@@ -2182,7 +2216,7 @@ void LocationsBuilderX86::VisitTypeConversion(HTypeConversion* conversion) {
   LocationSummary::CallKind call_kind =
       ((input_type == Primitive::kPrimFloat || input_type == Primitive::kPrimDouble)
        && result_type == Primitive::kPrimLong)
-      ? LocationSummary::kCall
+      ? LocationSummary::kCallOnMainOnly
       : LocationSummary::kNoCall;
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(conversion, call_kind);
@@ -3437,7 +3471,7 @@ void InstructionCodeGeneratorX86::GenerateDivRemIntegral(HBinaryOperation* instr
 
 void LocationsBuilderX86::VisitDiv(HDiv* div) {
   LocationSummary::CallKind call_kind = (div->GetResultType() == Primitive::kPrimLong)
-      ? LocationSummary::kCall
+      ? LocationSummary::kCallOnMainOnly
       : LocationSummary::kNoCall;
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(div, call_kind);
 
@@ -3540,7 +3574,7 @@ void LocationsBuilderX86::VisitRem(HRem* rem) {
   Primitive::Type type = rem->GetResultType();
 
   LocationSummary::CallKind call_kind = (rem->GetResultType() == Primitive::kPrimLong)
-      ? LocationSummary::kCall
+      ? LocationSummary::kCallOnMainOnly
       : LocationSummary::kNoCall;
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(rem, call_kind);
 
@@ -3982,7 +4016,7 @@ void InstructionCodeGeneratorX86::VisitUShr(HUShr* ushr) {
 
 void LocationsBuilderX86::VisitNewInstance(HNewInstance* instruction) {
   LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
+      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly);
   locations->SetOut(Location::RegisterLocation(EAX));
   if (instruction->IsStringAlloc()) {
     locations->AddTemp(Location::RegisterLocation(kMethodRegisterArgument));
@@ -3999,7 +4033,7 @@ void InstructionCodeGeneratorX86::VisitNewInstance(HNewInstance* instruction) {
   if (instruction->IsStringAlloc()) {
     // String is allocated through StringFactory. Call NewEmptyString entry point.
     Register temp = instruction->GetLocations()->GetTemp(0).AsRegister<Register>();
-    MemberOffset code_offset = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86WordSize);
+    MemberOffset code_offset = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86PointerSize);
     __ fs()->movl(temp, Address::Absolute(QUICK_ENTRY_POINT(pNewEmptyString)));
     __ call(Address(temp, code_offset.Int32Value()));
     codegen_->RecordPcInfo(instruction, instruction->GetDexPc());
@@ -4015,7 +4049,7 @@ void InstructionCodeGeneratorX86::VisitNewInstance(HNewInstance* instruction) {
 
 void LocationsBuilderX86::VisitNewArray(HNewArray* instruction) {
   LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
+      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly);
   locations->SetOut(Location::RegisterLocation(EAX));
   InvokeRuntimeCallingConvention calling_convention;
   locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
@@ -4070,16 +4104,21 @@ void LocationsBuilderX86::VisitClassTableGet(HClassTableGet* instruction) {
 
 void InstructionCodeGeneratorX86::VisitClassTableGet(HClassTableGet* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  uint32_t method_offset = 0;
   if (instruction->GetTableKind() == HClassTableGet::TableKind::kVTable) {
-    method_offset = mirror::Class::EmbeddedVTableEntryOffset(
+    uint32_t method_offset = mirror::Class::EmbeddedVTableEntryOffset(
         instruction->GetIndex(), kX86PointerSize).SizeValue();
+    __ movl(locations->Out().AsRegister<Register>(),
+            Address(locations->InAt(0).AsRegister<Register>(), method_offset));
   } else {
-    method_offset = mirror::Class::EmbeddedImTableEntryOffset(
-        instruction->GetIndex() % mirror::Class::kImtSize, kX86PointerSize).Uint32Value();
+    uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement(
+        instruction->GetIndex(), kX86PointerSize));
+    __ movl(locations->Out().AsRegister<Register>(),
+            Address(locations->InAt(0).AsRegister<Register>(),
+                    mirror::Class::ImtPtrOffset(kX86PointerSize).Uint32Value()));
+    // temp = temp->GetImtEntryAt(method_offset);
+    __ movl(locations->Out().AsRegister<Register>(),
+            Address(locations->Out().AsRegister<Register>(), method_offset));
   }
-  __ movl(locations->Out().AsRegister<Register>(),
-          Address(locations->InAt(0).AsRegister<Register>(), method_offset));
 }
 
 void LocationsBuilderX86::VisitNot(HNot* not_) {
@@ -4172,7 +4211,7 @@ void InstructionCodeGeneratorX86::VisitCompare(HCompare* compare) {
     case Primitive::kPrimShort:
     case Primitive::kPrimChar:
     case Primitive::kPrimInt: {
-      GenerateIntCompare(left, right);
+      codegen_->GenerateIntCompare(left, right);
       break;
     }
     case Primitive::kPrimLong: {
@@ -4411,7 +4450,7 @@ void CodeGeneratorX86::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke,
       // (callee_method + offset_of_quick_compiled_code)()
       __ call(Address(callee_method.AsRegister<Register>(),
                       ArtMethod::EntryPointFromQuickCompiledCodeOffset(
-                          kX86WordSize).Int32Value()));
+                          kX86PointerSize).Int32Value()));
       break;
   }
 
@@ -4445,7 +4484,7 @@ void CodeGeneratorX86::GenerateVirtualCall(HInvokeVirtual* invoke, Location temp
   __ movl(temp, Address(temp, method_offset));
   // call temp->GetEntryPoint();
   __ call(Address(
-      temp, ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86WordSize).Int32Value()));
+      temp, ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86PointerSize).Int32Value()));
 }
 
 void CodeGeneratorX86::RecordSimplePatch() {
@@ -4549,7 +4588,7 @@ void CodeGeneratorX86::MarkGCCard(Register temp,
     __ testl(value, value);
     __ j(kEqual, &is_null);
   }
-  __ fs()->movl(card, Address::Absolute(Thread::CardTableOffset<kX86WordSize>().Int32Value()));
+  __ fs()->movl(card, Address::Absolute(Thread::CardTableOffset<kX86PointerSize>().Int32Value()));
   __ movl(temp, object);
   __ shrl(temp, Immediate(gc::accounting::CardTable::kCardShift));
   __ movb(Address(temp, card, TIMES_1, 0),
@@ -5510,10 +5549,16 @@ void InstructionCodeGeneratorX86::VisitArraySet(HArraySet* instruction) {
 void LocationsBuilderX86::VisitArrayLength(HArrayLength* instruction) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
   locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+  if (!instruction->IsEmittedAtUseSite()) {
+    locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+  }
 }
 
 void InstructionCodeGeneratorX86::VisitArrayLength(HArrayLength* instruction) {
+  if (instruction->IsEmittedAtUseSite()) {
+    return;
+  }
+
   LocationSummary* locations = instruction->GetLocations();
   uint32_t offset = CodeGenerator::GetArrayLengthOffset(instruction);
   Register obj = locations->InAt(0).AsRegister<Register>();
@@ -5528,7 +5573,10 @@ void LocationsBuilderX86::VisitBoundsCheck(HBoundsCheck* instruction) {
       : LocationSummary::kNoCall;
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, call_kind);
   locations->SetInAt(0, Location::RegisterOrConstant(instruction->InputAt(0)));
-  locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
+  HInstruction* length = instruction->InputAt(1);
+  if (!length->IsEmittedAtUseSite()) {
+    locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
+  }
   if (instruction->HasUses()) {
     locations->SetOut(Location::SameAsFirstInput());
   }
@@ -5562,12 +5610,28 @@ void InstructionCodeGeneratorX86::VisitBoundsCheck(HBoundsCheck* instruction) {
     codegen_->AddSlowPath(slow_path);
     __ j(kAboveEqual, slow_path->GetEntryLabel());
   } else {
-    Register length = length_loc.AsRegister<Register>();
-    if (index_loc.IsConstant()) {
-      int32_t value = CodeGenerator::GetInt32ValueOf(index_loc.GetConstant());
-      __ cmpl(length, Immediate(value));
+    HInstruction* array_length = instruction->InputAt(1);
+    if (array_length->IsEmittedAtUseSite()) {
+      // Address the length field in the array.
+      DCHECK(array_length->IsArrayLength());
+      uint32_t len_offset = CodeGenerator::GetArrayLengthOffset(array_length->AsArrayLength());
+      Location array_loc = array_length->GetLocations()->InAt(0);
+      Address array_len(array_loc.AsRegister<Register>(), len_offset);
+      if (index_loc.IsConstant()) {
+        int32_t value = CodeGenerator::GetInt32ValueOf(index_loc.GetConstant());
+        __ cmpl(array_len, Immediate(value));
+      } else {
+        __ cmpl(array_len, index_loc.AsRegister<Register>());
+      }
+      codegen_->MaybeRecordImplicitNullCheck(array_length);
     } else {
-      __ cmpl(length, index_loc.AsRegister<Register>());
+      Register length = length_loc.AsRegister<Register>();
+      if (index_loc.IsConstant()) {
+        int32_t value = CodeGenerator::GetInt32ValueOf(index_loc.GetConstant());
+        __ cmpl(length, Immediate(value));
+      } else {
+        __ cmpl(length, index_loc.AsRegister<Register>());
+      }
     }
     codegen_->AddSlowPath(slow_path);
     __ j(kBelowEqual, slow_path->GetEntryLabel());
@@ -5616,7 +5680,7 @@ void InstructionCodeGeneratorX86::GenerateSuspendCheck(HSuspendCheck* instructio
     DCHECK_EQ(slow_path->GetSuccessor(), successor);
   }
 
-  __ fs()->cmpw(Address::Absolute(Thread::ThreadFlagsOffset<kX86WordSize>().Int32Value()),
+  __ fs()->cmpw(Address::Absolute(Thread::ThreadFlagsOffset<kX86PointerSize>().Int32Value()),
                 Immediate(0));
   if (successor == nullptr) {
     __ j(kNotEqual, slow_path->GetEntryLabel());
@@ -6167,52 +6231,19 @@ void InstructionCodeGeneratorX86::VisitLoadString(HLoadString* load) {
       codegen_->RecordSimplePatch();
       return;  // No dex cache slow path.
     }
-    case HLoadString::LoadKind::kDexCacheAddress: {
-      DCHECK_NE(load->GetAddress(), 0u);
-      uint32_t address = dchecked_integral_cast<uint32_t>(load->GetAddress());
-      // /* GcRoot<mirror::String> */ out = *address
-      GenerateGcRootFieldLoad(load, out_loc, Address::Absolute(address));
-      break;
-    }
-    case HLoadString::LoadKind::kDexCachePcRelative: {
-      Register base_reg = locations->InAt(0).AsRegister<Register>();
-      uint32_t offset = load->GetDexCacheElementOffset();
-      Label* fixup_label = codegen_->NewPcRelativeDexCacheArrayPatch(load->GetDexFile(), offset);
-      // /* GcRoot<mirror::String> */ out = *(base + offset)  /* PC-relative */
-      GenerateGcRootFieldLoad(
-          load, out_loc, Address(base_reg, CodeGeneratorX86::kDummy32BitOffset), fixup_label);
-      break;
-    }
-    case HLoadString::LoadKind::kDexCacheViaMethod: {
-      Register current_method = locations->InAt(0).AsRegister<Register>();
-
-      // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
-      GenerateGcRootFieldLoad(
-          load, out_loc, Address(current_method, ArtMethod::DeclaringClassOffset().Int32Value()));
-
-      // /* GcRoot<mirror::String>[] */ out = out->dex_cache_strings_
-      __ movl(out, Address(out, mirror::Class::DexCacheStringsOffset().Int32Value()));
-      // /* GcRoot<mirror::String> */ out = out[string_index]
-      GenerateGcRootFieldLoad(
-          load, out_loc, Address(out, CodeGenerator::GetCacheOffset(load->GetStringIndex())));
-      break;
-    }
     default:
-      LOG(FATAL) << "Unexpected load kind: " << load->GetLoadKind();
-      UNREACHABLE();
+      break;
   }
 
-  if (!load->IsInDexCache()) {
-    SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathX86(load);
-    codegen_->AddSlowPath(slow_path);
-    __ testl(out, out);
-    __ j(kEqual, slow_path->GetEntryLabel());
-    __ Bind(slow_path->GetExitLabel());
-  }
+  // TODO: Re-add the compiler code to do string dex cache lookup again.
+  SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathX86(load);
+  codegen_->AddSlowPath(slow_path);
+  __ jmp(slow_path->GetEntryLabel());
+  __ Bind(slow_path->GetExitLabel());
 }
 
 static Address GetExceptionTlsAddress() {
-  return Address::Absolute(Thread::ExceptionOffset<kX86WordSize>().Int32Value());
+  return Address::Absolute(Thread::ExceptionOffset<kX86PointerSize>().Int32Value());
 }
 
 void LocationsBuilderX86::VisitLoadException(HLoadException* load) {
@@ -6235,7 +6266,7 @@ void InstructionCodeGeneratorX86::VisitClearException(HClearException* clear ATT
 
 void LocationsBuilderX86::VisitThrow(HThrow* instruction) {
   LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
+      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
 }
@@ -6687,7 +6718,7 @@ void InstructionCodeGeneratorX86::VisitCheckCast(HCheckCast* instruction) {
 
 void LocationsBuilderX86::VisitMonitorOperation(HMonitorOperation* instruction) {
   LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
+      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
 }
@@ -6926,10 +6957,10 @@ void InstructionCodeGeneratorX86::GenerateGcRootFieldLoad(HInstruction* instruct
 
       // Slow path used to mark the GC root `root`.
       SlowPathCode* slow_path =
-          new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86(instruction, root, root);
+          new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86(instruction, root);
       codegen_->AddSlowPath(slow_path);
 
-      __ fs()->cmpl(Address::Absolute(Thread::IsGcMarkingOffset<kX86WordSize>().Int32Value()),
+      __ fs()->cmpl(Address::Absolute(Thread::IsGcMarkingOffset<kX86PointerSize>().Int32Value()),
                     Immediate(0));
       __ j(kNotEqual, slow_path->GetEntryLabel());
       __ Bind(slow_path->GetExitLabel());
@@ -7036,12 +7067,6 @@ void CodeGeneratorX86::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* i
   // /* LockWord */ lock_word = LockWord(monitor)
   static_assert(sizeof(LockWord) == sizeof(int32_t),
                 "art::LockWord and int32_t have different sizes.");
-  // /* uint32_t */ rb_state = lock_word.ReadBarrierState()
-  __ shrl(temp_reg, Immediate(LockWord::kReadBarrierStateShift));
-  __ andl(temp_reg, Immediate(LockWord::kReadBarrierStateMask));
-  static_assert(
-      LockWord::kReadBarrierStateMask == ReadBarrier::rb_ptr_mask_,
-      "art::LockWord::kReadBarrierStateMask is not equal to art::ReadBarrier::rb_ptr_mask_.");
 
   // Load fence to prevent load-load reordering.
   // Note that this is a no-op, thanks to the x86 memory model.
@@ -7056,13 +7081,18 @@ void CodeGeneratorX86::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* i
 
   // Slow path used to mark the object `ref` when it is gray.
   SlowPathCode* slow_path =
-      new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86(instruction, ref, ref);
+      new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86(instruction, ref);
   AddSlowPath(slow_path);
 
   // if (rb_state == ReadBarrier::gray_ptr_)
   //   ref = ReadBarrier::Mark(ref);
-  __ cmpl(temp_reg, Immediate(ReadBarrier::gray_ptr_));
-  __ j(kEqual, slow_path->GetEntryLabel());
+  // Given the numeric representation, it's enough to check the low bit of the
+  // rb_state. We do that by shifting the bit out of the lock word with SHR.
+  static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0");
+  static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1");
+  static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2");
+  __ shrl(temp_reg, Immediate(LockWord::kReadBarrierStateShift + 1));
+  __ j(kCarrySet, slow_path->GetEntryLabel());
   __ Bind(slow_path->GetExitLabel());
 }
 
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index 2a9fb80995..894f2e8f40 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -18,8 +18,8 @@
 #define ART_COMPILER_OPTIMIZING_CODE_GENERATOR_X86_H_
 
 #include "arch/x86/instruction_set_features_x86.h"
+#include "base/enums.h"
 #include "code_generator.h"
-#include "dex/compiler_enums.h"
 #include "driver/compiler_options.h"
 #include "nodes.h"
 #include "parallel_move_resolver.h"
@@ -29,7 +29,7 @@ namespace art {
 namespace x86 {
 
 // Use a local definition to prevent copying mistakes.
-static constexpr size_t kX86WordSize = kX86PointerSize;
+static constexpr size_t kX86WordSize = static_cast<size_t>(kX86PointerSize);
 
 class CodeGeneratorX86;
 
@@ -295,7 +295,6 @@ class InstructionCodeGeneratorX86 : public InstructionCodeGenerator {
                                    HBasicBlock* default_block);
 
   void GenerateFPCompare(Location lhs, Location rhs, HInstruction* insn, bool is_double);
-  void GenerateIntCompare(Location lhs, Location rhs);
 
   X86Assembler* const assembler_;
   CodeGeneratorX86* const codegen_;
@@ -336,6 +335,12 @@ class CodeGeneratorX86 : public CodeGenerator {
                      uint32_t dex_pc,
                      SlowPathCode* slow_path);
 
+  // Generate code to invoke a runtime entry point, but do not record
+  // PC-related information in a stack map.
+  void InvokeRuntimeWithoutRecordingPcInfo(int32_t entry_point_offset,
+                                           HInstruction* instruction,
+                                           SlowPathCode* slow_path);
+
   size_t GetWordSize() const OVERRIDE {
     return kX86WordSize;
   }
@@ -425,6 +430,8 @@ class CodeGeneratorX86 : public CodeGenerator {
                   Register value,
                   bool value_can_be_null);
 
+  void GenerateIntCompare(Location lhs, Location rhs);
+
   void GenerateMemoryBarrier(MemBarrierKind kind);
 
   Label* GetLabelOf(HBasicBlock* block) const {
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 5e30203b38..0c55ae44de 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -51,9 +51,9 @@ static constexpr FloatRegister kFpuCalleeSaves[] = { XMM12, XMM13, XMM14, XMM15
 
 static constexpr int kC2ConditionMask = 0x400;
 
-// NOLINT on __ macro to suppress wrong warning/fix from clang-tidy.
-#define __ down_cast<X86_64Assembler*>(codegen->GetAssembler())-> // NOLINT
-#define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, x).Int32Value()
+// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
+#define __ down_cast<X86_64Assembler*>(codegen->GetAssembler())->  // NOLINT
+#define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kX86_64PointerSize, x).Int32Value()
 
 class NullCheckSlowPathX86_64 : public SlowPathCode {
  public:
@@ -149,13 +149,11 @@ class SuspendCheckSlowPathX86_64 : public SlowPathCode {
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     CodeGeneratorX86_64* x86_64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
     __ Bind(GetEntryLabel());
-    SaveLiveRegisters(codegen, instruction_->GetLocations());
     x86_64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pTestSuspend),
                                   instruction_,
                                   instruction_->GetDexPc(),
                                   this);
     CheckEntrypointTypes<kQuickTestSuspend, void, void>();
-    RestoreLiveRegisters(codegen, instruction_->GetLocations());
     if (successor_ == nullptr) {
       __ jmp(GetReturnLabel());
     } else {
@@ -194,14 +192,31 @@ class BoundsCheckSlowPathX86_64 : public SlowPathCode {
       // Live registers will be restored in the catch block if caught.
       SaveLiveRegisters(codegen, instruction_->GetLocations());
     }
+    // Are we using an array length from memory?
+    HInstruction* array_length = instruction_->InputAt(1);
+    Location length_loc = locations->InAt(1);
+    InvokeRuntimeCallingConvention calling_convention;
+    if (array_length->IsArrayLength() && array_length->IsEmittedAtUseSite()) {
+      // Load the array length into our temporary.
+      uint32_t len_offset = CodeGenerator::GetArrayLengthOffset(array_length->AsArrayLength());
+      Location array_loc = array_length->GetLocations()->InAt(0);
+      Address array_len(array_loc.AsRegister<CpuRegister>(), len_offset);
+      length_loc = Location::RegisterLocation(calling_convention.GetRegisterAt(1));
+      // Check for conflicts with index.
+      if (length_loc.Equals(locations->InAt(0))) {
+        // We know we aren't using parameter 2.
+        length_loc = Location::RegisterLocation(calling_convention.GetRegisterAt(2));
+      }
+      __ movl(length_loc.AsRegister<CpuRegister>(), array_len);
+    }
+
     // We're moving two locations to locations that could overlap, so we need a parallel
     // move resolver.
-    InvokeRuntimeCallingConvention calling_convention;
     codegen->EmitParallelMoves(
         locations->InAt(0),
         Location::RegisterLocation(calling_convention.GetRegisterAt(0)),
         Primitive::kPrimInt,
-        locations->InAt(1),
+        length_loc,
         Location::RegisterLocation(calling_convention.GetRegisterAt(1)),
         Primitive::kPrimInt);
     uint32_t entry_point_offset = instruction_->AsBoundsCheck()->IsStringCharAt()
@@ -352,7 +367,7 @@ class TypeCheckSlowPathX86_64 : public SlowPathCode {
                                     dex_pc,
                                     this);
       CheckEntrypointTypes<
-          kQuickInstanceofNonTrivial, uint32_t, const mirror::Class*, const mirror::Class*>();
+          kQuickInstanceofNonTrivial, size_t, const mirror::Class*, const mirror::Class*>();
     } else {
       DCHECK(instruction_->IsCheckCast());
       x86_64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pCheckCast),
@@ -451,8 +466,8 @@ class ArraySetSlowPathX86_64 : public SlowPathCode {
 // Slow path marking an object during a read barrier.
 class ReadBarrierMarkSlowPathX86_64 : public SlowPathCode {
  public:
-  ReadBarrierMarkSlowPathX86_64(HInstruction* instruction, Location out, Location obj)
-      : SlowPathCode(instruction), out_(out), obj_(obj) {
+  ReadBarrierMarkSlowPathX86_64(HInstruction* instruction, Location obj)
+      : SlowPathCode(instruction), obj_(obj) {
     DCHECK(kEmitCompilerReadBarrier);
   }
 
@@ -460,9 +475,9 @@ class ReadBarrierMarkSlowPathX86_64 : public SlowPathCode {
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     LocationSummary* locations = instruction_->GetLocations();
-    Register reg_out = out_.AsRegister<Register>();
+    Register reg = obj_.AsRegister<Register>();
     DCHECK(locations->CanCall());
-    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg_out));
+    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg));
     DCHECK(instruction_->IsInstanceFieldGet() ||
            instruction_->IsStaticFieldGet() ||
            instruction_->IsArrayGet() ||
@@ -470,30 +485,40 @@ class ReadBarrierMarkSlowPathX86_64 : public SlowPathCode {
            instruction_->IsLoadString() ||
            instruction_->IsInstanceOf() ||
            instruction_->IsCheckCast() ||
-           ((instruction_->IsInvokeStaticOrDirect() || instruction_->IsInvokeVirtual()) &&
-            instruction_->GetLocations()->Intrinsified()))
+           (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()) ||
+           (instruction_->IsInvokeStaticOrDirect() && instruction_->GetLocations()->Intrinsified()))
         << "Unexpected instruction in read barrier marking slow path: "
         << instruction_->DebugName();
 
     __ Bind(GetEntryLabel());
-    SaveLiveRegisters(codegen, locations);
-
-    InvokeRuntimeCallingConvention calling_convention;
+    // No need to save live registers; it's taken care of by the
+    // entrypoint. Also, there is no need to update the stack mask,
+    // as this runtime call will not trigger a garbage collection.
     CodeGeneratorX86_64* x86_64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
-    x86_64_codegen->Move(Location::RegisterLocation(calling_convention.GetRegisterAt(0)), obj_);
-    x86_64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pReadBarrierMark),
-                               instruction_,
-                               instruction_->GetDexPc(),
-                               this);
-    CheckEntrypointTypes<kQuickReadBarrierMark, mirror::Object*, mirror::Object*>();
-    x86_64_codegen->Move(out_, Location::RegisterLocation(RAX));
-
-    RestoreLiveRegisters(codegen, locations);
+    DCHECK_NE(reg, RSP);
+    DCHECK(0 <= reg && reg < kNumberOfCpuRegisters) << reg;
+    // "Compact" slow path, saving two moves.
+    //
+    // Instead of using the standard runtime calling convention (input
+    // and output in R0):
+    //
+    //   RDI <- obj
+    //   RAX <- ReadBarrierMark(RDI)
+    //   obj <- RAX
+    //
+    // we just use rX (the register holding `obj`) as input and output
+    // of a dedicated entrypoint:
+    //
+    //   rX <- ReadBarrierMarkRegX(rX)
+    //
+    int32_t entry_point_offset =
+        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kX86_64PointerSize>(reg);
+    // This runtime call does not require a stack map.
+    x86_64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
     __ jmp(GetExitLabel());
   }
 
  private:
-  const Location out_;
   const Location obj_;
 
   DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathX86_64);
@@ -539,8 +564,7 @@ class ReadBarrierForHeapReferenceSlowPathX86_64 : public SlowPathCode {
            instruction_->IsArrayGet() ||
            instruction_->IsInstanceOf() ||
            instruction_->IsCheckCast() ||
-           ((instruction_->IsInvokeStaticOrDirect() || instruction_->IsInvokeVirtual()) &&
-            instruction_->GetLocations()->Intrinsified()))
+           (instruction_->IsInvokeVirtual()) && instruction_->GetLocations()->Intrinsified())
         << "Unexpected instruction in read barrier for heap reference slow path: "
         << instruction_->DebugName();
 
@@ -725,8 +749,8 @@ class ReadBarrierForRootSlowPathX86_64 : public SlowPathCode {
 };
 
 #undef __
-// NOLINT on __ macro to suppress wrong warning/fix from clang-tidy.
-#define __ down_cast<X86_64Assembler*>(GetAssembler())-> // NOLINT
+// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
+#define __ down_cast<X86_64Assembler*>(GetAssembler())->  // NOLINT
 
 inline Condition X86_64IntegerCondition(IfCondition cond) {
   switch (cond) {
@@ -858,7 +882,7 @@ void CodeGeneratorX86_64::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invo
       // (callee_method + offset_of_quick_compiled_code)()
       __ call(Address(callee_method.AsRegister<CpuRegister>(),
                       ArtMethod::EntryPointFromQuickCompiledCodeOffset(
-                          kX86_64WordSize).SizeValue()));
+                          kX86_64PointerSize).SizeValue()));
       break;
   }
 
@@ -893,7 +917,7 @@ void CodeGeneratorX86_64::GenerateVirtualCall(HInvokeVirtual* invoke, Location t
   __ movq(temp, Address(temp, method_offset));
   // call temp->GetEntryPoint();
   __ call(Address(temp, ArtMethod::EntryPointFromQuickCompiledCodeOffset(
-      kX86_64WordSize).SizeValue()));
+      kX86_64PointerSize).SizeValue()));
 }
 
 void CodeGeneratorX86_64::RecordSimplePatch() {
@@ -1006,7 +1030,7 @@ void CodeGeneratorX86_64::InvokeRuntime(QuickEntrypointEnum entrypoint,
                                         HInstruction* instruction,
                                         uint32_t dex_pc,
                                         SlowPathCode* slow_path) {
-  InvokeRuntime(GetThreadOffset<kX86_64WordSize>(entrypoint).Int32Value(),
+  InvokeRuntime(GetThreadOffset<kX86_64PointerSize>(entrypoint).Int32Value(),
                 instruction,
                 dex_pc,
                 slow_path);
@@ -1021,6 +1045,13 @@ void CodeGeneratorX86_64::InvokeRuntime(int32_t entry_point_offset,
   RecordPcInfo(instruction, dex_pc, slow_path);
 }
 
+void CodeGeneratorX86_64::InvokeRuntimeWithoutRecordingPcInfo(int32_t entry_point_offset,
+                                                              HInstruction* instruction,
+                                                              SlowPathCode* slow_path) {
+  ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction, slow_path);
+  __ gs()->call(Address::Absolute(entry_point_offset, /* no_rip */ true));
+}
+
 static constexpr int kNumberOfCpuRegisterPairs = 0;
 // Use a fake return address register to mimic Quick.
 static constexpr Register kFakeReturnRegister = Register(kLastCpuRegister + 1);
@@ -2257,8 +2288,6 @@ void InstructionCodeGeneratorX86_64::VisitInvokeInterface(HInvokeInterface* invo
   LocationSummary* locations = invoke->GetLocations();
   CpuRegister temp = locations->GetTemp(0).AsRegister<CpuRegister>();
   CpuRegister hidden_reg = locations->GetTemp(1).AsRegister<CpuRegister>();
-  uint32_t method_offset = mirror::Class::EmbeddedImTableEntryOffset(
-      invoke->GetImtIndex() % mirror::Class::kImtSize, kX86_64PointerSize).Uint32Value();
   Location receiver = locations->InAt(0);
   size_t class_offset = mirror::Object::ClassOffset().SizeValue();
 
@@ -2284,11 +2313,17 @@ void InstructionCodeGeneratorX86_64::VisitInvokeInterface(HInvokeInterface* invo
   // intact/accessible until the end of the marking phase (the
   // concurrent copying collector may not in the future).
   __ MaybeUnpoisonHeapReference(temp);
+  // temp = temp->GetAddressOfIMT()
+  __ movq(temp,
+      Address(temp, mirror::Class::ImtPtrOffset(kX86_64PointerSize).Uint32Value()));
+  // temp = temp->GetImtEntryAt(method_offset);
+  uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement(
+      invoke->GetImtIndex(), kX86_64PointerSize));
   // temp = temp->GetImtEntryAt(method_offset);
   __ movq(temp, Address(temp, method_offset));
   // call temp->GetEntryPoint();
-  __ call(Address(temp,
-                  ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86_64WordSize).SizeValue()));
+  __ call(Address(
+      temp, ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86_64PointerSize).SizeValue()));
 
   DCHECK(!codegen_->IsLeafMethod());
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
@@ -3909,7 +3944,7 @@ void InstructionCodeGeneratorX86_64::VisitUShr(HUShr* ushr) {
 
 void LocationsBuilderX86_64::VisitNewInstance(HNewInstance* instruction) {
   LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
+      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly);
   InvokeRuntimeCallingConvention calling_convention;
   if (instruction->IsStringAlloc()) {
     locations->AddTemp(Location::RegisterLocation(kMethodRegisterArgument));
@@ -3926,7 +3961,7 @@ void InstructionCodeGeneratorX86_64::VisitNewInstance(HNewInstance* instruction)
   if (instruction->IsStringAlloc()) {
     // String is allocated through StringFactory. Call NewEmptyString entry point.
     CpuRegister temp = instruction->GetLocations()->GetTemp(0).AsRegister<CpuRegister>();
-    MemberOffset code_offset = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86_64WordSize);
+    MemberOffset code_offset = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86_64PointerSize);
     __ gs()->movq(temp, Address::Absolute(QUICK_ENTRY_POINT(pNewEmptyString), /* no_rip */ true));
     __ call(Address(temp, code_offset.SizeValue()));
     codegen_->RecordPcInfo(instruction, instruction->GetDexPc());
@@ -3942,7 +3977,7 @@ void InstructionCodeGeneratorX86_64::VisitNewInstance(HNewInstance* instruction)
 
 void LocationsBuilderX86_64::VisitNewArray(HNewArray* instruction) {
   LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
+      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly);
   InvokeRuntimeCallingConvention calling_convention;
   locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
   locations->SetOut(Location::RegisterLocation(RAX));
@@ -4002,16 +4037,20 @@ void LocationsBuilderX86_64::VisitClassTableGet(HClassTableGet* instruction) {
 
 void InstructionCodeGeneratorX86_64::VisitClassTableGet(HClassTableGet* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  uint32_t method_offset = 0;
   if (instruction->GetTableKind() == HClassTableGet::TableKind::kVTable) {
-    method_offset = mirror::Class::EmbeddedVTableEntryOffset(
+    uint32_t method_offset = mirror::Class::EmbeddedVTableEntryOffset(
         instruction->GetIndex(), kX86_64PointerSize).SizeValue();
+    __ movq(locations->Out().AsRegister<CpuRegister>(),
+            Address(locations->InAt(0).AsRegister<CpuRegister>(), method_offset));
   } else {
-    method_offset = mirror::Class::EmbeddedImTableEntryOffset(
-        instruction->GetIndex() % mirror::Class::kImtSize, kX86_64PointerSize).Uint32Value();
+    uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement(
+        instruction->GetIndex(), kX86_64PointerSize));
+    __ movq(locations->Out().AsRegister<CpuRegister>(),
+            Address(locations->InAt(0).AsRegister<CpuRegister>(),
+            mirror::Class::ImtPtrOffset(kX86_64PointerSize).Uint32Value()));
+    __ movq(locations->Out().AsRegister<CpuRegister>(),
+            Address(locations->Out().AsRegister<CpuRegister>(), method_offset));
   }
-  __ movq(locations->Out().AsRegister<CpuRegister>(),
-          Address(locations->InAt(0).AsRegister<CpuRegister>(), method_offset));
 }
 
 void LocationsBuilderX86_64::VisitNot(HNot* not_) {
@@ -4980,10 +5019,16 @@ void LocationsBuilderX86_64::VisitArrayLength(HArrayLength* instruction) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+  if (!instruction->IsEmittedAtUseSite()) {
+    locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+  }
 }
 
 void InstructionCodeGeneratorX86_64::VisitArrayLength(HArrayLength* instruction) {
+  if (instruction->IsEmittedAtUseSite()) {
+    return;
+  }
+
   LocationSummary* locations = instruction->GetLocations();
   uint32_t offset = CodeGenerator::GetArrayLengthOffset(instruction);
   CpuRegister obj = locations->InAt(0).AsRegister<CpuRegister>();
@@ -4998,7 +5043,10 @@ void LocationsBuilderX86_64::VisitBoundsCheck(HBoundsCheck* instruction) {
       : LocationSummary::kNoCall;
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, call_kind);
   locations->SetInAt(0, Location::RegisterOrConstant(instruction->InputAt(0)));
-  locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
+  HInstruction* length = instruction->InputAt(1);
+  if (!length->IsEmittedAtUseSite()) {
+    locations->SetInAt(1, Location::RegisterOrConstant(length));
+  }
   if (instruction->HasUses()) {
     locations->SetOut(Location::SameAsFirstInput());
   }
@@ -5008,8 +5056,7 @@ void InstructionCodeGeneratorX86_64::VisitBoundsCheck(HBoundsCheck* instruction)
   LocationSummary* locations = instruction->GetLocations();
   Location index_loc = locations->InAt(0);
   Location length_loc = locations->InAt(1);
-  SlowPathCode* slow_path =
-      new (GetGraph()->GetArena()) BoundsCheckSlowPathX86_64(instruction);
+  SlowPathCode* slow_path = new (GetGraph()->GetArena()) BoundsCheckSlowPathX86_64(instruction);
 
   if (length_loc.IsConstant()) {
     int32_t length = CodeGenerator::GetInt32ValueOf(length_loc.GetConstant());
@@ -5032,12 +5079,28 @@ void InstructionCodeGeneratorX86_64::VisitBoundsCheck(HBoundsCheck* instruction)
     codegen_->AddSlowPath(slow_path);
     __ j(kAboveEqual, slow_path->GetEntryLabel());
   } else {
-    CpuRegister length = length_loc.AsRegister<CpuRegister>();
-    if (index_loc.IsConstant()) {
-      int32_t value = CodeGenerator::GetInt32ValueOf(index_loc.GetConstant());
-      __ cmpl(length, Immediate(value));
+    HInstruction* array_length = instruction->InputAt(1);
+    if (array_length->IsEmittedAtUseSite()) {
+      // Address the length field in the array.
+      DCHECK(array_length->IsArrayLength());
+      uint32_t len_offset = CodeGenerator::GetArrayLengthOffset(array_length->AsArrayLength());
+      Location array_loc = array_length->GetLocations()->InAt(0);
+      Address array_len(array_loc.AsRegister<CpuRegister>(), len_offset);
+      if (index_loc.IsConstant()) {
+        int32_t value = CodeGenerator::GetInt32ValueOf(index_loc.GetConstant());
+        __ cmpl(array_len, Immediate(value));
+      } else {
+        __ cmpl(array_len, index_loc.AsRegister<CpuRegister>());
+      }
+      codegen_->MaybeRecordImplicitNullCheck(array_length);
     } else {
-      __ cmpl(length, index_loc.AsRegister<CpuRegister>());
+      CpuRegister length = length_loc.AsRegister<CpuRegister>();
+      if (index_loc.IsConstant()) {
+        int32_t value = CodeGenerator::GetInt32ValueOf(index_loc.GetConstant());
+        __ cmpl(length, Immediate(value));
+      } else {
+        __ cmpl(length, index_loc.AsRegister<CpuRegister>());
+      }
     }
     codegen_->AddSlowPath(slow_path);
     __ j(kBelowEqual, slow_path->GetEntryLabel());
@@ -5054,7 +5117,7 @@ void CodeGeneratorX86_64::MarkGCCard(CpuRegister temp,
     __ testl(value, value);
     __ j(kEqual, &is_null);
   }
-  __ gs()->movq(card, Address::Absolute(Thread::CardTableOffset<kX86_64WordSize>().Int32Value(),
+  __ gs()->movq(card, Address::Absolute(Thread::CardTableOffset<kX86_64PointerSize>().Int32Value(),
                                         /* no_rip */ true));
   __ movq(temp, object);
   __ shrq(temp, Immediate(gc::accounting::CardTable::kCardShift));
@@ -5106,7 +5169,7 @@ void InstructionCodeGeneratorX86_64::GenerateSuspendCheck(HSuspendCheck* instruc
     DCHECK_EQ(slow_path->GetSuccessor(), successor);
   }
 
-  __ gs()->cmpw(Address::Absolute(Thread::ThreadFlagsOffset<kX86_64WordSize>().Int32Value(),
+  __ gs()->cmpw(Address::Absolute(Thread::ThreadFlagsOffset<kX86_64PointerSize>().Int32Value(),
                                   /* no_rip */ true),
                 Immediate(0));
   if (successor == nullptr) {
@@ -5573,57 +5636,19 @@ void InstructionCodeGeneratorX86_64::VisitLoadString(HLoadString* load) {
       codegen_->RecordSimplePatch();
       return;  // No dex cache slow path.
     }
-    case HLoadString::LoadKind::kDexCacheAddress: {
-      DCHECK_NE(load->GetAddress(), 0u);
-      // /* GcRoot<mirror::String> */ out = *address
-      if (IsUint<32>(load->GetAddress())) {
-        Address address = Address::Absolute(load->GetAddress(), /* no_rip */ true);
-        GenerateGcRootFieldLoad(load, out_loc, address);
-      } else {
-        // TODO: Consider using opcode A1, i.e. movl eax, moff32 (with 64-bit address).
-        __ movq(out, Immediate(load->GetAddress()));
-        GenerateGcRootFieldLoad(load, out_loc, Address(out, 0));
-      }
-      break;
-    }
-    case HLoadString::LoadKind::kDexCachePcRelative: {
-      uint32_t offset = load->GetDexCacheElementOffset();
-      Label* fixup_label = codegen_->NewPcRelativeDexCacheArrayPatch(load->GetDexFile(), offset);
-      Address address = Address::Absolute(CodeGeneratorX86_64::kDummy32BitOffset,
-                                          /* no_rip */ false);
-      // /* GcRoot<mirror::String> */ out = *address  /* PC-relative */
-      GenerateGcRootFieldLoad(load, out_loc, address, fixup_label);
-      break;
-    }
-    case HLoadString::LoadKind::kDexCacheViaMethod: {
-      CpuRegister current_method = locations->InAt(0).AsRegister<CpuRegister>();
-
-      // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
-      GenerateGcRootFieldLoad(
-          load, out_loc, Address(current_method, ArtMethod::DeclaringClassOffset().Int32Value()));
-      // /* GcRoot<mirror::String>[] */ out = out->dex_cache_strings_
-      __ movq(out, Address(out, mirror::Class::DexCacheStringsOffset().Uint32Value()));
-      // /* GcRoot<mirror::String> */ out = out[string_index]
-      GenerateGcRootFieldLoad(
-          load, out_loc, Address(out, CodeGenerator::GetCacheOffset(load->GetStringIndex())));
-      break;
-    }
     default:
-      LOG(FATAL) << "Unexpected load kind: " << load->GetLoadKind();
-      UNREACHABLE();
+      break;
   }
 
-  if (!load->IsInDexCache()) {
-    SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathX86_64(load);
-    codegen_->AddSlowPath(slow_path);
-    __ testl(out, out);
-    __ j(kEqual, slow_path->GetEntryLabel());
-    __ Bind(slow_path->GetExitLabel());
-  }
+  // TODO: Re-add the compiler code to do string dex cache lookup again.
+  SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathX86_64(load);
+  codegen_->AddSlowPath(slow_path);
+  __ jmp(slow_path->GetEntryLabel());
+  __ Bind(slow_path->GetExitLabel());
 }
 
 static Address GetExceptionTlsAddress() {
-  return Address::Absolute(Thread::ExceptionOffset<kX86_64WordSize>().Int32Value(),
+  return Address::Absolute(Thread::ExceptionOffset<kX86_64PointerSize>().Int32Value(),
                            /* no_rip */ true);
 }
 
@@ -5647,7 +5672,7 @@ void InstructionCodeGeneratorX86_64::VisitClearException(HClearException* clear
 
 void LocationsBuilderX86_64::VisitThrow(HThrow* instruction) {
   LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
+      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
 }
@@ -6157,7 +6182,7 @@ void InstructionCodeGeneratorX86_64::VisitCheckCast(HCheckCast* instruction) {
 
 void LocationsBuilderX86_64::VisitMonitorOperation(HMonitorOperation* instruction) {
   LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
+      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnMainOnly);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
 }
@@ -6378,10 +6403,10 @@ void InstructionCodeGeneratorX86_64::GenerateGcRootFieldLoad(HInstruction* instr
 
       // Slow path used to mark the GC root `root`.
       SlowPathCode* slow_path =
-          new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86_64(instruction, root, root);
+          new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86_64(instruction, root);
       codegen_->AddSlowPath(slow_path);
 
-      __ gs()->cmpl(Address::Absolute(Thread::IsGcMarkingOffset<kX86_64WordSize>().Int32Value(),
+      __ gs()->cmpl(Address::Absolute(Thread::IsGcMarkingOffset<kX86_64PointerSize>().Int32Value(),
                                       /* no_rip */ true),
                     Immediate(0));
       __ j(kNotEqual, slow_path->GetEntryLabel());
@@ -6489,12 +6514,6 @@ void CodeGeneratorX86_64::GenerateReferenceLoadWithBakerReadBarrier(HInstruction
   // /* LockWord */ lock_word = LockWord(monitor)
   static_assert(sizeof(LockWord) == sizeof(int32_t),
                 "art::LockWord and int32_t have different sizes.");
-  // /* uint32_t */ rb_state = lock_word.ReadBarrierState()
-  __ shrl(temp_reg, Immediate(LockWord::kReadBarrierStateShift));
-  __ andl(temp_reg, Immediate(LockWord::kReadBarrierStateMask));
-  static_assert(
-      LockWord::kReadBarrierStateMask == ReadBarrier::rb_ptr_mask_,
-      "art::LockWord::kReadBarrierStateMask is not equal to art::ReadBarrier::rb_ptr_mask_.");
 
   // Load fence to prevent load-load reordering.
   // Note that this is a no-op, thanks to the x86-64 memory model.
@@ -6509,13 +6528,18 @@ void CodeGeneratorX86_64::GenerateReferenceLoadWithBakerReadBarrier(HInstruction
 
   // Slow path used to mark the object `ref` when it is gray.
   SlowPathCode* slow_path =
-      new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86_64(instruction, ref, ref);
+      new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86_64(instruction, ref);
   AddSlowPath(slow_path);
 
   // if (rb_state == ReadBarrier::gray_ptr_)
   //   ref = ReadBarrier::Mark(ref);
-  __ cmpl(temp_reg, Immediate(ReadBarrier::gray_ptr_));
-  __ j(kEqual, slow_path->GetEntryLabel());
+  // Given the numeric representation, it's enough to check the low bit of the
+  // rb_state. We do that by shifting the bit out of the lock word with SHR.
+  static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0");
+  static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1");
+  static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2");
+  __ shrl(temp_reg, Immediate(LockWord::kReadBarrierStateShift + 1));
+  __ j(kCarrySet, slow_path->GetEntryLabel());
   __ Bind(slow_path->GetExitLabel());
 }
 
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index d7cfd37c33..4e0e34ce38 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -19,7 +19,6 @@
 
 #include "arch/x86_64/instruction_set_features_x86_64.h"
 #include "code_generator.h"
-#include "dex/compiler_enums.h"
 #include "driver/compiler_options.h"
 #include "nodes.h"
 #include "parallel_move_resolver.h"
@@ -29,7 +28,7 @@ namespace art {
 namespace x86_64 {
 
 // Use a local definition to prevent copying mistakes.
-static constexpr size_t kX86_64WordSize = kX86_64PointerSize;
+static constexpr size_t kX86_64WordSize = static_cast<size_t>(kX86_64PointerSize);
 
 // Some x86_64 instructions require a register to be available as temp.
 static constexpr Register TMP = R11;
@@ -318,6 +317,12 @@ class CodeGeneratorX86_64 : public CodeGenerator {
                      uint32_t dex_pc,
                      SlowPathCode* slow_path);
 
+  // Generate code to invoke a runtime entry point, but do not record
+  // PC-related information in a stack map.
+  void InvokeRuntimeWithoutRecordingPcInfo(int32_t entry_point_offset,
+                                           HInstruction* instruction,
+                                           SlowPathCode* slow_path);
+
   size_t GetWordSize() const OVERRIDE {
     return kX86_64WordSize;
   }
diff --git a/compiler/optimizing/codegen_test.cc b/compiler/optimizing/codegen_test.cc
index 6be79fa75c..fe6c0a305e 100644
--- a/compiler/optimizing/codegen_test.cc
+++ b/compiler/optimizing/codegen_test.cc
@@ -29,12 +29,6 @@
 #include "arch/x86_64/instruction_set_features_x86_64.h"
 #include "base/macros.h"
 #include "builder.h"
-#include "code_generator_arm.h"
-#include "code_generator_arm64.h"
-#include "code_generator_mips.h"
-#include "code_generator_mips64.h"
-#include "code_generator_x86.h"
-#include "code_generator_x86_64.h"
 #include "code_simulator_container.h"
 #include "common_compiler_test.h"
 #include "dex_file.h"
@@ -44,7 +38,7 @@
 #include "nodes.h"
 #include "optimizing_unit_test.h"
 #include "prepare_for_register_allocation.h"
-#include "register_allocator.h"
+#include "register_allocator_linear_scan.h"
 #include "ssa_liveness_analysis.h"
 #include "utils.h"
 #include "utils/arm/managed_register_arm.h"
@@ -52,10 +46,35 @@
 #include "utils/mips64/managed_register_mips64.h"
 #include "utils/x86/managed_register_x86.h"
 
+#ifdef ART_ENABLE_CODEGEN_arm
+#include "code_generator_arm.h"
+#endif
+
+#ifdef ART_ENABLE_CODEGEN_arm64
+#include "code_generator_arm64.h"
+#endif
+
+#ifdef ART_ENABLE_CODEGEN_x86
+#include "code_generator_x86.h"
+#endif
+
+#ifdef ART_ENABLE_CODEGEN_x86_64
+#include "code_generator_x86_64.h"
+#endif
+
+#ifdef ART_ENABLE_CODEGEN_mips
+#include "code_generator_mips.h"
+#endif
+
+#ifdef ART_ENABLE_CODEGEN_mips64
+#include "code_generator_mips64.h"
+#endif
+
 #include "gtest/gtest.h"
 
 namespace art {
 
+#ifdef ART_ENABLE_CODEGEN_arm
 // Provide our own codegen, that ensures the C calling conventions
 // are preserved. Currently, ART and C do not match as R4 is caller-save
 // in ART, and callee-save in C. Alternatively, we could use or write
@@ -80,7 +99,9 @@ class TestCodeGeneratorARM : public arm::CodeGeneratorARM {
     blocked_register_pairs_[arm::R6_R7] = false;
   }
 };
+#endif
 
+#ifdef ART_ENABLE_CODEGEN_x86
 class TestCodeGeneratorX86 : public x86::CodeGeneratorX86 {
  public:
   TestCodeGeneratorX86(HGraph* graph,
@@ -105,6 +126,7 @@ class TestCodeGeneratorX86 : public x86::CodeGeneratorX86 {
     blocked_register_pairs_[x86::ECX_EDI] = false;
   }
 };
+#endif
 
 class InternalCodeAllocator : public CodeAllocator {
  public:
@@ -219,7 +241,7 @@ static void RunCode(CodeGenerator* codegen,
 
   PrepareForRegisterAllocation(graph).Run();
   liveness.Analyze();
-  RegisterAllocator(graph->GetArena(), codegen, liveness).AllocateRegisters();
+  RegisterAllocator::Create(graph->GetArena(), codegen, liveness)->AllocateRegisters();
   hook_before_codegen(graph);
 
   InternalCodeAllocator allocator;
@@ -234,37 +256,54 @@ static void RunCode(InstructionSet target_isa,
                     bool has_result,
                     Expected expected) {
   CompilerOptions compiler_options;
+#ifdef ART_ENABLE_CODEGEN_arm
   if (target_isa == kArm || target_isa == kThumb2) {
     std::unique_ptr<const ArmInstructionSetFeatures> features_arm(
         ArmInstructionSetFeatures::FromCppDefines());
     TestCodeGeneratorARM codegenARM(graph, *features_arm.get(), compiler_options);
     RunCode(&codegenARM, graph, hook_before_codegen, has_result, expected);
-  } else if (target_isa == kArm64) {
+  }
+#endif
+#ifdef ART_ENABLE_CODEGEN_arm64
+  if (target_isa == kArm64) {
     std::unique_ptr<const Arm64InstructionSetFeatures> features_arm64(
         Arm64InstructionSetFeatures::FromCppDefines());
     arm64::CodeGeneratorARM64 codegenARM64(graph, *features_arm64.get(), compiler_options);
     RunCode(&codegenARM64, graph, hook_before_codegen, has_result, expected);
-  } else if (target_isa == kX86) {
+  }
+#endif
+#ifdef ART_ENABLE_CODEGEN_x86
+  if (target_isa == kX86) {
     std::unique_ptr<const X86InstructionSetFeatures> features_x86(
         X86InstructionSetFeatures::FromCppDefines());
-    x86::CodeGeneratorX86 codegenX86(graph, *features_x86.get(), compiler_options);
+    TestCodeGeneratorX86 codegenX86(graph, *features_x86.get(), compiler_options);
     RunCode(&codegenX86, graph, hook_before_codegen, has_result, expected);
-  } else if (target_isa == kX86_64) {
+  }
+#endif
+#ifdef ART_ENABLE_CODEGEN_x86_64
+  if (target_isa == kX86_64) {
     std::unique_ptr<const X86_64InstructionSetFeatures> features_x86_64(
         X86_64InstructionSetFeatures::FromCppDefines());
     x86_64::CodeGeneratorX86_64 codegenX86_64(graph, *features_x86_64.get(), compiler_options);
     RunCode(&codegenX86_64, graph, hook_before_codegen, has_result, expected);
-  } else if (target_isa == kMips) {
+  }
+#endif
+#ifdef ART_ENABLE_CODEGEN_mips
+  if (target_isa == kMips) {
     std::unique_ptr<const MipsInstructionSetFeatures> features_mips(
         MipsInstructionSetFeatures::FromCppDefines());
     mips::CodeGeneratorMIPS codegenMIPS(graph, *features_mips.get(), compiler_options);
     RunCode(&codegenMIPS, graph, hook_before_codegen, has_result, expected);
-  } else if (target_isa == kMips64) {
+  }
+#endif
+#ifdef ART_ENABLE_CODEGEN_mips64
+  if (target_isa == kMips64) {
     std::unique_ptr<const Mips64InstructionSetFeatures> features_mips64(
         Mips64InstructionSetFeatures::FromCppDefines());
     mips64::CodeGeneratorMIPS64 codegenMIPS64(graph, *features_mips64.get(), compiler_options);
     RunCode(&codegenMIPS64, graph, hook_before_codegen, has_result, expected);
   }
+#endif
 }
 
 static ::std::vector<InstructionSet> GetTargetISAs() {
diff --git a/compiler/optimizing/common_arm64.h b/compiler/optimizing/common_arm64.h
index a849448cf9..cc949c5275 100644
--- a/compiler/optimizing/common_arm64.h
+++ b/compiler/optimizing/common_arm64.h
@@ -21,8 +21,14 @@
 #include "locations.h"
 #include "nodes.h"
 #include "utils/arm64/assembler_arm64.h"
-#include "vixl/a64/disasm-a64.h"
-#include "vixl/a64/macro-assembler-a64.h"
+
+// TODO(VIXL): Make VIXL compile with -Wshadow.
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wshadow"
+#include "aarch64/disasm-aarch64.h"
+#include "aarch64/macro-assembler-aarch64.h"
+#include "aarch64/simulator-aarch64.h"
+#pragma GCC diagnostic pop
 
 namespace art {
 namespace arm64 {
@@ -34,87 +40,88 @@ static_assert((SP == 31) && (WSP == 31) && (XZR == 32) && (WZR == 32),
 
 static inline int VIXLRegCodeFromART(int code) {
   if (code == SP) {
-    return vixl::kSPRegInternalCode;
+    return vixl::aarch64::kSPRegInternalCode;
   }
   if (code == XZR) {
-    return vixl::kZeroRegCode;
+    return vixl::aarch64::kZeroRegCode;
   }
   return code;
 }
 
 static inline int ARTRegCodeFromVIXL(int code) {
-  if (code == vixl::kSPRegInternalCode) {
+  if (code == vixl::aarch64::kSPRegInternalCode) {
     return SP;
   }
-  if (code == vixl::kZeroRegCode) {
+  if (code == vixl::aarch64::kZeroRegCode) {
     return XZR;
   }
   return code;
 }
 
-static inline vixl::Register XRegisterFrom(Location location) {
+static inline vixl::aarch64::Register XRegisterFrom(Location location) {
   DCHECK(location.IsRegister()) << location;
-  return vixl::Register::XRegFromCode(VIXLRegCodeFromART(location.reg()));
+  return vixl::aarch64::Register::GetXRegFromCode(VIXLRegCodeFromART(location.reg()));
 }
 
-static inline vixl::Register WRegisterFrom(Location location) {
+static inline vixl::aarch64::Register WRegisterFrom(Location location) {
   DCHECK(location.IsRegister()) << location;
-  return vixl::Register::WRegFromCode(VIXLRegCodeFromART(location.reg()));
+  return vixl::aarch64::Register::GetWRegFromCode(VIXLRegCodeFromART(location.reg()));
 }
 
-static inline vixl::Register RegisterFrom(Location location, Primitive::Type type) {
+static inline vixl::aarch64::Register RegisterFrom(Location location, Primitive::Type type) {
   DCHECK(type != Primitive::kPrimVoid && !Primitive::IsFloatingPointType(type)) << type;
   return type == Primitive::kPrimLong ? XRegisterFrom(location) : WRegisterFrom(location);
 }
 
-static inline vixl::Register OutputRegister(HInstruction* instr) {
+static inline vixl::aarch64::Register OutputRegister(HInstruction* instr) {
   return RegisterFrom(instr->GetLocations()->Out(), instr->GetType());
 }
 
-static inline vixl::Register InputRegisterAt(HInstruction* instr, int input_index) {
+static inline vixl::aarch64::Register InputRegisterAt(HInstruction* instr, int input_index) {
   return RegisterFrom(instr->GetLocations()->InAt(input_index),
                       instr->InputAt(input_index)->GetType());
 }
 
-static inline vixl::FPRegister DRegisterFrom(Location location) {
+static inline vixl::aarch64::FPRegister DRegisterFrom(Location location) {
   DCHECK(location.IsFpuRegister()) << location;
-  return vixl::FPRegister::DRegFromCode(location.reg());
+  return vixl::aarch64::FPRegister::GetDRegFromCode(location.reg());
 }
 
-static inline vixl::FPRegister SRegisterFrom(Location location) {
+static inline vixl::aarch64::FPRegister SRegisterFrom(Location location) {
   DCHECK(location.IsFpuRegister()) << location;
-  return vixl::FPRegister::SRegFromCode(location.reg());
+  return vixl::aarch64::FPRegister::GetSRegFromCode(location.reg());
 }
 
-static inline vixl::FPRegister FPRegisterFrom(Location location, Primitive::Type type) {
+static inline vixl::aarch64::FPRegister FPRegisterFrom(Location location, Primitive::Type type) {
   DCHECK(Primitive::IsFloatingPointType(type)) << type;
   return type == Primitive::kPrimDouble ? DRegisterFrom(location) : SRegisterFrom(location);
 }
 
-static inline vixl::FPRegister OutputFPRegister(HInstruction* instr) {
+static inline vixl::aarch64::FPRegister OutputFPRegister(HInstruction* instr) {
   return FPRegisterFrom(instr->GetLocations()->Out(), instr->GetType());
 }
 
-static inline vixl::FPRegister InputFPRegisterAt(HInstruction* instr, int input_index) {
+static inline vixl::aarch64::FPRegister InputFPRegisterAt(HInstruction* instr, int input_index) {
   return FPRegisterFrom(instr->GetLocations()->InAt(input_index),
                         instr->InputAt(input_index)->GetType());
 }
 
-static inline vixl::CPURegister CPURegisterFrom(Location location, Primitive::Type type) {
-  return Primitive::IsFloatingPointType(type) ? vixl::CPURegister(FPRegisterFrom(location, type))
-                                              : vixl::CPURegister(RegisterFrom(location, type));
+static inline vixl::aarch64::CPURegister CPURegisterFrom(Location location, Primitive::Type type) {
+  return Primitive::IsFloatingPointType(type)
+      ? vixl::aarch64::CPURegister(FPRegisterFrom(location, type))
+      : vixl::aarch64::CPURegister(RegisterFrom(location, type));
 }
 
-static inline vixl::CPURegister OutputCPURegister(HInstruction* instr) {
+static inline vixl::aarch64::CPURegister OutputCPURegister(HInstruction* instr) {
   return Primitive::IsFloatingPointType(instr->GetType())
-      ? static_cast<vixl::CPURegister>(OutputFPRegister(instr))
-      : static_cast<vixl::CPURegister>(OutputRegister(instr));
+      ? static_cast<vixl::aarch64::CPURegister>(OutputFPRegister(instr))
+      : static_cast<vixl::aarch64::CPURegister>(OutputRegister(instr));
 }
 
-static inline vixl::CPURegister InputCPURegisterAt(HInstruction* instr, int index) {
+static inline vixl::aarch64::CPURegister InputCPURegisterAt(HInstruction* instr, int index) {
   return Primitive::IsFloatingPointType(instr->InputAt(index)->GetType())
-      ? static_cast<vixl::CPURegister>(InputFPRegisterAt(instr, index))
-      : static_cast<vixl::CPURegister>(InputRegisterAt(instr, index));
+      ? static_cast<vixl::aarch64::CPURegister>(InputFPRegisterAt(instr, index))
+      : static_cast<vixl::aarch64::CPURegister>(InputRegisterAt(instr, index));
 }
 
 static inline int64_t Int64ConstantFrom(Location location) {
@@ -129,63 +136,70 @@ static inline int64_t Int64ConstantFrom(Location location) {
   }
 }
 
-static inline vixl::Operand OperandFrom(Location location, Primitive::Type type) {
+static inline vixl::aarch64::Operand OperandFrom(Location location, Primitive::Type type) {
   if (location.IsRegister()) {
-    return vixl::Operand(RegisterFrom(location, type));
+    return vixl::aarch64::Operand(RegisterFrom(location, type));
   } else {
-    return vixl::Operand(Int64ConstantFrom(location));
+    return vixl::aarch64::Operand(Int64ConstantFrom(location));
   }
 }
 
-static inline vixl::Operand InputOperandAt(HInstruction* instr, int input_index) {
+static inline vixl::aarch64::Operand InputOperandAt(HInstruction* instr, int input_index) {
   return OperandFrom(instr->GetLocations()->InAt(input_index),
                      instr->InputAt(input_index)->GetType());
 }
 
-static inline vixl::MemOperand StackOperandFrom(Location location) {
-  return vixl::MemOperand(vixl::sp, location.GetStackIndex());
+static inline vixl::aarch64::MemOperand StackOperandFrom(Location location) {
+  return vixl::aarch64::MemOperand(vixl::aarch64::sp, location.GetStackIndex());
 }
 
-static inline vixl::MemOperand HeapOperand(const vixl::Register& base, size_t offset = 0) {
+static inline vixl::aarch64::MemOperand HeapOperand(const vixl::aarch64::Register& base,
+                                                    size_t offset = 0) {
   // A heap reference must be 32bit, so fit in a W register.
   DCHECK(base.IsW());
-  return vixl::MemOperand(base.X(), offset);
+  return vixl::aarch64::MemOperand(base.X(), offset);
 }
 
-static inline vixl::MemOperand HeapOperand(const vixl::Register& base,
-                                           const vixl::Register& regoffset,
-                                           vixl::Shift shift = vixl::LSL,
-                                           unsigned shift_amount = 0) {
+static inline vixl::aarch64::MemOperand HeapOperand(const vixl::aarch64::Register& base,
+                                                    const vixl::aarch64::Register& regoffset,
+                                                    vixl::aarch64::Shift shift = vixl::aarch64::LSL,
+                                                    unsigned shift_amount = 0) {
   // A heap reference must be 32bit, so fit in a W register.
   DCHECK(base.IsW());
-  return vixl::MemOperand(base.X(), regoffset, shift, shift_amount);
+  return vixl::aarch64::MemOperand(base.X(), regoffset, shift, shift_amount);
 }
 
-static inline vixl::MemOperand HeapOperand(const vixl::Register& base, Offset offset) {
+static inline vixl::aarch64::MemOperand HeapOperand(const vixl::aarch64::Register& base,
+                                                    Offset offset) {
   return HeapOperand(base, offset.SizeValue());
 }
 
-static inline vixl::MemOperand HeapOperandFrom(Location location, Offset offset) {
+static inline vixl::aarch64::MemOperand HeapOperandFrom(Location location, Offset offset) {
   return HeapOperand(RegisterFrom(location, Primitive::kPrimNot), offset);
 }
 
-static inline Location LocationFrom(const vixl::Register& reg) {
-  return Location::RegisterLocation(ARTRegCodeFromVIXL(reg.code()));
+static inline Location LocationFrom(const vixl::aarch64::Register& reg) {
+  return Location::RegisterLocation(ARTRegCodeFromVIXL(reg.GetCode()));
 }
 
-static inline Location LocationFrom(const vixl::FPRegister& fpreg) {
-  return Location::FpuRegisterLocation(fpreg.code());
+static inline Location LocationFrom(const vixl::aarch64::FPRegister& fpreg) {
+  return Location::FpuRegisterLocation(fpreg.GetCode());
 }
 
-static inline vixl::Operand OperandFromMemOperand(const vixl::MemOperand& mem_op) {
+static inline vixl::aarch64::Operand OperandFromMemOperand(
+    const vixl::aarch64::MemOperand& mem_op) {
   if (mem_op.IsImmediateOffset()) {
-    return vixl::Operand(mem_op.offset());
+    return vixl::aarch64::Operand(mem_op.GetOffset());
   } else {
     DCHECK(mem_op.IsRegisterOffset());
-    if (mem_op.extend() != vixl::NO_EXTEND) {
-      return vixl::Operand(mem_op.regoffset(), mem_op.extend(), mem_op.shift_amount());
-    } else if (mem_op.shift() != vixl::NO_SHIFT) {
-      return vixl::Operand(mem_op.regoffset(), mem_op.shift(), mem_op.shift_amount());
+    if (mem_op.GetExtend() != vixl::aarch64::NO_EXTEND) {
+      return vixl::aarch64::Operand(mem_op.GetRegisterOffset(),
+                                    mem_op.GetExtend(),
+                                    mem_op.GetShiftAmount());
+    } else if (mem_op.GetShift() != vixl::aarch64::NO_SHIFT) {
+      return vixl::aarch64::Operand(mem_op.GetRegisterOffset(),
+                                    mem_op.GetShift(),
+                                    mem_op.GetShiftAmount());
     } else {
       LOG(FATAL) << "Should not reach here";
       UNREACHABLE();
@@ -212,13 +226,13 @@ static bool CanEncodeConstantAsImmediate(HConstant* constant, HInstruction* inst
 
   if (instr->IsAnd() || instr->IsOr() || instr->IsXor()) {
     // Uses logical operations.
-    return vixl::Assembler::IsImmLogical(value, vixl::kXRegSize);
+    return vixl::aarch64::Assembler::IsImmLogical(value, vixl::aarch64::kXRegSize);
   } else if (instr->IsNeg()) {
     // Uses mov -immediate.
-    return vixl::Assembler::IsImmMovn(value, vixl::kXRegSize);
+    return vixl::aarch64::Assembler::IsImmMovn(value, vixl::aarch64::kXRegSize);
   } else {
     DCHECK(instr->IsAdd() ||
-           instr->IsArm64IntermediateAddress() ||
+           instr->IsIntermediateAddress() ||
            instr->IsBoundsCheck() ||
            instr->IsCompare() ||
            instr->IsCondition() ||
@@ -227,7 +241,8 @@ static bool CanEncodeConstantAsImmediate(HConstant* constant, HInstruction* inst
     // Uses aliases of ADD/SUB instructions.
     // If `value` does not fit but `-value` does, VIXL will automatically use
     // the 'opposite' instruction.
-    return vixl::Assembler::IsImmAddSub(value) || vixl::Assembler::IsImmAddSub(-value);
+    return vixl::aarch64::Assembler::IsImmAddSub(value)
+        || vixl::aarch64::Assembler::IsImmAddSub(-value);
   }
 }
 
@@ -263,30 +278,30 @@ static inline bool ArtVixlRegCodeCoherentForRegSet(uint32_t art_core_registers,
   return true;
 }
 
-static inline vixl::Shift ShiftFromOpKind(HArm64DataProcWithShifterOp::OpKind op_kind) {
+static inline vixl::aarch64::Shift ShiftFromOpKind(HArm64DataProcWithShifterOp::OpKind op_kind) {
   switch (op_kind) {
-    case HArm64DataProcWithShifterOp::kASR: return vixl::ASR;
-    case HArm64DataProcWithShifterOp::kLSL: return vixl::LSL;
-    case HArm64DataProcWithShifterOp::kLSR: return vixl::LSR;
+    case HArm64DataProcWithShifterOp::kASR: return vixl::aarch64::ASR;
+    case HArm64DataProcWithShifterOp::kLSL: return vixl::aarch64::LSL;
+    case HArm64DataProcWithShifterOp::kLSR: return vixl::aarch64::LSR;
     default:
       LOG(FATAL) << "Unexpected op kind " << op_kind;
       UNREACHABLE();
-      return vixl::NO_SHIFT;
+      return vixl::aarch64::NO_SHIFT;
   }
 }
 
-static inline vixl::Extend ExtendFromOpKind(HArm64DataProcWithShifterOp::OpKind op_kind) {
+static inline vixl::aarch64::Extend ExtendFromOpKind(HArm64DataProcWithShifterOp::OpKind op_kind) {
   switch (op_kind) {
-    case HArm64DataProcWithShifterOp::kUXTB: return vixl::UXTB;
-    case HArm64DataProcWithShifterOp::kUXTH: return vixl::UXTH;
-    case HArm64DataProcWithShifterOp::kUXTW: return vixl::UXTW;
-    case HArm64DataProcWithShifterOp::kSXTB: return vixl::SXTB;
-    case HArm64DataProcWithShifterOp::kSXTH: return vixl::SXTH;
-    case HArm64DataProcWithShifterOp::kSXTW: return vixl::SXTW;
+    case HArm64DataProcWithShifterOp::kUXTB: return vixl::aarch64::UXTB;
+    case HArm64DataProcWithShifterOp::kUXTH: return vixl::aarch64::UXTH;
+    case HArm64DataProcWithShifterOp::kUXTW: return vixl::aarch64::UXTW;
+    case HArm64DataProcWithShifterOp::kSXTB: return vixl::aarch64::SXTB;
+    case HArm64DataProcWithShifterOp::kSXTH: return vixl::aarch64::SXTH;
+    case HArm64DataProcWithShifterOp::kSXTW: return vixl::aarch64::SXTW;
     default:
       LOG(FATAL) << "Unexpected op kind " << op_kind;
       UNREACHABLE();
-      return vixl::NO_EXTEND;
+      return vixl::aarch64::NO_EXTEND;
   }
 }
 
diff --git a/compiler/optimizing/dead_code_elimination.cc b/compiler/optimizing/dead_code_elimination.cc
index 49cfff46d8..e1bde7c737 100644
--- a/compiler/optimizing/dead_code_elimination.cc
+++ b/compiler/optimizing/dead_code_elimination.cc
@@ -88,13 +88,207 @@ void HDeadCodeElimination::MaybeRecordDeadBlock(HBasicBlock* block) {
   }
 }
 
-void HDeadCodeElimination::RemoveDeadBlocks() {
-  if (graph_->HasIrreducibleLoops()) {
-    // Do not eliminate dead blocks if the graph has irreducible loops. We could
-    // support it, but that would require changes in our loop representation to handle
-    // multiple entry points. We decided it was not worth the complexity.
-    return;
+void HDeadCodeElimination::MaybeRecordSimplifyIf() {
+  if (stats_ != nullptr) {
+    stats_->RecordStat(MethodCompilationStat::kSimplifyIf);
+  }
+}
+
+static bool HasInput(HCondition* instruction, HInstruction* input) {
+  return (instruction->InputAt(0) == input) ||
+         (instruction->InputAt(1) == input);
+}
+
+static bool HasEquality(IfCondition condition) {
+  switch (condition) {
+    case kCondEQ:
+    case kCondLE:
+    case kCondGE:
+    case kCondBE:
+    case kCondAE:
+      return true;
+    case kCondNE:
+    case kCondLT:
+    case kCondGT:
+    case kCondB:
+    case kCondA:
+      return false;
+  }
+}
+
+static HConstant* Evaluate(HCondition* condition, HInstruction* left, HInstruction* right) {
+  if (left == right && !Primitive::IsFloatingPointType(left->GetType())) {
+    return condition->GetBlock()->GetGraph()->GetIntConstant(
+        HasEquality(condition->GetCondition()) ? 1 : 0);
+  }
+
+  if (!left->IsConstant() || !right->IsConstant()) {
+    return nullptr;
+  }
+
+  if (left->IsIntConstant()) {
+    return condition->Evaluate(left->AsIntConstant(), right->AsIntConstant());
+  } else if (left->IsNullConstant()) {
+    return condition->Evaluate(left->AsNullConstant(), right->AsNullConstant());
+  } else if (left->IsLongConstant()) {
+    return condition->Evaluate(left->AsLongConstant(), right->AsLongConstant());
+  } else if (left->IsFloatConstant()) {
+    return condition->Evaluate(left->AsFloatConstant(), right->AsFloatConstant());
+  } else {
+    DCHECK(left->IsDoubleConstant());
+    return condition->Evaluate(left->AsDoubleConstant(), right->AsDoubleConstant());
+  }
+}
+
+// Simplify the pattern:
+//
+//        B1    B2    ...
+//       goto  goto  goto
+//         \    |    /
+//          \   |   /
+//             B3
+//     i1 = phi(input, input)
+//     (i2 = condition on i1)
+//        if i1 (or i2)
+//          /     \
+//         /       \
+//        B4       B5
+//
+// Into:
+//
+//       B1      B2    ...
+//        |      |      |
+//       B4      B5    B?
+//
+// This simplification cannot be applied for loop headers, as they
+// contain a suspend check.
+//
+// Note that we rely on the dead code elimination to get rid of B3.
+bool HDeadCodeElimination::SimplifyIfs() {
+  bool simplified_one_or_more_ifs = false;
+  bool rerun_dominance_and_loop_analysis = false;
+
+  for (HReversePostOrderIterator it(*graph_); !it.Done(); it.Advance()) {
+    HBasicBlock* block = it.Current();
+    HInstruction* last = block->GetLastInstruction();
+    HInstruction* first = block->GetFirstInstruction();
+    if (last->IsIf() &&
+        block->HasSinglePhi() &&
+        block->GetFirstPhi()->HasOnlyOneNonEnvironmentUse()) {
+      bool has_only_phi_and_if = (last == first) && (last->InputAt(0) == block->GetFirstPhi());
+      bool has_only_phi_condition_and_if =
+          !has_only_phi_and_if &&
+          first->IsCondition() &&
+          HasInput(first->AsCondition(), block->GetFirstPhi()) &&
+          (first->GetNext() == last) &&
+          (last->InputAt(0) == first) &&
+          first->HasOnlyOneNonEnvironmentUse();
+
+      if (has_only_phi_and_if || has_only_phi_condition_and_if) {
+        DCHECK(!block->IsLoopHeader());
+        HPhi* phi = block->GetFirstPhi()->AsPhi();
+        bool phi_input_is_left = (first->InputAt(0) == phi);
+
+        // Walk over all inputs of the phis and update the control flow of
+        // predecessors feeding constants to the phi.
+        // Note that phi->InputCount() may change inside the loop.
+        for (size_t i = 0; i < phi->InputCount();) {
+          HInstruction* input = phi->InputAt(i);
+          HInstruction* value_to_check = nullptr;
+          if (has_only_phi_and_if) {
+            if (input->IsIntConstant()) {
+              value_to_check = input;
+            }
+          } else {
+            DCHECK(has_only_phi_condition_and_if);
+            if (phi_input_is_left) {
+              value_to_check = Evaluate(first->AsCondition(), input, first->InputAt(1));
+            } else {
+              value_to_check = Evaluate(first->AsCondition(), first->InputAt(0), input);
+            }
+          }
+          if (value_to_check == nullptr) {
+            // Could not evaluate to a constant, continue iterating over the inputs.
+            ++i;
+          } else {
+            HBasicBlock* predecessor_to_update = block->GetPredecessors()[i];
+            HBasicBlock* successor_to_update = nullptr;
+            if (value_to_check->AsIntConstant()->IsTrue()) {
+              successor_to_update = last->AsIf()->IfTrueSuccessor();
+            } else {
+              DCHECK(value_to_check->AsIntConstant()->IsFalse())
+                  << value_to_check->AsIntConstant()->GetValue();
+              successor_to_update = last->AsIf()->IfFalseSuccessor();
+            }
+            predecessor_to_update->ReplaceSuccessor(block, successor_to_update);
+            phi->RemoveInputAt(i);
+            simplified_one_or_more_ifs = true;
+            if (block->IsInLoop()) {
+              rerun_dominance_and_loop_analysis = true;
+            }
+            // For simplicity, don't create a dead block, let the dead code elimination
+            // pass deal with it.
+            if (phi->InputCount() == 1) {
+              break;
+            }
+          }
+        }
+        if (block->GetPredecessors().size() == 1) {
+          phi->ReplaceWith(phi->InputAt(0));
+          block->RemovePhi(phi);
+          if (has_only_phi_condition_and_if) {
+            // Evaluate here (and not wait for a constant folding pass) to open
+            // more opportunities for DCE.
+            HInstruction* result = first->AsCondition()->TryStaticEvaluation();
+            if (result != nullptr) {
+              first->ReplaceWith(result);
+              block->RemoveInstruction(first);
+            }
+          }
+        }
+        if (simplified_one_or_more_ifs) {
+          MaybeRecordSimplifyIf();
+        }
+      }
+    }
+  }
+  // We need to re-analyze the graph in order to run DCE afterwards.
+  if (simplified_one_or_more_ifs) {
+    if (rerun_dominance_and_loop_analysis) {
+      graph_->ClearLoopInformation();
+      graph_->ClearDominanceInformation();
+      graph_->BuildDominatorTree();
+    } else {
+      graph_->ClearDominanceInformation();
+      // We have introduced critical edges, remove them.
+      graph_->SimplifyCFG();
+      graph_->ComputeDominanceInformation();
+      graph_->ComputeTryBlockInformation();
+    }
+  }
+
+  return simplified_one_or_more_ifs;
+}
+
+void HDeadCodeElimination::ConnectSuccessiveBlocks() {
+  // Order does not matter.
+  for (HReversePostOrderIterator it(*graph_); !it.Done();) {
+    HBasicBlock* block  = it.Current();
+    if (block->IsEntryBlock() || !block->GetLastInstruction()->IsGoto()) {
+      it.Advance();
+      continue;
+    }
+    HBasicBlock* successor = block->GetSingleSuccessor();
+    if (successor->IsExitBlock() || successor->GetPredecessors().size() != 1u) {
+      it.Advance();
+      continue;
+    }
+    block->MergeWith(successor);
+    // Reiterate on this block in case it can be merged with its new successor.
   }
+}
+
+bool HDeadCodeElimination::RemoveDeadBlocks() {
   // Classify blocks as reachable/unreachable.
   ArenaAllocator* allocator = graph_->GetArena();
   ArenaBitVector live_blocks(allocator, graph_->GetBlocks().size(), false, kArenaAllocDCE);
@@ -132,23 +326,7 @@ void HDeadCodeElimination::RemoveDeadBlocks() {
       graph_->ComputeTryBlockInformation();
     }
   }
-
-  // Connect successive blocks created by dead branches. Order does not matter.
-  for (HReversePostOrderIterator it(*graph_); !it.Done();) {
-    HBasicBlock* block  = it.Current();
-    if (block->IsEntryBlock() || !block->GetLastInstruction()->IsGoto()) {
-      it.Advance();
-      continue;
-    }
-    HBasicBlock* successor = block->GetSingleSuccessor();
-    if (successor->IsExitBlock() || successor->GetPredecessors().size() != 1u) {
-      it.Advance();
-      continue;
-    }
-    block->MergeWith(successor);
-
-    // Reiterate on this block in case it can be merged with its new successor.
-  }
+  return removed_one_or_more_blocks;
 }
 
 void HDeadCodeElimination::RemoveDeadInstructions() {
@@ -181,7 +359,20 @@ void HDeadCodeElimination::RemoveDeadInstructions() {
 }
 
 void HDeadCodeElimination::Run() {
-  RemoveDeadBlocks();
+  // Do not eliminate dead blocks if the graph has irreducible loops. We could
+  // support it, but that would require changes in our loop representation to handle
+  // multiple entry points. We decided it was not worth the complexity.
+  if (!graph_->HasIrreducibleLoops()) {
+    // Simplify graph to generate more dead block patterns.
+    ConnectSuccessiveBlocks();
+    bool did_any_simplification = false;
+    did_any_simplification |= SimplifyIfs();
+    did_any_simplification |= RemoveDeadBlocks();
+    if (did_any_simplification) {
+      // Connect successive blocks created by dead branches.
+      ConnectSuccessiveBlocks();
+    }
+  }
   SsaRedundantPhiElimination(graph_).Run();
   RemoveDeadInstructions();
 }
diff --git a/compiler/optimizing/dead_code_elimination.h b/compiler/optimizing/dead_code_elimination.h
index 8d6008b845..58e700deba 100644
--- a/compiler/optimizing/dead_code_elimination.h
+++ b/compiler/optimizing/dead_code_elimination.h
@@ -31,18 +31,19 @@ class HDeadCodeElimination : public HOptimization {
  public:
   HDeadCodeElimination(HGraph* graph,
                        OptimizingCompilerStats* stats = nullptr,
-                       const char* name = kInitialDeadCodeEliminationPassName)
+                       const char* name = kDeadCodeEliminationPassName)
       : HOptimization(graph, name, stats) {}
 
   void Run() OVERRIDE;
-
-  static constexpr const char* kInitialDeadCodeEliminationPassName = "dead_code_elimination";
-  static constexpr const char* kFinalDeadCodeEliminationPassName = "dead_code_elimination_final";
+  static constexpr const char* kDeadCodeEliminationPassName = "dead_code_elimination";
 
  private:
   void MaybeRecordDeadBlock(HBasicBlock* block);
-  void RemoveDeadBlocks();
+  void MaybeRecordSimplifyIf();
+  bool RemoveDeadBlocks();
   void RemoveDeadInstructions();
+  bool SimplifyIfs();
+  void ConnectSuccessiveBlocks();
 
   DISALLOW_COPY_AND_ASSIGN(HDeadCodeElimination);
 };
diff --git a/compiler/optimizing/dex_cache_array_fixups_arm.h b/compiler/optimizing/dex_cache_array_fixups_arm.h
index 015f910328..9142e29eff 100644
--- a/compiler/optimizing/dex_cache_array_fixups_arm.h
+++ b/compiler/optimizing/dex_cache_array_fixups_arm.h
@@ -26,7 +26,9 @@ namespace arm {
 class DexCacheArrayFixups : public HOptimization {
  public:
   DexCacheArrayFixups(HGraph* graph, OptimizingCompilerStats* stats)
-      : HOptimization(graph, "dex_cache_array_fixups_arm", stats) {}
+      : HOptimization(graph, kDexCacheArrayFixupsArmPassName, stats) {}
+
+  static constexpr const char* kDexCacheArrayFixupsArmPassName = "dex_cache_array_fixups_arm";
 
   void Run() OVERRIDE;
 };
diff --git a/compiler/optimizing/dex_cache_array_fixups_mips.cc b/compiler/optimizing/dex_cache_array_fixups_mips.cc
index 0f42d9ce0f..19bab08eb4 100644
--- a/compiler/optimizing/dex_cache_array_fixups_mips.cc
+++ b/compiler/optimizing/dex_cache_array_fixups_mips.cc
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "code_generator_mips.h"
 #include "dex_cache_array_fixups_mips.h"
 
 #include "base/arena_containers.h"
@@ -27,8 +28,9 @@ namespace mips {
  */
 class DexCacheArrayFixupsVisitor : public HGraphVisitor {
  public:
-  explicit DexCacheArrayFixupsVisitor(HGraph* graph)
+  explicit DexCacheArrayFixupsVisitor(HGraph* graph, CodeGenerator* codegen)
       : HGraphVisitor(graph),
+        codegen_(down_cast<CodeGeneratorMIPS*>(codegen)),
         dex_cache_array_bases_(std::less<const DexFile*>(),
                                // Attribute memory use to code generator.
                                graph->GetArena()->Adapter(kArenaAllocCodeGenerator)) {}
@@ -41,9 +43,45 @@ class DexCacheArrayFixupsVisitor : public HGraphVisitor {
       HMipsDexCacheArraysBase* base = entry.second;
       base->MoveBeforeFirstUserAndOutOfLoops();
     }
+    // Computing the dex cache base for PC-relative accesses will clobber RA with
+    // the NAL instruction on R2. Take a note of this before generating the method
+    // entry.
+    if (!dex_cache_array_bases_.empty() && !codegen_->GetInstructionSetFeatures().IsR6()) {
+      codegen_->ClobberRA();
+    }
   }
 
  private:
+  void VisitLoadClass(HLoadClass* load_class) OVERRIDE {
+    // If this is a load with PC-relative access to the dex cache types array,
+    // we need to add the dex cache arrays base as the special input.
+    if (load_class->GetLoadKind() == HLoadClass::LoadKind::kDexCachePcRelative) {
+      // Initialize base for target dex file if needed.
+      const DexFile& dex_file = load_class->GetDexFile();
+      HMipsDexCacheArraysBase* base = GetOrCreateDexCacheArrayBase(dex_file);
+      // Update the element offset in base.
+      DexCacheArraysLayout layout(kMipsPointerSize, &dex_file);
+      base->UpdateElementOffset(layout.TypeOffset(load_class->GetTypeIndex()));
+      // Add the special argument base to the load.
+      load_class->AddSpecialInput(base);
+    }
+  }
+
+  void VisitLoadString(HLoadString* load_string) OVERRIDE {
+    // If this is a load with PC-relative access to the dex cache strings array,
+    // we need to add the dex cache arrays base as the special input.
+    if (load_string->GetLoadKind() == HLoadString::LoadKind::kDexCachePcRelative) {
+      // Initialize base for target dex file if needed.
+      const DexFile& dex_file = load_string->GetDexFile();
+      HMipsDexCacheArraysBase* base = GetOrCreateDexCacheArrayBase(dex_file);
+      // Update the element offset in base.
+      DexCacheArraysLayout layout(kMipsPointerSize, &dex_file);
+      base->UpdateElementOffset(layout.StringOffset(load_string->GetStringIndex()));
+      // Add the special argument base to the load.
+      load_string->AddSpecialInput(base);
+    }
+  }
+
   void VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) OVERRIDE {
     // If this is an invoke with PC-relative access to the dex cache methods array,
     // we need to add the dex cache arrays base as the special input.
@@ -74,6 +112,8 @@ class DexCacheArrayFixupsVisitor : public HGraphVisitor {
         });
   }
 
+  CodeGeneratorMIPS* codegen_;
+
   using DexCacheArraysBaseMap =
       ArenaSafeMap<const DexFile*, HMipsDexCacheArraysBase*, std::less<const DexFile*>>;
   DexCacheArraysBaseMap dex_cache_array_bases_;
@@ -85,7 +125,7 @@ void DexCacheArrayFixups::Run() {
     // that can be live-in at the irreducible loop header.
     return;
   }
-  DexCacheArrayFixupsVisitor visitor(graph_);
+  DexCacheArrayFixupsVisitor visitor(graph_, codegen_);
   visitor.VisitInsertionOrder();
   visitor.MoveBasesIfNeeded();
 }
diff --git a/compiler/optimizing/dex_cache_array_fixups_mips.h b/compiler/optimizing/dex_cache_array_fixups_mips.h
index c8def2842e..861a199d6c 100644
--- a/compiler/optimizing/dex_cache_array_fixups_mips.h
+++ b/compiler/optimizing/dex_cache_array_fixups_mips.h
@@ -21,14 +21,23 @@
 #include "optimization.h"
 
 namespace art {
+
+class CodeGenerator;
+
 namespace mips {
 
 class DexCacheArrayFixups : public HOptimization {
  public:
-  DexCacheArrayFixups(HGraph* graph, OptimizingCompilerStats* stats)
-      : HOptimization(graph, "dex_cache_array_fixups_mips", stats) {}
+  DexCacheArrayFixups(HGraph* graph, CodeGenerator* codegen, OptimizingCompilerStats* stats)
+      : HOptimization(graph, kDexCacheArrayFixupsMipsPassName, stats),
+        codegen_(codegen) {}
+
+  static constexpr const char* kDexCacheArrayFixupsMipsPassName = "dex_cache_array_fixups_mips";
 
   void Run() OVERRIDE;
+
+ private:
+  CodeGenerator* codegen_;
 };
 
 }  // namespace mips
diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index 9d67373321..b3d5341de0 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc
@@ -31,7 +31,7 @@
 #include "nodes.h"
 #include "optimization.h"
 #include "reference_type_propagation.h"
-#include "register_allocator.h"
+#include "register_allocator_linear_scan.h"
 #include "ssa_liveness_analysis.h"
 #include "utils/assembler.h"
 
@@ -122,7 +122,10 @@ class HGraphVisualizerDisassembler {
             new DisassemblerOptions(/* absolute_addresses */ false,
                                     base_address,
                                     end_address,
-                                    /* can_read_literals */ true)));
+                                    /* can_read_literals */ true,
+                                    Is64BitInstructionSet(instruction_set)
+                                        ? &Thread::DumpThreadOffset<PointerSize::k64>
+                                        : &Thread::DumpThreadOffset<PointerSize::k32>)));
   }
 
   ~HGraphVisualizerDisassembler() {
@@ -298,6 +301,12 @@ class HGraphVisualizerPrinter : public HGraphDelegateVisitor {
         stream << constant->AsIntConstant()->GetValue();
       } else if (constant->IsLongConstant()) {
         stream << constant->AsLongConstant()->GetValue();
+      } else if (constant->IsFloatConstant()) {
+        stream << constant->AsFloatConstant()->GetValue();
+      } else if (constant->IsDoubleConstant()) {
+        stream << constant->AsDoubleConstant()->GetValue();
+      } else if (constant->IsNullConstant()) {
+        stream << "null";
       }
     } else if (location.IsInvalid()) {
       stream << "invalid";
@@ -401,6 +410,9 @@ class HGraphVisualizerPrinter : public HGraphDelegateVisitor {
   void VisitArrayLength(HArrayLength* array_length) OVERRIDE {
     StartAttributeStream("is_string_length") << std::boolalpha
         << array_length->IsStringLength() << std::noboolalpha;
+    if (array_length->IsEmittedAtUseSite()) {
+      StartAttributeStream("emitted_at_use") << "true";
+    }
   }
 
   void VisitBoundsCheck(HBoundsCheck* bounds_check) OVERRIDE {
diff --git a/compiler/optimizing/induction_var_analysis.h b/compiler/optimizing/induction_var_analysis.h
index 7c74816c26..cd4c830645 100644
--- a/compiler/optimizing/induction_var_analysis.h
+++ b/compiler/optimizing/induction_var_analysis.h
@@ -39,9 +39,9 @@ class HInductionVarAnalysis : public HOptimization {
 
   void Run() OVERRIDE;
 
- private:
   static constexpr const char* kInductionPassName = "induction_var_analysis";
 
+ private:
   struct NodeInfo {
     explicit NodeInfo(uint32_t d) : depth(d), done(false) {}
     uint32_t depth;
diff --git a/compiler/optimizing/inliner.cc b/compiler/optimizing/inliner.cc
index f9e78b0a8f..451aa38033 100644
--- a/compiler/optimizing/inliner.cc
+++ b/compiler/optimizing/inliner.cc
@@ -17,6 +17,7 @@
 #include "inliner.h"
 
 #include "art_method-inl.h"
+#include "base/enums.h"
 #include "builder.h"
 #include "class_linker.h"
 #include "constant_folding.h"
@@ -35,7 +36,7 @@
 #include "nodes.h"
 #include "optimizing_compiler.h"
 #include "reference_type_propagation.h"
-#include "register_allocator.h"
+#include "register_allocator_linear_scan.h"
 #include "quick/inline_method_analyser.h"
 #include "sharpening.h"
 #include "ssa_builder.h"
@@ -151,7 +152,7 @@ static ArtMethod* FindVirtualOrInterfaceTarget(HInvoke* invoke, ArtMethod* resol
   }
 
   ClassLinker* cl = Runtime::Current()->GetClassLinker();
-  size_t pointer_size = cl->GetImagePointerSize();
+  PointerSize pointer_size = cl->GetImagePointerSize();
   if (invoke->IsInvokeInterface()) {
     resolved_method = info.GetTypeHandle()->FindVirtualMethodForInterface(
         resolved_method, pointer_size);
@@ -208,12 +209,8 @@ static uint32_t FindClassIndexIn(mirror::Class* cls,
     DCHECK(cls->IsProxyClass()) << PrettyClass(cls);
     // TODO: deal with proxy classes.
   } else if (IsSameDexFile(cls->GetDexFile(), dex_file)) {
+    DCHECK_EQ(cls->GetDexCache(), dex_cache.Get());
     index = cls->GetDexTypeIndex();
-  } else {
-    index = cls->FindTypeIndexInOtherDexFile(dex_file);
-  }
-
-  if (index != DexFile::kDexNoIndex) {
     // Update the dex cache to ensure the class is in. The generated code will
     // consider it is. We make it safe by updating the dex cache, as other
     // dex files might also load the class, and there is no guarantee the dex
@@ -221,6 +218,14 @@ static uint32_t FindClassIndexIn(mirror::Class* cls,
     if (dex_cache->GetResolvedType(index) == nullptr) {
       dex_cache->SetResolvedType(index, cls);
     }
+  } else {
+    index = cls->FindTypeIndexInOtherDexFile(dex_file);
+    // We cannot guarantee the entry in the dex cache will resolve to the same class,
+    // as there may be different class loaders. So only return the index if it's
+    // the right class in the dex cache already.
+    if (index != DexFile::kDexNoIndex && dex_cache->GetResolvedType(index) != cls) {
+      index = DexFile::kDexNoIndex;
+    }
   }
 
   return index;
@@ -239,7 +244,7 @@ class ScopedProfilingInfoInlineUse {
 
   ~ScopedProfilingInfoInlineUse() {
     if (profiling_info_ != nullptr) {
-      size_t pointer_size = Runtime::Current()->GetClassLinker()->GetImagePointerSize();
+      PointerSize pointer_size = Runtime::Current()->GetClassLinker()->GetImagePointerSize();
       DCHECK_EQ(profiling_info_, method_->GetProfilingInfo(pointer_size));
       Runtime::Current()->GetJit()->GetCodeCache()->DoneCompilerUse(method_, self_);
     }
@@ -273,7 +278,7 @@ bool HInliner::TryInline(HInvoke* invoke_instruction) {
       return false;
     }
     MethodReference ref = invoke_instruction->AsInvokeStaticOrDirect()->GetTargetMethod();
-    mirror::DexCache* const dex_cache = (&caller_dex_file == ref.dex_file)
+    mirror::DexCache* const dex_cache = IsSameDexFile(caller_dex_file, *ref.dex_file)
         ? caller_compilation_unit_.GetDexCache().Get()
         : class_linker->FindDexCache(soa.Self(), *ref.dex_file);
     resolved_method = dex_cache->GetResolvedMethod(
@@ -386,7 +391,7 @@ bool HInliner::TryInlineMonomorphicCall(HInvoke* invoke_instruction,
   }
 
   ClassLinker* class_linker = caller_compilation_unit_.GetClassLinker();
-  size_t pointer_size = class_linker->GetImagePointerSize();
+  PointerSize pointer_size = class_linker->GetImagePointerSize();
   if (invoke_instruction->IsInvokeInterface()) {
     resolved_method = ic.GetMonomorphicType()->FindVirtualMethodForInterface(
         resolved_method, pointer_size);
@@ -478,7 +483,7 @@ bool HInliner::TryInlinePolymorphicCall(HInvoke* invoke_instruction,
   }
 
   ClassLinker* class_linker = caller_compilation_unit_.GetClassLinker();
-  size_t pointer_size = class_linker->GetImagePointerSize();
+  PointerSize pointer_size = class_linker->GetImagePointerSize();
   const DexFile& caller_dex_file = *caller_compilation_unit_.GetDexFile();
 
   bool all_targets_inlined = true;
@@ -640,7 +645,7 @@ bool HInliner::TryInlinePolymorphicCallToSameTarget(HInvoke* invoke_instruction,
     return false;
   }
   ClassLinker* class_linker = caller_compilation_unit_.GetClassLinker();
-  size_t pointer_size = class_linker->GetImagePointerSize();
+  PointerSize pointer_size = class_linker->GetImagePointerSize();
 
   DCHECK(resolved_method != nullptr);
   ArtMethod* actual_method = nullptr;
@@ -656,8 +661,8 @@ bool HInliner::TryInlinePolymorphicCallToSameTarget(HInvoke* invoke_instruction,
     }
     ArtMethod* new_method = nullptr;
     if (invoke_instruction->IsInvokeInterface()) {
-      new_method = ic.GetTypeAt(i)->GetEmbeddedImTableEntry(
-          method_index % mirror::Class::kImtSize, pointer_size);
+      new_method = ic.GetTypeAt(i)->GetImt(pointer_size)->Get(
+          method_index, pointer_size);
       if (new_method->IsRuntimeMethod()) {
         // Bail out as soon as we see a conflict trampoline in one of the target's
         // interface table.
@@ -804,8 +809,6 @@ bool HInliner::TryInlineAndReplace(HInvoke* invoke_instruction, ArtMethod* metho
 bool HInliner::TryBuildAndInline(HInvoke* invoke_instruction,
                                  ArtMethod* method,
                                  HInstruction** return_replacement) {
-  const DexFile& caller_dex_file = *caller_compilation_unit_.GetDexFile();
-
   if (method->IsProxyMethod()) {
     VLOG(compiler) << "Method " << PrettyMethod(method)
                    << " is not inlined because of unimplemented inline support for proxy methods.";
@@ -828,15 +831,6 @@ bool HInliner::TryBuildAndInline(HInvoke* invoke_instruction,
     return false;
   }
 
-  uint32_t method_index = FindMethodIndexIn(
-      method, caller_dex_file, invoke_instruction->GetDexMethodIndex());
-  if (method_index == DexFile::kDexNoIndex) {
-    VLOG(compiler) << "Call to "
-                   << PrettyMethod(method)
-                   << " cannot be inlined because unaccessible to caller";
-    return false;
-  }
-
   bool same_dex_file = IsSameDexFile(*outer_compilation_unit_.GetDexFile(), *method->GetDexFile());
 
   const DexFile::CodeItem* code_item = method->GetCodeItem();
@@ -873,7 +867,7 @@ bool HInliner::TryBuildAndInline(HInvoke* invoke_instruction,
     if (Runtime::Current()->UseJitCompilation() ||
         !compiler_driver_->IsMethodVerifiedWithoutFailures(
             method->GetDexMethodIndex(), class_def_idx, *method->GetDexFile())) {
-      VLOG(compiler) << "Method " << PrettyMethod(method_index, caller_dex_file)
+      VLOG(compiler) << "Method " << PrettyMethod(method)
                      << " couldn't be verified, so it cannot be inlined";
       return false;
     }
@@ -883,7 +877,7 @@ bool HInliner::TryBuildAndInline(HInvoke* invoke_instruction,
       invoke_instruction->AsInvokeStaticOrDirect()->IsStaticWithImplicitClinitCheck()) {
     // Case of a static method that cannot be inlined because it implicitly
     // requires an initialization check of its declaring class.
-    VLOG(compiler) << "Method " << PrettyMethod(method_index, caller_dex_file)
+    VLOG(compiler) << "Method " << PrettyMethod(method)
                    << " is not inlined because it is static and requires a clinit"
                    << " check that cannot be emitted due to Dex cache limitations";
     return false;
@@ -893,7 +887,7 @@ bool HInliner::TryBuildAndInline(HInvoke* invoke_instruction,
     return false;
   }
 
-  VLOG(compiler) << "Successfully inlined " << PrettyMethod(method_index, caller_dex_file);
+  VLOG(compiler) << "Successfully inlined " << PrettyMethod(method);
   MaybeRecordStat(kInlinedInvoke);
   return true;
 }
@@ -1011,7 +1005,7 @@ bool HInliner::TryPatternSubstitution(HInvoke* invoke_instruction,
           invoke_instruction->GetBlock()->InsertInstructionBefore(iput, invoke_instruction);
 
           // Check whether the field is final. If it is, we need to add a barrier.
-          size_t pointer_size = InstructionSetPointerSize(codegen_->GetInstructionSet());
+          PointerSize pointer_size = InstructionSetPointerSize(codegen_->GetInstructionSet());
           ArtField* resolved_field = dex_cache->GetResolvedField(field_index, pointer_size);
           DCHECK(resolved_field != nullptr);
           if (resolved_field->IsFinal()) {
@@ -1037,7 +1031,7 @@ HInstanceFieldGet* HInliner::CreateInstanceFieldGet(Handle<mirror::DexCache> dex
                                                     uint32_t field_index,
                                                     HInstruction* obj)
     SHARED_REQUIRES(Locks::mutator_lock_) {
-  size_t pointer_size = InstructionSetPointerSize(codegen_->GetInstructionSet());
+  PointerSize pointer_size = InstructionSetPointerSize(codegen_->GetInstructionSet());
   ArtField* resolved_field = dex_cache->GetResolvedField(field_index, pointer_size);
   DCHECK(resolved_field != nullptr);
   HInstanceFieldGet* iget = new (graph_->GetArena()) HInstanceFieldGet(
@@ -1065,7 +1059,7 @@ HInstanceFieldSet* HInliner::CreateInstanceFieldSet(Handle<mirror::DexCache> dex
                                                     HInstruction* obj,
                                                     HInstruction* value)
     SHARED_REQUIRES(Locks::mutator_lock_) {
-  size_t pointer_size = InstructionSetPointerSize(codegen_->GetInstructionSet());
+  PointerSize pointer_size = InstructionSetPointerSize(codegen_->GetInstructionSet());
   ArtField* resolved_field = dex_cache->GetResolvedField(field_index, pointer_size);
   DCHECK(resolved_field != nullptr);
   HInstanceFieldSet* iput = new (graph_->GetArena()) HInstanceFieldSet(
@@ -1094,8 +1088,11 @@ bool HInliner::TryBuildAndInlineHelper(HInvoke* invoke_instruction,
   uint32_t method_index = resolved_method->GetDexMethodIndex();
   ClassLinker* class_linker = caller_compilation_unit_.GetClassLinker();
   Handle<mirror::DexCache> dex_cache(handles_->NewHandle(resolved_method->GetDexCache()));
+  Handle<mirror::ClassLoader> class_loader(handles_->NewHandle(
+      resolved_method->GetDeclaringClass()->GetClassLoader()));
+
   DexCompilationUnit dex_compilation_unit(
-      caller_compilation_unit_.GetClassLoader(),
+      class_loader.ToJObject(),
       class_linker,
       callee_dex_file,
       code_item,
@@ -1404,7 +1401,7 @@ bool HInliner::ArgumentTypesMoreSpecific(HInvoke* invoke_instruction, ArtMethod*
     }
   }
 
-  size_t pointer_size = Runtime::Current()->GetClassLinker()->GetImagePointerSize();
+  PointerSize pointer_size = Runtime::Current()->GetClassLinker()->GetImagePointerSize();
 
   // Iterate over the list of parameter types and test whether any of the
   // actual inputs has a more specific reference type than the type declared in
@@ -1461,7 +1458,7 @@ void HInliner::FixUpReturnReferenceType(ArtMethod* resolved_method,
         // TODO: we could be more precise by merging the phi inputs but that requires
         // some functionality from the reference type propagation.
         DCHECK(return_replacement->IsPhi());
-        size_t pointer_size = Runtime::Current()->GetClassLinker()->GetImagePointerSize();
+        PointerSize pointer_size = Runtime::Current()->GetClassLinker()->GetImagePointerSize();
         mirror::Class* cls = resolved_method->GetReturnType(false /* resolve */, pointer_size);
         return_replacement->SetReferenceTypeInfo(GetClassRTI(cls));
       }
diff --git a/compiler/optimizing/instruction_builder.cc b/compiler/optimizing/instruction_builder.cc
index b4125299ea..e5dab569fd 100644
--- a/compiler/optimizing/instruction_builder.cc
+++ b/compiler/optimizing/instruction_builder.cc
@@ -16,8 +16,10 @@
 
 #include "instruction_builder.h"
 
+#include "art_method-inl.h"
 #include "bytecode_utils.h"
 #include "class_linker.h"
+#include "dex_instruction-inl.h"
 #include "driver/compiler_options.h"
 #include "scoped_thread_state_change.h"
 
@@ -890,7 +892,7 @@ bool HInstructionBuilder::BuildInvoke(const Instruction& instruction,
                                            return_type,
                                            dex_pc,
                                            method_idx,
-                                           resolved_method->GetDexMethodIndex());
+                                           resolved_method->GetImtIndex());
   }
 
   return HandleInvoke(invoke,
diff --git a/compiler/optimizing/instruction_builder.h b/compiler/optimizing/instruction_builder.h
index 9cfc065da6..517cf76831 100644
--- a/compiler/optimizing/instruction_builder.h
+++ b/compiler/optimizing/instruction_builder.h
@@ -30,6 +30,8 @@
 
 namespace art {
 
+class Instruction;
+
 class HInstructionBuilder : public ValueObject {
  public:
   HInstructionBuilder(HGraph* graph,
diff --git a/compiler/optimizing/instruction_simplifier.cc b/compiler/optimizing/instruction_simplifier.cc
index e0410dcdb2..4ca0600dba 100644
--- a/compiler/optimizing/instruction_simplifier.cc
+++ b/compiler/optimizing/instruction_simplifier.cc
@@ -920,6 +920,7 @@ void InstructionSimplifierVisitor::VisitTypeConversion(HTypeConversion* instruct
 void InstructionSimplifierVisitor::VisitAdd(HAdd* instruction) {
   HConstant* input_cst = instruction->GetConstantRight();
   HInstruction* input_other = instruction->GetLeastConstantLeft();
+  bool integral_type = Primitive::IsIntegralType(instruction->GetType());
   if ((input_cst != nullptr) && input_cst->IsArithmeticZero()) {
     // Replace code looking like
     //    ADD dst, src, 0
@@ -928,7 +929,7 @@ void InstructionSimplifierVisitor::VisitAdd(HAdd* instruction) {
     // Note that we cannot optimize `x + 0.0` to `x` for floating-point. When
     // `x` is `-0.0`, the former expression yields `0.0`, while the later
     // yields `-0.0`.
-    if (Primitive::IsIntegralType(instruction->GetType())) {
+    if (integral_type) {
       instruction->ReplaceWith(input_other);
       instruction->GetBlock()->RemoveInstruction(instruction);
       RecordSimplification();
@@ -974,10 +975,31 @@ void InstructionSimplifierVisitor::VisitAdd(HAdd* instruction) {
   // so no need to return.
   TryHandleAssociativeAndCommutativeOperation(instruction);
 
-  if ((instruction->GetLeft()->IsSub() || instruction->GetRight()->IsSub()) &&
+  if ((left->IsSub() || right->IsSub()) &&
       TrySubtractionChainSimplification(instruction)) {
     return;
   }
+
+  if (integral_type) {
+    // Replace code patterns looking like
+    //    SUB dst1, x, y        SUB dst1, x, y
+    //    ADD dst2, dst1, y     ADD dst2, y, dst1
+    // with
+    //    SUB dst1, x, y
+    // ADD instruction is not needed in this case, we may use
+    // one of inputs of SUB instead.
+    if (left->IsSub() && left->InputAt(1) == right) {
+      instruction->ReplaceWith(left->InputAt(0));
+      RecordSimplification();
+      instruction->GetBlock()->RemoveInstruction(instruction);
+      return;
+    } else if (right->IsSub() && right->InputAt(1) == left) {
+      instruction->ReplaceWith(right->InputAt(0));
+      RecordSimplification();
+      instruction->GetBlock()->RemoveInstruction(instruction);
+      return;
+    }
+  }
 }
 
 void InstructionSimplifierVisitor::VisitAnd(HAnd* instruction) {
@@ -1511,6 +1533,29 @@ void InstructionSimplifierVisitor::VisitSub(HSub* instruction) {
   if (TrySubtractionChainSimplification(instruction)) {
     return;
   }
+
+  if (left->IsAdd()) {
+    // Replace code patterns looking like
+    //    ADD dst1, x, y        ADD dst1, x, y
+    //    SUB dst2, dst1, y     SUB dst2, dst1, x
+    // with
+    //    ADD dst1, x, y
+    // SUB instruction is not needed in this case, we may use
+    // one of inputs of ADD instead.
+    // It is applicable to integral types only.
+    DCHECK(Primitive::IsIntegralType(type));
+    if (left->InputAt(1) == right) {
+      instruction->ReplaceWith(left->InputAt(0));
+      RecordSimplification();
+      instruction->GetBlock()->RemoveInstruction(instruction);
+      return;
+    } else if (left->InputAt(0) == right) {
+      instruction->ReplaceWith(left->InputAt(1));
+      RecordSimplification();
+      instruction->GetBlock()->RemoveInstruction(instruction);
+      return;
+    }
+  }
 }
 
 void InstructionSimplifierVisitor::VisitUShr(HUShr* instruction) {
diff --git a/compiler/optimizing/instruction_simplifier_arm.cc b/compiler/optimizing/instruction_simplifier_arm.cc
index cd026b8770..495f3fd232 100644
--- a/compiler/optimizing/instruction_simplifier_arm.cc
+++ b/compiler/optimizing/instruction_simplifier_arm.cc
@@ -14,8 +14,10 @@
  * limitations under the License.
  */
 
+#include "code_generator.h"
 #include "instruction_simplifier_arm.h"
 #include "instruction_simplifier_shared.h"
+#include "mirror/array-inl.h"
 
 namespace art {
 namespace arm {
@@ -38,6 +40,46 @@ void InstructionSimplifierArmVisitor::VisitAnd(HAnd* instruction) {
   }
 }
 
+void InstructionSimplifierArmVisitor::VisitArrayGet(HArrayGet* instruction) {
+  size_t data_offset = CodeGenerator::GetArrayDataOffset(instruction);
+  Primitive::Type type = instruction->GetType();
+
+  if (type == Primitive::kPrimLong
+      || type == Primitive::kPrimFloat
+      || type == Primitive::kPrimDouble) {
+    // T32 doesn't support ShiftedRegOffset mem address mode for these types
+    // to enable optimization.
+    return;
+  }
+
+  if (TryExtractArrayAccessAddress(instruction,
+                                   instruction->GetArray(),
+                                   instruction->GetIndex(),
+                                   data_offset)) {
+    RecordSimplification();
+  }
+}
+
+void InstructionSimplifierArmVisitor::VisitArraySet(HArraySet* instruction) {
+  size_t access_size = Primitive::ComponentSize(instruction->GetComponentType());
+  size_t data_offset = mirror::Array::DataOffset(access_size).Uint32Value();
+  Primitive::Type type = instruction->GetComponentType();
+
+  if (type == Primitive::kPrimLong
+      || type == Primitive::kPrimFloat
+      || type == Primitive::kPrimDouble) {
+    // T32 doesn't support ShiftedRegOffset mem address mode for these types
+    // to enable optimization.
+    return;
+  }
+
+  if (TryExtractArrayAccessAddress(instruction,
+                                   instruction->GetArray(),
+                                   instruction->GetIndex(),
+                                   data_offset)) {
+    RecordSimplification();
+  }
+}
 
 }  // namespace arm
 }  // namespace art
diff --git a/compiler/optimizing/instruction_simplifier_arm.h b/compiler/optimizing/instruction_simplifier_arm.h
index 14c940eb21..782110c40a 100644
--- a/compiler/optimizing/instruction_simplifier_arm.h
+++ b/compiler/optimizing/instruction_simplifier_arm.h
@@ -38,6 +38,8 @@ class InstructionSimplifierArmVisitor : public HGraphVisitor {
   void VisitMul(HMul* instruction) OVERRIDE;
   void VisitOr(HOr* instruction) OVERRIDE;
   void VisitAnd(HAnd* instruction) OVERRIDE;
+  void VisitArrayGet(HArrayGet* instruction) OVERRIDE;
+  void VisitArraySet(HArraySet* instruction) OVERRIDE;
 
   OptimizingCompilerStats* stats_;
 };
@@ -46,7 +48,9 @@ class InstructionSimplifierArmVisitor : public HGraphVisitor {
 class InstructionSimplifierArm : public HOptimization {
  public:
   InstructionSimplifierArm(HGraph* graph, OptimizingCompilerStats* stats)
-    : HOptimization(graph, "instruction_simplifier_arm", stats) {}
+    : HOptimization(graph, kInstructionSimplifierArmPassName, stats) {}
+
+  static constexpr const char* kInstructionSimplifierArmPassName = "instruction_simplifier_arm";
 
   void Run() OVERRIDE {
     InstructionSimplifierArmVisitor visitor(graph_, stats_);
diff --git a/compiler/optimizing/instruction_simplifier_arm64.cc b/compiler/optimizing/instruction_simplifier_arm64.cc
index 983d31d168..6d107d571f 100644
--- a/compiler/optimizing/instruction_simplifier_arm64.cc
+++ b/compiler/optimizing/instruction_simplifier_arm64.cc
@@ -28,56 +28,6 @@ using helpers::CanFitInShifterOperand;
 using helpers::HasShifterOperand;
 using helpers::ShifterOperandSupportsExtension;
 
-void InstructionSimplifierArm64Visitor::TryExtractArrayAccessAddress(HInstruction* access,
-                                                                     HInstruction* array,
-                                                                     HInstruction* index,
-                                                                     size_t data_offset) {
-  if (kEmitCompilerReadBarrier) {
-    // The read barrier instrumentation does not support the
-    // HArm64IntermediateAddress instruction yet.
-    //
-    // TODO: Handle this case properly in the ARM64 code generator and
-    // re-enable this optimization; otherwise, remove this TODO.
-    // b/26601270
-    return;
-  }
-  if (index->IsConstant() ||
-      (index->IsBoundsCheck() && index->AsBoundsCheck()->GetIndex()->IsConstant())) {
-    // When the index is a constant all the addressing can be fitted in the
-    // memory access instruction, so do not split the access.
-    return;
-  }
-  if (access->IsArraySet() &&
-      access->AsArraySet()->GetValue()->GetType() == Primitive::kPrimNot) {
-    // The access may require a runtime call or the original array pointer.
-    return;
-  }
-
-  // Proceed to extract the base address computation.
-  ArenaAllocator* arena = GetGraph()->GetArena();
-
-  HIntConstant* offset = GetGraph()->GetIntConstant(data_offset);
-  HArm64IntermediateAddress* address =
-      new (arena) HArm64IntermediateAddress(array, offset, kNoDexPc);
-  address->SetReferenceTypeInfo(array->GetReferenceTypeInfo());
-  access->GetBlock()->InsertInstructionBefore(address, access);
-  access->ReplaceInput(address, 0);
-  // Both instructions must depend on GC to prevent any instruction that can
-  // trigger GC to be inserted between the two.
-  access->AddSideEffects(SideEffects::DependsOnGC());
-  DCHECK(address->GetSideEffects().Includes(SideEffects::DependsOnGC()));
-  DCHECK(access->GetSideEffects().Includes(SideEffects::DependsOnGC()));
-  // TODO: Code generation for HArrayGet and HArraySet will check whether the input address
-  // is an HArm64IntermediateAddress and generate appropriate code.
-  // We would like to replace the `HArrayGet` and `HArraySet` with custom instructions (maybe
-  // `HArm64Load` and `HArm64Store`). We defer these changes because these new instructions would
-  // not bring any advantages yet.
-  // Also see the comments in
-  // `InstructionCodeGeneratorARM64::VisitArrayGet()` and
-  // `InstructionCodeGeneratorARM64::VisitArraySet()`.
-  RecordSimplification();
-}
-
 bool InstructionSimplifierArm64Visitor::TryMergeIntoShifterOperand(HInstruction* use,
                                                                    HInstruction* bitfield_op,
                                                                    bool do_merge) {
@@ -190,19 +140,23 @@ void InstructionSimplifierArm64Visitor::VisitAnd(HAnd* instruction) {
 
 void InstructionSimplifierArm64Visitor::VisitArrayGet(HArrayGet* instruction) {
   size_t data_offset = CodeGenerator::GetArrayDataOffset(instruction);
-  TryExtractArrayAccessAddress(instruction,
-                               instruction->GetArray(),
-                               instruction->GetIndex(),
-                               data_offset);
+  if (TryExtractArrayAccessAddress(instruction,
+                                   instruction->GetArray(),
+                                   instruction->GetIndex(),
+                                   data_offset)) {
+    RecordSimplification();
+  }
 }
 
 void InstructionSimplifierArm64Visitor::VisitArraySet(HArraySet* instruction) {
   size_t access_size = Primitive::ComponentSize(instruction->GetComponentType());
   size_t data_offset = mirror::Array::DataOffset(access_size).Uint32Value();
-  TryExtractArrayAccessAddress(instruction,
-                               instruction->GetArray(),
-                               instruction->GetIndex(),
-                               data_offset);
+  if (TryExtractArrayAccessAddress(instruction,
+                                   instruction->GetArray(),
+                                   instruction->GetIndex(),
+                                   data_offset)) {
+    RecordSimplification();
+  }
 }
 
 void InstructionSimplifierArm64Visitor::VisitMul(HMul* instruction) {
diff --git a/compiler/optimizing/instruction_simplifier_arm64.h b/compiler/optimizing/instruction_simplifier_arm64.h
index 4735f85ab0..f71684efe9 100644
--- a/compiler/optimizing/instruction_simplifier_arm64.h
+++ b/compiler/optimizing/instruction_simplifier_arm64.h
@@ -35,10 +35,6 @@ class InstructionSimplifierArm64Visitor : public HGraphVisitor {
     }
   }
 
-  void TryExtractArrayAccessAddress(HInstruction* access,
-                                    HInstruction* array,
-                                    HInstruction* index,
-                                    size_t data_offset);
   bool TryMergeIntoUsersShifterOperand(HInstruction* instruction);
   bool TryMergeIntoShifterOperand(HInstruction* use,
                                   HInstruction* bitfield_op,
@@ -86,8 +82,9 @@ class InstructionSimplifierArm64Visitor : public HGraphVisitor {
 class InstructionSimplifierArm64 : public HOptimization {
  public:
   InstructionSimplifierArm64(HGraph* graph, OptimizingCompilerStats* stats)
-    : HOptimization(graph, "instruction_simplifier_arm64", stats) {}
-
+    : HOptimization(graph, kInstructionSimplifierArm64PassName, stats) {}
+  static constexpr const char* kInstructionSimplifierArm64PassName
+      = "instruction_simplifier_arm64";
   void Run() OVERRIDE {
     InstructionSimplifierArm64Visitor visitor(graph_, stats_);
     visitor.VisitReversePostOrder();
diff --git a/compiler/optimizing/instruction_simplifier_shared.cc b/compiler/optimizing/instruction_simplifier_shared.cc
index dab1ebc16d..8f7778fe68 100644
--- a/compiler/optimizing/instruction_simplifier_shared.cc
+++ b/compiler/optimizing/instruction_simplifier_shared.cc
@@ -226,4 +226,59 @@ bool TryMergeNegatedInput(HBinaryOperation* op) {
   return false;
 }
 
+
+bool TryExtractArrayAccessAddress(HInstruction* access,
+                                  HInstruction* array,
+                                  HInstruction* index,
+                                  size_t data_offset) {
+  if (kEmitCompilerReadBarrier) {
+    // The read barrier instrumentation does not support the
+    // HIntermediateAddress instruction yet.
+    //
+    // TODO: Handle this case properly in the ARM64 and ARM code generator and
+    // re-enable this optimization; otherwise, remove this TODO.
+    // b/26601270
+    return false;
+  }
+  if (index->IsConstant() ||
+      (index->IsBoundsCheck() && index->AsBoundsCheck()->GetIndex()->IsConstant())) {
+    // When the index is a constant all the addressing can be fitted in the
+    // memory access instruction, so do not split the access.
+    return false;
+  }
+  if (access->IsArraySet() &&
+      access->AsArraySet()->GetValue()->GetType() == Primitive::kPrimNot) {
+    // The access may require a runtime call or the original array pointer.
+    return false;
+  }
+
+  // Proceed to extract the base address computation.
+  HGraph* graph = access->GetBlock()->GetGraph();
+  ArenaAllocator* arena = graph->GetArena();
+
+  HIntConstant* offset = graph->GetIntConstant(data_offset);
+  HIntermediateAddress* address =
+      new (arena) HIntermediateAddress(array, offset, kNoDexPc);
+  address->SetReferenceTypeInfo(array->GetReferenceTypeInfo());
+  access->GetBlock()->InsertInstructionBefore(address, access);
+  access->ReplaceInput(address, 0);
+  // Both instructions must depend on GC to prevent any instruction that can
+  // trigger GC to be inserted between the two.
+  access->AddSideEffects(SideEffects::DependsOnGC());
+  DCHECK(address->GetSideEffects().Includes(SideEffects::DependsOnGC()));
+  DCHECK(access->GetSideEffects().Includes(SideEffects::DependsOnGC()));
+  // TODO: Code generation for HArrayGet and HArraySet will check whether the input address
+  // is an HIntermediateAddress and generate appropriate code.
+  // We would like to replace the `HArrayGet` and `HArraySet` with custom instructions (maybe
+  // `HArm64Load` and `HArm64Store`,`HArmLoad` and `HArmStore`). We defer these changes
+  // because these new instructions would not bring any advantages yet.
+  // Also see the comments in
+  // `InstructionCodeGeneratorARM::VisitArrayGet()`
+  // `InstructionCodeGeneratorARM::VisitArraySet()`
+  // `InstructionCodeGeneratorARM64::VisitArrayGet()`
+  // `InstructionCodeGeneratorARM64::VisitArraySet()`.
+  return true;
+}
+
+
 }  // namespace art
diff --git a/compiler/optimizing/instruction_simplifier_shared.h b/compiler/optimizing/instruction_simplifier_shared.h
index b1fe8f4756..56804f5e90 100644
--- a/compiler/optimizing/instruction_simplifier_shared.h
+++ b/compiler/optimizing/instruction_simplifier_shared.h
@@ -26,6 +26,11 @@ bool TryCombineMultiplyAccumulate(HMul* mul, InstructionSet isa);
 // a negated bitwise instruction.
 bool TryMergeNegatedInput(HBinaryOperation* op);
 
+bool TryExtractArrayAccessAddress(HInstruction* access,
+                                  HInstruction* array,
+                                  HInstruction* index,
+                                  size_t data_offset);
+
 }  // namespace art
 
 #endif  // ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_SHARED_H_
diff --git a/compiler/optimizing/intrinsics.h b/compiler/optimizing/intrinsics.h
index 3429a8fdbb..1a8eb58857 100644
--- a/compiler/optimizing/intrinsics.h
+++ b/compiler/optimizing/intrinsics.h
@@ -27,9 +27,6 @@ namespace art {
 class CompilerDriver;
 class DexFile;
 
-// Temporary measure until we have caught up with the Java 7 definition of Math.round. b/26327751
-static constexpr bool kRoundIsPlusPointFive = false;
-
 // Positive floating-point infinities.
 static constexpr uint32_t kPositiveInfinityFloat = 0x7f800000U;
 static constexpr uint64_t kPositiveInfinityDouble = UINT64_C(0x7ff0000000000000);
diff --git a/compiler/optimizing/intrinsics_arm.cc b/compiler/optimizing/intrinsics_arm.cc
index 579fb9d3bb..0bbc0e54bc 100644
--- a/compiler/optimizing/intrinsics_arm.cc
+++ b/compiler/optimizing/intrinsics_arm.cc
@@ -41,6 +41,92 @@ ArenaAllocator* IntrinsicCodeGeneratorARM::GetAllocator() {
 
 using IntrinsicSlowPathARM = IntrinsicSlowPath<InvokeDexCallingConventionVisitorARM>;
 
+// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
+#define __ down_cast<ArmAssembler*>(codegen->GetAssembler())->  // NOLINT
+
+// Slow path implementing the SystemArrayCopy intrinsic copy loop with read barriers.
+class ReadBarrierSystemArrayCopySlowPathARM : public SlowPathCode {
+ public:
+  explicit ReadBarrierSystemArrayCopySlowPathARM(HInstruction* instruction)
+      : SlowPathCode(instruction) {
+    DCHECK(kEmitCompilerReadBarrier);
+    DCHECK(kUseBakerReadBarrier);
+  }
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen);
+    LocationSummary* locations = instruction_->GetLocations();
+    DCHECK(locations->CanCall());
+    DCHECK(instruction_->IsInvokeStaticOrDirect())
+        << "Unexpected instruction in read barrier arraycopy slow path: "
+        << instruction_->DebugName();
+    DCHECK(instruction_->GetLocations()->Intrinsified());
+    DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kSystemArrayCopy);
+
+    int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot);
+    uint32_t element_size_shift = Primitive::ComponentSizeShift(Primitive::kPrimNot);
+    uint32_t offset = mirror::Array::DataOffset(element_size).Uint32Value();
+
+    Register dest = locations->InAt(2).AsRegister<Register>();
+    Location dest_pos = locations->InAt(3);
+    Register src_curr_addr = locations->GetTemp(0).AsRegister<Register>();
+    Register dst_curr_addr = locations->GetTemp(1).AsRegister<Register>();
+    Register src_stop_addr = locations->GetTemp(2).AsRegister<Register>();
+    Register tmp = locations->GetTemp(3).AsRegister<Register>();
+
+    __ Bind(GetEntryLabel());
+    // Compute the base destination address in `dst_curr_addr`.
+    if (dest_pos.IsConstant()) {
+      int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
+      __ AddConstant(dst_curr_addr, dest, element_size * constant + offset);
+    } else {
+      __ add(dst_curr_addr,
+             dest,
+             ShifterOperand(dest_pos.AsRegister<Register>(), LSL, element_size_shift));
+      __ AddConstant(dst_curr_addr, offset);
+    }
+
+    Label loop;
+    __ Bind(&loop);
+    __ ldr(tmp, Address(src_curr_addr, element_size, Address::PostIndex));
+    __ MaybeUnpoisonHeapReference(tmp);
+    // TODO: Inline the mark bit check before calling the runtime?
+    // tmp = ReadBarrier::Mark(tmp);
+    // No need to save live registers; it's taken care of by the
+    // entrypoint. Also, there is no need to update the stack mask,
+    // as this runtime call will not trigger a garbage collection.
+    // (See ReadBarrierMarkSlowPathARM::EmitNativeCode for more
+    // explanations.)
+    DCHECK_NE(tmp, SP);
+    DCHECK_NE(tmp, LR);
+    DCHECK_NE(tmp, PC);
+    // IP is used internally by the ReadBarrierMarkRegX entry point
+    // as a temporary (and not preserved).  It thus cannot be used by
+    // any live register in this slow path.
+    DCHECK_NE(src_curr_addr, IP);
+    DCHECK_NE(dst_curr_addr, IP);
+    DCHECK_NE(src_stop_addr, IP);
+    DCHECK_NE(tmp, IP);
+    DCHECK(0 <= tmp && tmp < kNumberOfCoreRegisters) << tmp;
+    int32_t entry_point_offset =
+        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(tmp);
+    // This runtime call does not require a stack map.
+    arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
+    __ MaybePoisonHeapReference(tmp);
+    __ str(tmp, Address(dst_curr_addr, element_size, Address::PostIndex));
+    __ cmp(src_curr_addr, ShifterOperand(src_stop_addr));
+    __ b(&loop, NE);
+    __ b(GetExitLabel());
+  }
+
+  const char* GetDescription() const OVERRIDE { return "ReadBarrierSystemArrayCopySlowPathARM"; }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(ReadBarrierSystemArrayCopySlowPathARM);
+};
+
+#undef __
+
 bool IntrinsicLocationsBuilderARM::TryDispatch(HInvoke* invoke) {
   Dispatch(invoke);
   LocationSummary* res = invoke->GetLocations();
@@ -1201,7 +1287,7 @@ static void GenerateVisitStringIndexOf(HInvoke* invoke,
   }
 
   __ LoadFromOffset(kLoadWord, LR, TR,
-                    QUICK_ENTRYPOINT_OFFSET(kArmWordSize, pIndexOf).Int32Value());
+                    QUICK_ENTRYPOINT_OFFSET(kArmPointerSize, pIndexOf).Int32Value());
   CheckEntrypointTypes<kQuickIndexOf, int32_t, void*, uint32_t, uint32_t>();
   __ blx(LR);
 
@@ -1212,7 +1298,7 @@ static void GenerateVisitStringIndexOf(HInvoke* invoke,
 
 void IntrinsicLocationsBuilderARM::VisitStringIndexOf(HInvoke* invoke) {
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                            LocationSummary::kCall,
+                                                            LocationSummary::kCallOnMainAndSlowPath,
                                                             kIntrinsified);
   // We have a hand-crafted assembly stub that follows the runtime calling convention. So it's
   // best to align the inputs accordingly.
@@ -1232,7 +1318,7 @@ void IntrinsicCodeGeneratorARM::VisitStringIndexOf(HInvoke* invoke) {
 
 void IntrinsicLocationsBuilderARM::VisitStringIndexOfAfter(HInvoke* invoke) {
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                            LocationSummary::kCall,
+                                                            LocationSummary::kCallOnMainAndSlowPath,
                                                             kIntrinsified);
   // We have a hand-crafted assembly stub that follows the runtime calling convention. So it's
   // best to align the inputs accordingly.
@@ -1250,7 +1336,7 @@ void IntrinsicCodeGeneratorARM::VisitStringIndexOfAfter(HInvoke* invoke) {
 
 void IntrinsicLocationsBuilderARM::VisitStringNewStringFromBytes(HInvoke* invoke) {
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                            LocationSummary::kCall,
+                                                            LocationSummary::kCallOnMainAndSlowPath,
                                                             kIntrinsified);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
@@ -1270,8 +1356,10 @@ void IntrinsicCodeGeneratorARM::VisitStringNewStringFromBytes(HInvoke* invoke) {
   codegen_->AddSlowPath(slow_path);
   __ b(slow_path->GetEntryLabel(), EQ);
 
-  __ LoadFromOffset(
-      kLoadWord, LR, TR, QUICK_ENTRYPOINT_OFFSET(kArmWordSize, pAllocStringFromBytes).Int32Value());
+  __ LoadFromOffset(kLoadWord,
+                    LR,
+                    TR,
+                    QUICK_ENTRYPOINT_OFFSET(kArmPointerSize, pAllocStringFromBytes).Int32Value());
   CheckEntrypointTypes<kQuickAllocStringFromBytes, void*, void*, int32_t, int32_t, int32_t>();
   __ blx(LR);
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
@@ -1280,7 +1368,7 @@ void IntrinsicCodeGeneratorARM::VisitStringNewStringFromBytes(HInvoke* invoke) {
 
 void IntrinsicLocationsBuilderARM::VisitStringNewStringFromChars(HInvoke* invoke) {
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                            LocationSummary::kCall,
+                                                            LocationSummary::kCallOnMainOnly,
                                                             kIntrinsified);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
@@ -1298,8 +1386,10 @@ void IntrinsicCodeGeneratorARM::VisitStringNewStringFromChars(HInvoke* invoke) {
   //   java.lang.StringFactory.newStringFromChars(int offset, int charCount, char[] data)
   //
   // all include a null check on `data` before calling that method.
-  __ LoadFromOffset(
-      kLoadWord, LR, TR, QUICK_ENTRYPOINT_OFFSET(kArmWordSize, pAllocStringFromChars).Int32Value());
+  __ LoadFromOffset(kLoadWord,
+                    LR,
+                    TR,
+                    QUICK_ENTRYPOINT_OFFSET(kArmPointerSize, pAllocStringFromChars).Int32Value());
   CheckEntrypointTypes<kQuickAllocStringFromChars, void*, int32_t, int32_t, void*>();
   __ blx(LR);
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
@@ -1307,7 +1397,7 @@ void IntrinsicCodeGeneratorARM::VisitStringNewStringFromChars(HInvoke* invoke) {
 
 void IntrinsicLocationsBuilderARM::VisitStringNewStringFromString(HInvoke* invoke) {
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                            LocationSummary::kCall,
+                                                            LocationSummary::kCallOnMainAndSlowPath,
                                                             kIntrinsified);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
@@ -1325,7 +1415,7 @@ void IntrinsicCodeGeneratorARM::VisitStringNewStringFromString(HInvoke* invoke)
   __ b(slow_path->GetEntryLabel(), EQ);
 
   __ LoadFromOffset(kLoadWord,
-      LR, TR, QUICK_ENTRYPOINT_OFFSET(kArmWordSize, pAllocStringFromString).Int32Value());
+      LR, TR, QUICK_ENTRYPOINT_OFFSET(kArmPointerSize, pAllocStringFromString).Int32Value());
   CheckEntrypointTypes<kQuickAllocStringFromString, void*, void*>();
   __ blx(LR);
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
@@ -1333,9 +1423,9 @@ void IntrinsicCodeGeneratorARM::VisitStringNewStringFromString(HInvoke* invoke)
 }
 
 void IntrinsicLocationsBuilderARM::VisitSystemArrayCopy(HInvoke* invoke) {
-  // TODO(rpl): Implement read barriers in the SystemArrayCopy
-  // intrinsic and re-enable it (b/29516905).
-  if (kEmitCompilerReadBarrier) {
+  // The only read barrier implementation supporting the
+  // SystemArrayCopy intrinsic is the Baker-style read barriers.
+  if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
     return;
   }
 
@@ -1358,6 +1448,13 @@ void IntrinsicLocationsBuilderARM::VisitSystemArrayCopy(HInvoke* invoke) {
   if (length != nullptr && !assembler_->ShifterOperandCanAlwaysHold(length->GetValue())) {
     locations->SetInAt(4, Location::RequiresRegister());
   }
+  if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+    // Temporary register IP cannot be used in
+    // ReadBarrierSystemArrayCopySlowPathARM64 (because that register
+    // is clobbered by ReadBarrierMarkRegX entry points). Get an extra
+    // temporary register from the register allocator.
+    locations->AddTemp(Location::RequiresRegister());
+  }
 }
 
 static void CheckPosition(ArmAssembler* assembler,
@@ -1423,9 +1520,9 @@ static void CheckPosition(ArmAssembler* assembler,
 }
 
 void IntrinsicCodeGeneratorARM::VisitSystemArrayCopy(HInvoke* invoke) {
-  // TODO(rpl): Implement read barriers in the SystemArrayCopy
-  // intrinsic and re-enable it (b/29516905).
-  DCHECK(!kEmitCompilerReadBarrier);
+  // The only read barrier implementation supporting the
+  // SystemArrayCopy intrinsic is the Baker-style read barriers.
+  DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier);
 
   ArmAssembler* assembler = GetAssembler();
   LocationSummary* locations = invoke->GetLocations();
@@ -1434,18 +1531,22 @@ void IntrinsicCodeGeneratorARM::VisitSystemArrayCopy(HInvoke* invoke) {
   uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
   uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
   uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
+  uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
 
   Register src = locations->InAt(0).AsRegister<Register>();
   Location src_pos = locations->InAt(1);
   Register dest = locations->InAt(2).AsRegister<Register>();
   Location dest_pos = locations->InAt(3);
   Location length = locations->InAt(4);
-  Register temp1 = locations->GetTemp(0).AsRegister<Register>();
-  Register temp2 = locations->GetTemp(1).AsRegister<Register>();
-  Register temp3 = locations->GetTemp(2).AsRegister<Register>();
+  Location temp1_loc = locations->GetTemp(0);
+  Register temp1 = temp1_loc.AsRegister<Register>();
+  Location temp2_loc = locations->GetTemp(1);
+  Register temp2 = temp2_loc.AsRegister<Register>();
+  Location temp3_loc = locations->GetTemp(2);
+  Register temp3 = temp3_loc.AsRegister<Register>();
 
-  SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathARM(invoke);
-  codegen_->AddSlowPath(slow_path);
+  SlowPathCode* intrinsic_slow_path = new (GetAllocator()) IntrinsicSlowPathARM(invoke);
+  codegen_->AddSlowPath(intrinsic_slow_path);
 
   Label conditions_on_positions_validated;
   SystemArrayCopyOptimizations optimizations(invoke);
@@ -1461,7 +1562,7 @@ void IntrinsicCodeGeneratorARM::VisitSystemArrayCopy(HInvoke* invoke) {
         DCHECK_GE(src_pos_constant, dest_pos_constant);
       } else if (src_pos_constant < dest_pos_constant) {
         __ cmp(src, ShifterOperand(dest));
-        __ b(slow_path->GetEntryLabel(), EQ);
+        __ b(intrinsic_slow_path->GetEntryLabel(), EQ);
       }
 
       // Checked when building locations.
@@ -1473,7 +1574,7 @@ void IntrinsicCodeGeneratorARM::VisitSystemArrayCopy(HInvoke* invoke) {
         __ b(&conditions_on_positions_validated, NE);
       }
       __ cmp(dest_pos.AsRegister<Register>(), ShifterOperand(src_pos_constant));
-      __ b(slow_path->GetEntryLabel(), GT);
+      __ b(intrinsic_slow_path->GetEntryLabel(), GT);
     }
   } else {
     if (!optimizations.GetDestinationIsSource()) {
@@ -1486,19 +1587,19 @@ void IntrinsicCodeGeneratorARM::VisitSystemArrayCopy(HInvoke* invoke) {
     } else {
       __ cmp(src_pos.AsRegister<Register>(), ShifterOperand(dest_pos.AsRegister<Register>()));
     }
-    __ b(slow_path->GetEntryLabel(), LT);
+    __ b(intrinsic_slow_path->GetEntryLabel(), LT);
   }
 
   __ Bind(&conditions_on_positions_validated);
 
   if (!optimizations.GetSourceIsNotNull()) {
     // Bail out if the source is null.
-    __ CompareAndBranchIfZero(src, slow_path->GetEntryLabel());
+    __ CompareAndBranchIfZero(src, intrinsic_slow_path->GetEntryLabel());
   }
 
   if (!optimizations.GetDestinationIsNotNull() && !optimizations.GetDestinationIsSource()) {
     // Bail out if the destination is null.
-    __ CompareAndBranchIfZero(dest, slow_path->GetEntryLabel());
+    __ CompareAndBranchIfZero(dest, intrinsic_slow_path->GetEntryLabel());
   }
 
   // If the length is negative, bail out.
@@ -1507,7 +1608,7 @@ void IntrinsicCodeGeneratorARM::VisitSystemArrayCopy(HInvoke* invoke) {
       !optimizations.GetCountIsSourceLength() &&
       !optimizations.GetCountIsDestinationLength()) {
     __ cmp(length.AsRegister<Register>(), ShifterOperand(0));
-    __ b(slow_path->GetEntryLabel(), LT);
+    __ b(intrinsic_slow_path->GetEntryLabel(), LT);
   }
 
   // Validity checks: source.
@@ -1515,7 +1616,7 @@ void IntrinsicCodeGeneratorARM::VisitSystemArrayCopy(HInvoke* invoke) {
                 src_pos,
                 src,
                 length,
-                slow_path,
+                intrinsic_slow_path,
                 temp1,
                 optimizations.GetCountIsSourceLength());
 
@@ -1524,7 +1625,7 @@ void IntrinsicCodeGeneratorARM::VisitSystemArrayCopy(HInvoke* invoke) {
                 dest_pos,
                 dest,
                 length,
-                slow_path,
+                intrinsic_slow_path,
                 temp1,
                 optimizations.GetCountIsDestinationLength());
 
@@ -1533,112 +1634,287 @@ void IntrinsicCodeGeneratorARM::VisitSystemArrayCopy(HInvoke* invoke) {
     // type of the destination array. We do two checks: the classes are the same,
     // or the destination is Object[]. If none of these checks succeed, we go to the
     // slow path.
-    __ LoadFromOffset(kLoadWord, temp1, dest, class_offset);
-    __ LoadFromOffset(kLoadWord, temp2, src, class_offset);
-    bool did_unpoison = false;
-    if (!optimizations.GetDestinationIsNonPrimitiveArray() ||
-        !optimizations.GetSourceIsNonPrimitiveArray()) {
-      // One or two of the references need to be unpoisoned. Unpoison them
-      // both to make the identity check valid.
-      __ MaybeUnpoisonHeapReference(temp1);
-      __ MaybeUnpoisonHeapReference(temp2);
-      did_unpoison = true;
-    }
 
-    if (!optimizations.GetDestinationIsNonPrimitiveArray()) {
-      // Bail out if the destination is not a non primitive array.
-      // /* HeapReference<Class> */ temp3 = temp1->component_type_
-      __ LoadFromOffset(kLoadWord, temp3, temp1, component_offset);
-      __ CompareAndBranchIfZero(temp3, slow_path->GetEntryLabel());
-      __ MaybeUnpoisonHeapReference(temp3);
-      __ LoadFromOffset(kLoadUnsignedHalfword, temp3, temp3, primitive_offset);
-      static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
-      __ CompareAndBranchIfNonZero(temp3, slow_path->GetEntryLabel());
-    }
+    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+      if (!optimizations.GetSourceIsNonPrimitiveArray()) {
+        // /* HeapReference<Class> */ temp1 = src->klass_
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(
+            invoke, temp1_loc, src, class_offset, temp2_loc, /* needs_null_check */ false);
+        // Bail out if the source is not a non primitive array.
+        // /* HeapReference<Class> */ temp1 = temp1->component_type_
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(
+            invoke, temp1_loc, temp1, component_offset, temp2_loc, /* needs_null_check */ false);
+        __ CompareAndBranchIfZero(temp1, intrinsic_slow_path->GetEntryLabel());
+        // If heap poisoning is enabled, `temp1` has been unpoisoned
+        // by the the previous call to GenerateFieldLoadWithBakerReadBarrier.
+        // /* uint16_t */ temp1 = static_cast<uint16>(temp1->primitive_type_);
+        __ LoadFromOffset(kLoadUnsignedHalfword, temp1, temp1, primitive_offset);
+        static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
+        __ CompareAndBranchIfNonZero(temp1, intrinsic_slow_path->GetEntryLabel());
+      }
 
-    if (!optimizations.GetSourceIsNonPrimitiveArray()) {
-      // Bail out if the source is not a non primitive array.
-      // /* HeapReference<Class> */ temp3 = temp2->component_type_
-      __ LoadFromOffset(kLoadWord, temp3, temp2, component_offset);
-      __ CompareAndBranchIfZero(temp3, slow_path->GetEntryLabel());
-      __ MaybeUnpoisonHeapReference(temp3);
-      __ LoadFromOffset(kLoadUnsignedHalfword, temp3, temp3, primitive_offset);
-      static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
-      __ CompareAndBranchIfNonZero(temp3, slow_path->GetEntryLabel());
-    }
+      // /* HeapReference<Class> */ temp1 = dest->klass_
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(
+          invoke, temp1_loc, dest, class_offset, temp2_loc, /* needs_null_check */ false);
+
+      if (!optimizations.GetDestinationIsNonPrimitiveArray()) {
+        // Bail out if the destination is not a non primitive array.
+        //
+        // Register `temp1` is not trashed by the read barrier emitted
+        // by GenerateFieldLoadWithBakerReadBarrier below, as that
+        // method produces a call to a ReadBarrierMarkRegX entry point,
+        // which saves all potentially live registers, including
+        // temporaries such a `temp1`.
+        // /* HeapReference<Class> */ temp2 = temp1->component_type_
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(
+            invoke, temp2_loc, temp1, component_offset, temp3_loc, /* needs_null_check */ false);
+        __ CompareAndBranchIfZero(temp2, intrinsic_slow_path->GetEntryLabel());
+        // If heap poisoning is enabled, `temp2` has been unpoisoned
+        // by the the previous call to GenerateFieldLoadWithBakerReadBarrier.
+        // /* uint16_t */ temp2 = static_cast<uint16>(temp2->primitive_type_);
+        __ LoadFromOffset(kLoadUnsignedHalfword, temp2, temp2, primitive_offset);
+        static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
+        __ CompareAndBranchIfNonZero(temp2, intrinsic_slow_path->GetEntryLabel());
+      }
 
-    __ cmp(temp1, ShifterOperand(temp2));
+      // For the same reason given earlier, `temp1` is not trashed by the
+      // read barrier emitted by GenerateFieldLoadWithBakerReadBarrier below.
+      // /* HeapReference<Class> */ temp2 = src->klass_
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(
+          invoke, temp2_loc, src, class_offset, temp3_loc, /* needs_null_check */ false);
+      // Note: if heap poisoning is on, we are comparing two unpoisoned references here.
+      __ cmp(temp1, ShifterOperand(temp2));
+
+      if (optimizations.GetDestinationIsTypedObjectArray()) {
+        Label do_copy;
+        __ b(&do_copy, EQ);
+        // /* HeapReference<Class> */ temp1 = temp1->component_type_
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(
+            invoke, temp1_loc, temp1, component_offset, temp2_loc, /* needs_null_check */ false);
+        // /* HeapReference<Class> */ temp1 = temp1->super_class_
+        // We do not need to emit a read barrier for the following
+        // heap reference load, as `temp1` is only used in a
+        // comparison with null below, and this reference is not
+        // kept afterwards.
+        __ LoadFromOffset(kLoadWord, temp1, temp1, super_offset);
+        __ CompareAndBranchIfNonZero(temp1, intrinsic_slow_path->GetEntryLabel());
+        __ Bind(&do_copy);
+      } else {
+        __ b(intrinsic_slow_path->GetEntryLabel(), NE);
+      }
+    } else {
+      // Non read barrier code.
+
+      // /* HeapReference<Class> */ temp1 = dest->klass_
+      __ LoadFromOffset(kLoadWord, temp1, dest, class_offset);
+      // /* HeapReference<Class> */ temp2 = src->klass_
+      __ LoadFromOffset(kLoadWord, temp2, src, class_offset);
+      bool did_unpoison = false;
+      if (!optimizations.GetDestinationIsNonPrimitiveArray() ||
+          !optimizations.GetSourceIsNonPrimitiveArray()) {
+        // One or two of the references need to be unpoisoned. Unpoison them
+        // both to make the identity check valid.
+        __ MaybeUnpoisonHeapReference(temp1);
+        __ MaybeUnpoisonHeapReference(temp2);
+        did_unpoison = true;
+      }
 
-    if (optimizations.GetDestinationIsTypedObjectArray()) {
-      Label do_copy;
-      __ b(&do_copy, EQ);
-      if (!did_unpoison) {
+      if (!optimizations.GetDestinationIsNonPrimitiveArray()) {
+        // Bail out if the destination is not a non primitive array.
+        // /* HeapReference<Class> */ temp3 = temp1->component_type_
+        __ LoadFromOffset(kLoadWord, temp3, temp1, component_offset);
+        __ CompareAndBranchIfZero(temp3, intrinsic_slow_path->GetEntryLabel());
+        __ MaybeUnpoisonHeapReference(temp3);
+        // /* uint16_t */ temp3 = static_cast<uint16>(temp3->primitive_type_);
+        __ LoadFromOffset(kLoadUnsignedHalfword, temp3, temp3, primitive_offset);
+        static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
+        __ CompareAndBranchIfNonZero(temp3, intrinsic_slow_path->GetEntryLabel());
+      }
+
+      if (!optimizations.GetSourceIsNonPrimitiveArray()) {
+        // Bail out if the source is not a non primitive array.
+        // /* HeapReference<Class> */ temp3 = temp2->component_type_
+        __ LoadFromOffset(kLoadWord, temp3, temp2, component_offset);
+        __ CompareAndBranchIfZero(temp3, intrinsic_slow_path->GetEntryLabel());
+        __ MaybeUnpoisonHeapReference(temp3);
+        // /* uint16_t */ temp3 = static_cast<uint16>(temp3->primitive_type_);
+        __ LoadFromOffset(kLoadUnsignedHalfword, temp3, temp3, primitive_offset);
+        static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
+        __ CompareAndBranchIfNonZero(temp3, intrinsic_slow_path->GetEntryLabel());
+      }
+
+      __ cmp(temp1, ShifterOperand(temp2));
+
+      if (optimizations.GetDestinationIsTypedObjectArray()) {
+        Label do_copy;
+        __ b(&do_copy, EQ);
+        if (!did_unpoison) {
+          __ MaybeUnpoisonHeapReference(temp1);
+        }
+        // /* HeapReference<Class> */ temp1 = temp1->component_type_
+        __ LoadFromOffset(kLoadWord, temp1, temp1, component_offset);
         __ MaybeUnpoisonHeapReference(temp1);
+        // /* HeapReference<Class> */ temp1 = temp1->super_class_
+        __ LoadFromOffset(kLoadWord, temp1, temp1, super_offset);
+        // No need to unpoison the result, we're comparing against null.
+        __ CompareAndBranchIfNonZero(temp1, intrinsic_slow_path->GetEntryLabel());
+        __ Bind(&do_copy);
+      } else {
+        __ b(intrinsic_slow_path->GetEntryLabel(), NE);
       }
-      // /* HeapReference<Class> */ temp1 = temp1->component_type_
-      __ LoadFromOffset(kLoadWord, temp1, temp1, component_offset);
-      __ MaybeUnpoisonHeapReference(temp1);
-      // /* HeapReference<Class> */ temp1 = temp1->super_class_
-      __ LoadFromOffset(kLoadWord, temp1, temp1, super_offset);
-      // No need to unpoison the result, we're comparing against null.
-      __ CompareAndBranchIfNonZero(temp1, slow_path->GetEntryLabel());
-      __ Bind(&do_copy);
-    } else {
-      __ b(slow_path->GetEntryLabel(), NE);
     }
   } else if (!optimizations.GetSourceIsNonPrimitiveArray()) {
     DCHECK(optimizations.GetDestinationIsNonPrimitiveArray());
     // Bail out if the source is not a non primitive array.
-    // /* HeapReference<Class> */ temp1 = src->klass_
-    __ LoadFromOffset(kLoadWord, temp1, src, class_offset);
-    __ MaybeUnpoisonHeapReference(temp1);
-    // /* HeapReference<Class> */ temp3 = temp1->component_type_
-    __ LoadFromOffset(kLoadWord, temp3, temp1, component_offset);
-    __ CompareAndBranchIfZero(temp3, slow_path->GetEntryLabel());
-    __ MaybeUnpoisonHeapReference(temp3);
+    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+      // /* HeapReference<Class> */ temp1 = src->klass_
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(
+          invoke, temp1_loc, src, class_offset, temp2_loc, /* needs_null_check */ false);
+      // /* HeapReference<Class> */ temp3 = temp1->component_type_
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(
+          invoke, temp3_loc, temp1, component_offset, temp2_loc, /* needs_null_check */ false);
+      __ CompareAndBranchIfZero(temp3, intrinsic_slow_path->GetEntryLabel());
+      // If heap poisoning is enabled, `temp3` has been unpoisoned
+      // by the the previous call to GenerateFieldLoadWithBakerReadBarrier.
+    } else {
+      // /* HeapReference<Class> */ temp1 = src->klass_
+      __ LoadFromOffset(kLoadWord, temp1, src, class_offset);
+      __ MaybeUnpoisonHeapReference(temp1);
+      // /* HeapReference<Class> */ temp3 = temp1->component_type_
+      __ LoadFromOffset(kLoadWord, temp3, temp1, component_offset);
+      __ CompareAndBranchIfZero(temp3, intrinsic_slow_path->GetEntryLabel());
+      __ MaybeUnpoisonHeapReference(temp3);
+    }
+    // /* uint16_t */ temp3 = static_cast<uint16>(temp3->primitive_type_);
     __ LoadFromOffset(kLoadUnsignedHalfword, temp3, temp3, primitive_offset);
     static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
-    __ CompareAndBranchIfNonZero(temp3, slow_path->GetEntryLabel());
+    __ CompareAndBranchIfNonZero(temp3, intrinsic_slow_path->GetEntryLabel());
   }
 
-  // Compute base source address, base destination address, and end source address.
-
   int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot);
+  uint32_t element_size_shift = Primitive::ComponentSizeShift(Primitive::kPrimNot);
   uint32_t offset = mirror::Array::DataOffset(element_size).Uint32Value();
+
+  // Compute the base source address in `temp1`.
   if (src_pos.IsConstant()) {
     int32_t constant = src_pos.GetConstant()->AsIntConstant()->GetValue();
     __ AddConstant(temp1, src, element_size * constant + offset);
   } else {
-    __ add(temp1, src, ShifterOperand(src_pos.AsRegister<Register>(), LSL, 2));
+    __ add(temp1, src, ShifterOperand(src_pos.AsRegister<Register>(), LSL, element_size_shift));
     __ AddConstant(temp1, offset);
   }
 
-  if (dest_pos.IsConstant()) {
-    int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
-    __ AddConstant(temp2, dest, element_size * constant + offset);
-  } else {
-    __ add(temp2, dest, ShifterOperand(dest_pos.AsRegister<Register>(), LSL, 2));
-    __ AddConstant(temp2, offset);
-  }
-
+  // Compute the end source address in `temp3`.
   if (length.IsConstant()) {
     int32_t constant = length.GetConstant()->AsIntConstant()->GetValue();
     __ AddConstant(temp3, temp1, element_size * constant);
   } else {
-    __ add(temp3, temp1, ShifterOperand(length.AsRegister<Register>(), LSL, 2));
+    __ add(temp3, temp1, ShifterOperand(length.AsRegister<Register>(), LSL, element_size_shift));
   }
 
-  // Iterate over the arrays and do a raw copy of the objects. We don't need to
-  // poison/unpoison.
-  Label loop, done;
-  __ cmp(temp1, ShifterOperand(temp3));
-  __ b(&done, EQ);
-  __ Bind(&loop);
-  __ ldr(IP, Address(temp1, element_size, Address::PostIndex));
-  __ str(IP, Address(temp2, element_size, Address::PostIndex));
-  __ cmp(temp1, ShifterOperand(temp3));
-  __ b(&loop, NE);
-  __ Bind(&done);
+  if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+    // The base destination address is computed later, as `temp2` is
+    // used for intermediate computations.
+
+    // SystemArrayCopy implementation for Baker read barriers (see
+    // also CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier):
+    //
+    //   if (src_ptr != end_ptr) {
+    //     uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState();
+    //     lfence;  // Load fence or artificial data dependency to prevent load-load reordering
+    //     bool is_gray = (rb_state == ReadBarrier::gray_ptr_);
+    //     if (is_gray) {
+    //       // Slow-path copy.
+    //       do {
+    //         *dest_ptr++ = MaybePoison(ReadBarrier::Mark(MaybeUnpoison(*src_ptr++)));
+    //       } while (src_ptr != end_ptr)
+    //     } else {
+    //       // Fast-path copy.
+    //       do {
+    //         *dest_ptr++ = *src_ptr++;
+    //       } while (src_ptr != end_ptr)
+    //     }
+    //   }
+
+    Label loop, done;
+
+    // Don't enter copy loop if `length == 0`.
+    __ cmp(temp1, ShifterOperand(temp3));
+    __ b(&done, EQ);
+
+    // /* int32_t */ monitor = src->monitor_
+    __ LoadFromOffset(kLoadWord, temp2, src, monitor_offset);
+    // /* LockWord */ lock_word = LockWord(monitor)
+    static_assert(sizeof(LockWord) == sizeof(int32_t),
+                  "art::LockWord and int32_t have different sizes.");
+
+    // Introduce a dependency on the lock_word including the rb_state,
+    // which shall prevent load-load reordering without using
+    // a memory barrier (which would be more expensive).
+    // `src` is unchanged by this operation, but its value now depends
+    // on `temp2`.
+    __ add(src, src, ShifterOperand(temp2, LSR, 32));
+
+    // Slow path used to copy array when `src` is gray.
+    SlowPathCode* read_barrier_slow_path =
+        new (GetAllocator()) ReadBarrierSystemArrayCopySlowPathARM(invoke);
+    codegen_->AddSlowPath(read_barrier_slow_path);
+
+    // Given the numeric representation, it's enough to check the low bit of the
+    // rb_state. We do that by shifting the bit out of the lock word with LSRS
+    // which can be a 16-bit instruction unlike the TST immediate.
+    static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0");
+    static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1");
+    static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2");
+    __ Lsrs(temp2, temp2, LockWord::kReadBarrierStateShift + 1);
+    // Carry flag is the last bit shifted out by LSRS.
+    __ b(read_barrier_slow_path->GetEntryLabel(), CS);
+
+    // Fast-path copy.
+
+    // Compute the base destination address in `temp2`.
+    if (dest_pos.IsConstant()) {
+      int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
+      __ AddConstant(temp2, dest, element_size * constant + offset);
+    } else {
+      __ add(temp2, dest, ShifterOperand(dest_pos.AsRegister<Register>(), LSL, element_size_shift));
+      __ AddConstant(temp2, offset);
+    }
+
+    // Iterate over the arrays and do a raw copy of the objects. We don't need to
+    // poison/unpoison.
+    __ Bind(&loop);
+    __ ldr(IP, Address(temp1, element_size, Address::PostIndex));
+    __ str(IP, Address(temp2, element_size, Address::PostIndex));
+    __ cmp(temp1, ShifterOperand(temp3));
+    __ b(&loop, NE);
+
+    __ Bind(read_barrier_slow_path->GetExitLabel());
+    __ Bind(&done);
+  } else {
+    // Non read barrier code.
+
+    // Compute the base destination address in `temp2`.
+    if (dest_pos.IsConstant()) {
+      int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
+      __ AddConstant(temp2, dest, element_size * constant + offset);
+    } else {
+      __ add(temp2, dest, ShifterOperand(dest_pos.AsRegister<Register>(), LSL, element_size_shift));
+      __ AddConstant(temp2, offset);
+    }
+
+    // Iterate over the arrays and do a raw copy of the objects. We don't need to
+    // poison/unpoison.
+    Label loop, done;
+    __ cmp(temp1, ShifterOperand(temp3));
+    __ b(&done, EQ);
+    __ Bind(&loop);
+    __ ldr(IP, Address(temp1, element_size, Address::PostIndex));
+    __ str(IP, Address(temp2, element_size, Address::PostIndex));
+    __ cmp(temp1, ShifterOperand(temp3));
+    __ b(&loop, NE);
+    __ Bind(&done);
+  }
 
   // We only need one card marking on the destination array.
   codegen_->MarkGCCard(temp1,
@@ -1647,7 +1923,7 @@ void IntrinsicCodeGeneratorARM::VisitSystemArrayCopy(HInvoke* invoke) {
                        Register(kNoRegister),
                        /* value_can_be_null */ false);
 
-  __ Bind(slow_path->GetExitLabel());
+  __ Bind(intrinsic_slow_path->GetExitLabel());
 }
 
 static void CreateFPToFPCallLocations(ArenaAllocator* arena, HInvoke* invoke) {
@@ -1665,7 +1941,7 @@ static void CreateFPToFPCallLocations(ArenaAllocator* arena, HInvoke* invoke) {
   DCHECK_EQ(invoke->GetType(), Primitive::kPrimDouble);
 
   LocationSummary* const locations = new (arena) LocationSummary(invoke,
-                                                                 LocationSummary::kCall,
+                                                                 LocationSummary::kCallOnMainOnly,
                                                                  kIntrinsified);
   const InvokeRuntimeCallingConvention calling_convention;
 
@@ -1692,7 +1968,7 @@ static void CreateFPFPToFPCallLocations(ArenaAllocator* arena, HInvoke* invoke)
   DCHECK_EQ(invoke->GetType(), Primitive::kPrimDouble);
 
   LocationSummary* const locations = new (arena) LocationSummary(invoke,
-                                                                 LocationSummary::kCall,
+                                                                 LocationSummary::kCallOnMainOnly,
                                                                  kIntrinsified);
   const InvokeRuntimeCallingConvention calling_convention;
 
@@ -1718,7 +1994,7 @@ static void GenFPToFPCall(HInvoke* invoke,
   DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(calling_convention.GetRegisterAt(0)));
   DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(calling_convention.GetRegisterAt(1)));
 
-  __ LoadFromOffset(kLoadWord, LR, TR, GetThreadOffset<kArmWordSize>(entry).Int32Value());
+  __ LoadFromOffset(kLoadWord, LR, TR, GetThreadOffset<kArmPointerSize>(entry).Int32Value());
   // Native code uses the soft float ABI.
   __ vmovrrd(calling_convention.GetRegisterAt(0),
              calling_convention.GetRegisterAt(1),
@@ -1744,7 +2020,7 @@ static void GenFPFPToFPCall(HInvoke* invoke,
   DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(calling_convention.GetRegisterAt(2)));
   DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(calling_convention.GetRegisterAt(3)));
 
-  __ LoadFromOffset(kLoadWord, LR, TR, GetThreadOffset<kArmWordSize>(entry).Int32Value());
+  __ LoadFromOffset(kLoadWord, LR, TR, GetThreadOffset<kArmPointerSize>(entry).Int32Value());
   // Native code uses the soft float ABI.
   __ vmovrrd(calling_convention.GetRegisterAt(0),
              calling_convention.GetRegisterAt(1),
@@ -1979,6 +2255,51 @@ void IntrinsicCodeGeneratorARM::VisitShortReverseBytes(HInvoke* invoke) {
   __ revsh(out, in);
 }
 
+static void GenBitCount(HInvoke* instr, Primitive::Type type, ArmAssembler* assembler) {
+  DCHECK(Primitive::IsIntOrLongType(type)) << type;
+  DCHECK_EQ(instr->GetType(), Primitive::kPrimInt);
+  DCHECK_EQ(Primitive::PrimitiveKind(instr->InputAt(0)->GetType()), type);
+
+  bool is_long = type == Primitive::kPrimLong;
+  LocationSummary* locations = instr->GetLocations();
+  Location in = locations->InAt(0);
+  Register src_0 = is_long ? in.AsRegisterPairLow<Register>() : in.AsRegister<Register>();
+  Register src_1 = is_long ? in.AsRegisterPairHigh<Register>() : src_0;
+  SRegister tmp_s = locations->GetTemp(0).AsFpuRegisterPairLow<SRegister>();
+  DRegister tmp_d = FromLowSToD(tmp_s);
+  Register  out_r = locations->Out().AsRegister<Register>();
+
+  // Move data from core register(s) to temp D-reg for bit count calculation, then move back.
+  // According to Cortex A57 and A72 optimization guides, compared to transferring to full D-reg,
+  // transferring data from core reg to upper or lower half of vfp D-reg requires extra latency,
+  // That's why for integer bit count, we use 'vmov d0, r0, r0' instead of 'vmov d0[0], r0'.
+  __ vmovdrr(tmp_d, src_1, src_0);                         // Temp DReg |--src_1|--src_0|
+  __ vcntd(tmp_d, tmp_d);                                  // Temp DReg |c|c|c|c|c|c|c|c|
+  __ vpaddld(tmp_d, tmp_d, 8, /* is_unsigned */ true);     // Temp DReg |--c|--c|--c|--c|
+  __ vpaddld(tmp_d, tmp_d, 16, /* is_unsigned */ true);    // Temp DReg |------c|------c|
+  if (is_long) {
+    __ vpaddld(tmp_d, tmp_d, 32, /* is_unsigned */ true);  // Temp DReg |--------------c|
+  }
+  __ vmovrs(out_r, tmp_s);
+}
+
+void IntrinsicLocationsBuilderARM::VisitIntegerBitCount(HInvoke* invoke) {
+  CreateIntToIntLocations(arena_, invoke);
+  invoke->GetLocations()->AddTemp(Location::RequiresFpuRegister());
+}
+
+void IntrinsicCodeGeneratorARM::VisitIntegerBitCount(HInvoke* invoke) {
+  GenBitCount(invoke, Primitive::kPrimInt, GetAssembler());
+}
+
+void IntrinsicLocationsBuilderARM::VisitLongBitCount(HInvoke* invoke) {
+  VisitIntegerBitCount(invoke);
+}
+
+void IntrinsicCodeGeneratorARM::VisitLongBitCount(HInvoke* invoke) {
+  GenBitCount(invoke, Primitive::kPrimLong, GetAssembler());
+}
+
 void IntrinsicLocationsBuilderARM::VisitStringGetCharsNoCheck(HInvoke* invoke) {
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
                                                             LocationSummary::kNoCall,
@@ -2119,8 +2440,6 @@ void IntrinsicCodeGeneratorARM::VisitDoubleIsInfinite(HInvoke* invoke) {
   __ Lsr(out, out, 5);
 }
 
-UNIMPLEMENTED_INTRINSIC(ARM, IntegerBitCount)
-UNIMPLEMENTED_INTRINSIC(ARM, LongBitCount)
 UNIMPLEMENTED_INTRINSIC(ARM, MathMinDoubleDouble)
 UNIMPLEMENTED_INTRINSIC(ARM, MathMinFloatFloat)
 UNIMPLEMENTED_INTRINSIC(ARM, MathMaxDoubleDouble)
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index 1d507530aa..91374b3108 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -26,12 +26,15 @@
 #include "mirror/string.h"
 #include "thread.h"
 #include "utils/arm64/assembler_arm64.h"
-#include "utils/arm64/constants_arm64.h"
 
-#include "vixl/a64/disasm-a64.h"
-#include "vixl/a64/macro-assembler-a64.h"
+using namespace vixl::aarch64;  // NOLINT(build/namespaces)
 
-using namespace vixl;   // NOLINT(build/namespaces)
+// TODO(VIXL): Make VIXL compile with -Wshadow.
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wshadow"
+#include "aarch64/disasm-aarch64.h"
+#include "aarch64/macro-assembler-aarch64.h"
+#pragma GCC diagnostic pop
 
 namespace art {
 
@@ -57,15 +60,15 @@ ALWAYS_INLINE inline MemOperand AbsoluteHeapOperandFrom(Location location, size_
 
 }  // namespace
 
-vixl::MacroAssembler* IntrinsicCodeGeneratorARM64::GetVIXLAssembler() {
-  return codegen_->GetAssembler()->vixl_masm_;
+MacroAssembler* IntrinsicCodeGeneratorARM64::GetVIXLAssembler() {
+  return codegen_->GetVIXLAssembler();
 }
 
 ArenaAllocator* IntrinsicCodeGeneratorARM64::GetAllocator() {
   return codegen_->GetGraph()->GetArena();
 }
 
-#define __ codegen->GetAssembler()->vixl_masm_->
+#define __ codegen->GetVIXLAssembler()->
 
 static void MoveFromReturnRegister(Location trg,
                                    Primitive::Type type,
@@ -141,6 +144,73 @@ class IntrinsicSlowPathARM64 : public SlowPathCodeARM64 {
   DISALLOW_COPY_AND_ASSIGN(IntrinsicSlowPathARM64);
 };
 
+// Slow path implementing the SystemArrayCopy intrinsic copy loop with read barriers.
+class ReadBarrierSystemArrayCopySlowPathARM64 : public SlowPathCodeARM64 {
+ public:
+  ReadBarrierSystemArrayCopySlowPathARM64(HInstruction* instruction, Location tmp)
+      : SlowPathCodeARM64(instruction), tmp_(tmp) {
+    DCHECK(kEmitCompilerReadBarrier);
+    DCHECK(kUseBakerReadBarrier);
+  }
+
+  void EmitNativeCode(CodeGenerator* codegen_in) OVERRIDE {
+    CodeGeneratorARM64* codegen = down_cast<CodeGeneratorARM64*>(codegen_in);
+    LocationSummary* locations = instruction_->GetLocations();
+    DCHECK(locations->CanCall());
+    DCHECK(instruction_->IsInvokeStaticOrDirect())
+        << "Unexpected instruction in read barrier arraycopy slow path: "
+        << instruction_->DebugName();
+    DCHECK(instruction_->GetLocations()->Intrinsified());
+    DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kSystemArrayCopy);
+
+    const int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot);
+
+    Register src_curr_addr = XRegisterFrom(locations->GetTemp(0));
+    Register dst_curr_addr = XRegisterFrom(locations->GetTemp(1));
+    Register src_stop_addr = XRegisterFrom(locations->GetTemp(2));
+    Register tmp_reg = WRegisterFrom(tmp_);
+
+    __ Bind(GetEntryLabel());
+    vixl::aarch64::Label slow_copy_loop;
+    __ Bind(&slow_copy_loop);
+    __ Ldr(tmp_reg, MemOperand(src_curr_addr, element_size, PostIndex));
+    codegen->GetAssembler()->MaybeUnpoisonHeapReference(tmp_reg);
+    // TODO: Inline the mark bit check before calling the runtime?
+    // tmp_reg = ReadBarrier::Mark(tmp_reg);
+    // No need to save live registers; it's taken care of by the
+    // entrypoint. Also, there is no need to update the stack mask,
+    // as this runtime call will not trigger a garbage collection.
+    // (See ReadBarrierMarkSlowPathARM64::EmitNativeCode for more
+    // explanations.)
+    DCHECK_NE(tmp_.reg(), LR);
+    DCHECK_NE(tmp_.reg(), WSP);
+    DCHECK_NE(tmp_.reg(), WZR);
+    // IP0 is used internally by the ReadBarrierMarkRegX entry point
+    // as a temporary (and not preserved).  It thus cannot be used by
+    // any live register in this slow path.
+    DCHECK_NE(LocationFrom(src_curr_addr).reg(), IP0);
+    DCHECK_NE(LocationFrom(dst_curr_addr).reg(), IP0);
+    DCHECK_NE(LocationFrom(src_stop_addr).reg(), IP0);
+    DCHECK_NE(tmp_.reg(), IP0);
+    DCHECK(0 <= tmp_.reg() && tmp_.reg() < kNumberOfWRegisters) << tmp_.reg();
+    int32_t entry_point_offset =
+        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(tmp_.reg());
+    // This runtime call does not require a stack map.
+    codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
+    codegen->GetAssembler()->MaybePoisonHeapReference(tmp_reg);
+    __ Str(tmp_reg, MemOperand(dst_curr_addr, element_size, PostIndex));
+    __ Cmp(src_curr_addr, src_stop_addr);
+    __ B(&slow_copy_loop, ne);
+    __ B(GetExitLabel());
+  }
+
+  const char* GetDescription() const OVERRIDE { return "ReadBarrierSystemArrayCopySlowPathARM64"; }
+
+ private:
+  Location tmp_;
+
+  DISALLOW_COPY_AND_ASSIGN(ReadBarrierSystemArrayCopySlowPathARM64);
+};
 #undef __
 
 bool IntrinsicLocationsBuilderARM64::TryDispatch(HInvoke* invoke) {
@@ -170,14 +240,14 @@ static void CreateIntToFPLocations(ArenaAllocator* arena, HInvoke* invoke) {
   locations->SetOut(Location::RequiresFpuRegister());
 }
 
-static void MoveFPToInt(LocationSummary* locations, bool is64bit, vixl::MacroAssembler* masm) {
+static void MoveFPToInt(LocationSummary* locations, bool is64bit, MacroAssembler* masm) {
   Location input = locations->InAt(0);
   Location output = locations->Out();
   __ Fmov(is64bit ? XRegisterFrom(output) : WRegisterFrom(output),
           is64bit ? DRegisterFrom(input) : SRegisterFrom(input));
 }
 
-static void MoveIntToFP(LocationSummary* locations, bool is64bit, vixl::MacroAssembler* masm) {
+static void MoveIntToFP(LocationSummary* locations, bool is64bit, MacroAssembler* masm) {
   Location input = locations->InAt(0);
   Location output = locations->Out();
   __ Fmov(is64bit ? DRegisterFrom(output) : SRegisterFrom(output),
@@ -222,7 +292,7 @@ static void CreateIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke) {
 
 static void GenReverseBytes(LocationSummary* locations,
                             Primitive::Type type,
-                            vixl::MacroAssembler* masm) {
+                            MacroAssembler* masm) {
   Location in = locations->InAt(0);
   Location out = locations->Out();
 
@@ -276,7 +346,7 @@ static void CreateIntIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke) {
 
 static void GenNumberOfLeadingZeros(LocationSummary* locations,
                                     Primitive::Type type,
-                                    vixl::MacroAssembler* masm) {
+                                    MacroAssembler* masm) {
   DCHECK(type == Primitive::kPrimInt || type == Primitive::kPrimLong);
 
   Location in = locations->InAt(0);
@@ -303,7 +373,7 @@ void IntrinsicCodeGeneratorARM64::VisitLongNumberOfLeadingZeros(HInvoke* invoke)
 
 static void GenNumberOfTrailingZeros(LocationSummary* locations,
                                      Primitive::Type type,
-                                     vixl::MacroAssembler* masm) {
+                                     MacroAssembler* masm) {
   DCHECK(type == Primitive::kPrimInt || type == Primitive::kPrimLong);
 
   Location in = locations->InAt(0);
@@ -331,7 +401,7 @@ void IntrinsicCodeGeneratorARM64::VisitLongNumberOfTrailingZeros(HInvoke* invoke
 
 static void GenReverse(LocationSummary* locations,
                        Primitive::Type type,
-                       vixl::MacroAssembler* masm) {
+                       MacroAssembler* masm) {
   DCHECK(type == Primitive::kPrimInt || type == Primitive::kPrimLong);
 
   Location in = locations->InAt(0);
@@ -356,7 +426,7 @@ void IntrinsicCodeGeneratorARM64::VisitLongReverse(HInvoke* invoke) {
   GenReverse(invoke->GetLocations(), Primitive::kPrimLong, GetVIXLAssembler());
 }
 
-static void GenBitCount(HInvoke* instr, Primitive::Type type, vixl::MacroAssembler* masm) {
+static void GenBitCount(HInvoke* instr, Primitive::Type type, MacroAssembler* masm) {
   DCHECK(Primitive::IsIntOrLongType(type)) << type;
   DCHECK_EQ(instr->GetType(), Primitive::kPrimInt);
   DCHECK_EQ(Primitive::PrimitiveKind(instr->InputAt(0)->GetType()), type);
@@ -397,7 +467,7 @@ static void CreateFPToFPLocations(ArenaAllocator* arena, HInvoke* invoke) {
   locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
 }
 
-static void MathAbsFP(LocationSummary* locations, bool is64bit, vixl::MacroAssembler* masm) {
+static void MathAbsFP(LocationSummary* locations, bool is64bit, MacroAssembler* masm) {
   Location in = locations->InAt(0);
   Location out = locations->Out();
 
@@ -433,7 +503,7 @@ static void CreateIntToInt(ArenaAllocator* arena, HInvoke* invoke) {
 
 static void GenAbsInteger(LocationSummary* locations,
                           bool is64bit,
-                          vixl::MacroAssembler* masm) {
+                          MacroAssembler* masm) {
   Location in = locations->InAt(0);
   Location output = locations->Out();
 
@@ -463,7 +533,7 @@ void IntrinsicCodeGeneratorARM64::VisitMathAbsLong(HInvoke* invoke) {
 static void GenMinMaxFP(LocationSummary* locations,
                         bool is_min,
                         bool is_double,
-                        vixl::MacroAssembler* masm) {
+                        MacroAssembler* masm) {
   Location op1 = locations->InAt(0);
   Location op2 = locations->InAt(1);
   Location out = locations->Out();
@@ -523,7 +593,7 @@ void IntrinsicCodeGeneratorARM64::VisitMathMaxFloatFloat(HInvoke* invoke) {
 static void GenMinMax(LocationSummary* locations,
                       bool is_min,
                       bool is_long,
-                      vixl::MacroAssembler* masm) {
+                      MacroAssembler* masm) {
   Location op1 = locations->InAt(0);
   Location op2 = locations->InAt(1);
   Location out = locations->Out();
@@ -574,7 +644,7 @@ void IntrinsicLocationsBuilderARM64::VisitMathSqrt(HInvoke* invoke) {
 
 void IntrinsicCodeGeneratorARM64::VisitMathSqrt(HInvoke* invoke) {
   LocationSummary* locations = invoke->GetLocations();
-  vixl::MacroAssembler* masm = GetVIXLAssembler();
+  MacroAssembler* masm = GetVIXLAssembler();
   __ Fsqrt(DRegisterFrom(locations->Out()), DRegisterFrom(locations->InAt(0)));
 }
 
@@ -584,7 +654,7 @@ void IntrinsicLocationsBuilderARM64::VisitMathCeil(HInvoke* invoke) {
 
 void IntrinsicCodeGeneratorARM64::VisitMathCeil(HInvoke* invoke) {
   LocationSummary* locations = invoke->GetLocations();
-  vixl::MacroAssembler* masm = GetVIXLAssembler();
+  MacroAssembler* masm = GetVIXLAssembler();
   __ Frintp(DRegisterFrom(locations->Out()), DRegisterFrom(locations->InAt(0)));
 }
 
@@ -594,7 +664,7 @@ void IntrinsicLocationsBuilderARM64::VisitMathFloor(HInvoke* invoke) {
 
 void IntrinsicCodeGeneratorARM64::VisitMathFloor(HInvoke* invoke) {
   LocationSummary* locations = invoke->GetLocations();
-  vixl::MacroAssembler* masm = GetVIXLAssembler();
+  MacroAssembler* masm = GetVIXLAssembler();
   __ Frintm(DRegisterFrom(locations->Out()), DRegisterFrom(locations->InAt(0)));
 }
 
@@ -604,7 +674,7 @@ void IntrinsicLocationsBuilderARM64::VisitMathRint(HInvoke* invoke) {
 
 void IntrinsicCodeGeneratorARM64::VisitMathRint(HInvoke* invoke) {
   LocationSummary* locations = invoke->GetLocations();
-  vixl::MacroAssembler* masm = GetVIXLAssembler();
+  MacroAssembler* masm = GetVIXLAssembler();
   __ Frintn(DRegisterFrom(locations->Out()), DRegisterFrom(locations->InAt(0)));
 }
 
@@ -617,7 +687,7 @@ static void CreateFPToIntPlusFPTempLocations(ArenaAllocator* arena, HInvoke* inv
   locations->AddTemp(Location::RequiresFpuRegister());
 }
 
-static void GenMathRound(HInvoke* invoke, bool is_double, vixl::MacroAssembler* masm) {
+static void GenMathRound(HInvoke* invoke, bool is_double, vixl::aarch64::MacroAssembler* masm) {
   // Java 8 API definition for Math.round():
   // Return the closest long or int to the argument, with ties rounding to positive infinity.
   //
@@ -635,13 +705,13 @@ static void GenMathRound(HInvoke* invoke, bool is_double, vixl::MacroAssembler*
   FPRegister in_reg = is_double ? DRegisterFrom(l->InAt(0)) : SRegisterFrom(l->InAt(0));
   FPRegister tmp_fp = is_double ? DRegisterFrom(l->GetTemp(0)) : SRegisterFrom(l->GetTemp(0));
   Register out_reg = is_double ? XRegisterFrom(l->Out()) : WRegisterFrom(l->Out());
-  vixl::Label done;
+  vixl::aarch64::Label done;
 
   // Round to nearest integer, ties away from zero.
   __ Fcvtas(out_reg, in_reg);
 
   // For positive values, zero or NaN inputs, rounding is done.
-  __ Tbz(out_reg, out_reg.size() - 1, &done);
+  __ Tbz(out_reg, out_reg.GetSizeInBits() - 1, &done);
 
   // Handle input < 0 cases.
   // If input is negative but not a tie, previous result (round to nearest) is valid.
@@ -675,7 +745,7 @@ void IntrinsicLocationsBuilderARM64::VisitMemoryPeekByte(HInvoke* invoke) {
 }
 
 void IntrinsicCodeGeneratorARM64::VisitMemoryPeekByte(HInvoke* invoke) {
-  vixl::MacroAssembler* masm = GetVIXLAssembler();
+  MacroAssembler* masm = GetVIXLAssembler();
   __ Ldrsb(WRegisterFrom(invoke->GetLocations()->Out()),
           AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
 }
@@ -685,7 +755,7 @@ void IntrinsicLocationsBuilderARM64::VisitMemoryPeekIntNative(HInvoke* invoke) {
 }
 
 void IntrinsicCodeGeneratorARM64::VisitMemoryPeekIntNative(HInvoke* invoke) {
-  vixl::MacroAssembler* masm = GetVIXLAssembler();
+  MacroAssembler* masm = GetVIXLAssembler();
   __ Ldr(WRegisterFrom(invoke->GetLocations()->Out()),
          AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
 }
@@ -695,7 +765,7 @@ void IntrinsicLocationsBuilderARM64::VisitMemoryPeekLongNative(HInvoke* invoke)
 }
 
 void IntrinsicCodeGeneratorARM64::VisitMemoryPeekLongNative(HInvoke* invoke) {
-  vixl::MacroAssembler* masm = GetVIXLAssembler();
+  MacroAssembler* masm = GetVIXLAssembler();
   __ Ldr(XRegisterFrom(invoke->GetLocations()->Out()),
          AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
 }
@@ -705,7 +775,7 @@ void IntrinsicLocationsBuilderARM64::VisitMemoryPeekShortNative(HInvoke* invoke)
 }
 
 void IntrinsicCodeGeneratorARM64::VisitMemoryPeekShortNative(HInvoke* invoke) {
-  vixl::MacroAssembler* masm = GetVIXLAssembler();
+  MacroAssembler* masm = GetVIXLAssembler();
   __ Ldrsh(WRegisterFrom(invoke->GetLocations()->Out()),
            AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
 }
@@ -723,7 +793,7 @@ void IntrinsicLocationsBuilderARM64::VisitMemoryPokeByte(HInvoke* invoke) {
 }
 
 void IntrinsicCodeGeneratorARM64::VisitMemoryPokeByte(HInvoke* invoke) {
-  vixl::MacroAssembler* masm = GetVIXLAssembler();
+  MacroAssembler* masm = GetVIXLAssembler();
   __ Strb(WRegisterFrom(invoke->GetLocations()->InAt(1)),
           AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
 }
@@ -733,7 +803,7 @@ void IntrinsicLocationsBuilderARM64::VisitMemoryPokeIntNative(HInvoke* invoke) {
 }
 
 void IntrinsicCodeGeneratorARM64::VisitMemoryPokeIntNative(HInvoke* invoke) {
-  vixl::MacroAssembler* masm = GetVIXLAssembler();
+  MacroAssembler* masm = GetVIXLAssembler();
   __ Str(WRegisterFrom(invoke->GetLocations()->InAt(1)),
          AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
 }
@@ -743,7 +813,7 @@ void IntrinsicLocationsBuilderARM64::VisitMemoryPokeLongNative(HInvoke* invoke)
 }
 
 void IntrinsicCodeGeneratorARM64::VisitMemoryPokeLongNative(HInvoke* invoke) {
-  vixl::MacroAssembler* masm = GetVIXLAssembler();
+  MacroAssembler* masm = GetVIXLAssembler();
   __ Str(XRegisterFrom(invoke->GetLocations()->InAt(1)),
          AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
 }
@@ -753,7 +823,7 @@ void IntrinsicLocationsBuilderARM64::VisitMemoryPokeShortNative(HInvoke* invoke)
 }
 
 void IntrinsicCodeGeneratorARM64::VisitMemoryPokeShortNative(HInvoke* invoke) {
-  vixl::MacroAssembler* masm = GetVIXLAssembler();
+  MacroAssembler* masm = GetVIXLAssembler();
   __ Strh(WRegisterFrom(invoke->GetLocations()->InAt(1)),
           AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
 }
@@ -767,7 +837,7 @@ void IntrinsicLocationsBuilderARM64::VisitThreadCurrentThread(HInvoke* invoke) {
 
 void IntrinsicCodeGeneratorARM64::VisitThreadCurrentThread(HInvoke* invoke) {
   codegen_->Load(Primitive::kPrimNot, WRegisterFrom(invoke->GetLocations()->Out()),
-                 MemOperand(tr, Thread::PeerOffset<8>().Int32Value()));
+                 MemOperand(tr, Thread::PeerOffset<kArm64PointerSize>().Int32Value()));
 }
 
 static void GenUnsafeGet(HInvoke* invoke,
@@ -778,7 +848,7 @@ static void GenUnsafeGet(HInvoke* invoke,
   DCHECK((type == Primitive::kPrimInt) ||
          (type == Primitive::kPrimLong) ||
          (type == Primitive::kPrimNot));
-  vixl::MacroAssembler* masm = codegen->GetAssembler()->vixl_masm_;
+  MacroAssembler* masm = codegen->GetVIXLAssembler();
   Location base_loc = locations->InAt(1);
   Register base = WRegisterFrom(base_loc);      // Object pointer.
   Location offset_loc = locations->InAt(2);
@@ -912,7 +982,7 @@ static void GenUnsafePut(LocationSummary* locations,
                          bool is_volatile,
                          bool is_ordered,
                          CodeGeneratorARM64* codegen) {
-  vixl::MacroAssembler* masm = codegen->GetAssembler()->vixl_masm_;
+  MacroAssembler* masm = codegen->GetVIXLAssembler();
 
   Register base = WRegisterFrom(locations->InAt(1));    // Object pointer.
   Register offset = XRegisterFrom(locations->InAt(2));  // Long offset.
@@ -1031,7 +1101,7 @@ static void CreateIntIntIntIntIntToInt(ArenaAllocator* arena,
 }
 
 static void GenCas(LocationSummary* locations, Primitive::Type type, CodeGeneratorARM64* codegen) {
-  vixl::MacroAssembler* masm = codegen->GetAssembler()->vixl_masm_;
+  MacroAssembler* masm = codegen->GetVIXLAssembler();
 
   Register out = WRegisterFrom(locations->Out());                  // Boolean result.
 
@@ -1070,7 +1140,7 @@ static void GenCas(LocationSummary* locations, Primitive::Type type, CodeGenerat
   // } while (tmp_value == 0 && failure([tmp_ptr] <- r_new_value));
   // result = tmp_value != 0;
 
-  vixl::Label loop_head, exit_loop;
+  vixl::aarch64::Label loop_head, exit_loop;
   __ Bind(&loop_head);
   // TODO: When `type == Primitive::kPrimNot`, add a read barrier for
   // the reference stored in the object before attempting the CAS,
@@ -1154,20 +1224,22 @@ void IntrinsicLocationsBuilderARM64::VisitStringCompareTo(HInvoke* invoke) {
 }
 
 void IntrinsicCodeGeneratorARM64::VisitStringCompareTo(HInvoke* invoke) {
-  vixl::MacroAssembler* masm = GetVIXLAssembler();
+  MacroAssembler* masm = GetVIXLAssembler();
   LocationSummary* locations = invoke->GetLocations();
 
-  Register str = XRegisterFrom(locations->InAt(0));
-  Register arg = XRegisterFrom(locations->InAt(1));
+  Register str = InputRegisterAt(invoke, 0);
+  Register arg = InputRegisterAt(invoke, 1);
+  DCHECK(str.IsW());
+  DCHECK(arg.IsW());
   Register out = OutputRegister(invoke);
 
   Register temp0 = WRegisterFrom(locations->GetTemp(0));
   Register temp1 = WRegisterFrom(locations->GetTemp(1));
   Register temp2 = WRegisterFrom(locations->GetTemp(2));
 
-  vixl::Label loop;
-  vixl::Label find_char_diff;
-  vixl::Label end;
+  vixl::aarch64::Label loop;
+  vixl::aarch64::Label find_char_diff;
+  vixl::aarch64::Label end;
 
   // Get offsets of count and value fields within a string object.
   const int32_t count_offset = mirror::String::CountOffset().Int32Value();
@@ -1189,8 +1261,8 @@ void IntrinsicCodeGeneratorARM64::VisitStringCompareTo(HInvoke* invoke) {
   __ Subs(out, str, arg);
   __ B(&end, eq);
   // Load lengths of this and argument strings.
-  __ Ldr(temp0, MemOperand(str.X(), count_offset));
-  __ Ldr(temp1, MemOperand(arg.X(), count_offset));
+  __ Ldr(temp0, HeapOperand(str, count_offset));
+  __ Ldr(temp1, HeapOperand(arg, count_offset));
   // Return zero if both strings are empty.
   __ Orr(out, temp0, temp1);
   __ Cbz(out, &end);
@@ -1219,8 +1291,8 @@ void IntrinsicCodeGeneratorARM64::VisitStringCompareTo(HInvoke* invoke) {
 
   // Loop to compare 4x16-bit characters at a time (ok because of string data alignment).
   __ Bind(&loop);
-  __ Ldr(temp4, MemOperand(str.X(), temp1));
-  __ Ldr(temp0, MemOperand(arg.X(), temp1));
+  __ Ldr(temp4, MemOperand(str.X(), temp1.X()));
+  __ Ldr(temp0, MemOperand(arg.X(), temp1.X()));
   __ Cmp(temp4, temp0);
   __ B(ne, &find_char_diff);
   __ Add(temp1, temp1, char_size * 4);
@@ -1239,14 +1311,14 @@ void IntrinsicCodeGeneratorARM64::VisitStringCompareTo(HInvoke* invoke) {
   __ Clz(temp1, temp1);
   // If the number of 16-bit chars remaining <= the index where the difference occurs (0-3), then
   // the difference occurs outside the remaining string data, so just return length diff (out).
-  __ Cmp(temp2, Operand(temp1, LSR, 4));
+  __ Cmp(temp2, Operand(temp1.W(), LSR, 4));
   __ B(le, &end);
   // Extract the characters and calculate the difference.
   __ Bic(temp1, temp1, 0xf);
   __ Lsr(temp0, temp0, temp1);
   __ Lsr(temp4, temp4, temp1);
   __ And(temp4, temp4, 0xffff);
-  __ Sub(out, temp4, Operand(temp0, UXTH));
+  __ Sub(out, temp4.W(), Operand(temp0.W(), UXTH));
 
   __ Bind(&end);
 
@@ -1269,7 +1341,7 @@ void IntrinsicLocationsBuilderARM64::VisitStringEquals(HInvoke* invoke) {
 }
 
 void IntrinsicCodeGeneratorARM64::VisitStringEquals(HInvoke* invoke) {
-  vixl::MacroAssembler* masm = GetVIXLAssembler();
+  MacroAssembler* masm = GetVIXLAssembler();
   LocationSummary* locations = invoke->GetLocations();
 
   Register str = WRegisterFrom(locations->InAt(0));
@@ -1281,10 +1353,10 @@ void IntrinsicCodeGeneratorARM64::VisitStringEquals(HInvoke* invoke) {
   Register temp1 = WRegisterFrom(locations->GetTemp(0));
   Register temp2 = WRegisterFrom(locations->GetTemp(1));
 
-  vixl::Label loop;
-  vixl::Label end;
-  vixl::Label return_true;
-  vixl::Label return_false;
+  vixl::aarch64::Label loop;
+  vixl::aarch64::Label end;
+  vixl::aarch64::Label return_true;
+  vixl::aarch64::Label return_false;
 
   // Get offsets of count, value, and class fields within a string object.
   const int32_t count_offset = mirror::String::CountOffset().Int32Value();
@@ -1357,7 +1429,7 @@ void IntrinsicCodeGeneratorARM64::VisitStringEquals(HInvoke* invoke) {
 }
 
 static void GenerateVisitStringIndexOf(HInvoke* invoke,
-                                       vixl::MacroAssembler* masm,
+                                       MacroAssembler* masm,
                                        CodeGeneratorARM64* codegen,
                                        ArenaAllocator* allocator,
                                        bool start_at_zero) {
@@ -1394,7 +1466,7 @@ static void GenerateVisitStringIndexOf(HInvoke* invoke,
     __ Mov(tmp_reg, 0);
   }
 
-  __ Ldr(lr, MemOperand(tr, QUICK_ENTRYPOINT_OFFSET(kArm64WordSize, pIndexOf).Int32Value()));
+  __ Ldr(lr, MemOperand(tr, QUICK_ENTRYPOINT_OFFSET(kArm64PointerSize, pIndexOf).Int32Value()));
   CheckEntrypointTypes<kQuickIndexOf, int32_t, void*, uint32_t, uint32_t>();
   __ Blr(lr);
 
@@ -1405,7 +1477,7 @@ static void GenerateVisitStringIndexOf(HInvoke* invoke,
 
 void IntrinsicLocationsBuilderARM64::VisitStringIndexOf(HInvoke* invoke) {
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                            LocationSummary::kCall,
+                                                            LocationSummary::kCallOnMainAndSlowPath,
                                                             kIntrinsified);
   // We have a hand-crafted assembly stub that follows the runtime calling convention. So it's
   // best to align the inputs accordingly.
@@ -1425,7 +1497,7 @@ void IntrinsicCodeGeneratorARM64::VisitStringIndexOf(HInvoke* invoke) {
 
 void IntrinsicLocationsBuilderARM64::VisitStringIndexOfAfter(HInvoke* invoke) {
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                            LocationSummary::kCall,
+                                                            LocationSummary::kCallOnMainAndSlowPath,
                                                             kIntrinsified);
   // We have a hand-crafted assembly stub that follows the runtime calling convention. So it's
   // best to align the inputs accordingly.
@@ -1443,7 +1515,7 @@ void IntrinsicCodeGeneratorARM64::VisitStringIndexOfAfter(HInvoke* invoke) {
 
 void IntrinsicLocationsBuilderARM64::VisitStringNewStringFromBytes(HInvoke* invoke) {
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                            LocationSummary::kCall,
+                                                            LocationSummary::kCallOnMainAndSlowPath,
                                                             kIntrinsified);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
@@ -1454,7 +1526,7 @@ void IntrinsicLocationsBuilderARM64::VisitStringNewStringFromBytes(HInvoke* invo
 }
 
 void IntrinsicCodeGeneratorARM64::VisitStringNewStringFromBytes(HInvoke* invoke) {
-  vixl::MacroAssembler* masm = GetVIXLAssembler();
+  MacroAssembler* masm = GetVIXLAssembler();
   LocationSummary* locations = invoke->GetLocations();
 
   Register byte_array = WRegisterFrom(locations->InAt(0));
@@ -1464,7 +1536,8 @@ void IntrinsicCodeGeneratorARM64::VisitStringNewStringFromBytes(HInvoke* invoke)
   __ B(eq, slow_path->GetEntryLabel());
 
   __ Ldr(lr,
-      MemOperand(tr, QUICK_ENTRYPOINT_OFFSET(kArm64WordSize, pAllocStringFromBytes).Int32Value()));
+      MemOperand(tr,
+                 QUICK_ENTRYPOINT_OFFSET(kArm64PointerSize, pAllocStringFromBytes).Int32Value()));
   CheckEntrypointTypes<kQuickAllocStringFromBytes, void*, void*, int32_t, int32_t, int32_t>();
   __ Blr(lr);
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
@@ -1473,7 +1546,7 @@ void IntrinsicCodeGeneratorARM64::VisitStringNewStringFromBytes(HInvoke* invoke)
 
 void IntrinsicLocationsBuilderARM64::VisitStringNewStringFromChars(HInvoke* invoke) {
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                            LocationSummary::kCall,
+                                                            LocationSummary::kCallOnMainOnly,
                                                             kIntrinsified);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
@@ -1483,7 +1556,7 @@ void IntrinsicLocationsBuilderARM64::VisitStringNewStringFromChars(HInvoke* invo
 }
 
 void IntrinsicCodeGeneratorARM64::VisitStringNewStringFromChars(HInvoke* invoke) {
-  vixl::MacroAssembler* masm = GetVIXLAssembler();
+  MacroAssembler* masm = GetVIXLAssembler();
 
   // No need to emit code checking whether `locations->InAt(2)` is a null
   // pointer, as callers of the native method
@@ -1492,7 +1565,8 @@ void IntrinsicCodeGeneratorARM64::VisitStringNewStringFromChars(HInvoke* invoke)
   //
   // all include a null check on `data` before calling that method.
   __ Ldr(lr,
-      MemOperand(tr, QUICK_ENTRYPOINT_OFFSET(kArm64WordSize, pAllocStringFromChars).Int32Value()));
+      MemOperand(tr,
+                 QUICK_ENTRYPOINT_OFFSET(kArm64PointerSize, pAllocStringFromChars).Int32Value()));
   CheckEntrypointTypes<kQuickAllocStringFromChars, void*, int32_t, int32_t, void*>();
   __ Blr(lr);
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
@@ -1500,7 +1574,7 @@ void IntrinsicCodeGeneratorARM64::VisitStringNewStringFromChars(HInvoke* invoke)
 
 void IntrinsicLocationsBuilderARM64::VisitStringNewStringFromString(HInvoke* invoke) {
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                            LocationSummary::kCall,
+                                                            LocationSummary::kCallOnMainAndSlowPath,
                                                             kIntrinsified);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
@@ -1508,7 +1582,7 @@ void IntrinsicLocationsBuilderARM64::VisitStringNewStringFromString(HInvoke* inv
 }
 
 void IntrinsicCodeGeneratorARM64::VisitStringNewStringFromString(HInvoke* invoke) {
-  vixl::MacroAssembler* masm = GetVIXLAssembler();
+  MacroAssembler* masm = GetVIXLAssembler();
   LocationSummary* locations = invoke->GetLocations();
 
   Register string_to_copy = WRegisterFrom(locations->InAt(0));
@@ -1518,7 +1592,8 @@ void IntrinsicCodeGeneratorARM64::VisitStringNewStringFromString(HInvoke* invoke
   __ B(eq, slow_path->GetEntryLabel());
 
   __ Ldr(lr,
-      MemOperand(tr, QUICK_ENTRYPOINT_OFFSET(kArm64WordSize, pAllocStringFromString).Int32Value()));
+      MemOperand(tr,
+                 QUICK_ENTRYPOINT_OFFSET(kArm64PointerSize, pAllocStringFromString).Int32Value()));
   CheckEntrypointTypes<kQuickAllocStringFromString, void*, void*>();
   __ Blr(lr);
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
@@ -1531,7 +1606,7 @@ static void CreateFPToFPCallLocations(ArenaAllocator* arena, HInvoke* invoke) {
   DCHECK(Primitive::IsFloatingPointType(invoke->GetType()));
 
   LocationSummary* const locations = new (arena) LocationSummary(invoke,
-                                                                 LocationSummary::kCall,
+                                                                 LocationSummary::kCallOnMainOnly,
                                                                  kIntrinsified);
   InvokeRuntimeCallingConvention calling_convention;
 
@@ -1546,7 +1621,7 @@ static void CreateFPFPToFPCallLocations(ArenaAllocator* arena, HInvoke* invoke)
   DCHECK(Primitive::IsFloatingPointType(invoke->GetType()));
 
   LocationSummary* const locations = new (arena) LocationSummary(invoke,
-                                                                 LocationSummary::kCall,
+                                                                 LocationSummary::kCallOnMainOnly,
                                                                  kIntrinsified);
   InvokeRuntimeCallingConvention calling_convention;
 
@@ -1556,10 +1631,11 @@ static void CreateFPFPToFPCallLocations(ArenaAllocator* arena, HInvoke* invoke)
 }
 
 static void GenFPToFPCall(HInvoke* invoke,
-                          vixl::MacroAssembler* masm,
+                          MacroAssembler* masm,
                           CodeGeneratorARM64* codegen,
                           QuickEntrypointEnum entry) {
-  __ Ldr(lr, MemOperand(tr, GetThreadOffset<kArm64WordSize>(entry).Int32Value()));
+  __ Ldr(lr, MemOperand(tr,
+                        GetThreadOffset<kArm64PointerSize>(entry).Int32Value()));
   __ Blr(lr);
   codegen->RecordPcInfo(invoke, invoke->GetDexPc());
 }
@@ -1716,7 +1792,7 @@ void IntrinsicLocationsBuilderARM64::VisitStringGetCharsNoCheck(HInvoke* invoke)
 }
 
 void IntrinsicCodeGeneratorARM64::VisitStringGetCharsNoCheck(HInvoke* invoke) {
-  vixl::MacroAssembler* masm = GetVIXLAssembler();
+  MacroAssembler* masm = GetVIXLAssembler();
   LocationSummary* locations = invoke->GetLocations();
 
   // Check assumption that sizeof(Char) is 2 (used in scaling below).
@@ -1756,9 +1832,9 @@ void IntrinsicCodeGeneratorARM64::VisitStringGetCharsNoCheck(HInvoke* invoke) {
   __ Sub(num_chr, srcEnd, srcBegin);
 
   // Do the copy.
-  vixl::Label loop;
-  vixl::Label done;
-  vixl::Label remainder;
+  vixl::aarch64::Label loop;
+  vixl::aarch64::Label done;
+  vixl::aarch64::Label remainder;
 
   // Early out for valid zero-length retrievals.
   __ Cbz(num_chr, &done);
@@ -1773,9 +1849,9 @@ void IntrinsicCodeGeneratorARM64::VisitStringGetCharsNoCheck(HInvoke* invoke) {
   // Main loop used for longer fetches loads and stores 8x16-bit characters at a time.
   // (Unaligned addresses are acceptable here and not worth inlining extra code to rectify.)
   __ Bind(&loop);
-  __ Ldp(tmp1, tmp2, MemOperand(src_ptr, char_size * 8, vixl::PostIndex));
+  __ Ldp(tmp1, tmp2, MemOperand(src_ptr, char_size * 8, PostIndex));
   __ Subs(num_chr, num_chr, 8);
-  __ Stp(tmp1, tmp2, MemOperand(dst_ptr, char_size * 8, vixl::PostIndex));
+  __ Stp(tmp1, tmp2, MemOperand(dst_ptr, char_size * 8, PostIndex));
   __ B(ge, &loop);
 
   __ Adds(num_chr, num_chr, 8);
@@ -1784,9 +1860,9 @@ void IntrinsicCodeGeneratorARM64::VisitStringGetCharsNoCheck(HInvoke* invoke) {
   // Main loop for < 8 character case and remainder handling. Loads and stores one
   // 16-bit Java character at a time.
   __ Bind(&remainder);
-  __ Ldrh(tmp1, MemOperand(src_ptr, char_size, vixl::PostIndex));
+  __ Ldrh(tmp1, MemOperand(src_ptr, char_size, PostIndex));
   __ Subs(num_chr, num_chr, 1);
-  __ Strh(tmp1, MemOperand(dst_ptr, char_size, vixl::PostIndex));
+  __ Strh(tmp1, MemOperand(dst_ptr, char_size, PostIndex));
   __ B(gt, &remainder);
 
   __ Bind(&done);
@@ -1800,7 +1876,7 @@ static void SetSystemArrayCopyLocationRequires(LocationSummary* locations,
                                                uint32_t at,
                                                HInstruction* input) {
   HIntConstant* const_input = input->AsIntConstant();
-  if (const_input != nullptr && !vixl::Assembler::IsImmAddSub(const_input->GetValue())) {
+  if (const_input != nullptr && !vixl::aarch64::Assembler::IsImmAddSub(const_input->GetValue())) {
     locations->SetInAt(at, Location::RequiresRegister());
   } else {
     locations->SetInAt(at, Location::RegisterOrConstant(input));
@@ -1847,7 +1923,7 @@ void IntrinsicLocationsBuilderARM64::VisitSystemArrayCopyChar(HInvoke* invoke) {
   locations->AddTemp(Location::RequiresRegister());
 }
 
-static void CheckSystemArrayCopyPosition(vixl::MacroAssembler* masm,
+static void CheckSystemArrayCopyPosition(MacroAssembler* masm,
                                          const Location& pos,
                                          const Register& input,
                                          const Location& length,
@@ -1880,7 +1956,7 @@ static void CheckSystemArrayCopyPosition(vixl::MacroAssembler* masm,
   } else {
     // Check that pos >= 0.
     Register pos_reg = WRegisterFrom(pos);
-    __ Tbnz(pos_reg, pos_reg.size() - 1, slow_path->GetEntryLabel());
+    __ Tbnz(pos_reg, pos_reg.GetSizeInBits() - 1, slow_path->GetEntryLabel());
 
     // Check that pos <= length(input) && (length(input) - pos) >= length.
     __ Ldr(temp, MemOperand(input, length_offset));
@@ -1893,7 +1969,7 @@ static void CheckSystemArrayCopyPosition(vixl::MacroAssembler* masm,
 
 // Compute base source address, base destination address, and end source address
 // for System.arraycopy* intrinsics.
-static void GenSystemArrayCopyAddresses(vixl::MacroAssembler* masm,
+static void GenSystemArrayCopyAddresses(MacroAssembler* masm,
                                         Primitive::Type type,
                                         const Register& src,
                                         const Location& src_pos,
@@ -1934,7 +2010,7 @@ static void GenSystemArrayCopyAddresses(vixl::MacroAssembler* masm,
 }
 
 void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopyChar(HInvoke* invoke) {
-  vixl::MacroAssembler* masm = GetVIXLAssembler();
+  MacroAssembler* masm = GetVIXLAssembler();
   LocationSummary* locations = invoke->GetLocations();
   Register src = XRegisterFrom(locations->InAt(0));
   Location src_pos = locations->InAt(1);
@@ -2007,12 +2083,12 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopyChar(HInvoke* invoke) {
   const int32_t char_size = Primitive::ComponentSize(Primitive::kPrimChar);
   UseScratchRegisterScope temps(masm);
   Register tmp = temps.AcquireW();
-  vixl::Label loop, done;
+  vixl::aarch64::Label loop, done;
   __ Bind(&loop);
   __ Cmp(src_curr_addr, src_stop_addr);
   __ B(&done, eq);
-  __ Ldrh(tmp, MemOperand(src_curr_addr, char_size, vixl::PostIndex));
-  __ Strh(tmp, MemOperand(dst_curr_addr, char_size, vixl::PostIndex));
+  __ Ldrh(tmp, MemOperand(src_curr_addr, char_size, PostIndex));
+  __ Strh(tmp, MemOperand(dst_curr_addr, char_size, PostIndex));
   __ B(&loop);
   __ Bind(&done);
 
@@ -2026,9 +2102,9 @@ static constexpr int32_t kSystemArrayCopyThreshold = 128;
 // We want to use two temporary registers in order to reduce the register pressure in arm64.
 // So we don't use the CodeGenerator::CreateSystemArrayCopyLocationSummary.
 void IntrinsicLocationsBuilderARM64::VisitSystemArrayCopy(HInvoke* invoke) {
-  // TODO(rpl): Implement read barriers in the SystemArrayCopy
-  // intrinsic and re-enable it (b/29516905).
-  if (kEmitCompilerReadBarrier) {
+  // The only read barrier implementation supporting the
+  // SystemArrayCopy intrinsic is the Baker-style read barriers.
+  if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
     return;
   }
 
@@ -2081,20 +2157,29 @@ void IntrinsicLocationsBuilderARM64::VisitSystemArrayCopy(HInvoke* invoke) {
 
   locations->AddTemp(Location::RequiresRegister());
   locations->AddTemp(Location::RequiresRegister());
+  if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+    // Temporary register IP0, obtained from the VIXL scratch register
+    // pool, cannot be used in ReadBarrierSystemArrayCopySlowPathARM64
+    // (because that register is clobbered by ReadBarrierMarkRegX
+    // entry points). Get an extra temporary register from the
+    // register allocator.
+    locations->AddTemp(Location::RequiresRegister());
+  }
 }
 
 void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) {
-  // TODO(rpl): Implement read barriers in the SystemArrayCopy
-  // intrinsic and re-enable it (b/29516905).
-  DCHECK(!kEmitCompilerReadBarrier);
+  // The only read barrier implementation supporting the
+  // SystemArrayCopy intrinsic is the Baker-style read barriers.
+  DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier);
 
-  vixl::MacroAssembler* masm = GetVIXLAssembler();
+  MacroAssembler* masm = GetVIXLAssembler();
   LocationSummary* locations = invoke->GetLocations();
 
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
   uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
   uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
   uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
+  uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
 
   Register src = XRegisterFrom(locations->InAt(0));
   Location src_pos = locations->InAt(1);
@@ -2102,12 +2187,14 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) {
   Location dest_pos = locations->InAt(3);
   Location length = locations->InAt(4);
   Register temp1 = WRegisterFrom(locations->GetTemp(0));
+  Location temp1_loc = LocationFrom(temp1);
   Register temp2 = WRegisterFrom(locations->GetTemp(1));
+  Location temp2_loc = LocationFrom(temp2);
 
-  SlowPathCodeARM64* slow_path = new (GetAllocator()) IntrinsicSlowPathARM64(invoke);
-  codegen_->AddSlowPath(slow_path);
+  SlowPathCodeARM64* intrinsic_slow_path = new (GetAllocator()) IntrinsicSlowPathARM64(invoke);
+  codegen_->AddSlowPath(intrinsic_slow_path);
 
-  vixl::Label conditions_on_positions_validated;
+  vixl::aarch64::Label conditions_on_positions_validated;
   SystemArrayCopyOptimizations optimizations(invoke);
 
   // If source and destination are the same, we go to slow path if we need to do
@@ -2121,7 +2208,7 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) {
         DCHECK_GE(src_pos_constant, dest_pos_constant);
       } else if (src_pos_constant < dest_pos_constant) {
         __ Cmp(src, dest);
-        __ B(slow_path->GetEntryLabel(), eq);
+        __ B(intrinsic_slow_path->GetEntryLabel(), eq);
       }
       // Checked when building locations.
       DCHECK(!optimizations.GetDestinationIsSource()
@@ -2132,7 +2219,7 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) {
         __ B(&conditions_on_positions_validated, ne);
       }
       __ Cmp(WRegisterFrom(dest_pos), src_pos_constant);
-      __ B(slow_path->GetEntryLabel(), gt);
+      __ B(intrinsic_slow_path->GetEntryLabel(), gt);
     }
   } else {
     if (!optimizations.GetDestinationIsSource()) {
@@ -2141,19 +2228,19 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) {
     }
     __ Cmp(RegisterFrom(src_pos, invoke->InputAt(1)->GetType()),
            OperandFrom(dest_pos, invoke->InputAt(3)->GetType()));
-    __ B(slow_path->GetEntryLabel(), lt);
+    __ B(intrinsic_slow_path->GetEntryLabel(), lt);
   }
 
   __ Bind(&conditions_on_positions_validated);
 
   if (!optimizations.GetSourceIsNotNull()) {
     // Bail out if the source is null.
-    __ Cbz(src, slow_path->GetEntryLabel());
+    __ Cbz(src, intrinsic_slow_path->GetEntryLabel());
   }
 
   if (!optimizations.GetDestinationIsNotNull() && !optimizations.GetDestinationIsSource()) {
     // Bail out if the destination is null.
-    __ Cbz(dest, slow_path->GetEntryLabel());
+    __ Cbz(dest, intrinsic_slow_path->GetEntryLabel());
   }
 
   // We have already checked in the LocationsBuilder for the constant case.
@@ -2161,17 +2248,17 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) {
       !optimizations.GetCountIsSourceLength() &&
       !optimizations.GetCountIsDestinationLength()) {
     // If the length is negative, bail out.
-    __ Tbnz(WRegisterFrom(length), kWRegSize - 1, slow_path->GetEntryLabel());
+    __ Tbnz(WRegisterFrom(length), kWRegSize - 1, intrinsic_slow_path->GetEntryLabel());
     // If the length >= 128 then (currently) prefer native implementation.
     __ Cmp(WRegisterFrom(length), kSystemArrayCopyThreshold);
-    __ B(slow_path->GetEntryLabel(), ge);
+    __ B(intrinsic_slow_path->GetEntryLabel(), ge);
   }
   // Validity checks: source.
   CheckSystemArrayCopyPosition(masm,
                                src_pos,
                                src,
                                length,
-                               slow_path,
+                               intrinsic_slow_path,
                                temp1,
                                optimizations.GetCountIsSourceLength());
 
@@ -2180,90 +2267,236 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) {
                                dest_pos,
                                dest,
                                length,
-                               slow_path,
+                               intrinsic_slow_path,
                                temp1,
                                optimizations.GetCountIsDestinationLength());
   {
     // We use a block to end the scratch scope before the write barrier, thus
     // freeing the temporary registers so they can be used in `MarkGCCard`.
     UseScratchRegisterScope temps(masm);
+    // Note: Because it is acquired from VIXL's scratch register pool,
+    // `temp3` might be IP0, and thus cannot be used as `ref` argument
+    // of CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier
+    // calls below (see ReadBarrierMarkSlowPathARM64 for more details).
     Register temp3 = temps.AcquireW();
+
     if (!optimizations.GetDoesNotNeedTypeCheck()) {
       // Check whether all elements of the source array are assignable to the component
       // type of the destination array. We do two checks: the classes are the same,
       // or the destination is Object[]. If none of these checks succeed, we go to the
       // slow path.
-      __ Ldr(temp1, MemOperand(dest, class_offset));
-      __ Ldr(temp2, MemOperand(src, class_offset));
-      bool did_unpoison = false;
-      if (!optimizations.GetDestinationIsNonPrimitiveArray() ||
-          !optimizations.GetSourceIsNonPrimitiveArray()) {
-        // One or two of the references need to be unpoisoned. Unpoison them
-        // both to make the identity check valid.
-        codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp1);
-        codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp2);
-        did_unpoison = true;
-      }
 
-      if (!optimizations.GetDestinationIsNonPrimitiveArray()) {
-        // Bail out if the destination is not a non primitive array.
-        // /* HeapReference<Class> */ temp3 = temp1->component_type_
-        __ Ldr(temp3, HeapOperand(temp1, component_offset));
-        __ Cbz(temp3, slow_path->GetEntryLabel());
-        codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp3);
-        __ Ldrh(temp3, HeapOperand(temp3, primitive_offset));
-        static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
-        __ Cbnz(temp3, slow_path->GetEntryLabel());
-      }
+      if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+        if (!optimizations.GetSourceIsNonPrimitiveArray()) {
+          // /* HeapReference<Class> */ temp1 = src->klass_
+          codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke,
+                                                          temp1_loc,
+                                                          src.W(),
+                                                          class_offset,
+                                                          temp2,
+                                                          /* needs_null_check */ false,
+                                                          /* use_load_acquire */ false);
+          // Bail out if the source is not a non primitive array.
+          // /* HeapReference<Class> */ temp1 = temp1->component_type_
+          codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke,
+                                                          temp1_loc,
+                                                          temp1,
+                                                          component_offset,
+                                                          temp2,
+                                                          /* needs_null_check */ false,
+                                                          /* use_load_acquire */ false);
+          __ Cbz(temp1, intrinsic_slow_path->GetEntryLabel());
+          // If heap poisoning is enabled, `temp1` has been unpoisoned
+          // by the the previous call to GenerateFieldLoadWithBakerReadBarrier.
+          // /* uint16_t */ temp1 = static_cast<uint16>(temp1->primitive_type_);
+          __ Ldrh(temp1, HeapOperand(temp1, primitive_offset));
+          static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
+          __ Cbnz(temp1, intrinsic_slow_path->GetEntryLabel());
+        }
 
-      if (!optimizations.GetSourceIsNonPrimitiveArray()) {
-        // Bail out if the source is not a non primitive array.
-        // /* HeapReference<Class> */ temp3 = temp2->component_type_
-        __ Ldr(temp3, HeapOperand(temp2, component_offset));
-        __ Cbz(temp3, slow_path->GetEntryLabel());
-        codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp3);
-        __ Ldrh(temp3, HeapOperand(temp3, primitive_offset));
-        static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
-        __ Cbnz(temp3, slow_path->GetEntryLabel());
-      }
+        // /* HeapReference<Class> */ temp1 = dest->klass_
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke,
+                                                        temp1_loc,
+                                                        dest.W(),
+                                                        class_offset,
+                                                        temp2,
+                                                        /* needs_null_check */ false,
+                                                        /* use_load_acquire */ false);
+
+        if (!optimizations.GetDestinationIsNonPrimitiveArray()) {
+          // Bail out if the destination is not a non primitive array.
+          //
+          // Register `temp1` is not trashed by the read barrier emitted
+          // by GenerateFieldLoadWithBakerReadBarrier below, as that
+          // method produces a call to a ReadBarrierMarkRegX entry point,
+          // which saves all potentially live registers, including
+          // temporaries such a `temp1`.
+          // /* HeapReference<Class> */ temp2 = temp1->component_type_
+          codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke,
+                                                          temp2_loc,
+                                                          temp1,
+                                                          component_offset,
+                                                          temp3,
+                                                          /* needs_null_check */ false,
+                                                          /* use_load_acquire */ false);
+          __ Cbz(temp2, intrinsic_slow_path->GetEntryLabel());
+          // If heap poisoning is enabled, `temp2` has been unpoisoned
+          // by the the previous call to GenerateFieldLoadWithBakerReadBarrier.
+          // /* uint16_t */ temp2 = static_cast<uint16>(temp2->primitive_type_);
+          __ Ldrh(temp2, HeapOperand(temp2, primitive_offset));
+          static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
+          __ Cbnz(temp2, intrinsic_slow_path->GetEntryLabel());
+        }
+
+        // For the same reason given earlier, `temp1` is not trashed by the
+        // read barrier emitted by GenerateFieldLoadWithBakerReadBarrier below.
+        // /* HeapReference<Class> */ temp2 = src->klass_
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke,
+                                                        temp2_loc,
+                                                        src.W(),
+                                                        class_offset,
+                                                        temp3,
+                                                        /* needs_null_check */ false,
+                                                        /* use_load_acquire */ false);
+        // Note: if heap poisoning is on, we are comparing two unpoisoned references here.
+        __ Cmp(temp1, temp2);
+
+        if (optimizations.GetDestinationIsTypedObjectArray()) {
+          vixl::aarch64::Label do_copy;
+          __ B(&do_copy, eq);
+          // /* HeapReference<Class> */ temp1 = temp1->component_type_
+          codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke,
+                                                          temp1_loc,
+                                                          temp1,
+                                                          component_offset,
+                                                          temp2,
+                                                          /* needs_null_check */ false,
+                                                          /* use_load_acquire */ false);
+          // /* HeapReference<Class> */ temp1 = temp1->super_class_
+          // We do not need to emit a read barrier for the following
+          // heap reference load, as `temp1` is only used in a
+          // comparison with null below, and this reference is not
+          // kept afterwards.
+          __ Ldr(temp1, HeapOperand(temp1, super_offset));
+          __ Cbnz(temp1, intrinsic_slow_path->GetEntryLabel());
+          __ Bind(&do_copy);
+        } else {
+          __ B(intrinsic_slow_path->GetEntryLabel(), ne);
+        }
+      } else {
+        // Non read barrier code.
+
+        // /* HeapReference<Class> */ temp1 = dest->klass_
+        __ Ldr(temp1, MemOperand(dest, class_offset));
+        // /* HeapReference<Class> */ temp2 = src->klass_
+        __ Ldr(temp2, MemOperand(src, class_offset));
+        bool did_unpoison = false;
+        if (!optimizations.GetDestinationIsNonPrimitiveArray() ||
+            !optimizations.GetSourceIsNonPrimitiveArray()) {
+          // One or two of the references need to be unpoisoned. Unpoison them
+          // both to make the identity check valid.
+          codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp1);
+          codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp2);
+          did_unpoison = true;
+        }
+
+        if (!optimizations.GetDestinationIsNonPrimitiveArray()) {
+          // Bail out if the destination is not a non primitive array.
+          // /* HeapReference<Class> */ temp3 = temp1->component_type_
+          __ Ldr(temp3, HeapOperand(temp1, component_offset));
+          __ Cbz(temp3, intrinsic_slow_path->GetEntryLabel());
+          codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp3);
+          // /* uint16_t */ temp3 = static_cast<uint16>(temp3->primitive_type_);
+          __ Ldrh(temp3, HeapOperand(temp3, primitive_offset));
+          static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
+          __ Cbnz(temp3, intrinsic_slow_path->GetEntryLabel());
+        }
+
+        if (!optimizations.GetSourceIsNonPrimitiveArray()) {
+          // Bail out if the source is not a non primitive array.
+          // /* HeapReference<Class> */ temp3 = temp2->component_type_
+          __ Ldr(temp3, HeapOperand(temp2, component_offset));
+          __ Cbz(temp3, intrinsic_slow_path->GetEntryLabel());
+          codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp3);
+          // /* uint16_t */ temp3 = static_cast<uint16>(temp3->primitive_type_);
+          __ Ldrh(temp3, HeapOperand(temp3, primitive_offset));
+          static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
+          __ Cbnz(temp3, intrinsic_slow_path->GetEntryLabel());
+        }
 
-      __ Cmp(temp1, temp2);
+        __ Cmp(temp1, temp2);
 
-      if (optimizations.GetDestinationIsTypedObjectArray()) {
-        vixl::Label do_copy;
-        __ B(&do_copy, eq);
-        if (!did_unpoison) {
+        if (optimizations.GetDestinationIsTypedObjectArray()) {
+          vixl::aarch64::Label do_copy;
+          __ B(&do_copy, eq);
+          if (!did_unpoison) {
+            codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp1);
+          }
+          // /* HeapReference<Class> */ temp1 = temp1->component_type_
+          __ Ldr(temp1, HeapOperand(temp1, component_offset));
           codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp1);
+          // /* HeapReference<Class> */ temp1 = temp1->super_class_
+          __ Ldr(temp1, HeapOperand(temp1, super_offset));
+          // No need to unpoison the result, we're comparing against null.
+          __ Cbnz(temp1, intrinsic_slow_path->GetEntryLabel());
+          __ Bind(&do_copy);
+        } else {
+          __ B(intrinsic_slow_path->GetEntryLabel(), ne);
         }
-        // /* HeapReference<Class> */ temp1 = temp1->component_type_
-        __ Ldr(temp1, HeapOperand(temp1, component_offset));
-        codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp1);
-        // /* HeapReference<Class> */ temp1 = temp1->super_class_
-        __ Ldr(temp1, HeapOperand(temp1, super_offset));
-        // No need to unpoison the result, we're comparing against null.
-        __ Cbnz(temp1, slow_path->GetEntryLabel());
-        __ Bind(&do_copy);
-      } else {
-        __ B(slow_path->GetEntryLabel(), ne);
       }
     } else if (!optimizations.GetSourceIsNonPrimitiveArray()) {
       DCHECK(optimizations.GetDestinationIsNonPrimitiveArray());
       // Bail out if the source is not a non primitive array.
-      // /* HeapReference<Class> */ temp1 = src->klass_
-      __ Ldr(temp1, HeapOperand(src.W(), class_offset));
-      codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp1);
-      // /* HeapReference<Class> */ temp3 = temp1->component_type_
-      __ Ldr(temp3, HeapOperand(temp1, component_offset));
-      __ Cbz(temp3, slow_path->GetEntryLabel());
-      codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp3);
-      __ Ldrh(temp3, HeapOperand(temp3, primitive_offset));
+      if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+        // /* HeapReference<Class> */ temp1 = src->klass_
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke,
+                                                        temp1_loc,
+                                                        src.W(),
+                                                        class_offset,
+                                                        temp2,
+                                                        /* needs_null_check */ false,
+                                                        /* use_load_acquire */ false);
+        // /* HeapReference<Class> */ temp2 = temp1->component_type_
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke,
+                                                        temp2_loc,
+                                                        temp1,
+                                                        component_offset,
+                                                        temp3,
+                                                        /* needs_null_check */ false,
+                                                        /* use_load_acquire */ false);
+        __ Cbz(temp2, intrinsic_slow_path->GetEntryLabel());
+        // If heap poisoning is enabled, `temp2` has been unpoisoned
+        // by the the previous call to GenerateFieldLoadWithBakerReadBarrier.
+      } else {
+        // /* HeapReference<Class> */ temp1 = src->klass_
+        __ Ldr(temp1, HeapOperand(src.W(), class_offset));
+        codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp1);
+        // /* HeapReference<Class> */ temp2 = temp1->component_type_
+        __ Ldr(temp2, HeapOperand(temp1, component_offset));
+        __ Cbz(temp2, intrinsic_slow_path->GetEntryLabel());
+        codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp2);
+      }
+      // /* uint16_t */ temp2 = static_cast<uint16>(temp2->primitive_type_);
+      __ Ldrh(temp2, HeapOperand(temp2, primitive_offset));
       static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
-      __ Cbnz(temp3, slow_path->GetEntryLabel());
+      __ Cbnz(temp2, intrinsic_slow_path->GetEntryLabel());
     }
 
     Register src_curr_addr = temp1.X();
     Register dst_curr_addr = temp2.X();
-    Register src_stop_addr = temp3.X();
+    Register src_stop_addr;
+    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+      // Temporary register IP0, obtained from the VIXL scratch
+      // register pool as `temp3`, cannot be used in
+      // ReadBarrierSystemArrayCopySlowPathARM64 (because that
+      // register is clobbered by ReadBarrierMarkRegX entry points).
+      // So another temporary register allocated by the register
+      // allocator instead.
+      DCHECK_EQ(LocationFrom(temp3).reg(), IP0);
+      src_stop_addr = XRegisterFrom(locations->GetTemp(2));
+    } else {
+      src_stop_addr = temp3.X();
+    }
 
     GenSystemArrayCopyAddresses(masm,
                                 Primitive::kPrimNot,
@@ -2276,30 +2509,103 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) {
                                 dst_curr_addr,
                                 src_stop_addr);
 
-    // Iterate over the arrays and do a raw copy of the objects. We don't need to
-    // poison/unpoison.
-    vixl::Label loop, done;
     const int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot);
-    __ Bind(&loop);
-    __ Cmp(src_curr_addr, src_stop_addr);
-    __ B(&done, eq);
-    {
+
+    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+      // SystemArrayCopy implementation for Baker read barriers (see
+      // also CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier):
+      //
+      //   if (src_ptr != end_ptr) {
+      //     uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState();
+      //     lfence;  // Load fence or artificial data dependency to prevent load-load reordering
+      //     bool is_gray = (rb_state == ReadBarrier::gray_ptr_);
+      //     if (is_gray) {
+      //       // Slow-path copy.
+      //       do {
+      //         *dest_ptr++ = MaybePoison(ReadBarrier::Mark(MaybeUnpoison(*src_ptr++)));
+      //       } while (src_ptr != end_ptr)
+      //     } else {
+      //       // Fast-path copy.
+      //       do {
+      //         *dest_ptr++ = *src_ptr++;
+      //       } while (src_ptr != end_ptr)
+      //     }
+      //   }
+
+      vixl::aarch64::Label loop, done;
+
+      // Don't enter copy loop if `length == 0`.
+      __ Cmp(src_curr_addr, src_stop_addr);
+      __ B(&done, eq);
+
       Register tmp = temps.AcquireW();
-      __ Ldr(tmp, MemOperand(src_curr_addr, element_size, vixl::PostIndex));
-      __ Str(tmp, MemOperand(dst_curr_addr, element_size, vixl::PostIndex));
+      // Make sure `tmp` is not IP0, as it is clobbered by
+      // ReadBarrierMarkRegX entry points in
+      // ReadBarrierSystemArrayCopySlowPathARM64.
+      DCHECK_NE(LocationFrom(tmp).reg(), IP0);
+
+      // /* int32_t */ monitor = src->monitor_
+      __ Ldr(tmp, HeapOperand(src.W(), monitor_offset));
+      // /* LockWord */ lock_word = LockWord(monitor)
+      static_assert(sizeof(LockWord) == sizeof(int32_t),
+                    "art::LockWord and int32_t have different sizes.");
+
+      // Introduce a dependency on the lock_word including rb_state,
+      // to prevent load-load reordering, and without using
+      // a memory barrier (which would be more expensive).
+      // `src` is unchanged by this operation, but its value now depends
+      // on `tmp`.
+      __ Add(src.X(), src.X(), Operand(tmp.X(), LSR, 32));
+
+      // Slow path used to copy array when `src` is gray.
+      SlowPathCodeARM64* read_barrier_slow_path =
+          new (GetAllocator()) ReadBarrierSystemArrayCopySlowPathARM64(invoke, LocationFrom(tmp));
+      codegen_->AddSlowPath(read_barrier_slow_path);
+
+      // Given the numeric representation, it's enough to check the low bit of the rb_state.
+      static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0");
+      static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1");
+      static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2");
+      __ Tbnz(tmp, LockWord::kReadBarrierStateShift, read_barrier_slow_path->GetEntryLabel());
+
+      // Fast-path copy.
+      // Iterate over the arrays and do a raw copy of the objects. We don't need to
+      // poison/unpoison.
+      __ Bind(&loop);
+      __ Ldr(tmp, MemOperand(src_curr_addr, element_size, PostIndex));
+      __ Str(tmp, MemOperand(dst_curr_addr, element_size, PostIndex));
+      __ Cmp(src_curr_addr, src_stop_addr);
+      __ B(&loop, ne);
+
+      __ Bind(read_barrier_slow_path->GetExitLabel());
+      __ Bind(&done);
+    } else {
+      // Non read barrier code.
+
+      // Iterate over the arrays and do a raw copy of the objects. We don't need to
+      // poison/unpoison.
+      vixl::aarch64::Label loop, done;
+      __ Bind(&loop);
+      __ Cmp(src_curr_addr, src_stop_addr);
+      __ B(&done, eq);
+      {
+        Register tmp = temps.AcquireW();
+        __ Ldr(tmp, MemOperand(src_curr_addr, element_size, PostIndex));
+        __ Str(tmp, MemOperand(dst_curr_addr, element_size, PostIndex));
+      }
+      __ B(&loop);
+      __ Bind(&done);
     }
-    __ B(&loop);
-    __ Bind(&done);
   }
   // We only need one card marking on the destination array.
   codegen_->MarkGCCard(dest.W(), Register(), /* value_can_be_null */ false);
 
-  __ Bind(slow_path->GetExitLabel());
+  __ Bind(intrinsic_slow_path->GetExitLabel());
 }
 
 static void GenIsInfinite(LocationSummary* locations,
                           bool is64bit,
-                          vixl::MacroAssembler* masm) {
+                          MacroAssembler* masm) {
   Operand infinity;
   Register out;
 
@@ -2311,7 +2617,7 @@ static void GenIsInfinite(LocationSummary* locations,
     out = WRegisterFrom(locations->Out());
   }
 
-  const Register zero = vixl::Assembler::AppropriateZeroRegFor(out);
+  const Register zero = vixl::aarch64::Assembler::AppropriateZeroRegFor(out);
 
   MoveFPToInt(locations, is64bit, masm);
   __ Eor(out, out, infinity);
diff --git a/compiler/optimizing/intrinsics_arm64.h b/compiler/optimizing/intrinsics_arm64.h
index d47448a9c3..525153621b 100644
--- a/compiler/optimizing/intrinsics_arm64.h
+++ b/compiler/optimizing/intrinsics_arm64.h
@@ -20,10 +20,11 @@
 #include "intrinsics.h"
 
 namespace vixl {
+namespace aarch64 {
 
 class MacroAssembler;
 
-}  // namespace vixl
+}}  // namespace vixl::aarch64
 
 namespace art {
 
@@ -73,7 +74,7 @@ INTRINSICS_LIST(OPTIMIZING_INTRINSICS)
 #undef OPTIMIZING_INTRINSICS
 
  private:
-  vixl::MacroAssembler* GetVIXLAssembler();
+  vixl::aarch64::MacroAssembler* GetVIXLAssembler();
 
   ArenaAllocator* GetAllocator();
 
diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc
index 7137fd9c11..6e5eb6622b 100644
--- a/compiler/optimizing/intrinsics_mips.cc
+++ b/compiler/optimizing/intrinsics_mips.cc
@@ -1875,7 +1875,7 @@ void IntrinsicCodeGeneratorMIPS::VisitUnsafeCASObject(HInvoke* invoke) {
 // int java.lang.String.compareTo(String anotherString)
 void IntrinsicLocationsBuilderMIPS::VisitStringCompareTo(HInvoke* invoke) {
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                            LocationSummary::kCall,
+                                                            LocationSummary::kCallOnMainOnly,
                                                             kIntrinsified);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
@@ -1899,8 +1899,7 @@ void IntrinsicCodeGeneratorMIPS::VisitStringCompareTo(HInvoke* invoke) {
   __ LoadFromOffset(kLoadWord,
                     T9,
                     TR,
-                    QUICK_ENTRYPOINT_OFFSET(kMipsWordSize,
-                                            pStringCompareTo).Int32Value());
+                    QUICK_ENTRYPOINT_OFFSET(kMipsPointerSize, pStringCompareTo).Int32Value());
   __ Jalr(T9);
   __ Nop();
   __ Bind(slow_path->GetExitLabel());
@@ -2059,7 +2058,7 @@ static void GenerateStringIndexOf(HInvoke* invoke,
   __ LoadFromOffset(kLoadWord,
                     T9,
                     TR,
-                    QUICK_ENTRYPOINT_OFFSET(kMipsWordSize, pIndexOf).Int32Value());
+                    QUICK_ENTRYPOINT_OFFSET(kMipsPointerSize, pIndexOf).Int32Value());
   __ Jalr(T9);
   __ Nop();
 
@@ -2071,7 +2070,7 @@ static void GenerateStringIndexOf(HInvoke* invoke,
 // int java.lang.String.indexOf(int ch)
 void IntrinsicLocationsBuilderMIPS::VisitStringIndexOf(HInvoke* invoke) {
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                            LocationSummary::kCall,
+                                                            LocationSummary::kCallOnMainAndSlowPath,
                                                             kIntrinsified);
   // We have a hand-crafted assembly stub that follows the runtime
   // calling convention. So it's best to align the inputs accordingly.
@@ -2096,7 +2095,7 @@ void IntrinsicCodeGeneratorMIPS::VisitStringIndexOf(HInvoke* invoke) {
 // int java.lang.String.indexOf(int ch, int fromIndex)
 void IntrinsicLocationsBuilderMIPS::VisitStringIndexOfAfter(HInvoke* invoke) {
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                            LocationSummary::kCall,
+                                                            LocationSummary::kCallOnMainAndSlowPath,
                                                             kIntrinsified);
   // We have a hand-crafted assembly stub that follows the runtime
   // calling convention. So it's best to align the inputs accordingly.
@@ -2122,7 +2121,7 @@ void IntrinsicCodeGeneratorMIPS::VisitStringIndexOfAfter(HInvoke* invoke) {
 // java.lang.StringFactory.newStringFromBytes(byte[] data, int high, int offset, int byteCount)
 void IntrinsicLocationsBuilderMIPS::VisitStringNewStringFromBytes(HInvoke* invoke) {
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                            LocationSummary::kCall,
+                                                            LocationSummary::kCallOnMainAndSlowPath,
                                                             kIntrinsified);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
@@ -2145,7 +2144,7 @@ void IntrinsicCodeGeneratorMIPS::VisitStringNewStringFromBytes(HInvoke* invoke)
   __ LoadFromOffset(kLoadWord,
                     T9,
                     TR,
-                    QUICK_ENTRYPOINT_OFFSET(kMipsWordSize, pAllocStringFromBytes).Int32Value());
+                    QUICK_ENTRYPOINT_OFFSET(kMipsPointerSize, pAllocStringFromBytes).Int32Value());
   __ Jalr(T9);
   __ Nop();
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
@@ -2155,7 +2154,7 @@ void IntrinsicCodeGeneratorMIPS::VisitStringNewStringFromBytes(HInvoke* invoke)
 // java.lang.StringFactory.newStringFromChars(int offset, int charCount, char[] data)
 void IntrinsicLocationsBuilderMIPS::VisitStringNewStringFromChars(HInvoke* invoke) {
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                            LocationSummary::kCall,
+                                                            LocationSummary::kCallOnMainOnly,
                                                             kIntrinsified);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
@@ -2178,7 +2177,7 @@ void IntrinsicCodeGeneratorMIPS::VisitStringNewStringFromChars(HInvoke* invoke)
   __ LoadFromOffset(kLoadWord,
                     T9,
                     TR,
-                    QUICK_ENTRYPOINT_OFFSET(kMipsWordSize, pAllocStringFromChars).Int32Value());
+                    QUICK_ENTRYPOINT_OFFSET(kMipsPointerSize, pAllocStringFromChars).Int32Value());
   __ Jalr(T9);
   __ Nop();
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
@@ -2187,7 +2186,7 @@ void IntrinsicCodeGeneratorMIPS::VisitStringNewStringFromChars(HInvoke* invoke)
 // java.lang.StringFactory.newStringFromString(String toCopy)
 void IntrinsicLocationsBuilderMIPS::VisitStringNewStringFromString(HInvoke* invoke) {
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                            LocationSummary::kCall,
+                                                            LocationSummary::kCallOnMainAndSlowPath,
                                                             kIntrinsified);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
@@ -2207,7 +2206,7 @@ void IntrinsicCodeGeneratorMIPS::VisitStringNewStringFromString(HInvoke* invoke)
   __ LoadFromOffset(kLoadWord,
                     T9,
                     TR,
-                    QUICK_ENTRYPOINT_OFFSET(kMipsWordSize, pAllocStringFromString).Int32Value());
+                    QUICK_ENTRYPOINT_OFFSET(kMipsPointerSize, pAllocStringFromString).Int32Value());
   __ Jalr(T9);
   __ Nop();
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
diff --git a/compiler/optimizing/intrinsics_mips64.cc b/compiler/optimizing/intrinsics_mips64.cc
index cc4971b8f8..1e18540e1a 100644
--- a/compiler/optimizing/intrinsics_mips64.cc
+++ b/compiler/optimizing/intrinsics_mips64.cc
@@ -1519,7 +1519,7 @@ void IntrinsicCodeGeneratorMIPS64::VisitUnsafeCASObject(HInvoke* invoke) {
 // int java.lang.String.compareTo(String anotherString)
 void IntrinsicLocationsBuilderMIPS64::VisitStringCompareTo(HInvoke* invoke) {
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                            LocationSummary::kCall,
+                                                            LocationSummary::kCallOnMainOnly,
                                                             kIntrinsified);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
@@ -1543,7 +1543,7 @@ void IntrinsicCodeGeneratorMIPS64::VisitStringCompareTo(HInvoke* invoke) {
   __ LoadFromOffset(kLoadDoubleword,
                     T9,
                     TR,
-                    QUICK_ENTRYPOINT_OFFSET(kMips64DoublewordSize, pStringCompareTo).Int32Value());
+                    QUICK_ENTRYPOINT_OFFSET(kMips64PointerSize, pStringCompareTo).Int32Value());
   __ Jalr(T9);
   __ Nop();
   __ Bind(slow_path->GetExitLabel());
@@ -1694,7 +1694,7 @@ static void GenerateStringIndexOf(HInvoke* invoke,
   __ LoadFromOffset(kLoadDoubleword,
                     T9,
                     TR,
-                    QUICK_ENTRYPOINT_OFFSET(kMips64DoublewordSize, pIndexOf).Int32Value());
+                    QUICK_ENTRYPOINT_OFFSET(kMips64PointerSize, pIndexOf).Int32Value());
   CheckEntrypointTypes<kQuickIndexOf, int32_t, void*, uint32_t, uint32_t>();
   __ Jalr(T9);
   __ Nop();
@@ -1707,7 +1707,7 @@ static void GenerateStringIndexOf(HInvoke* invoke,
 // int java.lang.String.indexOf(int ch)
 void IntrinsicLocationsBuilderMIPS64::VisitStringIndexOf(HInvoke* invoke) {
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                            LocationSummary::kCall,
+                                                            LocationSummary::kCallOnMainAndSlowPath,
                                                             kIntrinsified);
   // We have a hand-crafted assembly stub that follows the runtime
   // calling convention. So it's best to align the inputs accordingly.
@@ -1728,7 +1728,7 @@ void IntrinsicCodeGeneratorMIPS64::VisitStringIndexOf(HInvoke* invoke) {
 // int java.lang.String.indexOf(int ch, int fromIndex)
 void IntrinsicLocationsBuilderMIPS64::VisitStringIndexOfAfter(HInvoke* invoke) {
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                            LocationSummary::kCall,
+                                                            LocationSummary::kCallOnMainAndSlowPath,
                                                             kIntrinsified);
   // We have a hand-crafted assembly stub that follows the runtime
   // calling convention. So it's best to align the inputs accordingly.
@@ -1748,7 +1748,7 @@ void IntrinsicCodeGeneratorMIPS64::VisitStringIndexOfAfter(HInvoke* invoke) {
 // java.lang.StringFactory.newStringFromBytes(byte[] data, int high, int offset, int byteCount)
 void IntrinsicLocationsBuilderMIPS64::VisitStringNewStringFromBytes(HInvoke* invoke) {
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                            LocationSummary::kCall,
+                                                            LocationSummary::kCallOnMainAndSlowPath,
                                                             kIntrinsified);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
@@ -1771,7 +1771,7 @@ void IntrinsicCodeGeneratorMIPS64::VisitStringNewStringFromBytes(HInvoke* invoke
   __ LoadFromOffset(kLoadDoubleword,
                     T9,
                     TR,
-                    QUICK_ENTRYPOINT_OFFSET(kMips64DoublewordSize,
+                    QUICK_ENTRYPOINT_OFFSET(kMips64PointerSize,
                                             pAllocStringFromBytes).Int32Value());
   CheckEntrypointTypes<kQuickAllocStringFromBytes, void*, void*, int32_t, int32_t, int32_t>();
   __ Jalr(T9);
@@ -1783,7 +1783,7 @@ void IntrinsicCodeGeneratorMIPS64::VisitStringNewStringFromBytes(HInvoke* invoke
 // java.lang.StringFactory.newStringFromChars(int offset, int charCount, char[] data)
 void IntrinsicLocationsBuilderMIPS64::VisitStringNewStringFromChars(HInvoke* invoke) {
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                            LocationSummary::kCall,
+                                                            LocationSummary::kCallOnMainOnly,
                                                             kIntrinsified);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
@@ -1805,7 +1805,7 @@ void IntrinsicCodeGeneratorMIPS64::VisitStringNewStringFromChars(HInvoke* invoke
   __ LoadFromOffset(kLoadDoubleword,
                     T9,
                     TR,
-                    QUICK_ENTRYPOINT_OFFSET(kMips64DoublewordSize,
+                    QUICK_ENTRYPOINT_OFFSET(kMips64PointerSize,
                                             pAllocStringFromChars).Int32Value());
   CheckEntrypointTypes<kQuickAllocStringFromChars, void*, int32_t, int32_t, void*>();
   __ Jalr(T9);
@@ -1816,7 +1816,7 @@ void IntrinsicCodeGeneratorMIPS64::VisitStringNewStringFromChars(HInvoke* invoke
 // java.lang.StringFactory.newStringFromString(String toCopy)
 void IntrinsicLocationsBuilderMIPS64::VisitStringNewStringFromString(HInvoke* invoke) {
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                            LocationSummary::kCall,
+                                                            LocationSummary::kCallOnMainAndSlowPath,
                                                             kIntrinsified);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
@@ -1836,7 +1836,7 @@ void IntrinsicCodeGeneratorMIPS64::VisitStringNewStringFromString(HInvoke* invok
   __ LoadFromOffset(kLoadDoubleword,
                     T9,
                     TR,
-                    QUICK_ENTRYPOINT_OFFSET(kMips64DoublewordSize,
+                    QUICK_ENTRYPOINT_OFFSET(kMips64PointerSize,
                                             pAllocStringFromString).Int32Value());
   CheckEntrypointTypes<kQuickAllocStringFromString, void*, void*>();
   __ Jalr(T9);
@@ -1879,6 +1879,84 @@ void IntrinsicCodeGeneratorMIPS64::VisitDoubleIsInfinite(HInvoke* invoke) {
   GenIsInfinite(invoke->GetLocations(), /* is64bit */ true, GetAssembler());
 }
 
+static void GenHighestOneBit(LocationSummary* locations,
+                             Primitive::Type type,
+                             Mips64Assembler* assembler) {
+  DCHECK(type == Primitive::kPrimInt || type == Primitive::kPrimLong) << PrettyDescriptor(type);
+
+  GpuRegister in = locations->InAt(0).AsRegister<GpuRegister>();
+  GpuRegister out = locations->Out().AsRegister<GpuRegister>();
+
+  if (type == Primitive::kPrimLong) {
+    __ Dclz(TMP, in);
+    __ LoadConst64(AT, INT64_C(0x8000000000000000));
+    __ Dsrlv(out, AT, TMP);
+  } else {
+    __ Clz(TMP, in);
+    __ LoadConst32(AT, 0x80000000);
+    __ Srlv(out, AT, TMP);
+  }
+  // For either value of "type", when "in" is zero, "out" should also
+  // be zero. Without this extra "and" operation, when "in" is zero,
+  // "out" would be either Integer.MIN_VALUE, or Long.MIN_VALUE because
+  // the MIPS logical shift operations "dsrlv", and "srlv" don't use
+  // the shift amount (TMP) directly; they use either (TMP % 64) or
+  // (TMP % 32), respectively.
+  __ And(out, out, in);
+}
+
+// int java.lang.Integer.highestOneBit(int)
+void IntrinsicLocationsBuilderMIPS64::VisitIntegerHighestOneBit(HInvoke* invoke) {
+  CreateIntToIntLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorMIPS64::VisitIntegerHighestOneBit(HInvoke* invoke) {
+  GenHighestOneBit(invoke->GetLocations(), Primitive::kPrimInt, GetAssembler());
+}
+
+// long java.lang.Long.highestOneBit(long)
+void IntrinsicLocationsBuilderMIPS64::VisitLongHighestOneBit(HInvoke* invoke) {
+  CreateIntToIntLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorMIPS64::VisitLongHighestOneBit(HInvoke* invoke) {
+  GenHighestOneBit(invoke->GetLocations(), Primitive::kPrimLong, GetAssembler());
+}
+
+static void GenLowestOneBit(LocationSummary* locations,
+                            Primitive::Type type,
+                            Mips64Assembler* assembler) {
+  DCHECK(type == Primitive::kPrimInt || type == Primitive::kPrimLong) << PrettyDescriptor(type);
+
+  GpuRegister in = locations->InAt(0).AsRegister<GpuRegister>();
+  GpuRegister out = locations->Out().AsRegister<GpuRegister>();
+
+  if (type == Primitive::kPrimLong) {
+    __ Dsubu(TMP, ZERO, in);
+  } else {
+    __ Subu(TMP, ZERO, in);
+  }
+  __ And(out, TMP, in);
+}
+
+// int java.lang.Integer.lowestOneBit(int)
+void IntrinsicLocationsBuilderMIPS64::VisitIntegerLowestOneBit(HInvoke* invoke) {
+  CreateIntToIntLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorMIPS64::VisitIntegerLowestOneBit(HInvoke* invoke) {
+  GenLowestOneBit(invoke->GetLocations(), Primitive::kPrimInt, GetAssembler());
+}
+
+// long java.lang.Long.lowestOneBit(long)
+void IntrinsicLocationsBuilderMIPS64::VisitLongLowestOneBit(HInvoke* invoke) {
+  CreateIntToIntLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorMIPS64::VisitLongLowestOneBit(HInvoke* invoke) {
+  GenLowestOneBit(invoke->GetLocations(), Primitive::kPrimLong, GetAssembler());
+}
+
 UNIMPLEMENTED_INTRINSIC(MIPS64, ReferenceGetReferent)
 UNIMPLEMENTED_INTRINSIC(MIPS64, StringGetCharsNoCheck)
 UNIMPLEMENTED_INTRINSIC(MIPS64, SystemArrayCopyChar)
@@ -1902,11 +1980,6 @@ UNIMPLEMENTED_INTRINSIC(MIPS64, MathSinh)
 UNIMPLEMENTED_INTRINSIC(MIPS64, MathTan)
 UNIMPLEMENTED_INTRINSIC(MIPS64, MathTanh)
 
-UNIMPLEMENTED_INTRINSIC(MIPS64, IntegerHighestOneBit)
-UNIMPLEMENTED_INTRINSIC(MIPS64, LongHighestOneBit)
-UNIMPLEMENTED_INTRINSIC(MIPS64, IntegerLowestOneBit)
-UNIMPLEMENTED_INTRINSIC(MIPS64, LongLowestOneBit)
-
 // 1.8.
 UNIMPLEMENTED_INTRINSIC(MIPS64, UnsafeGetAndAddInt)
 UNIMPLEMENTED_INTRINSIC(MIPS64, UnsafeGetAndAddLong)
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index 812bdf550e..49d6c1952c 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -70,6 +70,105 @@ static void MoveArguments(HInvoke* invoke, CodeGeneratorX86* codegen) {
 
 using IntrinsicSlowPathX86 = IntrinsicSlowPath<InvokeDexCallingConventionVisitorX86>;
 
+// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
+#define __ down_cast<X86Assembler*>(codegen->GetAssembler())->  // NOLINT
+
+// Slow path implementing the SystemArrayCopy intrinsic copy loop with read barriers.
+class ReadBarrierSystemArrayCopySlowPathX86 : public SlowPathCode {
+ public:
+  explicit ReadBarrierSystemArrayCopySlowPathX86(HInstruction* instruction)
+      : SlowPathCode(instruction) {
+    DCHECK(kEmitCompilerReadBarrier);
+    DCHECK(kUseBakerReadBarrier);
+  }
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    CodeGeneratorX86* x86_codegen = down_cast<CodeGeneratorX86*>(codegen);
+    LocationSummary* locations = instruction_->GetLocations();
+    DCHECK(locations->CanCall());
+    DCHECK(instruction_->IsInvokeStaticOrDirect())
+        << "Unexpected instruction in read barrier arraycopy slow path: "
+        << instruction_->DebugName();
+    DCHECK(instruction_->GetLocations()->Intrinsified());
+    DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kSystemArrayCopy);
+
+    int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot);
+    uint32_t offset = mirror::Array::DataOffset(element_size).Uint32Value();
+
+    Register src = locations->InAt(0).AsRegister<Register>();
+    Location src_pos = locations->InAt(1);
+    Register dest = locations->InAt(2).AsRegister<Register>();
+    Location dest_pos = locations->InAt(3);
+    Location length = locations->InAt(4);
+    Location temp1_loc = locations->GetTemp(0);
+    Register temp1 = temp1_loc.AsRegister<Register>();
+    Register temp2 = locations->GetTemp(1).AsRegister<Register>();
+    Register temp3 = locations->GetTemp(2).AsRegister<Register>();
+
+    __ Bind(GetEntryLabel());
+    // In this code path, registers `temp1`, `temp2`, and `temp3`
+    // (resp.) are not used for the base source address, the base
+    // destination address, and the end source address (resp.), as in
+    // other SystemArrayCopy intrinsic code paths.  Instead they are
+    // (resp.) used for:
+    // - the loop index (`i`);
+    // - the source index (`src_index`) and the loaded (source)
+    //   reference (`value`); and
+    // - the destination index (`dest_index`).
+
+    // i = 0
+    __ xorl(temp1, temp1);
+    NearLabel loop;
+    __ Bind(&loop);
+    // value = src_array[i + src_pos]
+    if (src_pos.IsConstant()) {
+      int32_t constant = src_pos.GetConstant()->AsIntConstant()->GetValue();
+      int32_t adjusted_offset = offset + constant * element_size;
+      __ movl(temp2, Address(src, temp1, ScaleFactor::TIMES_4, adjusted_offset));
+    } else {
+      __ leal(temp2, Address(src_pos.AsRegister<Register>(), temp1, ScaleFactor::TIMES_1, 0));
+      __ movl(temp2, Address(src, temp2, ScaleFactor::TIMES_4, offset));
+    }
+    __ MaybeUnpoisonHeapReference(temp2);
+    // TODO: Inline the mark bit check before calling the runtime?
+    // value = ReadBarrier::Mark(value)
+    // No need to save live registers; it's taken care of by the
+    // entrypoint. Also, there is no need to update the stack mask,
+    // as this runtime call will not trigger a garbage collection.
+    // (See ReadBarrierMarkSlowPathX86::EmitNativeCode for more
+    // explanations.)
+    DCHECK_NE(temp2, ESP);
+    DCHECK(0 <= temp2 && temp2 < kNumberOfCpuRegisters) << temp2;
+    int32_t entry_point_offset =
+        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kX86PointerSize>(temp2);
+    // This runtime call does not require a stack map.
+    x86_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
+    __ MaybePoisonHeapReference(temp2);
+    // dest_array[i + dest_pos] = value
+    if (dest_pos.IsConstant()) {
+      int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
+      int32_t adjusted_offset = offset + constant * element_size;
+      __ movl(Address(dest, temp1, ScaleFactor::TIMES_4, adjusted_offset), temp2);
+    } else {
+      __ leal(temp3, Address(dest_pos.AsRegister<Register>(), temp1, ScaleFactor::TIMES_1, 0));
+      __ movl(Address(dest, temp3, ScaleFactor::TIMES_4, offset), temp2);
+    }
+    // ++i
+    __ addl(temp1, Immediate(1));
+    // if (i != length) goto loop
+    x86_codegen->GenerateIntCompare(temp1_loc, length);
+    __ j(kNotEqual, &loop);
+    __ jmp(GetExitLabel());
+  }
+
+  const char* GetDescription() const OVERRIDE { return "ReadBarrierSystemArrayCopySlowPathX86"; }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(ReadBarrierSystemArrayCopySlowPathX86);
+};
+
+#undef __
+
 #define __ assembler->
 
 static void CreateFPToIntLocations(ArenaAllocator* arena, HInvoke* invoke, bool is64bit) {
@@ -706,7 +805,7 @@ static void CreateSSE41FPToFPLocations(ArenaAllocator* arena,
 
   // We have to fall back to a call to the intrinsic.
   LocationSummary* locations = new (arena) LocationSummary(invoke,
-                                                           LocationSummary::kCall);
+                                                           LocationSummary::kCallOnMainOnly);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0)));
   locations->SetOut(Location::FpuRegisterLocation(XMM0));
@@ -752,20 +851,20 @@ void IntrinsicCodeGeneratorX86::VisitMathRint(HInvoke* invoke) {
   GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 0);
 }
 
-// Note that 32 bit x86 doesn't have the capability to inline MathRoundDouble,
-// as it needs 64 bit instructions.
 void IntrinsicLocationsBuilderX86::VisitMathRoundFloat(HInvoke* invoke) {
-  // See intrinsics.h.
-  if (!kRoundIsPlusPointFive) {
-    return;
-  }
-
   // Do we have instruction support?
   if (codegen_->GetInstructionSetFeatures().HasSSE4_1()) {
+    HInvokeStaticOrDirect* static_or_direct = invoke->AsInvokeStaticOrDirect();
+    DCHECK(static_or_direct != nullptr);
     LocationSummary* locations = new (arena_) LocationSummary(invoke,
                                                               LocationSummary::kNoCall,
                                                               kIntrinsified);
     locations->SetInAt(0, Location::RequiresFpuRegister());
+    if (static_or_direct->HasSpecialInput() &&
+        invoke->InputAt(
+            static_or_direct->GetSpecialInputIndex())->IsX86ComputeBaseMethodAddress()) {
+      locations->SetInAt(1, Location::RequiresRegister());
+    }
     locations->SetOut(Location::RequiresRegister());
     locations->AddTemp(Location::RequiresFpuRegister());
     locations->AddTemp(Location::RequiresFpuRegister());
@@ -774,7 +873,7 @@ void IntrinsicLocationsBuilderX86::VisitMathRoundFloat(HInvoke* invoke) {
 
   // We have to fall back to a call to the intrinsic.
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                           LocationSummary::kCall);
+                                                            LocationSummary::kCallOnMainOnly);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0)));
   locations->SetOut(Location::RegisterLocation(EAX));
@@ -784,54 +883,61 @@ void IntrinsicLocationsBuilderX86::VisitMathRoundFloat(HInvoke* invoke) {
 
 void IntrinsicCodeGeneratorX86::VisitMathRoundFloat(HInvoke* invoke) {
   LocationSummary* locations = invoke->GetLocations();
-  if (locations->WillCall()) {
+  if (locations->WillCall()) {  // TODO: can we reach this?
     InvokeOutOfLineIntrinsic(codegen_, invoke);
     return;
   }
 
-  // Implement RoundFloat as t1 = floor(input + 0.5f);  convert to int.
   XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
+  XmmRegister t1 = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+  XmmRegister t2 = locations->GetTemp(1).AsFpuRegister<XmmRegister>();
   Register out = locations->Out().AsRegister<Register>();
-  XmmRegister maxInt = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
-  XmmRegister inPlusPointFive = locations->GetTemp(1).AsFpuRegister<XmmRegister>();
-  NearLabel done, nan;
+  NearLabel skip_incr, done;
   X86Assembler* assembler = GetAssembler();
 
-  // Generate 0.5 into inPlusPointFive.
-  __ movl(out, Immediate(bit_cast<int32_t, float>(0.5f)));
-  __ movd(inPlusPointFive, out);
-
-  // Add in the input.
-  __ addss(inPlusPointFive, in);
-
-  // And truncate to an integer.
-  __ roundss(inPlusPointFive, inPlusPointFive, Immediate(1));
-
+  // Since no direct x86 rounding instruction matches the required semantics,
+  // this intrinsic is implemented as follows:
+  //  result = floor(in);
+  //  if (in - result >= 0.5f)
+  //    result = result + 1.0f;
+  __ movss(t2, in);
+  __ roundss(t1, in, Immediate(1));
+  __ subss(t2, t1);
+  if (locations->GetInputCount() == 2 && locations->InAt(1).IsValid()) {
+    // Direct constant area available.
+    Register constant_area = locations->InAt(1).AsRegister<Register>();
+    __ comiss(t2, codegen_->LiteralInt32Address(bit_cast<int32_t, float>(0.5f), constant_area));
+    __ j(kBelow, &skip_incr);
+    __ addss(t1, codegen_->LiteralInt32Address(bit_cast<int32_t, float>(1.0f), constant_area));
+    __ Bind(&skip_incr);
+  } else {
+    // No constant area: go through stack.
+    __ pushl(Immediate(bit_cast<int32_t, float>(0.5f)));
+    __ pushl(Immediate(bit_cast<int32_t, float>(1.0f)));
+    __ comiss(t2, Address(ESP, 4));
+    __ j(kBelow, &skip_incr);
+    __ addss(t1, Address(ESP, 0));
+    __ Bind(&skip_incr);
+    __ addl(ESP, Immediate(8));
+  }
+
+  // Final conversion to an integer. Unfortunately this also does not have a
+  // direct x86 instruction, since NaN should map to 0 and large positive
+  // values need to be clipped to the extreme value.
   __ movl(out, Immediate(kPrimIntMax));
-  // maxInt = int-to-float(out)
-  __ cvtsi2ss(maxInt, out);
-
-  // if inPlusPointFive >= maxInt goto done
-  __ comiss(inPlusPointFive, maxInt);
-  __ j(kAboveEqual, &done);
-
-  // if input == NaN goto nan
-  __ j(kUnordered, &nan);
-
-  // output = float-to-int-truncate(input)
-  __ cvttss2si(out, inPlusPointFive);
-  __ jmp(&done);
-  __ Bind(&nan);
-
-  //  output = 0
-  __ xorl(out, out);
+  __ cvtsi2ss(t2, out);
+  __ comiss(t1, t2);
+  __ j(kAboveEqual, &done);  // clipped to max (already in out), does not jump on unordered
+  __ movl(out, Immediate(0));  // does not change flags
+  __ j(kUnordered, &done);  // NaN mapped to 0 (just moved in out)
+  __ cvttss2si(out, t1);
   __ Bind(&done);
 }
 
 static void CreateFPToFPCallLocations(ArenaAllocator* arena,
                                       HInvoke* invoke) {
   LocationSummary* locations = new (arena) LocationSummary(invoke,
-                                                           LocationSummary::kCall,
+                                                           LocationSummary::kCallOnMainOnly,
                                                            kIntrinsified);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(0)));
@@ -857,7 +963,7 @@ static void GenFPToFPCall(HInvoke* invoke, CodeGeneratorX86* codegen, QuickEntry
   }
 
   // Now do the actual call.
-  __ fs()->call(Address::Absolute(GetThreadOffset<kX86WordSize>(entry)));
+  __ fs()->call(Address::Absolute(GetThreadOffset<kX86PointerSize>(entry)));
 
   // Extract the return value from the FP stack.
   __ fstpl(Address(ESP, 0));
@@ -985,7 +1091,7 @@ void IntrinsicCodeGeneratorX86::VisitMathTanh(HInvoke* invoke) {
 static void CreateFPFPToFPCallLocations(ArenaAllocator* arena,
                                         HInvoke* invoke) {
   LocationSummary* locations = new (arena) LocationSummary(invoke,
-                                                           LocationSummary::kCall,
+                                                           LocationSummary::kCallOnMainOnly,
                                                            kIntrinsified);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(0)));
@@ -1216,7 +1322,7 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopyChar(HInvoke* invoke) {
 void IntrinsicLocationsBuilderX86::VisitStringCompareTo(HInvoke* invoke) {
   // The inputs plus one temp.
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                            LocationSummary::kCall,
+                                                            LocationSummary::kCallOnMainAndSlowPath,
                                                             kIntrinsified);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
@@ -1237,7 +1343,7 @@ void IntrinsicCodeGeneratorX86::VisitStringCompareTo(HInvoke* invoke) {
   codegen_->AddSlowPath(slow_path);
   __ j(kEqual, slow_path->GetEntryLabel());
 
-  __ fs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86WordSize, pStringCompareTo)));
+  __ fs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86PointerSize, pStringCompareTo)));
   __ Bind(slow_path->GetExitLabel());
 }
 
@@ -1490,7 +1596,7 @@ void IntrinsicCodeGeneratorX86::VisitStringIndexOfAfter(HInvoke* invoke) {
 
 void IntrinsicLocationsBuilderX86::VisitStringNewStringFromBytes(HInvoke* invoke) {
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                            LocationSummary::kCall,
+                                                            LocationSummary::kCallOnMainAndSlowPath,
                                                             kIntrinsified);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
@@ -1510,7 +1616,7 @@ void IntrinsicCodeGeneratorX86::VisitStringNewStringFromBytes(HInvoke* invoke) {
   codegen_->AddSlowPath(slow_path);
   __ j(kEqual, slow_path->GetEntryLabel());
 
-  __ fs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86WordSize, pAllocStringFromBytes)));
+  __ fs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86PointerSize, pAllocStringFromBytes)));
   CheckEntrypointTypes<kQuickAllocStringFromBytes, void*, void*, int32_t, int32_t, int32_t>();
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
   __ Bind(slow_path->GetExitLabel());
@@ -1518,7 +1624,7 @@ void IntrinsicCodeGeneratorX86::VisitStringNewStringFromBytes(HInvoke* invoke) {
 
 void IntrinsicLocationsBuilderX86::VisitStringNewStringFromChars(HInvoke* invoke) {
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                            LocationSummary::kCall,
+                                                            LocationSummary::kCallOnMainOnly,
                                                             kIntrinsified);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
@@ -1536,14 +1642,14 @@ void IntrinsicCodeGeneratorX86::VisitStringNewStringFromChars(HInvoke* invoke) {
   //   java.lang.StringFactory.newStringFromChars(int offset, int charCount, char[] data)
   //
   // all include a null check on `data` before calling that method.
-  __ fs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86WordSize, pAllocStringFromChars)));
+  __ fs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86PointerSize, pAllocStringFromChars)));
   CheckEntrypointTypes<kQuickAllocStringFromChars, void*, int32_t, int32_t, void*>();
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
 }
 
 void IntrinsicLocationsBuilderX86::VisitStringNewStringFromString(HInvoke* invoke) {
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                            LocationSummary::kCall,
+                                                            LocationSummary::kCallOnMainAndSlowPath,
                                                             kIntrinsified);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
@@ -1560,7 +1666,8 @@ void IntrinsicCodeGeneratorX86::VisitStringNewStringFromString(HInvoke* invoke)
   codegen_->AddSlowPath(slow_path);
   __ j(kEqual, slow_path->GetEntryLabel());
 
-  __ fs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86WordSize, pAllocStringFromString)));
+  __ fs()->call(
+      Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86PointerSize, pAllocStringFromString)));
   CheckEntrypointTypes<kQuickAllocStringFromString, void*, void*>();
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
   __ Bind(slow_path->GetExitLabel());
@@ -1801,7 +1908,7 @@ void IntrinsicLocationsBuilderX86::VisitThreadCurrentThread(HInvoke* invoke) {
 
 void IntrinsicCodeGeneratorX86::VisitThreadCurrentThread(HInvoke* invoke) {
   Register out = invoke->GetLocations()->Out().AsRegister<Register>();
-  GetAssembler()->fs()->movl(out, Address::Absolute(Thread::PeerOffset<kX86WordSize>()));
+  GetAssembler()->fs()->movl(out, Address::Absolute(Thread::PeerOffset<kX86PointerSize>()));
 }
 
 static void GenUnsafeGet(HInvoke* invoke,
@@ -2670,9 +2777,9 @@ static bool IsSameInput(HInstruction* instruction, size_t input0, size_t input1)
 }
 
 void IntrinsicLocationsBuilderX86::VisitSystemArrayCopy(HInvoke* invoke) {
-  // TODO(rpl): Implement read barriers in the SystemArrayCopy
-  // intrinsic and re-enable it (b/29516905).
-  if (kEmitCompilerReadBarrier) {
+  // The only read barrier implementation supporting the
+  // SystemArrayCopy intrinsic is the Baker-style read barriers.
+  if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
     return;
   }
 
@@ -2702,9 +2809,9 @@ void IntrinsicLocationsBuilderX86::VisitSystemArrayCopy(HInvoke* invoke) {
 }
 
 void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) {
-  // TODO(rpl): Implement read barriers in the SystemArrayCopy
-  // intrinsic and re-enable it (b/29516905).
-  DCHECK(!kEmitCompilerReadBarrier);
+  // The only read barrier implementation supporting the
+  // SystemArrayCopy intrinsic is the Baker-style read barriers.
+  DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier);
 
   X86Assembler* assembler = GetAssembler();
   LocationSummary* locations = invoke->GetLocations();
@@ -2713,17 +2820,21 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) {
   uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
   uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
   uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
+  uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
 
   Register src = locations->InAt(0).AsRegister<Register>();
   Location src_pos = locations->InAt(1);
   Register dest = locations->InAt(2).AsRegister<Register>();
   Location dest_pos = locations->InAt(3);
-  Location length = locations->InAt(4);
-  Register temp1 = locations->GetTemp(0).AsRegister<Register>();
-  Register temp2 = locations->GetTemp(1).AsRegister<Register>();
+  Location length_arg = locations->InAt(4);
+  Location length = length_arg;
+  Location temp1_loc = locations->GetTemp(0);
+  Register temp1 = temp1_loc.AsRegister<Register>();
+  Location temp2_loc = locations->GetTemp(1);
+  Register temp2 = temp2_loc.AsRegister<Register>();
 
-  SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathX86(invoke);
-  codegen_->AddSlowPath(slow_path);
+  SlowPathCode* intrinsic_slow_path = new (GetAllocator()) IntrinsicSlowPathX86(invoke);
+  codegen_->AddSlowPath(intrinsic_slow_path);
 
   NearLabel conditions_on_positions_validated;
   SystemArrayCopyOptimizations optimizations(invoke);
@@ -2739,7 +2850,7 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) {
         DCHECK_GE(src_pos_constant, dest_pos_constant);
       } else if (src_pos_constant < dest_pos_constant) {
         __ cmpl(src, dest);
-        __ j(kEqual, slow_path->GetEntryLabel());
+        __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
       }
     } else {
       if (!optimizations.GetDestinationIsSource()) {
@@ -2747,7 +2858,7 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) {
         __ j(kNotEqual, &conditions_on_positions_validated);
       }
       __ cmpl(dest_pos.AsRegister<Register>(), Immediate(src_pos_constant));
-      __ j(kGreater, slow_path->GetEntryLabel());
+      __ j(kGreater, intrinsic_slow_path->GetEntryLabel());
     }
   } else {
     if (!optimizations.GetDestinationIsSource()) {
@@ -2757,10 +2868,10 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) {
     if (dest_pos.IsConstant()) {
       int32_t dest_pos_constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
       __ cmpl(src_pos.AsRegister<Register>(), Immediate(dest_pos_constant));
-      __ j(kLess, slow_path->GetEntryLabel());
+      __ j(kLess, intrinsic_slow_path->GetEntryLabel());
     } else {
       __ cmpl(src_pos.AsRegister<Register>(), dest_pos.AsRegister<Register>());
-      __ j(kLess, slow_path->GetEntryLabel());
+      __ j(kLess, intrinsic_slow_path->GetEntryLabel());
     }
   }
 
@@ -2769,16 +2880,17 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) {
   if (!optimizations.GetSourceIsNotNull()) {
     // Bail out if the source is null.
     __ testl(src, src);
-    __ j(kEqual, slow_path->GetEntryLabel());
+    __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
   }
 
   if (!optimizations.GetDestinationIsNotNull() && !optimizations.GetDestinationIsSource()) {
     // Bail out if the destination is null.
     __ testl(dest, dest);
-    __ j(kEqual, slow_path->GetEntryLabel());
+    __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
   }
 
-  Register temp3 = locations->GetTemp(2).AsRegister<Register>();
+  Location temp3_loc = locations->GetTemp(2);
+  Register temp3 = temp3_loc.AsRegister<Register>();
   if (length.IsStackSlot()) {
     __ movl(temp3, Address(ESP, length.GetStackIndex()));
     length = Location::RegisterLocation(temp3);
@@ -2790,7 +2902,7 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) {
       !optimizations.GetCountIsSourceLength() &&
       !optimizations.GetCountIsDestinationLength()) {
     __ testl(length.AsRegister<Register>(), length.AsRegister<Register>());
-    __ j(kLess, slow_path->GetEntryLabel());
+    __ j(kLess, intrinsic_slow_path->GetEntryLabel());
   }
 
   // Validity checks: source.
@@ -2798,7 +2910,7 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) {
                 src_pos,
                 src,
                 length,
-                slow_path,
+                intrinsic_slow_path,
                 temp1,
                 optimizations.GetCountIsSourceLength());
 
@@ -2807,7 +2919,7 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) {
                 dest_pos,
                 dest,
                 length,
-                slow_path,
+                intrinsic_slow_path,
                 temp1,
                 optimizations.GetCountIsDestinationLength());
 
@@ -2816,72 +2928,159 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) {
     // type of the destination array. We do two checks: the classes are the same,
     // or the destination is Object[]. If none of these checks succeed, we go to the
     // slow path.
+
     if (!optimizations.GetSourceIsNonPrimitiveArray()) {
-      // /* HeapReference<Class> */ temp1 = temp1->klass_
-      __ movl(temp1, Address(src, class_offset));
-      __ MaybeUnpoisonHeapReference(temp1);
-      // Bail out if the source is not a non primitive array.
-      // /* HeapReference<Class> */ temp1 = temp1->component_type_
-      __ movl(temp1, Address(temp1, component_offset));
-      __ testl(temp1, temp1);
-      __ j(kEqual, slow_path->GetEntryLabel());
-      __ MaybeUnpoisonHeapReference(temp1);
+      if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+        // /* HeapReference<Class> */ temp1 = src->klass_
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(
+            invoke, temp1_loc, src, class_offset, temp2_loc, /* needs_null_check */ false);
+        // Bail out if the source is not a non primitive array.
+        // /* HeapReference<Class> */ temp1 = temp1->component_type_
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(
+            invoke, temp1_loc, temp1, component_offset, temp2_loc, /* needs_null_check */ false);
+        __ testl(temp1, temp1);
+        __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
+        // If heap poisoning is enabled, `temp1` has been unpoisoned
+        // by the the previous call to GenerateFieldLoadWithBakerReadBarrier.
+      } else {
+        // /* HeapReference<Class> */ temp1 = src->klass_
+        __ movl(temp1, Address(src, class_offset));
+        __ MaybeUnpoisonHeapReference(temp1);
+        // Bail out if the source is not a non primitive array.
+        // /* HeapReference<Class> */ temp1 = temp1->component_type_
+        __ movl(temp1, Address(temp1, component_offset));
+        __ testl(temp1, temp1);
+        __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
+        __ MaybeUnpoisonHeapReference(temp1);
+      }
       __ cmpw(Address(temp1, primitive_offset), Immediate(Primitive::kPrimNot));
-      __ j(kNotEqual, slow_path->GetEntryLabel());
+      __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
     }
 
-    if (!optimizations.GetDestinationIsNonPrimitiveArray()) {
-      // /* HeapReference<Class> */ temp1 = temp1->klass_
-      __ movl(temp1, Address(dest, class_offset));
-      __ MaybeUnpoisonHeapReference(temp1);
-      // Bail out if the destination is not a non primitive array.
-      // /* HeapReference<Class> */ temp2 = temp1->component_type_
-      __ movl(temp2, Address(temp1, component_offset));
-      __ testl(temp2, temp2);
-      __ j(kEqual, slow_path->GetEntryLabel());
-      __ MaybeUnpoisonHeapReference(temp2);
-      __ cmpw(Address(temp2, primitive_offset), Immediate(Primitive::kPrimNot));
-      __ j(kNotEqual, slow_path->GetEntryLabel());
-      // Re-poison the heap reference to make the compare instruction below
-      // compare two poisoned references.
-      __ PoisonHeapReference(temp1);
+    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+      if (length.Equals(Location::RegisterLocation(temp3))) {
+        // When Baker read barriers are enabled, register `temp3`,
+        // which in the present case contains the `length` parameter,
+        // will be overwritten below.  Make the `length` location
+        // reference the original stack location; it will be moved
+        // back to `temp3` later if necessary.
+        DCHECK(length_arg.IsStackSlot());
+        length = length_arg;
+      }
+
+      // /* HeapReference<Class> */ temp1 = dest->klass_
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(
+          invoke, temp1_loc, dest, class_offset, temp2_loc, /* needs_null_check */ false);
+
+      if (!optimizations.GetDestinationIsNonPrimitiveArray()) {
+        // Bail out if the destination is not a non primitive array.
+        //
+        // Register `temp1` is not trashed by the read barrier emitted
+        // by GenerateFieldLoadWithBakerReadBarrier below, as that
+        // method produces a call to a ReadBarrierMarkRegX entry point,
+        // which saves all potentially live registers, including
+        // temporaries such a `temp1`.
+        // /* HeapReference<Class> */ temp2 = temp1->component_type_
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(
+            invoke, temp2_loc, temp1, component_offset, temp3_loc, /* needs_null_check */ false);
+        __ testl(temp2, temp2);
+        __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
+        // If heap poisoning is enabled, `temp2` has been unpoisoned
+        // by the the previous call to GenerateFieldLoadWithBakerReadBarrier.
+        __ cmpw(Address(temp2, primitive_offset), Immediate(Primitive::kPrimNot));
+        __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
+      }
+
+      // For the same reason given earlier, `temp1` is not trashed by the
+      // read barrier emitted by GenerateFieldLoadWithBakerReadBarrier below.
+      // /* HeapReference<Class> */ temp2 = src->klass_
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(
+          invoke, temp2_loc, src, class_offset, temp3_loc, /* needs_null_check */ false);
+      // Note: if heap poisoning is on, we are comparing two unpoisoned references here.
+      __ cmpl(temp1, temp2);
+
+      if (optimizations.GetDestinationIsTypedObjectArray()) {
+        NearLabel do_copy;
+        __ j(kEqual, &do_copy);
+        // /* HeapReference<Class> */ temp1 = temp1->component_type_
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(
+            invoke, temp1_loc, temp1, component_offset, temp2_loc, /* needs_null_check */ false);
+        // We do not need to emit a read barrier for the following
+        // heap reference load, as `temp1` is only used in a
+        // comparison with null below, and this reference is not
+        // kept afterwards.
+        __ cmpl(Address(temp1, super_offset), Immediate(0));
+        __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
+        __ Bind(&do_copy);
+      } else {
+        __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
+      }
     } else {
-      // /* HeapReference<Class> */ temp1 = temp1->klass_
-      __ movl(temp1, Address(dest, class_offset));
-    }
+      // Non read barrier code.
 
-    // Note: if poisoning is on, we are here comparing two poisoned references.
-    __ cmpl(temp1, Address(src, class_offset));
+      // /* HeapReference<Class> */ temp1 = dest->klass_
+      __ movl(temp1, Address(dest, class_offset));
+      if (!optimizations.GetDestinationIsNonPrimitiveArray()) {
+        __ MaybeUnpoisonHeapReference(temp1);
+        // Bail out if the destination is not a non primitive array.
+        // /* HeapReference<Class> */ temp2 = temp1->component_type_
+        __ movl(temp2, Address(temp1, component_offset));
+        __ testl(temp2, temp2);
+        __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
+        __ MaybeUnpoisonHeapReference(temp2);
+        __ cmpw(Address(temp2, primitive_offset), Immediate(Primitive::kPrimNot));
+        __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
+        // Re-poison the heap reference to make the compare instruction below
+        // compare two poisoned references.
+        __ PoisonHeapReference(temp1);
+      }
 
-    if (optimizations.GetDestinationIsTypedObjectArray()) {
-      NearLabel do_copy;
-      __ j(kEqual, &do_copy);
+      // Note: if heap poisoning is on, we are comparing two poisoned references here.
+      __ cmpl(temp1, Address(src, class_offset));
+
+      if (optimizations.GetDestinationIsTypedObjectArray()) {
+        NearLabel do_copy;
+        __ j(kEqual, &do_copy);
+        __ MaybeUnpoisonHeapReference(temp1);
+        // /* HeapReference<Class> */ temp1 = temp1->component_type_
+        __ movl(temp1, Address(temp1, component_offset));
+        __ MaybeUnpoisonHeapReference(temp1);
+        __ cmpl(Address(temp1, super_offset), Immediate(0));
+        __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
+        __ Bind(&do_copy);
+      } else {
+        __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
+      }
+    }
+  } else if (!optimizations.GetSourceIsNonPrimitiveArray()) {
+    DCHECK(optimizations.GetDestinationIsNonPrimitiveArray());
+    // Bail out if the source is not a non primitive array.
+    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+      // /* HeapReference<Class> */ temp1 = src->klass_
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(
+          invoke, temp1_loc, src, class_offset, temp2_loc, /* needs_null_check */ false);
+      // /* HeapReference<Class> */ temp1 = temp1->component_type_
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(
+          invoke, temp1_loc, temp1, component_offset, temp2_loc, /* needs_null_check */ false);
+      __ testl(temp1, temp1);
+      __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
+      // If heap poisoning is enabled, `temp1` has been unpoisoned
+      // by the the previous call to GenerateFieldLoadWithBakerReadBarrier.
+    } else {
+      // /* HeapReference<Class> */ temp1 = src->klass_
+      __ movl(temp1, Address(src, class_offset));
       __ MaybeUnpoisonHeapReference(temp1);
       // /* HeapReference<Class> */ temp1 = temp1->component_type_
       __ movl(temp1, Address(temp1, component_offset));
+      __ testl(temp1, temp1);
+      __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
       __ MaybeUnpoisonHeapReference(temp1);
-      __ cmpl(Address(temp1, super_offset), Immediate(0));
-      __ j(kNotEqual, slow_path->GetEntryLabel());
-      __ Bind(&do_copy);
-    } else {
-      __ j(kNotEqual, slow_path->GetEntryLabel());
     }
-  } else if (!optimizations.GetSourceIsNonPrimitiveArray()) {
-    DCHECK(optimizations.GetDestinationIsNonPrimitiveArray());
-    // Bail out if the source is not a non primitive array.
-    // /* HeapReference<Class> */ temp1 = src->klass_
-    __ movl(temp1, Address(src, class_offset));
-    __ MaybeUnpoisonHeapReference(temp1);
-    // /* HeapReference<Class> */ temp1 = temp1->component_type_
-    __ movl(temp1, Address(temp1, component_offset));
-    __ testl(temp1, temp1);
-    __ j(kEqual, slow_path->GetEntryLabel());
-    __ MaybeUnpoisonHeapReference(temp1);
     __ cmpw(Address(temp1, primitive_offset), Immediate(Primitive::kPrimNot));
-    __ j(kNotEqual, slow_path->GetEntryLabel());
+    __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
   }
 
-  // Compute base source address, base destination address, and end source address.
+  // Compute the base source address in `temp1`.
   int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot);
   DCHECK_EQ(element_size, 4);
   uint32_t offset = mirror::Array::DataOffset(element_size).Uint32Value();
@@ -2892,35 +3091,136 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) {
     __ leal(temp1, Address(src, src_pos.AsRegister<Register>(), ScaleFactor::TIMES_4, offset));
   }
 
-  if (dest_pos.IsConstant()) {
-    int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
-    __ leal(temp2, Address(dest, element_size * constant + offset));
-  } else {
-    __ leal(temp2, Address(dest, dest_pos.AsRegister<Register>(), ScaleFactor::TIMES_4, offset));
-  }
+  if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+    // If it is needed (in the case of the fast-path loop), the base
+    // destination address is computed later, as `temp2` is used for
+    // intermediate computations.
 
-  if (length.IsConstant()) {
-    int32_t constant = length.GetConstant()->AsIntConstant()->GetValue();
-    __ leal(temp3, Address(temp1, element_size * constant));
+    // Compute the end source address in `temp3`.
+    if (length.IsConstant()) {
+      int32_t constant = length.GetConstant()->AsIntConstant()->GetValue();
+      __ leal(temp3, Address(temp1, element_size * constant));
+    } else {
+      if (length.IsStackSlot()) {
+        // Location `length` is again pointing at a stack slot, as
+        // register `temp3` (which was containing the length parameter
+        // earlier) has been overwritten; restore it now
+        DCHECK(length.Equals(length_arg));
+        __ movl(temp3, Address(ESP, length.GetStackIndex()));
+        length = Location::RegisterLocation(temp3);
+      }
+      __ leal(temp3, Address(temp1, length.AsRegister<Register>(), ScaleFactor::TIMES_4, 0));
+    }
+
+    // SystemArrayCopy implementation for Baker read barriers (see
+    // also CodeGeneratorX86::GenerateReferenceLoadWithBakerReadBarrier):
+    //
+    //   if (src_ptr != end_ptr) {
+    //     uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState();
+    //     lfence;  // Load fence or artificial data dependency to prevent load-load reordering
+    //     bool is_gray = (rb_state == ReadBarrier::gray_ptr_);
+    //     if (is_gray) {
+    //       // Slow-path copy.
+    //       for (size_t i = 0; i != length; ++i) {
+    //         dest_array[dest_pos + i] =
+    //             MaybePoison(ReadBarrier::Mark(MaybeUnpoison(src_array[src_pos + i])));
+    //       }
+    //     } else {
+    //       // Fast-path copy.
+    //       do {
+    //         *dest_ptr++ = *src_ptr++;
+    //       } while (src_ptr != end_ptr)
+    //     }
+    //   }
+
+    NearLabel loop, done;
+
+    // Don't enter copy loop if `length == 0`.
+    __ cmpl(temp1, temp3);
+    __ j(kEqual, &done);
+
+    // /* int32_t */ monitor = src->monitor_
+    __ movl(temp2, Address(src, monitor_offset));
+    // /* LockWord */ lock_word = LockWord(monitor)
+    static_assert(sizeof(LockWord) == sizeof(int32_t),
+                  "art::LockWord and int32_t have different sizes.");
+
+    // Load fence to prevent load-load reordering.
+    // Note that this is a no-op, thanks to the x86 memory model.
+    codegen_->GenerateMemoryBarrier(MemBarrierKind::kLoadAny);
+
+    // Slow path used to copy array when `src` is gray.
+    SlowPathCode* read_barrier_slow_path =
+        new (GetAllocator()) ReadBarrierSystemArrayCopySlowPathX86(invoke);
+    codegen_->AddSlowPath(read_barrier_slow_path);
+
+    // Given the numeric representation, it's enough to check the low bit of the
+    // rb_state. We do that by shifting the bit out of the lock word with SHR.
+    static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0");
+    static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1");
+    static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2");
+    __ shrl(temp2, Immediate(LockWord::kReadBarrierStateShift + 1));
+    __ j(kCarrySet, read_barrier_slow_path->GetEntryLabel());
+
+    // Fast-path copy.
+
+    // Set the base destination address in `temp2`.
+    if (dest_pos.IsConstant()) {
+      int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
+      __ leal(temp2, Address(dest, element_size * constant + offset));
+    } else {
+      __ leal(temp2, Address(dest, dest_pos.AsRegister<Register>(), ScaleFactor::TIMES_4, offset));
+    }
+
+    // Iterate over the arrays and do a raw copy of the objects. We don't need to
+    // poison/unpoison.
+    __ Bind(&loop);
+    __ pushl(Address(temp1, 0));
+    __ cfi().AdjustCFAOffset(4);
+    __ popl(Address(temp2, 0));
+    __ cfi().AdjustCFAOffset(-4);
+    __ addl(temp1, Immediate(element_size));
+    __ addl(temp2, Immediate(element_size));
+    __ cmpl(temp1, temp3);
+    __ j(kNotEqual, &loop);
+
+    __ Bind(read_barrier_slow_path->GetExitLabel());
+    __ Bind(&done);
   } else {
-    __ leal(temp3, Address(temp1, length.AsRegister<Register>(), ScaleFactor::TIMES_4, 0));
-  }
-
-  // Iterate over the arrays and do a raw copy of the objects. We don't need to
-  // poison/unpoison.
-  NearLabel loop, done;
-  __ cmpl(temp1, temp3);
-  __ j(kEqual, &done);
-  __ Bind(&loop);
-  __ pushl(Address(temp1, 0));
-  __ cfi().AdjustCFAOffset(4);
-  __ popl(Address(temp2, 0));
-  __ cfi().AdjustCFAOffset(-4);
-  __ addl(temp1, Immediate(element_size));
-  __ addl(temp2, Immediate(element_size));
-  __ cmpl(temp1, temp3);
-  __ j(kNotEqual, &loop);
-  __ Bind(&done);
+    // Non read barrier code.
+
+    // Compute the base destination address in `temp2`.
+    if (dest_pos.IsConstant()) {
+      int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
+      __ leal(temp2, Address(dest, element_size * constant + offset));
+    } else {
+      __ leal(temp2, Address(dest, dest_pos.AsRegister<Register>(), ScaleFactor::TIMES_4, offset));
+    }
+
+    // Compute the end source address in `temp3`.
+    if (length.IsConstant()) {
+      int32_t constant = length.GetConstant()->AsIntConstant()->GetValue();
+      __ leal(temp3, Address(temp1, element_size * constant));
+    } else {
+      __ leal(temp3, Address(temp1, length.AsRegister<Register>(), ScaleFactor::TIMES_4, 0));
+    }
+
+    // Iterate over the arrays and do a raw copy of the objects. We don't need to
+    // poison/unpoison.
+    NearLabel loop, done;
+    __ cmpl(temp1, temp3);
+    __ j(kEqual, &done);
+    __ Bind(&loop);
+    __ pushl(Address(temp1, 0));
+    __ cfi().AdjustCFAOffset(4);
+    __ popl(Address(temp2, 0));
+    __ cfi().AdjustCFAOffset(-4);
+    __ addl(temp1, Immediate(element_size));
+    __ addl(temp2, Immediate(element_size));
+    __ cmpl(temp1, temp3);
+    __ j(kNotEqual, &loop);
+    __ Bind(&done);
+  }
 
   // We only need one card marking on the destination array.
   codegen_->MarkGCCard(temp1,
@@ -2929,7 +3229,7 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) {
                        Register(kNoRegister),
                        /* value_can_be_null */ false);
 
-  __ Bind(slow_path->GetExitLabel());
+  __ Bind(intrinsic_slow_path->GetExitLabel());
 }
 
 UNIMPLEMENTED_INTRINSIC(X86, MathRoundDouble)
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index 891aaf5ff9..311e1cd6eb 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -64,6 +64,65 @@ static void MoveArguments(HInvoke* invoke, CodeGeneratorX86_64* codegen) {
 
 using IntrinsicSlowPathX86_64 = IntrinsicSlowPath<InvokeDexCallingConventionVisitorX86_64>;
 
+// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
+#define __ down_cast<X86_64Assembler*>(codegen->GetAssembler())->  // NOLINT
+
+// Slow path implementing the SystemArrayCopy intrinsic copy loop with read barriers.
+class ReadBarrierSystemArrayCopySlowPathX86_64 : public SlowPathCode {
+ public:
+  explicit ReadBarrierSystemArrayCopySlowPathX86_64(HInstruction* instruction)
+      : SlowPathCode(instruction) {
+    DCHECK(kEmitCompilerReadBarrier);
+    DCHECK(kUseBakerReadBarrier);
+  }
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    CodeGeneratorX86_64* x86_64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
+    LocationSummary* locations = instruction_->GetLocations();
+    DCHECK(locations->CanCall());
+    DCHECK(instruction_->IsInvokeStaticOrDirect())
+        << "Unexpected instruction in read barrier arraycopy slow path: "
+        << instruction_->DebugName();
+    DCHECK(instruction_->GetLocations()->Intrinsified());
+    DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kSystemArrayCopy);
+
+    int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot);
+
+    CpuRegister src_curr_addr = locations->GetTemp(0).AsRegister<CpuRegister>();
+    CpuRegister dst_curr_addr = locations->GetTemp(1).AsRegister<CpuRegister>();
+    CpuRegister src_stop_addr = locations->GetTemp(2).AsRegister<CpuRegister>();
+
+    __ Bind(GetEntryLabel());
+    NearLabel loop;
+    __ Bind(&loop);
+    __ movl(CpuRegister(TMP), Address(src_curr_addr, 0));
+    __ MaybeUnpoisonHeapReference(CpuRegister(TMP));
+    // TODO: Inline the mark bit check before calling the runtime?
+    // TMP = ReadBarrier::Mark(TMP);
+    // No need to save live registers; it's taken care of by the
+    // entrypoint. Also, there is no need to update the stack mask,
+    // as this runtime call will not trigger a garbage collection.
+    int32_t entry_point_offset =
+        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kX86_64PointerSize>(TMP);
+    // This runtime call does not require a stack map.
+    x86_64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
+    __ MaybePoisonHeapReference(CpuRegister(TMP));
+    __ movl(Address(dst_curr_addr, 0), CpuRegister(TMP));
+    __ addl(src_curr_addr, Immediate(element_size));
+    __ addl(dst_curr_addr, Immediate(element_size));
+    __ cmpl(src_curr_addr, src_stop_addr);
+    __ j(kNotEqual, &loop);
+    __ jmp(GetExitLabel());
+  }
+
+  const char* GetDescription() const OVERRIDE { return "ReadBarrierSystemArrayCopySlowPathX86_64"; }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(ReadBarrierSystemArrayCopySlowPathX86_64);
+};
+
+#undef __
+
 #define __ assembler->
 
 static void CreateFPToIntLocations(ArenaAllocator* arena, HInvoke* invoke) {
@@ -526,7 +585,7 @@ static void CreateSSE41FPToFPLocations(ArenaAllocator* arena,
 
   // We have to fall back to a call to the intrinsic.
   LocationSummary* locations = new (arena) LocationSummary(invoke,
-                                                           LocationSummary::kCall);
+                                                           LocationSummary::kCallOnMainOnly);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0)));
   locations->SetOut(Location::FpuRegisterLocation(XMM0));
@@ -583,12 +642,13 @@ static void CreateSSE41FPToIntLocations(ArenaAllocator* arena,
     locations->SetInAt(0, Location::RequiresFpuRegister());
     locations->SetOut(Location::RequiresRegister());
     locations->AddTemp(Location::RequiresFpuRegister());
+    locations->AddTemp(Location::RequiresFpuRegister());
     return;
   }
 
   // We have to fall back to a call to the intrinsic.
   LocationSummary* locations = new (arena) LocationSummary(invoke,
-                                                           LocationSummary::kCall);
+                                                           LocationSummary::kCallOnMainOnly);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0)));
   locations->SetOut(Location::RegisterLocation(RAX));
@@ -597,10 +657,7 @@ static void CreateSSE41FPToIntLocations(ArenaAllocator* arena,
 }
 
 void IntrinsicLocationsBuilderX86_64::VisitMathRoundFloat(HInvoke* invoke) {
-  // See intrinsics.h.
-  if (kRoundIsPlusPointFive) {
-    CreateSSE41FPToIntLocations(arena_, invoke, codegen_);
-  }
+  CreateSSE41FPToIntLocations(arena_, invoke, codegen_);
 }
 
 void IntrinsicCodeGeneratorX86_64::VisitMathRoundFloat(HInvoke* invoke) {
@@ -610,47 +667,41 @@ void IntrinsicCodeGeneratorX86_64::VisitMathRoundFloat(HInvoke* invoke) {
     return;
   }
 
-  // Implement RoundFloat as t1 = floor(input + 0.5f);  convert to int.
   XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
-  XmmRegister inPlusPointFive = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
-  NearLabel done, nan;
+  XmmRegister t1 = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+  XmmRegister t2 = locations->GetTemp(1).AsFpuRegister<XmmRegister>();
+  NearLabel skip_incr, done;
   X86_64Assembler* assembler = GetAssembler();
 
-  // Load 0.5 into inPlusPointFive.
-  __ movss(inPlusPointFive, codegen_->LiteralFloatAddress(0.5f));
-
-  // Add in the input.
-  __ addss(inPlusPointFive, in);
-
-  // And truncate to an integer.
-  __ roundss(inPlusPointFive, inPlusPointFive, Immediate(1));
-
-  // Load maxInt into out.
-  codegen_->Load64BitValue(out, kPrimIntMax);
-
-  // if inPlusPointFive >= maxInt goto done
-  __ comiss(inPlusPointFive, codegen_->LiteralFloatAddress(static_cast<float>(kPrimIntMax)));
-  __ j(kAboveEqual, &done);
-
-  // if input == NaN goto nan
-  __ j(kUnordered, &nan);
-
-  // output = float-to-int-truncate(input)
-  __ cvttss2si(out, inPlusPointFive);
-  __ jmp(&done);
-  __ Bind(&nan);
-
-  //  output = 0
-  __ xorl(out, out);
+  // Since no direct x86 rounding instruction matches the required semantics,
+  // this intrinsic is implemented as follows:
+  //  result = floor(in);
+  //  if (in - result >= 0.5f)
+  //    result = result + 1.0f;
+  __ movss(t2, in);
+  __ roundss(t1, in, Immediate(1));
+  __ subss(t2, t1);
+  __ comiss(t2, codegen_->LiteralFloatAddress(0.5f));
+  __ j(kBelow, &skip_incr);
+  __ addss(t1, codegen_->LiteralFloatAddress(1.0f));
+  __ Bind(&skip_incr);
+
+  // Final conversion to an integer. Unfortunately this also does not have a
+  // direct x86 instruction, since NaN should map to 0 and large positive
+  // values need to be clipped to the extreme value.
+  codegen_->Load32BitValue(out, kPrimIntMax);
+  __ cvtsi2ss(t2, out);
+  __ comiss(t1, t2);
+  __ j(kAboveEqual, &done);  // clipped to max (already in out), does not jump on unordered
+  __ movl(out, Immediate(0));  // does not change flags
+  __ j(kUnordered, &done);  // NaN mapped to 0 (just moved in out)
+  __ cvttss2si(out, t1);
   __ Bind(&done);
 }
 
 void IntrinsicLocationsBuilderX86_64::VisitMathRoundDouble(HInvoke* invoke) {
-  // See intrinsics.h.
-  if (kRoundIsPlusPointFive) {
-    CreateSSE41FPToIntLocations(arena_, invoke, codegen_);
-  }
+  CreateSSE41FPToIntLocations(arena_, invoke, codegen_);
 }
 
 void IntrinsicCodeGeneratorX86_64::VisitMathRoundDouble(HInvoke* invoke) {
@@ -660,46 +711,43 @@ void IntrinsicCodeGeneratorX86_64::VisitMathRoundDouble(HInvoke* invoke) {
     return;
   }
 
-  // Implement RoundDouble as t1 = floor(input + 0.5);  convert to long.
   XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
-  XmmRegister inPlusPointFive = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
-  NearLabel done, nan;
+  XmmRegister t1 = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+  XmmRegister t2 = locations->GetTemp(1).AsFpuRegister<XmmRegister>();
+  NearLabel skip_incr, done;
   X86_64Assembler* assembler = GetAssembler();
 
-  // Load 0.5 into inPlusPointFive.
-  __ movsd(inPlusPointFive, codegen_->LiteralDoubleAddress(0.5));
-
-  // Add in the input.
-  __ addsd(inPlusPointFive, in);
-
-  // And truncate to an integer.
-  __ roundsd(inPlusPointFive, inPlusPointFive, Immediate(1));
-
-  // Load maxLong into out.
+  // Since no direct x86 rounding instruction matches the required semantics,
+  // this intrinsic is implemented as follows:
+  //  result = floor(in);
+  //  if (in - result >= 0.5)
+  //    result = result + 1.0f;
+  __ movsd(t2, in);
+  __ roundsd(t1, in, Immediate(1));
+  __ subsd(t2, t1);
+  __ comisd(t2, codegen_->LiteralDoubleAddress(0.5));
+  __ j(kBelow, &skip_incr);
+  __ addsd(t1, codegen_->LiteralDoubleAddress(1.0f));
+  __ Bind(&skip_incr);
+
+  // Final conversion to an integer. Unfortunately this also does not have a
+  // direct x86 instruction, since NaN should map to 0 and large positive
+  // values need to be clipped to the extreme value.
   codegen_->Load64BitValue(out, kPrimLongMax);
-
-  // if inPlusPointFive >= maxLong goto done
-  __ comisd(inPlusPointFive, codegen_->LiteralDoubleAddress(static_cast<double>(kPrimLongMax)));
-  __ j(kAboveEqual, &done);
-
-  // if input == NaN goto nan
-  __ j(kUnordered, &nan);
-
-  // output = double-to-long-truncate(input)
-  __ cvttsd2si(out, inPlusPointFive, /* is64bit */ true);
-  __ jmp(&done);
-  __ Bind(&nan);
-
-  //  output = 0
-  __ xorl(out, out);
+  __ cvtsi2sd(t2, out, /* is64bit */ true);
+  __ comisd(t1, t2);
+  __ j(kAboveEqual, &done);  // clipped to max (already in out), does not jump on unordered
+  __ movl(out, Immediate(0));  // does not change flags, implicit zero extension to 64-bit
+  __ j(kUnordered, &done);  // NaN mapped to 0 (just moved in out)
+  __ cvttsd2si(out, t1, /* is64bit */ true);
   __ Bind(&done);
 }
 
 static void CreateFPToFPCallLocations(ArenaAllocator* arena,
                                       HInvoke* invoke) {
   LocationSummary* locations = new (arena) LocationSummary(invoke,
-                                                           LocationSummary::kCall,
+                                                           LocationSummary::kCallOnMainOnly,
                                                            kIntrinsified);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(0)));
@@ -720,7 +768,7 @@ static void GenFPToFPCall(HInvoke* invoke, CodeGeneratorX86_64* codegen,
   DCHECK(invoke->IsInvokeStaticOrDirect());
   X86_64Assembler* assembler = codegen->GetAssembler();
 
-  __ gs()->call(Address::Absolute(GetThreadOffset<kX86_64WordSize>(entry), true));
+  __ gs()->call(Address::Absolute(GetThreadOffset<kX86_64PointerSize>(entry), true));
   codegen->RecordPcInfo(invoke, invoke->GetDexPc());
 }
 
@@ -839,7 +887,7 @@ void IntrinsicCodeGeneratorX86_64::VisitMathTanh(HInvoke* invoke) {
 static void CreateFPFPToFPCallLocations(ArenaAllocator* arena,
                                         HInvoke* invoke) {
   LocationSummary* locations = new (arena) LocationSummary(invoke,
-                                                           LocationSummary::kCall,
+                                                           LocationSummary::kCallOnMainOnly,
                                                            kIntrinsified);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(0)));
@@ -1064,9 +1112,9 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopyChar(HInvoke* invoke) {
 
 
 void IntrinsicLocationsBuilderX86_64::VisitSystemArrayCopy(HInvoke* invoke) {
-  // TODO(rpl): Implement read barriers in the SystemArrayCopy
-  // intrinsic and re-enable it (b/29516905).
-  if (kEmitCompilerReadBarrier) {
+  // The only read barrier implementation supporting the
+  // SystemArrayCopy intrinsic is the Baker-style read barriers.
+  if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
     return;
   }
 
@@ -1074,9 +1122,9 @@ void IntrinsicLocationsBuilderX86_64::VisitSystemArrayCopy(HInvoke* invoke) {
 }
 
 void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) {
-  // TODO(rpl): Implement read barriers in the SystemArrayCopy
-  // intrinsic and re-enable it (b/29516905).
-  DCHECK(!kEmitCompilerReadBarrier);
+  // The only read barrier implementation supporting the
+  // SystemArrayCopy intrinsic is the Baker-style read barriers.
+  DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier);
 
   X86_64Assembler* assembler = GetAssembler();
   LocationSummary* locations = invoke->GetLocations();
@@ -1085,18 +1133,23 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) {
   uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
   uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
   uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
+  uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
 
   CpuRegister src = locations->InAt(0).AsRegister<CpuRegister>();
   Location src_pos = locations->InAt(1);
   CpuRegister dest = locations->InAt(2).AsRegister<CpuRegister>();
   Location dest_pos = locations->InAt(3);
   Location length = locations->InAt(4);
-  CpuRegister temp1 = locations->GetTemp(0).AsRegister<CpuRegister>();
-  CpuRegister temp2 = locations->GetTemp(1).AsRegister<CpuRegister>();
-  CpuRegister temp3 = locations->GetTemp(2).AsRegister<CpuRegister>();
+  Location temp1_loc = locations->GetTemp(0);
+  CpuRegister temp1 = temp1_loc.AsRegister<CpuRegister>();
+  Location temp2_loc = locations->GetTemp(1);
+  CpuRegister temp2 = temp2_loc.AsRegister<CpuRegister>();
+  Location temp3_loc = locations->GetTemp(2);
+  CpuRegister temp3 = temp3_loc.AsRegister<CpuRegister>();
+  Location TMP_loc = Location::RegisterLocation(TMP);
 
-  SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathX86_64(invoke);
-  codegen_->AddSlowPath(slow_path);
+  SlowPathCode* intrinsic_slow_path = new (GetAllocator()) IntrinsicSlowPathX86_64(invoke);
+  codegen_->AddSlowPath(intrinsic_slow_path);
 
   NearLabel conditions_on_positions_validated;
   SystemArrayCopyOptimizations optimizations(invoke);
@@ -1112,7 +1165,7 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) {
         DCHECK_GE(src_pos_constant, dest_pos_constant);
       } else if (src_pos_constant < dest_pos_constant) {
         __ cmpl(src, dest);
-        __ j(kEqual, slow_path->GetEntryLabel());
+        __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
       }
     } else {
       if (!optimizations.GetDestinationIsSource()) {
@@ -1120,7 +1173,7 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) {
         __ j(kNotEqual, &conditions_on_positions_validated);
       }
       __ cmpl(dest_pos.AsRegister<CpuRegister>(), Immediate(src_pos_constant));
-      __ j(kGreater, slow_path->GetEntryLabel());
+      __ j(kGreater, intrinsic_slow_path->GetEntryLabel());
     }
   } else {
     if (!optimizations.GetDestinationIsSource()) {
@@ -1130,10 +1183,10 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) {
     if (dest_pos.IsConstant()) {
       int32_t dest_pos_constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
       __ cmpl(src_pos.AsRegister<CpuRegister>(), Immediate(dest_pos_constant));
-      __ j(kLess, slow_path->GetEntryLabel());
+      __ j(kLess, intrinsic_slow_path->GetEntryLabel());
     } else {
       __ cmpl(src_pos.AsRegister<CpuRegister>(), dest_pos.AsRegister<CpuRegister>());
-      __ j(kLess, slow_path->GetEntryLabel());
+      __ j(kLess, intrinsic_slow_path->GetEntryLabel());
     }
   }
 
@@ -1142,13 +1195,13 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) {
   if (!optimizations.GetSourceIsNotNull()) {
     // Bail out if the source is null.
     __ testl(src, src);
-    __ j(kEqual, slow_path->GetEntryLabel());
+    __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
   }
 
   if (!optimizations.GetDestinationIsNotNull() && !optimizations.GetDestinationIsSource()) {
     // Bail out if the destination is null.
     __ testl(dest, dest);
-    __ j(kEqual, slow_path->GetEntryLabel());
+    __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
   }
 
   // If the length is negative, bail out.
@@ -1157,7 +1210,7 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) {
       !optimizations.GetCountIsSourceLength() &&
       !optimizations.GetCountIsDestinationLength()) {
     __ testl(length.AsRegister<CpuRegister>(), length.AsRegister<CpuRegister>());
-    __ j(kLess, slow_path->GetEntryLabel());
+    __ j(kLess, intrinsic_slow_path->GetEntryLabel());
   }
 
   // Validity checks: source.
@@ -1165,7 +1218,7 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) {
                 src_pos,
                 src,
                 length,
-                slow_path,
+                intrinsic_slow_path,
                 temp1,
                 optimizations.GetCountIsSourceLength());
 
@@ -1174,7 +1227,7 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) {
                 dest_pos,
                 dest,
                 length,
-                slow_path,
+                intrinsic_slow_path,
                 temp1,
                 optimizations.GetCountIsDestinationLength());
 
@@ -1183,38 +1236,80 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) {
     // type of the destination array. We do two checks: the classes are the same,
     // or the destination is Object[]. If none of these checks succeed, we go to the
     // slow path.
-    __ movl(temp1, Address(dest, class_offset));
-    __ movl(temp2, Address(src, class_offset));
+
     bool did_unpoison = false;
-    if (!optimizations.GetDestinationIsNonPrimitiveArray() ||
-        !optimizations.GetSourceIsNonPrimitiveArray()) {
-      // One or two of the references need to be unpoisoned. Unpoison them
-      // both to make the identity check valid.
-      __ MaybeUnpoisonHeapReference(temp1);
-      __ MaybeUnpoisonHeapReference(temp2);
-      did_unpoison = true;
+    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+      // /* HeapReference<Class> */ temp1 = dest->klass_
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(
+          invoke, temp1_loc, dest, class_offset, temp3_loc, /* needs_null_check */ false);
+      // Register `temp1` is not trashed by the read barrier emitted
+      // by GenerateFieldLoadWithBakerReadBarrier below, as that
+      // method produces a call to a ReadBarrierMarkRegX entry point,
+      // which saves all potentially live registers, including
+      // temporaries such a `temp1`.
+      // /* HeapReference<Class> */ temp2 = src->klass_
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(
+          invoke, temp2_loc, src, class_offset, temp3_loc, /* needs_null_check */ false);
+      // If heap poisoning is enabled, `temp1` and `temp2` have been
+      // unpoisoned by the the previous calls to
+      // GenerateFieldLoadWithBakerReadBarrier.
+    } else {
+      // /* HeapReference<Class> */ temp1 = dest->klass_
+      __ movl(temp1, Address(dest, class_offset));
+      // /* HeapReference<Class> */ temp2 = src->klass_
+      __ movl(temp2, Address(src, class_offset));
+      if (!optimizations.GetDestinationIsNonPrimitiveArray() ||
+          !optimizations.GetSourceIsNonPrimitiveArray()) {
+        // One or two of the references need to be unpoisoned. Unpoison them
+        // both to make the identity check valid.
+        __ MaybeUnpoisonHeapReference(temp1);
+        __ MaybeUnpoisonHeapReference(temp2);
+        did_unpoison = true;
+      }
     }
 
     if (!optimizations.GetDestinationIsNonPrimitiveArray()) {
       // Bail out if the destination is not a non primitive array.
-      // /* HeapReference<Class> */ TMP = temp1->component_type_
-      __ movl(CpuRegister(TMP), Address(temp1, component_offset));
-      __ testl(CpuRegister(TMP), CpuRegister(TMP));
-      __ j(kEqual, slow_path->GetEntryLabel());
-      __ MaybeUnpoisonHeapReference(CpuRegister(TMP));
+      if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+        // /* HeapReference<Class> */ TMP = temp1->component_type_
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(
+            invoke, TMP_loc, temp1, component_offset, temp3_loc, /* needs_null_check */ false);
+        __ testl(CpuRegister(TMP), CpuRegister(TMP));
+        __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
+        // If heap poisoning is enabled, `TMP` has been unpoisoned by
+        // the the previous call to GenerateFieldLoadWithBakerReadBarrier.
+      } else {
+        // /* HeapReference<Class> */ TMP = temp1->component_type_
+        __ movl(CpuRegister(TMP), Address(temp1, component_offset));
+        __ testl(CpuRegister(TMP), CpuRegister(TMP));
+        __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
+        __ MaybeUnpoisonHeapReference(CpuRegister(TMP));
+      }
       __ cmpw(Address(CpuRegister(TMP), primitive_offset), Immediate(Primitive::kPrimNot));
-      __ j(kNotEqual, slow_path->GetEntryLabel());
+      __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
     }
 
     if (!optimizations.GetSourceIsNonPrimitiveArray()) {
       // Bail out if the source is not a non primitive array.
-      // /* HeapReference<Class> */ TMP = temp2->component_type_
-      __ movl(CpuRegister(TMP), Address(temp2, component_offset));
-      __ testl(CpuRegister(TMP), CpuRegister(TMP));
-      __ j(kEqual, slow_path->GetEntryLabel());
-      __ MaybeUnpoisonHeapReference(CpuRegister(TMP));
+      if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+        // For the same reason given earlier, `temp1` is not trashed by the
+        // read barrier emitted by GenerateFieldLoadWithBakerReadBarrier below.
+        // /* HeapReference<Class> */ TMP = temp2->component_type_
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(
+            invoke, TMP_loc, temp2, component_offset, temp3_loc, /* needs_null_check */ false);
+        __ testl(CpuRegister(TMP), CpuRegister(TMP));
+        __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
+        // If heap poisoning is enabled, `TMP` has been unpoisoned by
+        // the the previous call to GenerateFieldLoadWithBakerReadBarrier.
+      } else {
+        // /* HeapReference<Class> */ TMP = temp2->component_type_
+        __ movl(CpuRegister(TMP), Address(temp2, component_offset));
+        __ testl(CpuRegister(TMP), CpuRegister(TMP));
+        __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
+        __ MaybeUnpoisonHeapReference(CpuRegister(TMP));
+      }
       __ cmpw(Address(CpuRegister(TMP), primitive_offset), Immediate(Primitive::kPrimNot));
-      __ j(kNotEqual, slow_path->GetEntryLabel());
+      __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
     }
 
     __ cmpl(temp1, temp2);
@@ -1222,34 +1317,56 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) {
     if (optimizations.GetDestinationIsTypedObjectArray()) {
       NearLabel do_copy;
       __ j(kEqual, &do_copy);
-      if (!did_unpoison) {
+      if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+        // /* HeapReference<Class> */ temp1 = temp1->component_type_
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(
+            invoke, temp1_loc, temp1, component_offset, temp3_loc, /* needs_null_check */ false);
+        // We do not need to emit a read barrier for the following
+        // heap reference load, as `temp1` is only used in a
+        // comparison with null below, and this reference is not
+        // kept afterwards.
+        __ cmpl(Address(temp1, super_offset), Immediate(0));
+      } else {
+        if (!did_unpoison) {
+          __ MaybeUnpoisonHeapReference(temp1);
+        }
+        // /* HeapReference<Class> */ temp1 = temp1->component_type_
+        __ movl(temp1, Address(temp1, component_offset));
         __ MaybeUnpoisonHeapReference(temp1);
+        // No need to unpoison the following heap reference load, as
+        // we're comparing against null.
+        __ cmpl(Address(temp1, super_offset), Immediate(0));
       }
-      // /* HeapReference<Class> */ temp1 = temp1->component_type_
-      __ movl(temp1, Address(temp1, component_offset));
-      __ MaybeUnpoisonHeapReference(temp1);
-      // /* HeapReference<Class> */ temp1 = temp1->super_class_
-      __ movl(temp1, Address(temp1, super_offset));
-      // No need to unpoison the result, we're comparing against null.
-      __ testl(temp1, temp1);
-      __ j(kNotEqual, slow_path->GetEntryLabel());
+      __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
       __ Bind(&do_copy);
     } else {
-      __ j(kNotEqual, slow_path->GetEntryLabel());
+      __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
     }
   } else if (!optimizations.GetSourceIsNonPrimitiveArray()) {
     DCHECK(optimizations.GetDestinationIsNonPrimitiveArray());
     // Bail out if the source is not a non primitive array.
-    // /* HeapReference<Class> */ temp1 = src->klass_
-    __ movl(temp1, Address(src, class_offset));
-    __ MaybeUnpoisonHeapReference(temp1);
-    // /* HeapReference<Class> */ TMP = temp1->component_type_
-    __ movl(CpuRegister(TMP), Address(temp1, component_offset));
-    __ testl(CpuRegister(TMP), CpuRegister(TMP));
-    __ j(kEqual, slow_path->GetEntryLabel());
-    __ MaybeUnpoisonHeapReference(CpuRegister(TMP));
+    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+      // /* HeapReference<Class> */ temp1 = src->klass_
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(
+          invoke, temp1_loc, src, class_offset, temp3_loc, /* needs_null_check */ false);
+      // /* HeapReference<Class> */ TMP = temp1->component_type_
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(
+          invoke, TMP_loc, temp1, component_offset, temp3_loc, /* needs_null_check */ false);
+      __ testl(CpuRegister(TMP), CpuRegister(TMP));
+      __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
+    } else {
+      // /* HeapReference<Class> */ temp1 = src->klass_
+      __ movl(temp1, Address(src, class_offset));
+      __ MaybeUnpoisonHeapReference(temp1);
+      // /* HeapReference<Class> */ TMP = temp1->component_type_
+      __ movl(CpuRegister(TMP), Address(temp1, component_offset));
+      // No need to unpoison `TMP` now, as we're comparing against null.
+      __ testl(CpuRegister(TMP), CpuRegister(TMP));
+      __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
+      __ MaybeUnpoisonHeapReference(CpuRegister(TMP));
+    }
     __ cmpw(Address(CpuRegister(TMP), primitive_offset), Immediate(Primitive::kPrimNot));
-    __ j(kNotEqual, slow_path->GetEntryLabel());
+    __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
   }
 
   // Compute base source address, base destination address, and end source address.
@@ -1277,19 +1394,86 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) {
     __ leal(temp3, Address(temp1, length.AsRegister<CpuRegister>(), ScaleFactor::TIMES_4, 0));
   }
 
-  // Iterate over the arrays and do a raw copy of the objects. We don't need to
-  // poison/unpoison.
-  NearLabel loop, done;
-  __ cmpl(temp1, temp3);
-  __ j(kEqual, &done);
-  __ Bind(&loop);
-  __ movl(CpuRegister(TMP), Address(temp1, 0));
-  __ movl(Address(temp2, 0), CpuRegister(TMP));
-  __ addl(temp1, Immediate(element_size));
-  __ addl(temp2, Immediate(element_size));
-  __ cmpl(temp1, temp3);
-  __ j(kNotEqual, &loop);
-  __ Bind(&done);
+  if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+    // SystemArrayCopy implementation for Baker read barriers (see
+    // also CodeGeneratorX86_64::GenerateReferenceLoadWithBakerReadBarrier):
+    //
+    //   if (src_ptr != end_ptr) {
+    //     uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState();
+    //     lfence;  // Load fence or artificial data dependency to prevent load-load reordering
+    //     bool is_gray = (rb_state == ReadBarrier::gray_ptr_);
+    //     if (is_gray) {
+    //       // Slow-path copy.
+    //       do {
+    //         *dest_ptr++ = MaybePoison(ReadBarrier::Mark(MaybeUnpoison(*src_ptr++)));
+    //       } while (src_ptr != end_ptr)
+    //     } else {
+    //       // Fast-path copy.
+    //       do {
+    //         *dest_ptr++ = *src_ptr++;
+    //       } while (src_ptr != end_ptr)
+    //     }
+    //   }
+
+    NearLabel loop, done;
+
+    // Don't enter copy loop if `length == 0`.
+    __ cmpl(temp1, temp3);
+    __ j(kEqual, &done);
+
+    // /* int32_t */ monitor = src->monitor_
+    __ movl(CpuRegister(TMP), Address(src, monitor_offset));
+    // /* LockWord */ lock_word = LockWord(monitor)
+    static_assert(sizeof(LockWord) == sizeof(int32_t),
+                  "art::LockWord and int32_t have different sizes.");
+
+    // Load fence to prevent load-load reordering.
+    // Note that this is a no-op, thanks to the x86-64 memory model.
+    codegen_->GenerateMemoryBarrier(MemBarrierKind::kLoadAny);
+
+    // Slow path used to copy array when `src` is gray.
+    SlowPathCode* read_barrier_slow_path =
+        new (GetAllocator()) ReadBarrierSystemArrayCopySlowPathX86_64(invoke);
+    codegen_->AddSlowPath(read_barrier_slow_path);
+
+    // Given the numeric representation, it's enough to check the low bit of the
+    // rb_state. We do that by shifting the bit out of the lock word with SHR.
+    static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0");
+    static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1");
+    static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2");
+    __ shrl(CpuRegister(TMP), Immediate(LockWord::kReadBarrierStateShift + 1));
+    __ j(kCarrySet, read_barrier_slow_path->GetEntryLabel());
+
+    // Fast-path copy.
+    // Iterate over the arrays and do a raw copy of the objects. We don't need to
+    // poison/unpoison.
+    __ Bind(&loop);
+    __ movl(CpuRegister(TMP), Address(temp1, 0));
+    __ movl(Address(temp2, 0), CpuRegister(TMP));
+    __ addl(temp1, Immediate(element_size));
+    __ addl(temp2, Immediate(element_size));
+    __ cmpl(temp1, temp3);
+    __ j(kNotEqual, &loop);
+
+    __ Bind(read_barrier_slow_path->GetExitLabel());
+    __ Bind(&done);
+  } else {
+    // Non read barrier code.
+
+    // Iterate over the arrays and do a raw copy of the objects. We don't need to
+    // poison/unpoison.
+    NearLabel loop, done;
+    __ cmpl(temp1, temp3);
+    __ j(kEqual, &done);
+    __ Bind(&loop);
+    __ movl(CpuRegister(TMP), Address(temp1, 0));
+    __ movl(Address(temp2, 0), CpuRegister(TMP));
+    __ addl(temp1, Immediate(element_size));
+    __ addl(temp2, Immediate(element_size));
+    __ cmpl(temp1, temp3);
+    __ j(kNotEqual, &loop);
+    __ Bind(&done);
+  }
 
   // We only need one card marking on the destination array.
   codegen_->MarkGCCard(temp1,
@@ -1298,12 +1482,12 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) {
                        CpuRegister(kNoRegister),
                        /* value_can_be_null */ false);
 
-  __ Bind(slow_path->GetExitLabel());
+  __ Bind(intrinsic_slow_path->GetExitLabel());
 }
 
 void IntrinsicLocationsBuilderX86_64::VisitStringCompareTo(HInvoke* invoke) {
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                            LocationSummary::kCall,
+                                                            LocationSummary::kCallOnMainAndSlowPath,
                                                             kIntrinsified);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
@@ -1324,7 +1508,7 @@ void IntrinsicCodeGeneratorX86_64::VisitStringCompareTo(HInvoke* invoke) {
   codegen_->AddSlowPath(slow_path);
   __ j(kEqual, slow_path->GetEntryLabel());
 
-  __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, pStringCompareTo),
+  __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86_64PointerSize, pStringCompareTo),
                                   /* no_rip */ true));
   __ Bind(slow_path->GetExitLabel());
 }
@@ -1577,7 +1761,7 @@ void IntrinsicCodeGeneratorX86_64::VisitStringIndexOfAfter(HInvoke* invoke) {
 
 void IntrinsicLocationsBuilderX86_64::VisitStringNewStringFromBytes(HInvoke* invoke) {
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                            LocationSummary::kCall,
+                                                            LocationSummary::kCallOnMainAndSlowPath,
                                                             kIntrinsified);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
@@ -1597,7 +1781,8 @@ void IntrinsicCodeGeneratorX86_64::VisitStringNewStringFromBytes(HInvoke* invoke
   codegen_->AddSlowPath(slow_path);
   __ j(kEqual, slow_path->GetEntryLabel());
 
-  __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, pAllocStringFromBytes),
+  __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86_64PointerSize,
+                                                          pAllocStringFromBytes),
                                   /* no_rip */ true));
   CheckEntrypointTypes<kQuickAllocStringFromBytes, void*, void*, int32_t, int32_t, int32_t>();
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
@@ -1606,7 +1791,7 @@ void IntrinsicCodeGeneratorX86_64::VisitStringNewStringFromBytes(HInvoke* invoke
 
 void IntrinsicLocationsBuilderX86_64::VisitStringNewStringFromChars(HInvoke* invoke) {
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                            LocationSummary::kCall,
+                                                            LocationSummary::kCallOnMainOnly,
                                                             kIntrinsified);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
@@ -1624,7 +1809,8 @@ void IntrinsicCodeGeneratorX86_64::VisitStringNewStringFromChars(HInvoke* invoke
   //   java.lang.StringFactory.newStringFromChars(int offset, int charCount, char[] data)
   //
   // all include a null check on `data` before calling that method.
-  __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, pAllocStringFromChars),
+  __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86_64PointerSize,
+                                                          pAllocStringFromChars),
                                   /* no_rip */ true));
   CheckEntrypointTypes<kQuickAllocStringFromChars, void*, int32_t, int32_t, void*>();
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
@@ -1632,7 +1818,7 @@ void IntrinsicCodeGeneratorX86_64::VisitStringNewStringFromChars(HInvoke* invoke
 
 void IntrinsicLocationsBuilderX86_64::VisitStringNewStringFromString(HInvoke* invoke) {
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                            LocationSummary::kCall,
+                                                            LocationSummary::kCallOnMainAndSlowPath,
                                                             kIntrinsified);
   InvokeRuntimeCallingConvention calling_convention;
   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
@@ -1649,7 +1835,8 @@ void IntrinsicCodeGeneratorX86_64::VisitStringNewStringFromString(HInvoke* invok
   codegen_->AddSlowPath(slow_path);
   __ j(kEqual, slow_path->GetEntryLabel());
 
-  __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, pAllocStringFromString),
+  __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86_64PointerSize,
+                                                          pAllocStringFromString),
                                   /* no_rip */ true));
   CheckEntrypointTypes<kQuickAllocStringFromString, void*, void*>();
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
@@ -1875,7 +2062,7 @@ void IntrinsicLocationsBuilderX86_64::VisitThreadCurrentThread(HInvoke* invoke)
 
 void IntrinsicCodeGeneratorX86_64::VisitThreadCurrentThread(HInvoke* invoke) {
   CpuRegister out = invoke->GetLocations()->Out().AsRegister<CpuRegister>();
-  GetAssembler()->gs()->movl(out, Address::Absolute(Thread::PeerOffset<kX86_64WordSize>(),
+  GetAssembler()->gs()->movl(out, Address::Absolute(Thread::PeerOffset<kX86_64PointerSize>(),
                                                     /* no_rip */ true));
 }
 
diff --git a/compiler/optimizing/locations.h b/compiler/optimizing/locations.h
index 3f27c911be..5fdfb9b6ca 100644
--- a/compiler/optimizing/locations.h
+++ b/compiler/optimizing/locations.h
@@ -376,6 +376,10 @@ class Location : public ValueObject {
     return PolicyField::Decode(GetPayload());
   }
 
+  bool RequiresRegisterKind() const {
+    return GetPolicy() == kRequiresRegister || GetPolicy() == kRequiresFpuRegister;
+  }
+
   uintptr_t GetEncoding() const {
     return GetPayload();
   }
@@ -480,8 +484,9 @@ class LocationSummary : public ArenaObject<kArenaAllocLocationSummary> {
  public:
   enum CallKind {
     kNoCall,
+    kCallOnMainAndSlowPath,
     kCallOnSlowPath,
-    kCall
+    kCallOnMainOnly
   };
 
   LocationSummary(HInstruction* instruction,
@@ -540,10 +545,29 @@ class LocationSummary : public ArenaObject<kArenaAllocLocationSummary> {
 
   Location Out() const { return output_; }
 
-  bool CanCall() const { return call_kind_ != kNoCall; }
-  bool WillCall() const { return call_kind_ == kCall; }
-  bool OnlyCallsOnSlowPath() const { return call_kind_ == kCallOnSlowPath; }
-  bool NeedsSafepoint() const { return CanCall(); }
+  bool CanCall() const {
+    return call_kind_ != kNoCall;
+  }
+
+  bool WillCall() const {
+    return call_kind_ == kCallOnMainOnly || call_kind_ == kCallOnMainAndSlowPath;
+  }
+
+  bool CallsOnSlowPath() const {
+    return call_kind_ == kCallOnSlowPath || call_kind_ == kCallOnMainAndSlowPath;
+  }
+
+  bool OnlyCallsOnSlowPath() const {
+    return call_kind_ == kCallOnSlowPath;
+  }
+
+  bool CallsOnMainAndSlowPath() const {
+    return call_kind_ == kCallOnMainAndSlowPath;
+  }
+
+  bool NeedsSafepoint() const {
+    return CanCall();
+  }
 
   void SetStackBit(uint32_t index) {
     stack_mask_->SetBit(index);
@@ -629,8 +653,7 @@ class LocationSummary : public ArenaObject<kArenaAllocLocationSummary> {
   // Whether these are locations for an intrinsified call.
   bool intrinsified_;
 
-  ART_FRIEND_TEST(RegisterAllocatorTest, ExpectedInRegisterHint);
-  ART_FRIEND_TEST(RegisterAllocatorTest, SameAsFirstInputHint);
+  friend class RegisterAllocatorTest;
   DISALLOW_COPY_AND_ASSIGN(LocationSummary);
 };
 
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index d557f42968..2808e1b5fc 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -2632,4 +2632,23 @@ std::ostream& operator<<(std::ostream& os, TypeCheckKind rhs) {
   }
 }
 
+std::ostream& operator<<(std::ostream& os, const MemBarrierKind& kind) {
+  switch (kind) {
+    case MemBarrierKind::kAnyStore:
+      return os << "AnyStore";
+    case MemBarrierKind::kLoadAny:
+      return os << "LoadAny";
+    case MemBarrierKind::kStoreStore:
+      return os << "StoreStore";
+    case MemBarrierKind::kAnyAny:
+      return os << "AnyAny";
+    case MemBarrierKind::kNTStoreStore:
+      return os << "NTStoreStore";
+
+    default:
+      LOG(FATAL) << "Unknown MemBarrierKind: " << static_cast<int>(kind);
+      UNREACHABLE();
+  }
+}
+
 }  // namespace art
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 0f0ef26ea9..dfa8276651 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -25,7 +25,6 @@
 #include "base/arena_containers.h"
 #include "base/arena_object.h"
 #include "base/stl_util.h"
-#include "dex/compiler_enums.h"
 #include "dex_file.h"
 #include "entrypoints/quick/quick_entrypoints_enum.h"
 #include "handle.h"
@@ -1289,7 +1288,8 @@ class HLoopInformationOutwardIterator : public ValueObject {
 #else
 #define FOR_EACH_CONCRETE_INSTRUCTION_SHARED(M)                         \
   M(BitwiseNegatedRight, Instruction)                                   \
-  M(MultiplyAccumulate, Instruction)
+  M(MultiplyAccumulate, Instruction)                                    \
+  M(IntermediateAddress, Instruction)
 #endif
 
 #ifndef ART_ENABLE_CODEGEN_arm
@@ -1303,8 +1303,7 @@ class HLoopInformationOutwardIterator : public ValueObject {
 #define FOR_EACH_CONCRETE_INSTRUCTION_ARM64(M)
 #else
 #define FOR_EACH_CONCRETE_INSTRUCTION_ARM64(M)                          \
-  M(Arm64DataProcWithShifterOp, Instruction)                            \
-  M(Arm64IntermediateAddress, Instruction)
+  M(Arm64DataProcWithShifterOp, Instruction)
 #endif
 
 #ifndef ART_ENABLE_CODEGEN_mips
@@ -5626,9 +5625,12 @@ inline uint32_t HLoadClass::GetDexCacheElementOffset() const {
 
 // Note: defined outside class to see operator<<(., HLoadClass::LoadKind).
 inline void HLoadClass::AddSpecialInput(HInstruction* special_input) {
-  // The special input is used for PC-relative loads on some architectures.
+  // The special input is used for PC-relative loads on some architectures,
+  // including literal pool loads, which are PC-relative too.
   DCHECK(GetLoadKind() == LoadKind::kBootImageLinkTimePcRelative ||
-         GetLoadKind() == LoadKind::kDexCachePcRelative) << GetLoadKind();
+         GetLoadKind() == LoadKind::kDexCachePcRelative ||
+         GetLoadKind() == LoadKind::kBootImageLinkTimeAddress ||
+         GetLoadKind() == LoadKind::kBootImageAddress) << GetLoadKind();
   DCHECK(special_input_.GetInstruction() == nullptr);
   special_input_ = HUserRecord<HInstruction*>(special_input);
   special_input->AddUseAt(this, 0);
@@ -5836,9 +5838,12 @@ inline uint32_t HLoadString::GetDexCacheElementOffset() const {
 
 // Note: defined outside class to see operator<<(., HLoadString::LoadKind).
 inline void HLoadString::AddSpecialInput(HInstruction* special_input) {
-  // The special input is used for PC-relative loads on some architectures.
+  // The special input is used for PC-relative loads on some architectures,
+  // including literal pool loads, which are PC-relative too.
   DCHECK(GetLoadKind() == LoadKind::kBootImageLinkTimePcRelative ||
-         GetLoadKind() == LoadKind::kDexCachePcRelative) << GetLoadKind();
+         GetLoadKind() == LoadKind::kDexCachePcRelative ||
+         GetLoadKind() == LoadKind::kBootImageLinkTimeAddress ||
+         GetLoadKind() == LoadKind::kBootImageAddress) << GetLoadKind();
   // HLoadString::GetInputRecords() returns an empty array at this point,
   // so use the GetInputRecords() from the base class to set the input record.
   DCHECK(special_input_.GetInstruction() == nullptr);
@@ -6305,6 +6310,32 @@ class HCheckCast FINAL : public HTemplateInstruction<2> {
   DISALLOW_COPY_AND_ASSIGN(HCheckCast);
 };
 
+/**
+ * @brief Memory barrier types (see "The JSR-133 Cookbook for Compiler Writers").
+ * @details We define the combined barrier types that are actually required
+ * by the Java Memory Model, rather than using exactly the terminology from
+ * the JSR-133 cookbook.  These should, in many cases, be replaced by acquire/release
+ * primitives.  Note that the JSR-133 cookbook generally does not deal with
+ * store atomicity issues, and the recipes there are not always entirely sufficient.
+ * The current recipe is as follows:
+ * -# Use AnyStore ~= (LoadStore | StoreStore) ~= release barrier before volatile store.
+ * -# Use AnyAny barrier after volatile store.  (StoreLoad is as expensive.)
+ * -# Use LoadAny barrier ~= (LoadLoad | LoadStore) ~= acquire barrier after each volatile load.
+ * -# Use StoreStore barrier after all stores but before return from any constructor whose
+ *    class has final fields.
+ * -# Use NTStoreStore to order non-temporal stores with respect to all later
+ *    store-to-memory instructions.  Only generated together with non-temporal stores.
+ */
+enum MemBarrierKind {
+  kAnyStore,
+  kLoadAny,
+  kStoreStore,
+  kAnyAny,
+  kNTStoreStore,
+  kLastBarrierKind = kNTStoreStore
+};
+std::ostream& operator<<(std::ostream& os, const MemBarrierKind& kind);
+
 class HMemoryBarrier FINAL : public HTemplateInstruction<0> {
  public:
   explicit HMemoryBarrier(MemBarrierKind barrier_kind, uint32_t dex_pc = kNoDexPc)
diff --git a/compiler/optimizing/nodes_arm64.h b/compiler/optimizing/nodes_arm64.h
index 06b073c3e2..3f88717c2a 100644
--- a/compiler/optimizing/nodes_arm64.h
+++ b/compiler/optimizing/nodes_arm64.h
@@ -94,32 +94,6 @@ class HArm64DataProcWithShifterOp FINAL : public HExpression<2> {
 
 std::ostream& operator<<(std::ostream& os, const HArm64DataProcWithShifterOp::OpKind op);
 
-// This instruction computes an intermediate address pointing in the 'middle' of an object. The
-// result pointer cannot be handled by GC, so extra care is taken to make sure that this value is
-// never used across anything that can trigger GC.
-class HArm64IntermediateAddress FINAL : public HExpression<2> {
- public:
-  HArm64IntermediateAddress(HInstruction* base_address, HInstruction* offset, uint32_t dex_pc)
-      : HExpression(Primitive::kPrimNot, SideEffects::DependsOnGC(), dex_pc) {
-    SetRawInputAt(0, base_address);
-    SetRawInputAt(1, offset);
-  }
-
-  bool CanBeMoved() const OVERRIDE { return true; }
-  bool InstructionDataEquals(const HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
-    return true;
-  }
-  bool IsActualObject() const OVERRIDE { return false; }
-
-  HInstruction* GetBaseAddress() const { return InputAt(0); }
-  HInstruction* GetOffset() const { return InputAt(1); }
-
-  DECLARE_INSTRUCTION(Arm64IntermediateAddress);
-
- private:
-  DISALLOW_COPY_AND_ASSIGN(HArm64IntermediateAddress);
-};
-
 }  // namespace art
 
 #endif  // ART_COMPILER_OPTIMIZING_NODES_ARM64_H_
diff --git a/compiler/optimizing/nodes_shared.h b/compiler/optimizing/nodes_shared.h
index f2d5cf3253..8bd8667f84 100644
--- a/compiler/optimizing/nodes_shared.h
+++ b/compiler/optimizing/nodes_shared.h
@@ -113,6 +113,34 @@ class HBitwiseNegatedRight FINAL : public HBinaryOperation {
   DISALLOW_COPY_AND_ASSIGN(HBitwiseNegatedRight);
 };
 
+
+// This instruction computes an intermediate address pointing in the 'middle' of an object. The
+// result pointer cannot be handled by GC, so extra care is taken to make sure that this value is
+// never used across anything that can trigger GC.
+class HIntermediateAddress FINAL : public HExpression<2> {
+ public:
+  HIntermediateAddress(HInstruction* base_address, HInstruction* offset, uint32_t dex_pc)
+      : HExpression(Primitive::kPrimNot, SideEffects::DependsOnGC(), dex_pc) {
+    SetRawInputAt(0, base_address);
+    SetRawInputAt(1, offset);
+  }
+
+  bool CanBeMoved() const OVERRIDE { return true; }
+  bool InstructionDataEquals(const HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
+    return true;
+  }
+  bool IsActualObject() const OVERRIDE { return false; }
+
+  HInstruction* GetBaseAddress() const { return InputAt(0); }
+  HInstruction* GetOffset() const { return InputAt(1); }
+
+  DECLARE_INSTRUCTION(IntermediateAddress);
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HIntermediateAddress);
+};
+
+
 }  // namespace art
 
 #endif  // ART_COMPILER_OPTIMIZING_NODES_SHARED_H_
diff --git a/compiler/optimizing/optimization.h b/compiler/optimizing/optimization.h
index 2f59d4cd5b..0819fb01ac 100644
--- a/compiler/optimizing/optimization.h
+++ b/compiler/optimizing/optimization.h
@@ -37,7 +37,10 @@ class HOptimization : public ArenaObject<kArenaAllocOptimization> {
 
   virtual ~HOptimization() {}
 
-  // Return the name of the pass.
+  // Return the name of the pass. Pass names for a single HOptimization should be of form
+  // <optimization_name> or <optimization_name>$<pass_name> for common <optimization_name> prefix.
+  // Example: 'instruction_simplifier', 'instruction_simplifier$after_bce',
+  // 'instruction_simplifier$before_codegen'.
   const char* GetPassName() const { return pass_name_; }
 
   // Perform the analysis itself.
diff --git a/compiler/optimizing/optimizing_cfi_test.cc b/compiler/optimizing/optimizing_cfi_test.cc
index a6d234d739..8c0231e1aa 100644
--- a/compiler/optimizing/optimizing_cfi_test.cc
+++ b/compiler/optimizing/optimizing_cfi_test.cc
@@ -157,13 +157,26 @@ class OptimizingCFITest : public CFITest {
     TestImpl(isa, #isa, expected_asm, expected_cfi);          \
   }
 
+#ifdef ART_ENABLE_CODEGEN_arm
 TEST_ISA(kThumb2)
+#endif
+#ifdef ART_ENABLE_CODEGEN_arm64
 TEST_ISA(kArm64)
+#endif
+#ifdef ART_ENABLE_CODEGEN_x86
 TEST_ISA(kX86)
+#endif
+#ifdef ART_ENABLE_CODEGEN_x86_64
 TEST_ISA(kX86_64)
+#endif
+#ifdef ART_ENABLE_CODEGEN_mips
 TEST_ISA(kMips)
+#endif
+#ifdef ART_ENABLE_CODEGEN_mips64
 TEST_ISA(kMips64)
+#endif
 
+#ifdef ART_ENABLE_CODEGEN_arm
 TEST_F(OptimizingCFITest, kThumb2Adjust) {
   std::vector<uint8_t> expected_asm(
       expected_asm_kThumb2_adjust,
@@ -184,7 +197,9 @@ TEST_F(OptimizingCFITest, kThumb2Adjust) {
   Finish();
   Check(kThumb2, "kThumb2_adjust", expected_asm, expected_cfi);
 }
+#endif
 
+#ifdef ART_ENABLE_CODEGEN_mips
 TEST_F(OptimizingCFITest, kMipsAdjust) {
   // One NOP in delay slot, 1 << 15 NOPS have size 1 << 17 which exceeds 18-bit signed maximum.
   static constexpr size_t kNumNops = 1u + (1u << 15);
@@ -212,7 +227,9 @@ TEST_F(OptimizingCFITest, kMipsAdjust) {
   Finish();
   Check(kMips, "kMips_adjust", expected_asm, expected_cfi);
 }
+#endif
 
+#ifdef ART_ENABLE_CODEGEN_mips64
 TEST_F(OptimizingCFITest, kMips64Adjust) {
   // One NOP in forbidden slot, 1 << 15 NOPS have size 1 << 17 which exceeds 18-bit signed maximum.
   static constexpr size_t kNumNops = 1u + (1u << 15);
@@ -240,6 +257,7 @@ TEST_F(OptimizingCFITest, kMips64Adjust) {
   Finish();
   Check(kMips64, "kMips64_adjust", expected_asm, expected_cfi);
 }
+#endif
 
 #endif  // ART_TARGET_ANDROID
 
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index d703b0f94f..a1da20bae4 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -37,6 +37,10 @@
 #include "pc_relative_fixups_x86.h"
 #endif
 
+#if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
+#include "x86_memory_gen.h"
+#endif
+
 #include "art_method-inl.h"
 #include "base/arena_allocator.h"
 #include "base/arena_containers.h"
@@ -77,7 +81,7 @@
 #include "oat_quick_method_header.h"
 #include "prepare_for_register_allocation.h"
 #include "reference_type_propagation.h"
-#include "register_allocator.h"
+#include "register_allocator_linear_scan.h"
 #include "select_generator.h"
 #include "sharpening.h"
 #include "side_effects_analysis.h"
@@ -91,6 +95,8 @@ namespace art {
 
 static constexpr size_t kArenaAllocatorMemoryReportThreshold = 8 * MB;
 
+static constexpr const char* kPassNameSeparator = "$";
+
 /**
  * Used by the code generator, to allocate the code in a vector.
  */
@@ -174,6 +180,7 @@ class PassObserver : public ValueObject {
 
  private:
   void StartPass(const char* pass_name) {
+    VLOG(compiler) << "Starting pass: " << pass_name;
     // Dump graph first, then start timer.
     if (visualizer_enabled_) {
       visualizer_.DumpGraph(pass_name, /* is_after_pass */ false, graph_in_bad_state_);
@@ -262,7 +269,7 @@ class PassScope : public ValueObject {
 class OptimizingCompiler FINAL : public Compiler {
  public:
   explicit OptimizingCompiler(CompilerDriver* driver);
-  ~OptimizingCompiler();
+  ~OptimizingCompiler() OVERRIDE;
 
   bool CanCompileMethod(uint32_t method_idx, const DexFile& dex_file) const OVERRIDE;
 
@@ -277,8 +284,13 @@ class OptimizingCompiler FINAL : public Compiler {
 
   CompiledMethod* JniCompile(uint32_t access_flags,
                              uint32_t method_idx,
-                             const DexFile& dex_file) const OVERRIDE {
-    return ArtQuickJniCompileMethod(GetCompilerDriver(), access_flags, method_idx, dex_file);
+                             const DexFile& dex_file,
+                             JniOptimizationFlags optimization_flags) const OVERRIDE {
+    return ArtQuickJniCompileMethod(GetCompilerDriver(),
+                                    access_flags,
+                                    method_idx,
+                                    dex_file,
+                                    optimization_flags);
   }
 
   uintptr_t GetEntryPointOf(ArtMethod* method) const OVERRIDE
@@ -302,6 +314,18 @@ class OptimizingCompiler FINAL : public Compiler {
       SHARED_REQUIRES(Locks::mutator_lock_);
 
  private:
+  void RunOptimizations(HGraph* graph,
+                        CodeGenerator* codegen,
+                        CompilerDriver* driver,
+                        const DexCompilationUnit& dex_compilation_unit,
+                        PassObserver* pass_observer,
+                        StackHandleScopeCollection* handles) const;
+
+  void RunOptimizations(HOptimization* optimizations[],
+                        size_t length,
+                        PassObserver* pass_observer) const;
+
+ private:
   // Create a 'CompiledMethod' for an optimized graph.
   CompiledMethod* Emit(ArenaAllocator* arena,
                        CodeVectorAllocator* code_allocator,
@@ -329,6 +353,18 @@ class OptimizingCompiler FINAL : public Compiler {
                             ArtMethod* method,
                             bool osr) const;
 
+  void MaybeRunInliner(HGraph* graph,
+                       CodeGenerator* codegen,
+                       CompilerDriver* driver,
+                       const DexCompilationUnit& dex_compilation_unit,
+                       PassObserver* pass_observer,
+                       StackHandleScopeCollection* handles) const;
+
+  void RunArchOptimizations(InstructionSet instruction_set,
+                            HGraph* graph,
+                            CodeGenerator* codegen,
+                            PassObserver* pass_observer) const;
+
   std::unique_ptr<OptimizingCompilerStats> compilation_stats_;
 
   std::unique_ptr<std::ostream> visualizer_output_;
@@ -392,22 +428,143 @@ static bool InstructionSetSupportsReadBarrier(InstructionSet instruction_set) {
       || instruction_set == kX86_64;
 }
 
-static void RunOptimizations(HOptimization* optimizations[],
-                             size_t length,
-                             PassObserver* pass_observer) {
+static HOptimization* BuildOptimization(
+    const std::string& opt_name,
+    ArenaAllocator* arena,
+    HGraph* graph,
+    OptimizingCompilerStats* stats,
+    CodeGenerator* codegen,
+    CompilerDriver* driver,
+    const DexCompilationUnit& dex_compilation_unit,
+    StackHandleScopeCollection* handles,
+    SideEffectsAnalysis* most_recent_side_effects,
+    HInductionVarAnalysis* most_recent_induction) {
+  if (opt_name == BoundsCheckElimination::kBoundsCheckEliminationPassName) {
+    CHECK(most_recent_side_effects != nullptr && most_recent_induction != nullptr);
+    return new (arena) BoundsCheckElimination(graph,
+                                              *most_recent_side_effects,
+                                              most_recent_induction);
+  } else if (opt_name == GVNOptimization::kGlobalValueNumberingPassName) {
+    CHECK(most_recent_side_effects != nullptr);
+    return new (arena) GVNOptimization(graph, *most_recent_side_effects);
+  } else if (opt_name == HConstantFolding::kConstantFoldingPassName) {
+    return new (arena) HConstantFolding(graph);
+  } else if (opt_name == HDeadCodeElimination::kDeadCodeEliminationPassName) {
+    return new (arena) HDeadCodeElimination(graph, stats);
+  } else if (opt_name == HInliner::kInlinerPassName) {
+    size_t number_of_dex_registers = dex_compilation_unit.GetCodeItem()->registers_size_;
+    return new (arena) HInliner(graph,                   // outer_graph
+                                graph,                   // outermost_graph
+                                codegen,
+                                dex_compilation_unit,    // outer_compilation_unit
+                                dex_compilation_unit,    // outermost_compilation_unit
+                                driver,
+                                handles,
+                                stats,
+                                number_of_dex_registers,
+                                /* depth */ 0);
+  } else if (opt_name == HSharpening::kSharpeningPassName) {
+    return new (arena) HSharpening(graph, codegen, dex_compilation_unit, driver);
+  } else if (opt_name == HSelectGenerator::kSelectGeneratorPassName) {
+    return new (arena) HSelectGenerator(graph, stats);
+  } else if (opt_name == HInductionVarAnalysis::kInductionPassName) {
+    return new (arena) HInductionVarAnalysis(graph);
+  } else if (opt_name == InstructionSimplifier::kInstructionSimplifierPassName) {
+    return new (arena) InstructionSimplifier(graph, stats);
+  } else if (opt_name == IntrinsicsRecognizer::kIntrinsicsRecognizerPassName) {
+    return new (arena) IntrinsicsRecognizer(graph, driver, stats);
+  } else if (opt_name == LICM::kLoopInvariantCodeMotionPassName) {
+    CHECK(most_recent_side_effects != nullptr);
+    return new (arena) LICM(graph, *most_recent_side_effects, stats);
+  } else if (opt_name == LoadStoreElimination::kLoadStoreEliminationPassName) {
+    CHECK(most_recent_side_effects != nullptr);
+    return new (arena) LoadStoreElimination(graph, *most_recent_side_effects);
+  } else if (opt_name == SideEffectsAnalysis::kSideEffectsAnalysisPassName) {
+    return new (arena) SideEffectsAnalysis(graph);
+#ifdef ART_ENABLE_CODEGEN_arm
+  } else if (opt_name == arm::DexCacheArrayFixups::kDexCacheArrayFixupsArmPassName) {
+    return new (arena) arm::DexCacheArrayFixups(graph, stats);
+  } else if (opt_name == arm::InstructionSimplifierArm::kInstructionSimplifierArmPassName) {
+    return new (arena) arm::InstructionSimplifierArm(graph, stats);
+#endif
+#ifdef ART_ENABLE_CODEGEN_arm64
+  } else if (opt_name == arm64::InstructionSimplifierArm64::kInstructionSimplifierArm64PassName) {
+    return new (arena) arm64::InstructionSimplifierArm64(graph, stats);
+#endif
+#ifdef ART_ENABLE_CODEGEN_mips
+  } else if (opt_name == mips::DexCacheArrayFixups::kDexCacheArrayFixupsMipsPassName) {
+    return new (arena) mips::DexCacheArrayFixups(graph, codegen, stats);
+  } else if (opt_name == mips::PcRelativeFixups::kPcRelativeFixupsMipsPassName) {
+    return new (arena) mips::PcRelativeFixups(graph, codegen, stats);
+#endif
+#ifdef ART_ENABLE_CODEGEN_x86
+  } else if (opt_name == x86::PcRelativeFixups::kPcRelativeFixupsX86PassName) {
+    return new (arena) x86::PcRelativeFixups(graph, codegen, stats);
+  } else if (opt_name == x86::X86MemoryOperandGeneration::kX86MemoryOperandGenerationPassName) {
+    return new (arena) x86::X86MemoryOperandGeneration(graph, codegen, stats);
+#endif
+  }
+  return nullptr;
+}
+
+static ArenaVector<HOptimization*> BuildOptimizations(
+    const std::vector<std::string>& pass_names,
+    ArenaAllocator* arena,
+    HGraph* graph,
+    OptimizingCompilerStats* stats,
+    CodeGenerator* codegen,
+    CompilerDriver* driver,
+    const DexCompilationUnit& dex_compilation_unit,
+    StackHandleScopeCollection* handles) {
+  // Few HOptimizations constructors require SideEffectsAnalysis or HInductionVarAnalysis
+  // instances. This method assumes that each of them expects the nearest instance preceeding it
+  // in the pass name list.
+  SideEffectsAnalysis* most_recent_side_effects = nullptr;
+  HInductionVarAnalysis* most_recent_induction = nullptr;
+  ArenaVector<HOptimization*> ret(arena->Adapter());
+  for (std::string pass_name : pass_names) {
+    size_t pos = pass_name.find(kPassNameSeparator);    // Strip suffix to get base pass name.
+    std::string opt_name = pos == std::string::npos ? pass_name : pass_name.substr(0, pos);
+
+    HOptimization* opt = BuildOptimization(
+        opt_name,
+        arena,
+        graph,
+        stats,
+        codegen,
+        driver,
+        dex_compilation_unit,
+        handles,
+        most_recent_side_effects,
+        most_recent_induction);
+    CHECK(opt != nullptr) << "Couldn't build optimization: \"" << pass_name << "\"";
+    ret.push_back(opt);
+
+    if (opt_name == SideEffectsAnalysis::kSideEffectsAnalysisPassName) {
+      most_recent_side_effects = down_cast<SideEffectsAnalysis*>(opt);
+    } else if (opt_name == HInductionVarAnalysis::kInductionPassName) {
+      most_recent_induction = down_cast<HInductionVarAnalysis*>(opt);
+    }
+  }
+  return ret;
+}
+
+void OptimizingCompiler::RunOptimizations(HOptimization* optimizations[],
+                                          size_t length,
+                                          PassObserver* pass_observer) const {
   for (size_t i = 0; i < length; ++i) {
     PassScope scope(optimizations[i]->GetPassName(), pass_observer);
     optimizations[i]->Run();
   }
 }
 
-static void MaybeRunInliner(HGraph* graph,
-                            CodeGenerator* codegen,
-                            CompilerDriver* driver,
-                            OptimizingCompilerStats* stats,
-                            const DexCompilationUnit& dex_compilation_unit,
-                            PassObserver* pass_observer,
-                            StackHandleScopeCollection* handles) {
+void OptimizingCompiler::MaybeRunInliner(HGraph* graph,
+                                         CodeGenerator* codegen,
+                                         CompilerDriver* driver,
+                                         const DexCompilationUnit& dex_compilation_unit,
+                                         PassObserver* pass_observer,
+                                         StackHandleScopeCollection* handles) const {
+  OptimizingCompilerStats* stats = compilation_stats_.get();
   const CompilerOptions& compiler_options = driver->GetCompilerOptions();
   bool should_inline = (compiler_options.GetInlineDepthLimit() > 0)
       && (compiler_options.GetInlineMaxCodeUnits() > 0);
@@ -416,11 +573,11 @@ static void MaybeRunInliner(HGraph* graph,
   }
   size_t number_of_dex_registers = dex_compilation_unit.GetCodeItem()->registers_size_;
   HInliner* inliner = new (graph->GetArena()) HInliner(
-      graph,
-      graph,
+      graph,                   // outer_graph
+      graph,                   // outermost_graph
       codegen,
-      dex_compilation_unit,
-      dex_compilation_unit,
+      dex_compilation_unit,    // outer_compilation_unit
+      dex_compilation_unit,    // outermost_compilation_unit
       driver,
       handles,
       stats,
@@ -431,11 +588,12 @@ static void MaybeRunInliner(HGraph* graph,
   RunOptimizations(optimizations, arraysize(optimizations), pass_observer);
 }
 
-static void RunArchOptimizations(InstructionSet instruction_set,
-                                 HGraph* graph,
-                                 CodeGenerator* codegen,
-                                 OptimizingCompilerStats* stats,
-                                 PassObserver* pass_observer) {
+void OptimizingCompiler::RunArchOptimizations(InstructionSet instruction_set,
+                                              HGraph* graph,
+                                              CodeGenerator* codegen,
+                                              PassObserver* pass_observer) const {
+  UNUSED(codegen);  // To avoid compilation error when compiling for svelte
+  OptimizingCompilerStats* stats = compilation_stats_.get();
   ArenaAllocator* arena = graph->GetArena();
   switch (instruction_set) {
 #ifdef ART_ENABLE_CODEGEN_arm
@@ -444,8 +602,12 @@ static void RunArchOptimizations(InstructionSet instruction_set,
       arm::DexCacheArrayFixups* fixups = new (arena) arm::DexCacheArrayFixups(graph, stats);
       arm::InstructionSimplifierArm* simplifier =
           new (arena) arm::InstructionSimplifierArm(graph, stats);
+      SideEffectsAnalysis* side_effects = new (arena) SideEffectsAnalysis(graph);
+      GVNOptimization* gvn = new (arena) GVNOptimization(graph, *side_effects, "GVN$after_arch");
       HOptimization* arm_optimizations[] = {
         simplifier,
+        side_effects,
+        gvn,
         fixups
       };
       RunOptimizations(arm_optimizations, arraysize(arm_optimizations), pass_observer);
@@ -457,7 +619,7 @@ static void RunArchOptimizations(InstructionSet instruction_set,
       arm64::InstructionSimplifierArm64* simplifier =
           new (arena) arm64::InstructionSimplifierArm64(graph, stats);
       SideEffectsAnalysis* side_effects = new (arena) SideEffectsAnalysis(graph);
-      GVNOptimization* gvn = new (arena) GVNOptimization(graph, *side_effects, "GVN_after_arch");
+      GVNOptimization* gvn = new (arena) GVNOptimization(graph, *side_effects, "GVN$after_arch");
       HOptimization* arm64_optimizations[] = {
         simplifier,
         side_effects,
@@ -472,7 +634,7 @@ static void RunArchOptimizations(InstructionSet instruction_set,
       mips::PcRelativeFixups* pc_relative_fixups =
           new (arena) mips::PcRelativeFixups(graph, codegen, stats);
       mips::DexCacheArrayFixups* dex_cache_array_fixups =
-          new (arena) mips::DexCacheArrayFixups(graph, stats);
+          new (arena) mips::DexCacheArrayFixups(graph, codegen, stats);
       HOptimization* mips_optimizations[] = {
           pc_relative_fixups,
           dex_cache_array_fixups
@@ -485,13 +647,27 @@ static void RunArchOptimizations(InstructionSet instruction_set,
     case kX86: {
       x86::PcRelativeFixups* pc_relative_fixups =
           new (arena) x86::PcRelativeFixups(graph, codegen, stats);
+      x86::X86MemoryOperandGeneration* memory_gen =
+          new (arena) x86::X86MemoryOperandGeneration(graph, codegen, stats);
       HOptimization* x86_optimizations[] = {
-          pc_relative_fixups
+          pc_relative_fixups,
+          memory_gen
       };
       RunOptimizations(x86_optimizations, arraysize(x86_optimizations), pass_observer);
       break;
     }
 #endif
+#ifdef ART_ENABLE_CODEGEN_x86_64
+    case kX86_64: {
+      x86::X86MemoryOperandGeneration* memory_gen =
+          new (arena) x86::X86MemoryOperandGeneration(graph, codegen, stats);
+      HOptimization* x86_64_optimizations[] = {
+          memory_gen
+      };
+      RunOptimizations(x86_64_optimizations, arraysize(x86_64_optimizations), pass_observer);
+      break;
+    }
+#endif
     default:
       break;
   }
@@ -500,7 +676,8 @@ static void RunArchOptimizations(InstructionSet instruction_set,
 NO_INLINE  // Avoid increasing caller's frame size by large stack-allocated objects.
 static void AllocateRegisters(HGraph* graph,
                               CodeGenerator* codegen,
-                              PassObserver* pass_observer) {
+                              PassObserver* pass_observer,
+                              RegisterAllocator::Strategy strategy) {
   {
     PassScope scope(PrepareForRegisterAllocation::kPrepareForRegisterAllocationPassName,
                     pass_observer);
@@ -513,27 +690,42 @@ static void AllocateRegisters(HGraph* graph,
   }
   {
     PassScope scope(RegisterAllocator::kRegisterAllocatorPassName, pass_observer);
-    RegisterAllocator(graph->GetArena(), codegen, liveness).AllocateRegisters();
+    RegisterAllocator::Create(graph->GetArena(), codegen, liveness, strategy)->AllocateRegisters();
   }
 }
 
-static void RunOptimizations(HGraph* graph,
-                             CodeGenerator* codegen,
-                             CompilerDriver* driver,
-                             OptimizingCompilerStats* stats,
-                             const DexCompilationUnit& dex_compilation_unit,
-                             PassObserver* pass_observer,
-                             StackHandleScopeCollection* handles) {
+void OptimizingCompiler::RunOptimizations(HGraph* graph,
+                                          CodeGenerator* codegen,
+                                          CompilerDriver* driver,
+                                          const DexCompilationUnit& dex_compilation_unit,
+                                          PassObserver* pass_observer,
+                                          StackHandleScopeCollection* handles) const {
+  OptimizingCompilerStats* stats = compilation_stats_.get();
   ArenaAllocator* arena = graph->GetArena();
+  if (driver->GetCompilerOptions().GetPassesToRun() != nullptr) {
+    ArenaVector<HOptimization*> optimizations = BuildOptimizations(
+        *driver->GetCompilerOptions().GetPassesToRun(),
+        arena,
+        graph,
+        stats,
+        codegen,
+        driver,
+        dex_compilation_unit,
+        handles);
+    RunOptimizations(&optimizations[0], optimizations.size(), pass_observer);
+    return;
+  }
+
   HDeadCodeElimination* dce1 = new (arena) HDeadCodeElimination(
-      graph, stats, HDeadCodeElimination::kInitialDeadCodeEliminationPassName);
+      graph, stats, "dead_code_elimination$initial");
   HDeadCodeElimination* dce2 = new (arena) HDeadCodeElimination(
-      graph, stats, HDeadCodeElimination::kFinalDeadCodeEliminationPassName);
+      graph, stats, "dead_code_elimination$final");
   HConstantFolding* fold1 = new (arena) HConstantFolding(graph);
   InstructionSimplifier* simplify1 = new (arena) InstructionSimplifier(graph, stats);
   HSelectGenerator* select_generator = new (arena) HSelectGenerator(graph, stats);
-  HConstantFolding* fold2 = new (arena) HConstantFolding(graph, "constant_folding_after_inlining");
-  HConstantFolding* fold3 = new (arena) HConstantFolding(graph, "constant_folding_after_bce");
+  HConstantFolding* fold2 = new (arena) HConstantFolding(
+      graph, "constant_folding$after_inlining");
+  HConstantFolding* fold3 = new (arena) HConstantFolding(graph, "constant_folding$after_bce");
   SideEffectsAnalysis* side_effects = new (arena) SideEffectsAnalysis(graph);
   GVNOptimization* gvn = new (arena) GVNOptimization(graph, *side_effects);
   LICM* licm = new (arena) LICM(graph, *side_effects, stats);
@@ -542,9 +734,9 @@ static void RunOptimizations(HGraph* graph,
   BoundsCheckElimination* bce = new (arena) BoundsCheckElimination(graph, *side_effects, induction);
   HSharpening* sharpening = new (arena) HSharpening(graph, codegen, dex_compilation_unit, driver);
   InstructionSimplifier* simplify2 = new (arena) InstructionSimplifier(
-      graph, stats, "instruction_simplifier_after_bce");
+      graph, stats, "instruction_simplifier$after_bce");
   InstructionSimplifier* simplify3 = new (arena) InstructionSimplifier(
-      graph, stats, "instruction_simplifier_before_codegen");
+      graph, stats, "instruction_simplifier$before_codegen");
   IntrinsicsRecognizer* intrinsics = new (arena) IntrinsicsRecognizer(graph, driver, stats);
 
   HOptimization* optimizations1[] = {
@@ -556,7 +748,7 @@ static void RunOptimizations(HGraph* graph,
   };
   RunOptimizations(optimizations1, arraysize(optimizations1), pass_observer);
 
-  MaybeRunInliner(graph, codegen, driver, stats, dex_compilation_unit, pass_observer, handles);
+  MaybeRunInliner(graph, codegen, driver, dex_compilation_unit, pass_observer, handles);
 
   HOptimization* optimizations2[] = {
     // SelectGenerator depends on the InstructionSimplifier removing
@@ -579,8 +771,7 @@ static void RunOptimizations(HGraph* graph,
   };
   RunOptimizations(optimizations2, arraysize(optimizations2), pass_observer);
 
-  RunArchOptimizations(driver->GetInstructionSet(), graph, codegen, stats, pass_observer);
-  AllocateRegisters(graph, codegen, pass_observer);
+  RunArchOptimizations(driver->GetInstructionSet(), graph, codegen, pass_observer);
 }
 
 static ArenaVector<LinkerPatch> EmitAndSortLinkerPatches(CodeGenerator* codegen) {
@@ -791,11 +982,14 @@ CodeGenerator* OptimizingCompiler::TryCompile(ArenaAllocator* arena,
     RunOptimizations(graph,
                      codegen.get(),
                      compiler_driver,
-                     compilation_stats_.get(),
                      dex_compilation_unit,
                      &pass_observer,
                      &handles);
 
+    RegisterAllocator::Strategy regalloc_strategy =
+      compiler_options.GetRegisterAllocationStrategy();
+    AllocateRegisters(graph, codegen.get(), &pass_observer, regalloc_strategy);
+
     codegen->Compile(code_allocator);
     pass_observer.DumpDisassembly();
   }
diff --git a/compiler/optimizing/optimizing_compiler_stats.h b/compiler/optimizing/optimizing_compiler_stats.h
index 9cc6ea45d0..c8d1ce0bd5 100644
--- a/compiler/optimizing/optimizing_compiler_stats.h
+++ b/compiler/optimizing/optimizing_compiler_stats.h
@@ -65,6 +65,7 @@ enum MethodCompilationStat {
   kInlinedInvokeVirtualOrInterface,
   kImplicitNullCheckGenerated,
   kExplicitNullCheckGenerated,
+  kSimplifyIf,
   kLastStat
 };
 
@@ -143,6 +144,7 @@ class OptimizingCompilerStats {
       case kInlinedInvokeVirtualOrInterface: name = "InlinedInvokeVirtualOrInterface"; break;
       case kImplicitNullCheckGenerated: name = "ImplicitNullCheckGenerated"; break;
       case kExplicitNullCheckGenerated: name = "ExplicitNullCheckGenerated"; break;
+      case kSimplifyIf: name = "SimplifyIf"; break;
 
       case kLastStat:
         LOG(FATAL) << "invalid stat "
diff --git a/compiler/optimizing/pc_relative_fixups_mips.cc b/compiler/optimizing/pc_relative_fixups_mips.cc
index ba405cdb69..c6acc45581 100644
--- a/compiler/optimizing/pc_relative_fixups_mips.cc
+++ b/compiler/optimizing/pc_relative_fixups_mips.cc
@@ -37,6 +37,10 @@ class PCRelativeHandlerVisitor : public HGraphVisitor {
       // entry block) and relieve some pressure on the register allocator
       // while avoiding recalculation of the base in a loop.
       base_->MoveBeforeFirstUserAndOutOfLoops();
+      // Computing the base for PC-relative literals will clobber RA with
+      // the NAL instruction on R2. Take a note of this before generating
+      // the method entry.
+      codegen_->ClobberRA();
     }
   }
 
@@ -58,6 +62,36 @@ class PCRelativeHandlerVisitor : public HGraphVisitor {
     DCHECK(base_ != nullptr);
   }
 
+  void VisitLoadClass(HLoadClass* load_class) OVERRIDE {
+    HLoadClass::LoadKind load_kind = load_class->GetLoadKind();
+    switch (load_kind) {
+      case HLoadClass::LoadKind::kBootImageLinkTimeAddress:
+      case HLoadClass::LoadKind::kBootImageAddress:
+      case HLoadClass::LoadKind::kBootImageLinkTimePcRelative:
+        // Add a base register for PC-relative literals on R2.
+        InitializePCRelativeBasePointer();
+        load_class->AddSpecialInput(base_);
+        break;
+      default:
+        break;
+    }
+  }
+
+  void VisitLoadString(HLoadString* load_string) OVERRIDE {
+    HLoadString::LoadKind load_kind = load_string->GetLoadKind();
+    switch (load_kind) {
+      case HLoadString::LoadKind::kBootImageLinkTimeAddress:
+      case HLoadString::LoadKind::kBootImageAddress:
+      case HLoadString::LoadKind::kBootImageLinkTimePcRelative:
+        // Add a base register for PC-relative literals on R2.
+        InitializePCRelativeBasePointer();
+        load_string->AddSpecialInput(base_);
+        break;
+      default:
+        break;
+    }
+  }
+
   void HandleInvoke(HInvoke* invoke) {
     // If this is an invoke-static/-direct with PC-relative dex cache array
     // addressing, we need the PC-relative address base.
@@ -77,7 +111,7 @@ class PCRelativeHandlerVisitor : public HGraphVisitor {
       // method pointer from the invoke.
       if (invoke_static_or_direct->HasCurrentMethodInput()) {
         DCHECK(!invoke_static_or_direct->HasPcRelativeDexCache());
-        CHECK(!has_extra_input);  // TODO: review this.
+        CHECK(!has_extra_input);
         return;
       }
 
@@ -116,7 +150,6 @@ void PcRelativeFixups::Run() {
   CodeGeneratorMIPS* mips_codegen = down_cast<CodeGeneratorMIPS*>(codegen_);
   if (mips_codegen->GetInstructionSetFeatures().IsR6()) {
     // Do nothing for R6 because it has PC-relative addressing.
-    // TODO: review. Move this check into RunArchOptimizations()?
     return;
   }
   if (graph_->HasIrreducibleLoops()) {
diff --git a/compiler/optimizing/pc_relative_fixups_mips.h b/compiler/optimizing/pc_relative_fixups_mips.h
index 1e8b071bb3..5a7397bf9d 100644
--- a/compiler/optimizing/pc_relative_fixups_mips.h
+++ b/compiler/optimizing/pc_relative_fixups_mips.h
@@ -32,6 +32,8 @@ class PcRelativeFixups : public HOptimization {
       : HOptimization(graph, "pc_relative_fixups_mips", stats),
         codegen_(codegen) {}
 
+  static constexpr const char* kPcRelativeFixupsMipsPassName = "pc_relative_fixups_mips";
+
   void Run() OVERRIDE;
 
  private:
diff --git a/compiler/optimizing/pc_relative_fixups_x86.cc b/compiler/optimizing/pc_relative_fixups_x86.cc
index 921f3dfff6..ad0921d7e6 100644
--- a/compiler/optimizing/pc_relative_fixups_x86.cc
+++ b/compiler/optimizing/pc_relative_fixups_x86.cc
@@ -227,6 +227,7 @@ class PCRelativeHandlerVisitor : public HGraphVisitor {
       case Intrinsics::kMathMaxFloatFloat:
       case Intrinsics::kMathMinDoubleDouble:
       case Intrinsics::kMathMinFloatFloat:
+      case Intrinsics::kMathRoundFloat:
         if (!base_added) {
           DCHECK(invoke_static_or_direct != nullptr);
           DCHECK(!invoke_static_or_direct->HasCurrentMethodInput());
diff --git a/compiler/optimizing/pc_relative_fixups_x86.h b/compiler/optimizing/pc_relative_fixups_x86.h
index 03de2fcece..72fa71ea94 100644
--- a/compiler/optimizing/pc_relative_fixups_x86.h
+++ b/compiler/optimizing/pc_relative_fixups_x86.h
@@ -29,9 +29,11 @@ namespace x86 {
 class PcRelativeFixups : public HOptimization {
  public:
   PcRelativeFixups(HGraph* graph, CodeGenerator* codegen, OptimizingCompilerStats* stats)
-      : HOptimization(graph, "pc_relative_fixups_x86", stats),
+      : HOptimization(graph, kPcRelativeFixupsX86PassName, stats),
         codegen_(codegen) {}
 
+  static constexpr const char* kPcRelativeFixupsX86PassName  = "pc_relative_fixups_x86";
+
   void Run() OVERRIDE;
 
  private:
diff --git a/compiler/optimizing/reference_type_propagation.cc b/compiler/optimizing/reference_type_propagation.cc
index 965d5ee4f9..e96ab1918c 100644
--- a/compiler/optimizing/reference_type_propagation.cc
+++ b/compiler/optimizing/reference_type_propagation.cc
@@ -16,6 +16,7 @@
 
 #include "reference_type_propagation.h"
 
+#include "base/enums.h"
 #include "class_linker-inl.h"
 #include "mirror/class-inl.h"
 #include "mirror/dex_cache.h"
@@ -775,7 +776,7 @@ void ReferenceTypePropagation::RTPVisitor::VisitInvoke(HInvoke* instr) {
   ClassLinker* cl = Runtime::Current()->GetClassLinker();
   mirror::DexCache* dex_cache =
       FindDexCacheWithHint(soa.Self(), instr->GetDexFile(), hint_dex_cache_);
-  size_t pointer_size = cl->GetImagePointerSize();
+  PointerSize pointer_size = cl->GetImagePointerSize();
   ArtMethod* method = dex_cache->GetResolvedMethod(instr->GetDexMethodIndex(), pointer_size);
   mirror::Class* klass = (method == nullptr) ? nullptr : method->GetReturnType(false, pointer_size);
   SetClassAsTypeInfo(instr, klass, /* is_exact */ false);
diff --git a/compiler/optimizing/register_allocation_resolver.cc b/compiler/optimizing/register_allocation_resolver.cc
new file mode 100644
index 0000000000..34502869e4
--- /dev/null
+++ b/compiler/optimizing/register_allocation_resolver.cc
@@ -0,0 +1,653 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "register_allocation_resolver.h"
+
+#include "code_generator.h"
+#include "ssa_liveness_analysis.h"
+
+namespace art {
+
+RegisterAllocationResolver::RegisterAllocationResolver(ArenaAllocator* allocator,
+                                                       CodeGenerator* codegen,
+                                                       const SsaLivenessAnalysis& liveness)
+      : allocator_(allocator),
+        codegen_(codegen),
+        liveness_(liveness) {}
+
+void RegisterAllocationResolver::Resolve(size_t max_safepoint_live_core_regs,
+                                         size_t max_safepoint_live_fp_regs,
+                                         size_t reserved_out_slots,
+                                         size_t int_spill_slots,
+                                         size_t long_spill_slots,
+                                         size_t float_spill_slots,
+                                         size_t double_spill_slots,
+                                         size_t catch_phi_spill_slots,
+                                         const ArenaVector<LiveInterval*>& temp_intervals) {
+  size_t spill_slots = int_spill_slots
+                     + long_spill_slots
+                     + float_spill_slots
+                     + double_spill_slots
+                     + catch_phi_spill_slots;
+
+  // Computes frame size and spill mask.
+  codegen_->InitializeCodeGeneration(spill_slots,
+                                     max_safepoint_live_core_regs,
+                                     max_safepoint_live_fp_regs,
+                                     reserved_out_slots,  // Includes slot(s) for the art method.
+                                     codegen_->GetGraph()->GetLinearOrder());
+
+  // Resolve outputs, including stack locations.
+  // TODO: Use pointers of Location inside LiveInterval to avoid doing another iteration.
+  for (size_t i = 0, e = liveness_.GetNumberOfSsaValues(); i < e; ++i) {
+    HInstruction* instruction = liveness_.GetInstructionFromSsaIndex(i);
+    LiveInterval* current = instruction->GetLiveInterval();
+    LocationSummary* locations = instruction->GetLocations();
+    Location location = locations->Out();
+    if (instruction->IsParameterValue()) {
+      // Now that we know the frame size, adjust the parameter's location.
+      if (location.IsStackSlot()) {
+        location = Location::StackSlot(location.GetStackIndex() + codegen_->GetFrameSize());
+        current->SetSpillSlot(location.GetStackIndex());
+        locations->UpdateOut(location);
+      } else if (location.IsDoubleStackSlot()) {
+        location = Location::DoubleStackSlot(location.GetStackIndex() + codegen_->GetFrameSize());
+        current->SetSpillSlot(location.GetStackIndex());
+        locations->UpdateOut(location);
+      } else if (current->HasSpillSlot()) {
+        current->SetSpillSlot(current->GetSpillSlot() + codegen_->GetFrameSize());
+      }
+    } else if (instruction->IsCurrentMethod()) {
+      // The current method is always at offset 0.
+      DCHECK(!current->HasSpillSlot() || (current->GetSpillSlot() == 0));
+    } else if (instruction->IsPhi() && instruction->AsPhi()->IsCatchPhi()) {
+      DCHECK(current->HasSpillSlot());
+      size_t slot = current->GetSpillSlot()
+                    + spill_slots
+                    + reserved_out_slots
+                    - catch_phi_spill_slots;
+      current->SetSpillSlot(slot * kVRegSize);
+    } else if (current->HasSpillSlot()) {
+      // Adjust the stack slot, now that we know the number of them for each type.
+      // The way this implementation lays out the stack is the following:
+      // [parameter slots       ]
+      // [catch phi spill slots ]
+      // [double spill slots    ]
+      // [long spill slots      ]
+      // [float spill slots     ]
+      // [int/ref values        ]
+      // [maximum out values    ] (number of arguments for calls)
+      // [art method            ].
+      size_t slot = current->GetSpillSlot();
+      switch (current->GetType()) {
+        case Primitive::kPrimDouble:
+          slot += long_spill_slots;
+          FALLTHROUGH_INTENDED;
+        case Primitive::kPrimLong:
+          slot += float_spill_slots;
+          FALLTHROUGH_INTENDED;
+        case Primitive::kPrimFloat:
+          slot += int_spill_slots;
+          FALLTHROUGH_INTENDED;
+        case Primitive::kPrimNot:
+        case Primitive::kPrimInt:
+        case Primitive::kPrimChar:
+        case Primitive::kPrimByte:
+        case Primitive::kPrimBoolean:
+        case Primitive::kPrimShort:
+          slot += reserved_out_slots;
+          break;
+        case Primitive::kPrimVoid:
+          LOG(FATAL) << "Unexpected type for interval " << current->GetType();
+      }
+      current->SetSpillSlot(slot * kVRegSize);
+    }
+
+    Location source = current->ToLocation();
+
+    if (location.IsUnallocated()) {
+      if (location.GetPolicy() == Location::kSameAsFirstInput) {
+        if (locations->InAt(0).IsUnallocated()) {
+          locations->SetInAt(0, source);
+        } else {
+          DCHECK(locations->InAt(0).Equals(source));
+        }
+      }
+      locations->UpdateOut(source);
+    } else {
+      DCHECK(source.Equals(location));
+    }
+  }
+
+  // Connect siblings and resolve inputs.
+  for (size_t i = 0, e = liveness_.GetNumberOfSsaValues(); i < e; ++i) {
+    HInstruction* instruction = liveness_.GetInstructionFromSsaIndex(i);
+    ConnectSiblings(instruction->GetLiveInterval(),
+                    max_safepoint_live_core_regs + max_safepoint_live_fp_regs);
+  }
+
+  // Resolve non-linear control flow across branches. Order does not matter.
+  for (HLinearOrderIterator it(*codegen_->GetGraph()); !it.Done(); it.Advance()) {
+    HBasicBlock* block = it.Current();
+    if (block->IsCatchBlock() ||
+        (block->IsLoopHeader() && block->GetLoopInformation()->IsIrreducible())) {
+      // Instructions live at the top of catch blocks or irreducible loop header
+      // were forced to spill.
+      if (kIsDebugBuild) {
+        BitVector* live = liveness_.GetLiveInSet(*block);
+        for (uint32_t idx : live->Indexes()) {
+          LiveInterval* interval = liveness_.GetInstructionFromSsaIndex(idx)->GetLiveInterval();
+          LiveInterval* sibling = interval->GetSiblingAt(block->GetLifetimeStart());
+          // `GetSiblingAt` returns the sibling that contains a position, but there could be
+          // a lifetime hole in it. `CoversSlow` returns whether the interval is live at that
+          // position.
+          if ((sibling != nullptr) && sibling->CoversSlow(block->GetLifetimeStart())) {
+            DCHECK(!sibling->HasRegister());
+          }
+        }
+      }
+    } else {
+      BitVector* live = liveness_.GetLiveInSet(*block);
+      for (uint32_t idx : live->Indexes()) {
+        LiveInterval* interval = liveness_.GetInstructionFromSsaIndex(idx)->GetLiveInterval();
+        for (HBasicBlock* predecessor : block->GetPredecessors()) {
+          ConnectSplitSiblings(interval, predecessor, block);
+        }
+      }
+    }
+  }
+
+  // Resolve phi inputs. Order does not matter.
+  for (HLinearOrderIterator it(*codegen_->GetGraph()); !it.Done(); it.Advance()) {
+    HBasicBlock* current = it.Current();
+    if (current->IsCatchBlock()) {
+      // Catch phi values are set at runtime by the exception delivery mechanism.
+    } else {
+      for (HInstructionIterator inst_it(current->GetPhis()); !inst_it.Done(); inst_it.Advance()) {
+        HInstruction* phi = inst_it.Current();
+        for (size_t i = 0, e = current->GetPredecessors().size(); i < e; ++i) {
+          HBasicBlock* predecessor = current->GetPredecessors()[i];
+          DCHECK_EQ(predecessor->GetNormalSuccessors().size(), 1u);
+          HInstruction* input = phi->InputAt(i);
+          Location source = input->GetLiveInterval()->GetLocationAt(
+              predecessor->GetLifetimeEnd() - 1);
+          Location destination = phi->GetLiveInterval()->ToLocation();
+          InsertParallelMoveAtExitOf(predecessor, phi, source, destination);
+        }
+      }
+    }
+  }
+
+  // Resolve temp locations.
+  for (LiveInterval* temp : temp_intervals) {
+    if (temp->IsHighInterval()) {
+      // High intervals can be skipped, they are already handled by the low interval.
+      continue;
+    }
+    HInstruction* at = liveness_.GetTempUser(temp);
+    size_t temp_index = liveness_.GetTempIndex(temp);
+    LocationSummary* locations = at->GetLocations();
+    switch (temp->GetType()) {
+      case Primitive::kPrimInt:
+        locations->SetTempAt(temp_index, Location::RegisterLocation(temp->GetRegister()));
+        break;
+
+      case Primitive::kPrimDouble:
+        if (codegen_->NeedsTwoRegisters(Primitive::kPrimDouble)) {
+          Location location = Location::FpuRegisterPairLocation(
+              temp->GetRegister(), temp->GetHighInterval()->GetRegister());
+          locations->SetTempAt(temp_index, location);
+        } else {
+          locations->SetTempAt(temp_index, Location::FpuRegisterLocation(temp->GetRegister()));
+        }
+        break;
+
+      default:
+        LOG(FATAL) << "Unexpected type for temporary location "
+                   << temp->GetType();
+    }
+  }
+}
+
+void RegisterAllocationResolver::ConnectSiblings(LiveInterval* interval,
+                                                 size_t max_safepoint_live_regs) {
+  LiveInterval* current = interval;
+  if (current->HasSpillSlot()
+      && current->HasRegister()
+      // Currently, we spill unconditionnally the current method in the code generators.
+      && !interval->GetDefinedBy()->IsCurrentMethod()) {
+    // We spill eagerly, so move must be at definition.
+    InsertMoveAfter(interval->GetDefinedBy(),
+                    interval->ToLocation(),
+                    interval->NeedsTwoSpillSlots()
+                        ? Location::DoubleStackSlot(interval->GetParent()->GetSpillSlot())
+                        : Location::StackSlot(interval->GetParent()->GetSpillSlot()));
+  }
+  UsePosition* use = current->GetFirstUse();
+  UsePosition* env_use = current->GetFirstEnvironmentUse();
+
+  // Walk over all siblings, updating locations of use positions, and
+  // connecting them when they are adjacent.
+  do {
+    Location source = current->ToLocation();
+
+    // Walk over all uses covered by this interval, and update the location
+    // information.
+
+    LiveRange* range = current->GetFirstRange();
+    while (range != nullptr) {
+      while (use != nullptr && use->GetPosition() < range->GetStart()) {
+        DCHECK(use->IsSynthesized());
+        use = use->GetNext();
+      }
+      while (use != nullptr && use->GetPosition() <= range->GetEnd()) {
+        DCHECK(!use->GetIsEnvironment());
+        DCHECK(current->CoversSlow(use->GetPosition()) || (use->GetPosition() == range->GetEnd()));
+        if (!use->IsSynthesized()) {
+          LocationSummary* locations = use->GetUser()->GetLocations();
+          Location expected_location = locations->InAt(use->GetInputIndex());
+          // The expected (actual) location may be invalid in case the input is unused. Currently
+          // this only happens for intrinsics.
+          if (expected_location.IsValid()) {
+            if (expected_location.IsUnallocated()) {
+              locations->SetInAt(use->GetInputIndex(), source);
+            } else if (!expected_location.IsConstant()) {
+              AddInputMoveFor(interval->GetDefinedBy(), use->GetUser(), source, expected_location);
+            }
+          } else {
+            DCHECK(use->GetUser()->IsInvoke());
+            DCHECK(use->GetUser()->AsInvoke()->GetIntrinsic() != Intrinsics::kNone);
+          }
+        }
+        use = use->GetNext();
+      }
+
+      // Walk over the environment uses, and update their locations.
+      while (env_use != nullptr && env_use->GetPosition() < range->GetStart()) {
+        env_use = env_use->GetNext();
+      }
+
+      while (env_use != nullptr && env_use->GetPosition() <= range->GetEnd()) {
+        DCHECK(current->CoversSlow(env_use->GetPosition())
+               || (env_use->GetPosition() == range->GetEnd()));
+        HEnvironment* environment = env_use->GetEnvironment();
+        environment->SetLocationAt(env_use->GetInputIndex(), source);
+        env_use = env_use->GetNext();
+      }
+
+      range = range->GetNext();
+    }
+
+    // If the next interval starts just after this one, and has a register,
+    // insert a move.
+    LiveInterval* next_sibling = current->GetNextSibling();
+    if (next_sibling != nullptr
+        && next_sibling->HasRegister()
+        && current->GetEnd() == next_sibling->GetStart()) {
+      Location destination = next_sibling->ToLocation();
+      InsertParallelMoveAt(current->GetEnd(), interval->GetDefinedBy(), source, destination);
+    }
+
+    for (SafepointPosition* safepoint_position = current->GetFirstSafepoint();
+         safepoint_position != nullptr;
+         safepoint_position = safepoint_position->GetNext()) {
+      DCHECK(current->CoversSlow(safepoint_position->GetPosition()));
+
+      LocationSummary* locations = safepoint_position->GetLocations();
+      if ((current->GetType() == Primitive::kPrimNot) && current->GetParent()->HasSpillSlot()) {
+        DCHECK(interval->GetDefinedBy()->IsActualObject())
+            << interval->GetDefinedBy()->DebugName()
+            << "@" << safepoint_position->GetInstruction()->DebugName();
+        locations->SetStackBit(current->GetParent()->GetSpillSlot() / kVRegSize);
+      }
+
+      switch (source.GetKind()) {
+        case Location::kRegister: {
+          locations->AddLiveRegister(source);
+          if (kIsDebugBuild && locations->OnlyCallsOnSlowPath()) {
+            DCHECK_LE(locations->GetNumberOfLiveRegisters(),
+                      max_safepoint_live_regs);
+          }
+          if (current->GetType() == Primitive::kPrimNot) {
+            DCHECK(interval->GetDefinedBy()->IsActualObject())
+                << interval->GetDefinedBy()->DebugName()
+                << "@" << safepoint_position->GetInstruction()->DebugName();
+            locations->SetRegisterBit(source.reg());
+          }
+          break;
+        }
+        case Location::kFpuRegister: {
+          locations->AddLiveRegister(source);
+          break;
+        }
+
+        case Location::kRegisterPair:
+        case Location::kFpuRegisterPair: {
+          locations->AddLiveRegister(source.ToLow());
+          locations->AddLiveRegister(source.ToHigh());
+          break;
+        }
+        case Location::kStackSlot:  // Fall-through
+        case Location::kDoubleStackSlot:  // Fall-through
+        case Location::kConstant: {
+          // Nothing to do.
+          break;
+        }
+        default: {
+          LOG(FATAL) << "Unexpected location for object";
+        }
+      }
+    }
+    current = next_sibling;
+  } while (current != nullptr);
+
+  if (kIsDebugBuild) {
+    // Following uses can only be synthesized uses.
+    while (use != nullptr) {
+      DCHECK(use->IsSynthesized());
+      use = use->GetNext();
+    }
+  }
+}
+
+static bool IsMaterializableEntryBlockInstructionOfGraphWithIrreducibleLoop(
+    HInstruction* instruction) {
+  return instruction->GetBlock()->GetGraph()->HasIrreducibleLoops() &&
+         (instruction->IsConstant() || instruction->IsCurrentMethod());
+}
+
+void RegisterAllocationResolver::ConnectSplitSiblings(LiveInterval* interval,
+                                                      HBasicBlock* from,
+                                                      HBasicBlock* to) const {
+  if (interval->GetNextSibling() == nullptr) {
+    // Nothing to connect. The whole range was allocated to the same location.
+    return;
+  }
+
+  // Find the intervals that cover `from` and `to`.
+  size_t destination_position = to->GetLifetimeStart();
+  size_t source_position = from->GetLifetimeEnd() - 1;
+  LiveInterval* destination = interval->GetSiblingAt(destination_position);
+  LiveInterval* source = interval->GetSiblingAt(source_position);
+
+  if (destination == source) {
+    // Interval was not split.
+    return;
+  }
+
+  LiveInterval* parent = interval->GetParent();
+  HInstruction* defined_by = parent->GetDefinedBy();
+  if (codegen_->GetGraph()->HasIrreducibleLoops() &&
+      (destination == nullptr || !destination->CoversSlow(destination_position))) {
+    // Our live_in fixed point calculation has found that the instruction is live
+    // in the `to` block because it will eventually enter an irreducible loop. Our
+    // live interval computation however does not compute a fixed point, and
+    // therefore will not have a location for that instruction for `to`.
+    // Because the instruction is a constant or the ArtMethod, we don't need to
+    // do anything: it will be materialized in the irreducible loop.
+    DCHECK(IsMaterializableEntryBlockInstructionOfGraphWithIrreducibleLoop(defined_by))
+        << defined_by->DebugName() << ":" << defined_by->GetId()
+        << " " << from->GetBlockId() << " -> " << to->GetBlockId();
+    return;
+  }
+
+  if (!destination->HasRegister()) {
+    // Values are eagerly spilled. Spill slot already contains appropriate value.
+    return;
+  }
+
+  Location location_source;
+  // `GetSiblingAt` returns the interval whose start and end cover `position`,
+  // but does not check whether the interval is inactive at that position.
+  // The only situation where the interval is inactive at that position is in the
+  // presence of irreducible loops for constants and ArtMethod.
+  if (codegen_->GetGraph()->HasIrreducibleLoops() &&
+      (source == nullptr || !source->CoversSlow(source_position))) {
+    DCHECK(IsMaterializableEntryBlockInstructionOfGraphWithIrreducibleLoop(defined_by));
+    if (defined_by->IsConstant()) {
+      location_source = defined_by->GetLocations()->Out();
+    } else {
+      DCHECK(defined_by->IsCurrentMethod());
+      location_source = parent->NeedsTwoSpillSlots()
+          ? Location::DoubleStackSlot(parent->GetSpillSlot())
+          : Location::StackSlot(parent->GetSpillSlot());
+    }
+  } else {
+    DCHECK(source != nullptr);
+    DCHECK(source->CoversSlow(source_position));
+    DCHECK(destination->CoversSlow(destination_position));
+    location_source = source->ToLocation();
+  }
+
+  // If `from` has only one successor, we can put the moves at the exit of it. Otherwise
+  // we need to put the moves at the entry of `to`.
+  if (from->GetNormalSuccessors().size() == 1) {
+    InsertParallelMoveAtExitOf(from,
+                               defined_by,
+                               location_source,
+                               destination->ToLocation());
+  } else {
+    DCHECK_EQ(to->GetPredecessors().size(), 1u);
+    InsertParallelMoveAtEntryOf(to,
+                                defined_by,
+                                location_source,
+                                destination->ToLocation());
+  }
+}
+
+static bool IsValidDestination(Location destination) {
+  return destination.IsRegister()
+      || destination.IsRegisterPair()
+      || destination.IsFpuRegister()
+      || destination.IsFpuRegisterPair()
+      || destination.IsStackSlot()
+      || destination.IsDoubleStackSlot();
+}
+
+void RegisterAllocationResolver::AddMove(HParallelMove* move,
+                                         Location source,
+                                         Location destination,
+                                         HInstruction* instruction,
+                                         Primitive::Type type) const {
+  if (type == Primitive::kPrimLong
+      && codegen_->ShouldSplitLongMoves()
+      // The parallel move resolver knows how to deal with long constants.
+      && !source.IsConstant()) {
+    move->AddMove(source.ToLow(), destination.ToLow(), Primitive::kPrimInt, instruction);
+    move->AddMove(source.ToHigh(), destination.ToHigh(), Primitive::kPrimInt, nullptr);
+  } else {
+    move->AddMove(source, destination, type, instruction);
+  }
+}
+
+void RegisterAllocationResolver::AddInputMoveFor(HInstruction* input,
+                                                 HInstruction* user,
+                                                 Location source,
+                                                 Location destination) const {
+  if (source.Equals(destination)) return;
+
+  DCHECK(!user->IsPhi());
+
+  HInstruction* previous = user->GetPrevious();
+  HParallelMove* move = nullptr;
+  if (previous == nullptr
+      || !previous->IsParallelMove()
+      || previous->GetLifetimePosition() < user->GetLifetimePosition()) {
+    move = new (allocator_) HParallelMove(allocator_);
+    move->SetLifetimePosition(user->GetLifetimePosition());
+    user->GetBlock()->InsertInstructionBefore(move, user);
+  } else {
+    move = previous->AsParallelMove();
+  }
+  DCHECK_EQ(move->GetLifetimePosition(), user->GetLifetimePosition());
+  AddMove(move, source, destination, nullptr, input->GetType());
+}
+
+static bool IsInstructionStart(size_t position) {
+  return (position & 1) == 0;
+}
+
+static bool IsInstructionEnd(size_t position) {
+  return (position & 1) == 1;
+}
+
+void RegisterAllocationResolver::InsertParallelMoveAt(size_t position,
+                                                      HInstruction* instruction,
+                                                      Location source,
+                                                      Location destination) const {
+  DCHECK(IsValidDestination(destination)) << destination;
+  if (source.Equals(destination)) return;
+
+  HInstruction* at = liveness_.GetInstructionFromPosition(position / 2);
+  HParallelMove* move;
+  if (at == nullptr) {
+    if (IsInstructionStart(position)) {
+      // Block boundary, don't do anything the connection of split siblings will handle it.
+      return;
+    } else {
+      // Move must happen before the first instruction of the block.
+      at = liveness_.GetInstructionFromPosition((position + 1) / 2);
+      // Note that parallel moves may have already been inserted, so we explicitly
+      // ask for the first instruction of the block: `GetInstructionFromPosition` does
+      // not contain the `HParallelMove` instructions.
+      at = at->GetBlock()->GetFirstInstruction();
+
+      if (at->GetLifetimePosition() < position) {
+        // We may insert moves for split siblings and phi spills at the beginning of the block.
+        // Since this is a different lifetime position, we need to go to the next instruction.
+        DCHECK(at->IsParallelMove());
+        at = at->GetNext();
+      }
+
+      if (at->GetLifetimePosition() != position) {
+        DCHECK_GT(at->GetLifetimePosition(), position);
+        move = new (allocator_) HParallelMove(allocator_);
+        move->SetLifetimePosition(position);
+        at->GetBlock()->InsertInstructionBefore(move, at);
+      } else {
+        DCHECK(at->IsParallelMove());
+        move = at->AsParallelMove();
+      }
+    }
+  } else if (IsInstructionEnd(position)) {
+    // Move must happen after the instruction.
+    DCHECK(!at->IsControlFlow());
+    move = at->GetNext()->AsParallelMove();
+    // This is a parallel move for connecting siblings in a same block. We need to
+    // differentiate it with moves for connecting blocks, and input moves.
+    if (move == nullptr || move->GetLifetimePosition() > position) {
+      move = new (allocator_) HParallelMove(allocator_);
+      move->SetLifetimePosition(position);
+      at->GetBlock()->InsertInstructionBefore(move, at->GetNext());
+    }
+  } else {
+    // Move must happen before the instruction.
+    HInstruction* previous = at->GetPrevious();
+    if (previous == nullptr
+        || !previous->IsParallelMove()
+        || previous->GetLifetimePosition() != position) {
+      // If the previous is a parallel move, then its position must be lower
+      // than the given `position`: it was added just after the non-parallel
+      // move instruction that precedes `instruction`.
+      DCHECK(previous == nullptr
+             || !previous->IsParallelMove()
+             || previous->GetLifetimePosition() < position);
+      move = new (allocator_) HParallelMove(allocator_);
+      move->SetLifetimePosition(position);
+      at->GetBlock()->InsertInstructionBefore(move, at);
+    } else {
+      move = previous->AsParallelMove();
+    }
+  }
+  DCHECK_EQ(move->GetLifetimePosition(), position);
+  AddMove(move, source, destination, instruction, instruction->GetType());
+}
+
+void RegisterAllocationResolver::InsertParallelMoveAtExitOf(HBasicBlock* block,
+                                                            HInstruction* instruction,
+                                                            Location source,
+                                                            Location destination) const {
+  DCHECK(IsValidDestination(destination)) << destination;
+  if (source.Equals(destination)) return;
+
+  DCHECK_EQ(block->GetNormalSuccessors().size(), 1u);
+  HInstruction* last = block->GetLastInstruction();
+  // We insert moves at exit for phi predecessors and connecting blocks.
+  // A block ending with an if or a packed switch cannot branch to a block
+  // with phis because we do not allow critical edges. It can also not connect
+  // a split interval between two blocks: the move has to happen in the successor.
+  DCHECK(!last->IsIf() && !last->IsPackedSwitch());
+  HInstruction* previous = last->GetPrevious();
+  HParallelMove* move;
+  // This is a parallel move for connecting blocks. We need to differentiate
+  // it with moves for connecting siblings in a same block, and output moves.
+  size_t position = last->GetLifetimePosition();
+  if (previous == nullptr || !previous->IsParallelMove()
+      || previous->AsParallelMove()->GetLifetimePosition() != position) {
+    move = new (allocator_) HParallelMove(allocator_);
+    move->SetLifetimePosition(position);
+    block->InsertInstructionBefore(move, last);
+  } else {
+    move = previous->AsParallelMove();
+  }
+  AddMove(move, source, destination, instruction, instruction->GetType());
+}
+
+void RegisterAllocationResolver::InsertParallelMoveAtEntryOf(HBasicBlock* block,
+                                                             HInstruction* instruction,
+                                                             Location source,
+                                                             Location destination) const {
+  DCHECK(IsValidDestination(destination)) << destination;
+  if (source.Equals(destination)) return;
+
+  HInstruction* first = block->GetFirstInstruction();
+  HParallelMove* move = first->AsParallelMove();
+  size_t position = block->GetLifetimeStart();
+  // This is a parallel move for connecting blocks. We need to differentiate
+  // it with moves for connecting siblings in a same block, and input moves.
+  if (move == nullptr || move->GetLifetimePosition() != position) {
+    move = new (allocator_) HParallelMove(allocator_);
+    move->SetLifetimePosition(position);
+    block->InsertInstructionBefore(move, first);
+  }
+  AddMove(move, source, destination, instruction, instruction->GetType());
+}
+
+void RegisterAllocationResolver::InsertMoveAfter(HInstruction* instruction,
+                                                 Location source,
+                                                 Location destination) const {
+  DCHECK(IsValidDestination(destination)) << destination;
+  if (source.Equals(destination)) return;
+
+  if (instruction->IsPhi()) {
+    InsertParallelMoveAtEntryOf(instruction->GetBlock(), instruction, source, destination);
+    return;
+  }
+
+  size_t position = instruction->GetLifetimePosition() + 1;
+  HParallelMove* move = instruction->GetNext()->AsParallelMove();
+  // This is a parallel move for moving the output of an instruction. We need
+  // to differentiate with input moves, moves for connecting siblings in a
+  // and moves for connecting blocks.
+  if (move == nullptr || move->GetLifetimePosition() != position) {
+    move = new (allocator_) HParallelMove(allocator_);
+    move->SetLifetimePosition(position);
+    instruction->GetBlock()->InsertInstructionBefore(move, instruction->GetNext());
+  }
+  AddMove(move, source, destination, instruction, instruction->GetType());
+}
+
+}  // namespace art
diff --git a/compiler/optimizing/register_allocation_resolver.h b/compiler/optimizing/register_allocation_resolver.h
new file mode 100644
index 0000000000..6ceb9bc955
--- /dev/null
+++ b/compiler/optimizing/register_allocation_resolver.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_OPTIMIZING_REGISTER_ALLOCATION_RESOLVER_H_
+#define ART_COMPILER_OPTIMIZING_REGISTER_ALLOCATION_RESOLVER_H_
+
+#include "base/arena_containers.h"
+#include "base/value_object.h"
+#include "primitive.h"
+
+namespace art {
+
+class ArenaAllocator;
+class CodeGenerator;
+class HBasicBlock;
+class HInstruction;
+class HParallelMove;
+class LiveInterval;
+class Location;
+class SsaLivenessAnalysis;
+
+/**
+ * Reconciles the locations assigned to live intervals with the location
+ * summary of each instruction, and inserts moves to resolve split intervals,
+ * nonlinear control flow, and phi inputs.
+ */
+class RegisterAllocationResolver : ValueObject {
+ public:
+  RegisterAllocationResolver(ArenaAllocator* allocator,
+                             CodeGenerator* codegen,
+                             const SsaLivenessAnalysis& liveness);
+
+  void Resolve(size_t max_safepoint_live_core_regs,
+               size_t max_safepoint_live_fp_regs,
+               size_t reserved_out_slots,  // Includes slot(s) for the art method.
+               size_t int_spill_slots,
+               size_t long_spill_slots,
+               size_t float_spill_slots,
+               size_t double_spill_slots,
+               size_t catch_phi_spill_slots,
+               const ArenaVector<LiveInterval*>& temp_intervals);
+
+ private:
+  // Connect adjacent siblings within blocks, and resolve inputs along the way.
+  // Uses max_safepoint_live_regs to check that we did not underestimate the
+  // number of live registers at safepoints.
+  void ConnectSiblings(LiveInterval* interval, size_t max_safepoint_live_regs);
+
+  // Connect siblings between block entries and exits.
+  void ConnectSplitSiblings(LiveInterval* interval, HBasicBlock* from, HBasicBlock* to) const;
+
+  // Helper methods for inserting parallel moves in the graph.
+  void InsertParallelMoveAtExitOf(HBasicBlock* block,
+                                  HInstruction* instruction,
+                                  Location source,
+                                  Location destination) const;
+  void InsertParallelMoveAtEntryOf(HBasicBlock* block,
+                                   HInstruction* instruction,
+                                   Location source,
+                                   Location destination) const;
+  void InsertMoveAfter(HInstruction* instruction, Location source, Location destination) const;
+  void AddInputMoveFor(HInstruction* input,
+                       HInstruction* user,
+                       Location source,
+                       Location destination) const;
+  void InsertParallelMoveAt(size_t position,
+                            HInstruction* instruction,
+                            Location source,
+                            Location destination) const;
+  void AddMove(HParallelMove* move,
+               Location source,
+               Location destination,
+               HInstruction* instruction,
+               Primitive::Type type) const;
+
+  ArenaAllocator* const allocator_;
+  CodeGenerator* const codegen_;
+  const SsaLivenessAnalysis& liveness_;
+
+  DISALLOW_COPY_AND_ASSIGN(RegisterAllocationResolver);
+};
+
+}  // namespace art
+
+#endif  // ART_COMPILER_OPTIMIZING_REGISTER_ALLOCATION_RESOLVER_H_
diff --git a/compiler/optimizing/register_allocator.cc b/compiler/optimizing/register_allocator.cc
index 9d99668484..5b768d5d67 100644
--- a/compiler/optimizing/register_allocator.cc
+++ b/compiler/optimizing/register_allocator.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2014 The Android Open Source Project
+ * Copyright (C) 2016 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,65 +21,33 @@
 
 #include "base/bit_vector-inl.h"
 #include "code_generator.h"
+#include "register_allocator_graph_color.h"
+#include "register_allocator_linear_scan.h"
 #include "ssa_liveness_analysis.h"
 
-namespace art {
-
-static constexpr size_t kMaxLifetimePosition = -1;
-static constexpr size_t kDefaultNumberOfSpillSlots = 4;
 
-// For simplicity, we implement register pairs as (reg, reg + 1).
-// Note that this is a requirement for double registers on ARM, since we
-// allocate SRegister.
-static int GetHighForLowRegister(int reg) { return reg + 1; }
-static bool IsLowRegister(int reg) { return (reg & 1) == 0; }
-static bool IsLowOfUnalignedPairInterval(LiveInterval* low) {
-  return GetHighForLowRegister(low->GetRegister()) != low->GetHighInterval()->GetRegister();
-}
+namespace art {
 
 RegisterAllocator::RegisterAllocator(ArenaAllocator* allocator,
                                      CodeGenerator* codegen,
                                      const SsaLivenessAnalysis& liveness)
-      : allocator_(allocator),
-        codegen_(codegen),
-        liveness_(liveness),
-        unhandled_core_intervals_(allocator->Adapter(kArenaAllocRegisterAllocator)),
-        unhandled_fp_intervals_(allocator->Adapter(kArenaAllocRegisterAllocator)),
-        unhandled_(nullptr),
-        handled_(allocator->Adapter(kArenaAllocRegisterAllocator)),
-        active_(allocator->Adapter(kArenaAllocRegisterAllocator)),
-        inactive_(allocator->Adapter(kArenaAllocRegisterAllocator)),
-        physical_core_register_intervals_(allocator->Adapter(kArenaAllocRegisterAllocator)),
-        physical_fp_register_intervals_(allocator->Adapter(kArenaAllocRegisterAllocator)),
-        temp_intervals_(allocator->Adapter(kArenaAllocRegisterAllocator)),
-        int_spill_slots_(allocator->Adapter(kArenaAllocRegisterAllocator)),
-        long_spill_slots_(allocator->Adapter(kArenaAllocRegisterAllocator)),
-        float_spill_slots_(allocator->Adapter(kArenaAllocRegisterAllocator)),
-        double_spill_slots_(allocator->Adapter(kArenaAllocRegisterAllocator)),
-        catch_phi_spill_slots_(0),
-        safepoints_(allocator->Adapter(kArenaAllocRegisterAllocator)),
-        processing_core_registers_(false),
-        number_of_registers_(-1),
-        registers_array_(nullptr),
-        blocked_core_registers_(codegen->GetBlockedCoreRegisters()),
-        blocked_fp_registers_(codegen->GetBlockedFloatingPointRegisters()),
-        reserved_out_slots_(0),
-        maximum_number_of_live_core_registers_(0),
-        maximum_number_of_live_fp_registers_(0) {
-  temp_intervals_.reserve(4);
-  int_spill_slots_.reserve(kDefaultNumberOfSpillSlots);
-  long_spill_slots_.reserve(kDefaultNumberOfSpillSlots);
-  float_spill_slots_.reserve(kDefaultNumberOfSpillSlots);
-  double_spill_slots_.reserve(kDefaultNumberOfSpillSlots);
+    : allocator_(allocator),
+      codegen_(codegen),
+      liveness_(liveness) {}
 
-  codegen->SetupBlockedRegisters();
-  physical_core_register_intervals_.resize(codegen->GetNumberOfCoreRegisters(), nullptr);
-  physical_fp_register_intervals_.resize(codegen->GetNumberOfFloatingPointRegisters(), nullptr);
-  // Always reserve for the current method and the graph's max out registers.
-  // TODO: compute it instead.
-  // ArtMethod* takes 2 vregs for 64 bits.
-  reserved_out_slots_ = InstructionSetPointerSize(codegen->GetInstructionSet()) / kVRegSize +
-      codegen->GetGraph()->GetMaximumNumberOfOutVRegs();
+RegisterAllocator* RegisterAllocator::Create(ArenaAllocator* allocator,
+                                             CodeGenerator* codegen,
+                                             const SsaLivenessAnalysis& analysis,
+                                             Strategy strategy) {
+  switch (strategy) {
+    case kRegisterAllocatorLinearScan:
+      return new (allocator) RegisterAllocatorLinearScan(allocator, codegen, analysis);
+    case kRegisterAllocatorGraphColor:
+      return new (allocator) RegisterAllocatorGraphColor(allocator, codegen, analysis);
+    default:
+      LOG(FATAL) << "Invalid register allocation strategy: " << strategy;
+      UNREACHABLE();
+  }
 }
 
 bool RegisterAllocator::CanAllocateRegistersFor(const HGraph& graph ATTRIBUTE_UNUSED,
@@ -93,328 +61,6 @@ bool RegisterAllocator::CanAllocateRegistersFor(const HGraph& graph ATTRIBUTE_UN
       || instruction_set == kX86_64;
 }
 
-static bool ShouldProcess(bool processing_core_registers, LiveInterval* interval) {
-  if (interval == nullptr) return false;
-  bool is_core_register = (interval->GetType() != Primitive::kPrimDouble)
-      && (interval->GetType() != Primitive::kPrimFloat);
-  return processing_core_registers == is_core_register;
-}
-
-void RegisterAllocator::AllocateRegisters() {
-  AllocateRegistersInternal();
-  Resolve();
-
-  if (kIsDebugBuild) {
-    processing_core_registers_ = true;
-    ValidateInternal(true);
-    processing_core_registers_ = false;
-    ValidateInternal(true);
-    // Check that the linear order is still correct with regards to lifetime positions.
-    // Since only parallel moves have been inserted during the register allocation,
-    // these checks are mostly for making sure these moves have been added correctly.
-    size_t current_liveness = 0;
-    for (HLinearOrderIterator it(*codegen_->GetGraph()); !it.Done(); it.Advance()) {
-      HBasicBlock* block = it.Current();
-      for (HInstructionIterator inst_it(block->GetPhis()); !inst_it.Done(); inst_it.Advance()) {
-        HInstruction* instruction = inst_it.Current();
-        DCHECK_LE(current_liveness, instruction->GetLifetimePosition());
-        current_liveness = instruction->GetLifetimePosition();
-      }
-      for (HInstructionIterator inst_it(block->GetInstructions());
-           !inst_it.Done();
-           inst_it.Advance()) {
-        HInstruction* instruction = inst_it.Current();
-        DCHECK_LE(current_liveness, instruction->GetLifetimePosition()) << instruction->DebugName();
-        current_liveness = instruction->GetLifetimePosition();
-      }
-    }
-  }
-}
-
-void RegisterAllocator::BlockRegister(Location location, size_t start, size_t end) {
-  int reg = location.reg();
-  DCHECK(location.IsRegister() || location.IsFpuRegister());
-  LiveInterval* interval = location.IsRegister()
-      ? physical_core_register_intervals_[reg]
-      : physical_fp_register_intervals_[reg];
-  Primitive::Type type = location.IsRegister()
-      ? Primitive::kPrimInt
-      : Primitive::kPrimFloat;
-  if (interval == nullptr) {
-    interval = LiveInterval::MakeFixedInterval(allocator_, reg, type);
-    if (location.IsRegister()) {
-      physical_core_register_intervals_[reg] = interval;
-    } else {
-      physical_fp_register_intervals_[reg] = interval;
-    }
-  }
-  DCHECK(interval->GetRegister() == reg);
-  interval->AddRange(start, end);
-}
-
-void RegisterAllocator::BlockRegisters(size_t start, size_t end, bool caller_save_only) {
-  for (size_t i = 0; i < codegen_->GetNumberOfCoreRegisters(); ++i) {
-    if (!caller_save_only || !codegen_->IsCoreCalleeSaveRegister(i)) {
-      BlockRegister(Location::RegisterLocation(i), start, end);
-    }
-  }
-  for (size_t i = 0; i < codegen_->GetNumberOfFloatingPointRegisters(); ++i) {
-    if (!caller_save_only || !codegen_->IsFloatingPointCalleeSaveRegister(i)) {
-      BlockRegister(Location::FpuRegisterLocation(i), start, end);
-    }
-  }
-}
-
-void RegisterAllocator::AllocateRegistersInternal() {
-  // Iterate post-order, to ensure the list is sorted, and the last added interval
-  // is the one with the lowest start position.
-  for (HLinearPostOrderIterator it(*codegen_->GetGraph()); !it.Done(); it.Advance()) {
-    HBasicBlock* block = it.Current();
-    for (HBackwardInstructionIterator back_it(block->GetInstructions()); !back_it.Done();
-         back_it.Advance()) {
-      ProcessInstruction(back_it.Current());
-    }
-    for (HInstructionIterator inst_it(block->GetPhis()); !inst_it.Done(); inst_it.Advance()) {
-      ProcessInstruction(inst_it.Current());
-    }
-
-    if (block->IsCatchBlock() ||
-        (block->IsLoopHeader() && block->GetLoopInformation()->IsIrreducible())) {
-      // By blocking all registers at the top of each catch block or irreducible loop, we force
-      // intervals belonging to the live-in set of the catch/header block to be spilled.
-      // TODO(ngeoffray): Phis in this block could be allocated in register.
-      size_t position = block->GetLifetimeStart();
-      BlockRegisters(position, position + 1);
-    }
-  }
-
-  number_of_registers_ = codegen_->GetNumberOfCoreRegisters();
-  registers_array_ = allocator_->AllocArray<size_t>(number_of_registers_,
-                                                    kArenaAllocRegisterAllocator);
-  processing_core_registers_ = true;
-  unhandled_ = &unhandled_core_intervals_;
-  for (LiveInterval* fixed : physical_core_register_intervals_) {
-    if (fixed != nullptr) {
-      // Fixed interval is added to inactive_ instead of unhandled_.
-      // It's also the only type of inactive interval whose start position
-      // can be after the current interval during linear scan.
-      // Fixed interval is never split and never moves to unhandled_.
-      inactive_.push_back(fixed);
-    }
-  }
-  LinearScan();
-
-  inactive_.clear();
-  active_.clear();
-  handled_.clear();
-
-  number_of_registers_ = codegen_->GetNumberOfFloatingPointRegisters();
-  registers_array_ = allocator_->AllocArray<size_t>(number_of_registers_,
-                                                    kArenaAllocRegisterAllocator);
-  processing_core_registers_ = false;
-  unhandled_ = &unhandled_fp_intervals_;
-  for (LiveInterval* fixed : physical_fp_register_intervals_) {
-    if (fixed != nullptr) {
-      // Fixed interval is added to inactive_ instead of unhandled_.
-      // It's also the only type of inactive interval whose start position
-      // can be after the current interval during linear scan.
-      // Fixed interval is never split and never moves to unhandled_.
-      inactive_.push_back(fixed);
-    }
-  }
-  LinearScan();
-}
-
-void RegisterAllocator::ProcessInstruction(HInstruction* instruction) {
-  LocationSummary* locations = instruction->GetLocations();
-  size_t position = instruction->GetLifetimePosition();
-
-  if (locations == nullptr) return;
-
-  // Create synthesized intervals for temporaries.
-  for (size_t i = 0; i < locations->GetTempCount(); ++i) {
-    Location temp = locations->GetTemp(i);
-    if (temp.IsRegister() || temp.IsFpuRegister()) {
-      BlockRegister(temp, position, position + 1);
-      // Ensure that an explicit temporary register is marked as being allocated.
-      codegen_->AddAllocatedRegister(temp);
-    } else {
-      DCHECK(temp.IsUnallocated());
-      switch (temp.GetPolicy()) {
-        case Location::kRequiresRegister: {
-          LiveInterval* interval =
-              LiveInterval::MakeTempInterval(allocator_, Primitive::kPrimInt);
-          temp_intervals_.push_back(interval);
-          interval->AddTempUse(instruction, i);
-          unhandled_core_intervals_.push_back(interval);
-          break;
-        }
-
-        case Location::kRequiresFpuRegister: {
-          LiveInterval* interval =
-              LiveInterval::MakeTempInterval(allocator_, Primitive::kPrimDouble);
-          temp_intervals_.push_back(interval);
-          interval->AddTempUse(instruction, i);
-          if (codegen_->NeedsTwoRegisters(Primitive::kPrimDouble)) {
-            interval->AddHighInterval(/* is_temp */ true);
-            LiveInterval* high = interval->GetHighInterval();
-            temp_intervals_.push_back(high);
-            unhandled_fp_intervals_.push_back(high);
-          }
-          unhandled_fp_intervals_.push_back(interval);
-          break;
-        }
-
-        default:
-          LOG(FATAL) << "Unexpected policy for temporary location "
-                     << temp.GetPolicy();
-      }
-    }
-  }
-
-  bool core_register = (instruction->GetType() != Primitive::kPrimDouble)
-      && (instruction->GetType() != Primitive::kPrimFloat);
-
-  if (locations->NeedsSafepoint()) {
-    if (codegen_->IsLeafMethod()) {
-      // TODO: We do this here because we do not want the suspend check to artificially
-      // create live registers. We should find another place, but this is currently the
-      // simplest.
-      DCHECK(instruction->IsSuspendCheckEntry());
-      instruction->GetBlock()->RemoveInstruction(instruction);
-      return;
-    }
-    safepoints_.push_back(instruction);
-    if (locations->OnlyCallsOnSlowPath()) {
-      // We add a synthesized range at this position to record the live registers
-      // at this position. Ideally, we could just update the safepoints when locations
-      // are updated, but we currently need to know the full stack size before updating
-      // locations (because of parameters and the fact that we don't have a frame pointer).
-      // And knowing the full stack size requires to know the maximum number of live
-      // registers at calls in slow paths.
-      // By adding the following interval in the algorithm, we can compute this
-      // maximum before updating locations.
-      LiveInterval* interval = LiveInterval::MakeSlowPathInterval(allocator_, instruction);
-      interval->AddRange(position, position + 1);
-      AddSorted(&unhandled_core_intervals_, interval);
-      AddSorted(&unhandled_fp_intervals_, interval);
-    }
-  }
-
-  if (locations->WillCall()) {
-    BlockRegisters(position, position + 1, /* caller_save_only */ true);
-  }
-
-  for (size_t i = 0; i < locations->GetInputCount(); ++i) {
-    Location input = locations->InAt(i);
-    if (input.IsRegister() || input.IsFpuRegister()) {
-      BlockRegister(input, position, position + 1);
-    } else if (input.IsPair()) {
-      BlockRegister(input.ToLow(), position, position + 1);
-      BlockRegister(input.ToHigh(), position, position + 1);
-    }
-  }
-
-  LiveInterval* current = instruction->GetLiveInterval();
-  if (current == nullptr) return;
-
-  ArenaVector<LiveInterval*>& unhandled = core_register
-      ? unhandled_core_intervals_
-      : unhandled_fp_intervals_;
-
-  DCHECK(unhandled.empty() || current->StartsBeforeOrAt(unhandled.back()));
-
-  if (codegen_->NeedsTwoRegisters(current->GetType())) {
-    current->AddHighInterval();
-  }
-
-  for (size_t safepoint_index = safepoints_.size(); safepoint_index > 0; --safepoint_index) {
-    HInstruction* safepoint = safepoints_[safepoint_index - 1u];
-    size_t safepoint_position = safepoint->GetLifetimePosition();
-
-    // Test that safepoints are ordered in the optimal way.
-    DCHECK(safepoint_index == safepoints_.size() ||
-           safepoints_[safepoint_index]->GetLifetimePosition() < safepoint_position);
-
-    if (safepoint_position == current->GetStart()) {
-      // The safepoint is for this instruction, so the location of the instruction
-      // does not need to be saved.
-      DCHECK_EQ(safepoint_index, safepoints_.size());
-      DCHECK_EQ(safepoint, instruction);
-      continue;
-    } else if (current->IsDeadAt(safepoint_position)) {
-      break;
-    } else if (!current->Covers(safepoint_position)) {
-      // Hole in the interval.
-      continue;
-    }
-    current->AddSafepoint(safepoint);
-  }
-  current->ResetSearchCache();
-
-  // Some instructions define their output in fixed register/stack slot. We need
-  // to ensure we know these locations before doing register allocation. For a
-  // given register, we create an interval that covers these locations. The register
-  // will be unavailable at these locations when trying to allocate one for an
-  // interval.
-  //
-  // The backwards walking ensures the ranges are ordered on increasing start positions.
-  Location output = locations->Out();
-  if (output.IsUnallocated() && output.GetPolicy() == Location::kSameAsFirstInput) {
-    Location first = locations->InAt(0);
-    if (first.IsRegister() || first.IsFpuRegister()) {
-      current->SetFrom(position + 1);
-      current->SetRegister(first.reg());
-    } else if (first.IsPair()) {
-      current->SetFrom(position + 1);
-      current->SetRegister(first.low());
-      LiveInterval* high = current->GetHighInterval();
-      high->SetRegister(first.high());
-      high->SetFrom(position + 1);
-    }
-  } else if (output.IsRegister() || output.IsFpuRegister()) {
-    // Shift the interval's start by one to account for the blocked register.
-    current->SetFrom(position + 1);
-    current->SetRegister(output.reg());
-    BlockRegister(output, position, position + 1);
-  } else if (output.IsPair()) {
-    current->SetFrom(position + 1);
-    current->SetRegister(output.low());
-    LiveInterval* high = current->GetHighInterval();
-    high->SetRegister(output.high());
-    high->SetFrom(position + 1);
-    BlockRegister(output.ToLow(), position, position + 1);
-    BlockRegister(output.ToHigh(), position, position + 1);
-  } else if (output.IsStackSlot() || output.IsDoubleStackSlot()) {
-    current->SetSpillSlot(output.GetStackIndex());
-  } else {
-    DCHECK(output.IsUnallocated() || output.IsConstant());
-  }
-
-  if (instruction->IsPhi() && instruction->AsPhi()->IsCatchPhi()) {
-    AllocateSpillSlotForCatchPhi(instruction->AsPhi());
-  }
-
-  // If needed, add interval to the list of unhandled intervals.
-  if (current->HasSpillSlot() || instruction->IsConstant()) {
-    // Split just before first register use.
-    size_t first_register_use = current->FirstRegisterUse();
-    if (first_register_use != kNoLifetime) {
-      LiveInterval* split = SplitBetween(current, current->GetStart(), first_register_use - 1);
-      // Don't add directly to `unhandled`, it needs to be sorted and the start
-      // of this new interval might be after intervals already in the list.
-      AddSorted(&unhandled, split);
-    } else {
-      // Nothing to do, we won't allocate a register for this value.
-    }
-  } else {
-    // Don't add directly to `unhandled`, temp or safepoint intervals
-    // for this instruction may have been added, and those can be
-    // processed first.
-    AddSorted(&unhandled, current);
-  }
-}
-
 class AllRangesIterator : public ValueObject {
  public:
   explicit AllRangesIterator(LiveInterval* interval)
@@ -442,36 +88,6 @@ class AllRangesIterator : public ValueObject {
   DISALLOW_COPY_AND_ASSIGN(AllRangesIterator);
 };
 
-bool RegisterAllocator::ValidateInternal(bool log_fatal_on_failure) const {
-  // To simplify unit testing, we eagerly create the array of intervals, and
-  // call the helper method.
-  ArenaVector<LiveInterval*> intervals(allocator_->Adapter(kArenaAllocRegisterAllocatorValidate));
-  for (size_t i = 0; i < liveness_.GetNumberOfSsaValues(); ++i) {
-    HInstruction* instruction = liveness_.GetInstructionFromSsaIndex(i);
-    if (ShouldProcess(processing_core_registers_, instruction->GetLiveInterval())) {
-      intervals.push_back(instruction->GetLiveInterval());
-    }
-  }
-
-  const ArenaVector<LiveInterval*>* physical_register_intervals = processing_core_registers_
-      ? &physical_core_register_intervals_
-      : &physical_fp_register_intervals_;
-  for (LiveInterval* fixed : *physical_register_intervals) {
-    if (fixed != nullptr) {
-      intervals.push_back(fixed);
-    }
-  }
-
-  for (LiveInterval* temp : temp_intervals_) {
-    if (ShouldProcess(processing_core_registers_, temp)) {
-      intervals.push_back(temp);
-    }
-  }
-
-  return ValidateIntervals(intervals, GetNumberOfSpillSlots(), reserved_out_slots_, *codegen_,
-                           allocator_, processing_core_registers_, log_fatal_on_failure);
-}
-
 bool RegisterAllocator::ValidateIntervals(const ArenaVector<LiveInterval*>& intervals,
                                           size_t number_of_spill_slots,
                                           size_t number_of_out_slots,
@@ -550,6 +166,19 @@ bool RegisterAllocator::ValidateIntervals(const ArenaVector<LiveInterval*>& inte
               } else {
                 codegen.DumpFloatingPointRegister(message, current->GetRegister());
               }
+              for (LiveInterval* interval : intervals) {
+                if (interval->HasRegister()
+                    && interval->GetRegister() == current->GetRegister()
+                    && interval->CoversSlow(j)) {
+                  message << std::endl;
+                  if (interval->GetDefinedBy() != nullptr) {
+                    message << interval->GetDefinedBy()->GetKind() << " ";
+                  } else {
+                    message << "physical ";
+                  }
+                  interval->Dump(message);
+                }
+              }
               LOG(FATAL) << message.str();
             } else {
               return false;
@@ -564,638 +193,30 @@ bool RegisterAllocator::ValidateIntervals(const ArenaVector<LiveInterval*>& inte
   return true;
 }
 
-void RegisterAllocator::DumpInterval(std::ostream& stream, LiveInterval* interval) const {
-  interval->Dump(stream);
-  stream << ": ";
-  if (interval->HasRegister()) {
-    if (interval->IsFloatingPoint()) {
-      codegen_->DumpFloatingPointRegister(stream, interval->GetRegister());
-    } else {
-      codegen_->DumpCoreRegister(stream, interval->GetRegister());
-    }
-  } else {
-    stream << "spilled";
-  }
-  stream << std::endl;
-}
-
-void RegisterAllocator::DumpAllIntervals(std::ostream& stream) const {
-  stream << "inactive: " << std::endl;
-  for (LiveInterval* inactive_interval : inactive_) {
-    DumpInterval(stream, inactive_interval);
-  }
-  stream << "active: " << std::endl;
-  for (LiveInterval* active_interval : active_) {
-    DumpInterval(stream, active_interval);
-  }
-  stream << "unhandled: " << std::endl;
-  auto unhandled = (unhandled_ != nullptr) ?
-      unhandled_ : &unhandled_core_intervals_;
-  for (LiveInterval* unhandled_interval : *unhandled) {
-    DumpInterval(stream, unhandled_interval);
-  }
-  stream << "handled: " << std::endl;
-  for (LiveInterval* handled_interval : handled_) {
-    DumpInterval(stream, handled_interval);
-  }
-}
-
-// By the book implementation of a linear scan register allocator.
-void RegisterAllocator::LinearScan() {
-  while (!unhandled_->empty()) {
-    // (1) Remove interval with the lowest start position from unhandled.
-    LiveInterval* current = unhandled_->back();
-    unhandled_->pop_back();
-
-    // Make sure the interval is an expected state.
-    DCHECK(!current->IsFixed() && !current->HasSpillSlot());
-    // Make sure we are going in the right order.
-    DCHECK(unhandled_->empty() || unhandled_->back()->GetStart() >= current->GetStart());
-    // Make sure a low interval is always with a high.
-    DCHECK(!current->IsLowInterval() || unhandled_->back()->IsHighInterval());
-    // Make sure a high interval is always with a low.
-    DCHECK(current->IsLowInterval() ||
-           unhandled_->empty() ||
-           !unhandled_->back()->IsHighInterval());
-
-    size_t position = current->GetStart();
-
-    // Remember the inactive_ size here since the ones moved to inactive_ from
-    // active_ below shouldn't need to be re-checked.
-    size_t inactive_intervals_to_handle = inactive_.size();
-
-    // (2) Remove currently active intervals that are dead at this position.
-    //     Move active intervals that have a lifetime hole at this position
-    //     to inactive.
-    auto active_kept_end = std::remove_if(
-        active_.begin(),
-        active_.end(),
-        [this, position](LiveInterval* interval) {
-          if (interval->IsDeadAt(position)) {
-            handled_.push_back(interval);
-            return true;
-          } else if (!interval->Covers(position)) {
-            inactive_.push_back(interval);
-            return true;
-          } else {
-            return false;  // Keep this interval.
-          }
-        });
-    active_.erase(active_kept_end, active_.end());
-
-    // (3) Remove currently inactive intervals that are dead at this position.
-    //     Move inactive intervals that cover this position to active.
-    auto inactive_to_handle_end = inactive_.begin() + inactive_intervals_to_handle;
-    auto inactive_kept_end = std::remove_if(
-        inactive_.begin(),
-        inactive_to_handle_end,
-        [this, position](LiveInterval* interval) {
-          DCHECK(interval->GetStart() < position || interval->IsFixed());
-          if (interval->IsDeadAt(position)) {
-            handled_.push_back(interval);
-            return true;
-          } else if (interval->Covers(position)) {
-            active_.push_back(interval);
-            return true;
-          } else {
-            return false;  // Keep this interval.
-          }
-        });
-    inactive_.erase(inactive_kept_end, inactive_to_handle_end);
-
-    if (current->IsSlowPathSafepoint()) {
-      // Synthesized interval to record the maximum number of live registers
-      // at safepoints. No need to allocate a register for it.
-      if (processing_core_registers_) {
-        maximum_number_of_live_core_registers_ =
-          std::max(maximum_number_of_live_core_registers_, active_.size());
-      } else {
-        maximum_number_of_live_fp_registers_ =
-          std::max(maximum_number_of_live_fp_registers_, active_.size());
-      }
-      DCHECK(unhandled_->empty() || unhandled_->back()->GetStart() > current->GetStart());
-      continue;
-    }
-
-    if (current->IsHighInterval() && !current->GetLowInterval()->HasRegister()) {
-      DCHECK(!current->HasRegister());
-      // Allocating the low part was unsucessful. The splitted interval for the high part
-      // will be handled next (it is in the `unhandled_` list).
-      continue;
-    }
-
-    // (4) Try to find an available register.
-    bool success = TryAllocateFreeReg(current);
-
-    // (5) If no register could be found, we need to spill.
-    if (!success) {
-      success = AllocateBlockedReg(current);
-    }
-
-    // (6) If the interval had a register allocated, add it to the list of active
-    //     intervals.
-    if (success) {
-      codegen_->AddAllocatedRegister(processing_core_registers_
-          ? Location::RegisterLocation(current->GetRegister())
-          : Location::FpuRegisterLocation(current->GetRegister()));
-      active_.push_back(current);
-      if (current->HasHighInterval() && !current->GetHighInterval()->HasRegister()) {
-        current->GetHighInterval()->SetRegister(GetHighForLowRegister(current->GetRegister()));
-      }
-    }
-  }
-}
-
-static void FreeIfNotCoverAt(LiveInterval* interval, size_t position, size_t* free_until) {
-  DCHECK(!interval->IsHighInterval());
-  // Note that the same instruction may occur multiple times in the input list,
-  // so `free_until` may have changed already.
-  // Since `position` is not the current scan position, we need to use CoversSlow.
-  if (interval->IsDeadAt(position)) {
-    // Set the register to be free. Note that inactive intervals might later
-    // update this.
-    free_until[interval->GetRegister()] = kMaxLifetimePosition;
-    if (interval->HasHighInterval()) {
-      DCHECK(interval->GetHighInterval()->IsDeadAt(position));
-      free_until[interval->GetHighInterval()->GetRegister()] = kMaxLifetimePosition;
-    }
-  } else if (!interval->CoversSlow(position)) {
-    // The interval becomes inactive at `defined_by`. We make its register
-    // available only until the next use strictly after `defined_by`.
-    free_until[interval->GetRegister()] = interval->FirstUseAfter(position);
+LiveInterval* RegisterAllocator::Split(LiveInterval* interval, size_t position) {
+  DCHECK_GE(position, interval->GetStart());
+  DCHECK(!interval->IsDeadAt(position));
+  if (position == interval->GetStart()) {
+    // Spill slot will be allocated when handling `interval` again.
+    interval->ClearRegister();
     if (interval->HasHighInterval()) {
-      DCHECK(!interval->GetHighInterval()->CoversSlow(position));
-      free_until[interval->GetHighInterval()->GetRegister()] = free_until[interval->GetRegister()];
-    }
-  }
-}
-
-// Find a free register. If multiple are found, pick the register that
-// is free the longest.
-bool RegisterAllocator::TryAllocateFreeReg(LiveInterval* current) {
-  size_t* free_until = registers_array_;
-
-  // First set all registers to be free.
-  for (size_t i = 0; i < number_of_registers_; ++i) {
-    free_until[i] = kMaxLifetimePosition;
-  }
-
-  // For each active interval, set its register to not free.
-  for (LiveInterval* interval : active_) {
-    DCHECK(interval->HasRegister());
-    free_until[interval->GetRegister()] = 0;
-  }
-
-  // An interval that starts an instruction (that is, it is not split), may
-  // re-use the registers used by the inputs of that instruciton, based on the
-  // location summary.
-  HInstruction* defined_by = current->GetDefinedBy();
-  if (defined_by != nullptr && !current->IsSplit()) {
-    LocationSummary* locations = defined_by->GetLocations();
-    if (!locations->OutputCanOverlapWithInputs() && locations->Out().IsUnallocated()) {
-      HInputsRef inputs = defined_by->GetInputs();
-      for (size_t i = 0; i < inputs.size(); ++i) {
-        // Take the last interval of the input. It is the location of that interval
-        // that will be used at `defined_by`.
-        LiveInterval* interval = inputs[i]->GetLiveInterval()->GetLastSibling();
-        // Note that interval may have not been processed yet.
-        // TODO: Handle non-split intervals last in the work list.
-        if (locations->InAt(i).IsValid()
-            && interval->HasRegister()
-            && interval->SameRegisterKind(*current)) {
-          // The input must be live until the end of `defined_by`, to comply to
-          // the linear scan algorithm. So we use `defined_by`'s end lifetime
-          // position to check whether the input is dead or is inactive after
-          // `defined_by`.
-          DCHECK(interval->CoversSlow(defined_by->GetLifetimePosition()));
-          size_t position = defined_by->GetLifetimePosition() + 1;
-          FreeIfNotCoverAt(interval, position, free_until);
-        }
-      }
-    }
-  }
-
-  // For each inactive interval, set its register to be free until
-  // the next intersection with `current`.
-  for (LiveInterval* inactive : inactive_) {
-    // Temp/Slow-path-safepoint interval has no holes.
-    DCHECK(!inactive->IsTemp() && !inactive->IsSlowPathSafepoint());
-    if (!current->IsSplit() && !inactive->IsFixed()) {
-      // Neither current nor inactive are fixed.
-      // Thanks to SSA, a non-split interval starting in a hole of an
-      // inactive interval should never intersect with that inactive interval.
-      // Only if it's not fixed though, because fixed intervals don't come from SSA.
-      DCHECK_EQ(inactive->FirstIntersectionWith(current), kNoLifetime);
-      continue;
-    }
-
-    DCHECK(inactive->HasRegister());
-    if (free_until[inactive->GetRegister()] == 0) {
-      // Already used by some active interval. No need to intersect.
-      continue;
-    }
-    size_t next_intersection = inactive->FirstIntersectionWith(current);
-    if (next_intersection != kNoLifetime) {
-      free_until[inactive->GetRegister()] =
-          std::min(free_until[inactive->GetRegister()], next_intersection);
-    }
-  }
-
-  int reg = kNoRegister;
-  if (current->HasRegister()) {
-    // Some instructions have a fixed register output.
-    reg = current->GetRegister();
-    if (free_until[reg] == 0) {
-      DCHECK(current->IsHighInterval());
-      // AllocateBlockedReg will spill the holder of the register.
-      return false;
-    }
-  } else {
-    DCHECK(!current->IsHighInterval());
-    int hint = current->FindFirstRegisterHint(free_until, liveness_);
-    if ((hint != kNoRegister)
-        // For simplicity, if the hint we are getting for a pair cannot be used,
-        // we are just going to allocate a new pair.
-        && !(current->IsLowInterval() && IsBlocked(GetHighForLowRegister(hint)))) {
-      DCHECK(!IsBlocked(hint));
-      reg = hint;
-    } else if (current->IsLowInterval()) {
-      reg = FindAvailableRegisterPair(free_until, current->GetStart());
-    } else {
-      reg = FindAvailableRegister(free_until, current);
-    }
-  }
-
-  DCHECK_NE(reg, kNoRegister);
-  // If we could not find a register, we need to spill.
-  if (free_until[reg] == 0) {
-    return false;
-  }
-
-  if (current->IsLowInterval()) {
-    // If the high register of this interval is not available, we need to spill.
-    int high_reg = current->GetHighInterval()->GetRegister();
-    if (high_reg == kNoRegister) {
-      high_reg = GetHighForLowRegister(reg);
-    }
-    if (free_until[high_reg] == 0) {
-      return false;
-    }
-  }
-
-  current->SetRegister(reg);
-  if (!current->IsDeadAt(free_until[reg])) {
-    // If the register is only available for a subset of live ranges
-    // covered by `current`, split `current` before the position where
-    // the register is not available anymore.
-    LiveInterval* split = SplitBetween(current, current->GetStart(), free_until[reg]);
-    DCHECK(split != nullptr);
-    AddSorted(unhandled_, split);
-  }
-  return true;
-}
-
-bool RegisterAllocator::IsBlocked(int reg) const {
-  return processing_core_registers_
-      ? blocked_core_registers_[reg]
-      : blocked_fp_registers_[reg];
-}
-
-int RegisterAllocator::FindAvailableRegisterPair(size_t* next_use, size_t starting_at) const {
-  int reg = kNoRegister;
-  // Pick the register pair that is used the last.
-  for (size_t i = 0; i < number_of_registers_; ++i) {
-    if (IsBlocked(i)) continue;
-    if (!IsLowRegister(i)) continue;
-    int high_register = GetHighForLowRegister(i);
-    if (IsBlocked(high_register)) continue;
-    int existing_high_register = GetHighForLowRegister(reg);
-    if ((reg == kNoRegister) || (next_use[i] >= next_use[reg]
-                        && next_use[high_register] >= next_use[existing_high_register])) {
-      reg = i;
-      if (next_use[i] == kMaxLifetimePosition
-          && next_use[high_register] == kMaxLifetimePosition) {
-        break;
-      }
-    } else if (next_use[reg] <= starting_at || next_use[existing_high_register] <= starting_at) {
-      // If one of the current register is known to be unavailable, just unconditionally
-      // try a new one.
-      reg = i;
-    }
-  }
-  return reg;
-}
-
-bool RegisterAllocator::IsCallerSaveRegister(int reg) const {
-  return processing_core_registers_
-      ? !codegen_->IsCoreCalleeSaveRegister(reg)
-      : !codegen_->IsFloatingPointCalleeSaveRegister(reg);
-}
-
-int RegisterAllocator::FindAvailableRegister(size_t* next_use, LiveInterval* current) const {
-  // We special case intervals that do not span a safepoint to try to find a caller-save
-  // register if one is available. We iterate from 0 to the number of registers,
-  // so if there are caller-save registers available at the end, we continue the iteration.
-  bool prefers_caller_save = !current->HasWillCallSafepoint();
-  int reg = kNoRegister;
-  for (size_t i = 0; i < number_of_registers_; ++i) {
-    if (IsBlocked(i)) {
-      // Register cannot be used. Continue.
-      continue;
-    }
-
-    // Best case: we found a register fully available.
-    if (next_use[i] == kMaxLifetimePosition) {
-      if (prefers_caller_save && !IsCallerSaveRegister(i)) {
-        // We can get shorter encodings on some platforms by using
-        // small register numbers. So only update the candidate if the previous
-        // one was not available for the whole method.
-        if (reg == kNoRegister || next_use[reg] != kMaxLifetimePosition) {
-          reg = i;
-        }
-        // Continue the iteration in the hope of finding a caller save register.
-        continue;
-      } else {
-        reg = i;
-        // We know the register is good enough. Return it.
-        break;
-      }
-    }
-
-    // If we had no register before, take this one as a reference.
-    if (reg == kNoRegister) {
-      reg = i;
-      continue;
-    }
-
-    // Pick the register that is used the last.
-    if (next_use[i] > next_use[reg]) {
-      reg = i;
-      continue;
-    }
-  }
-  return reg;
-}
-
-// Remove interval and its other half if any. Return iterator to the following element.
-static ArenaVector<LiveInterval*>::iterator RemoveIntervalAndPotentialOtherHalf(
-    ArenaVector<LiveInterval*>* intervals, ArenaVector<LiveInterval*>::iterator pos) {
-  DCHECK(intervals->begin() <= pos && pos < intervals->end());
-  LiveInterval* interval = *pos;
-  if (interval->IsLowInterval()) {
-    DCHECK(pos + 1 < intervals->end());
-    DCHECK_EQ(*(pos + 1), interval->GetHighInterval());
-    return intervals->erase(pos, pos + 2);
-  } else if (interval->IsHighInterval()) {
-    DCHECK(intervals->begin() < pos);
-    DCHECK_EQ(*(pos - 1), interval->GetLowInterval());
-    return intervals->erase(pos - 1, pos + 1);
-  } else {
-    return intervals->erase(pos);
-  }
-}
-
-bool RegisterAllocator::TrySplitNonPairOrUnalignedPairIntervalAt(size_t position,
-                                                                 size_t first_register_use,
-                                                                 size_t* next_use) {
-  for (auto it = active_.begin(), end = active_.end(); it != end; ++it) {
-    LiveInterval* active = *it;
-    DCHECK(active->HasRegister());
-    if (active->IsFixed()) continue;
-    if (active->IsHighInterval()) continue;
-    if (first_register_use > next_use[active->GetRegister()]) continue;
-
-    // Split the first interval found that is either:
-    // 1) A non-pair interval.
-    // 2) A pair interval whose high is not low + 1.
-    // 3) A pair interval whose low is not even.
-    if (!active->IsLowInterval() ||
-        IsLowOfUnalignedPairInterval(active) ||
-        !IsLowRegister(active->GetRegister())) {
-      LiveInterval* split = Split(active, position);
-      if (split != active) {
-        handled_.push_back(active);
-      }
-      RemoveIntervalAndPotentialOtherHalf(&active_, it);
-      AddSorted(unhandled_, split);
-      return true;
-    }
-  }
-  return false;
-}
-
-// Find the register that is used the last, and spill the interval
-// that holds it. If the first use of `current` is after that register
-// we spill `current` instead.
-bool RegisterAllocator::AllocateBlockedReg(LiveInterval* current) {
-  size_t first_register_use = current->FirstRegisterUse();
-  if (current->HasRegister()) {
-    DCHECK(current->IsHighInterval());
-    // The low interval has allocated the register for the high interval. In
-    // case the low interval had to split both intervals, we may end up in a
-    // situation where the high interval does not have a register use anymore.
-    // We must still proceed in order to split currently active and inactive
-    // uses of the high interval's register, and put the high interval in the
-    // active set.
-    DCHECK(first_register_use != kNoLifetime || (current->GetNextSibling() != nullptr));
-  } else if (first_register_use == kNoLifetime) {
-    AllocateSpillSlotFor(current);
-    return false;
-  }
-
-  // First set all registers as not being used.
-  size_t* next_use = registers_array_;
-  for (size_t i = 0; i < number_of_registers_; ++i) {
-    next_use[i] = kMaxLifetimePosition;
-  }
-
-  // For each active interval, find the next use of its register after the
-  // start of current.
-  for (LiveInterval* active : active_) {
-    DCHECK(active->HasRegister());
-    if (active->IsFixed()) {
-      next_use[active->GetRegister()] = current->GetStart();
-    } else {
-      size_t use = active->FirstRegisterUseAfter(current->GetStart());
-      if (use != kNoLifetime) {
-        next_use[active->GetRegister()] = use;
-      }
-    }
-  }
-
-  // For each inactive interval, find the next use of its register after the
-  // start of current.
-  for (LiveInterval* inactive : inactive_) {
-    // Temp/Slow-path-safepoint interval has no holes.
-    DCHECK(!inactive->IsTemp() && !inactive->IsSlowPathSafepoint());
-    if (!current->IsSplit() && !inactive->IsFixed()) {
-      // Neither current nor inactive are fixed.
-      // Thanks to SSA, a non-split interval starting in a hole of an
-      // inactive interval should never intersect with that inactive interval.
-      // Only if it's not fixed though, because fixed intervals don't come from SSA.
-      DCHECK_EQ(inactive->FirstIntersectionWith(current), kNoLifetime);
-      continue;
-    }
-    DCHECK(inactive->HasRegister());
-    size_t next_intersection = inactive->FirstIntersectionWith(current);
-    if (next_intersection != kNoLifetime) {
-      if (inactive->IsFixed()) {
-        next_use[inactive->GetRegister()] =
-            std::min(next_intersection, next_use[inactive->GetRegister()]);
-      } else {
-        size_t use = inactive->FirstUseAfter(current->GetStart());
-        if (use != kNoLifetime) {
-          next_use[inactive->GetRegister()] = std::min(use, next_use[inactive->GetRegister()]);
-        }
-      }
-    }
-  }
-
-  int reg = kNoRegister;
-  bool should_spill = false;
-  if (current->HasRegister()) {
-    DCHECK(current->IsHighInterval());
-    reg = current->GetRegister();
-    // When allocating the low part, we made sure the high register was available.
-    DCHECK_LT(first_register_use, next_use[reg]);
-  } else if (current->IsLowInterval()) {
-    reg = FindAvailableRegisterPair(next_use, first_register_use);
-    // We should spill if both registers are not available.
-    should_spill = (first_register_use >= next_use[reg])
-      || (first_register_use >= next_use[GetHighForLowRegister(reg)]);
-  } else {
-    DCHECK(!current->IsHighInterval());
-    reg = FindAvailableRegister(next_use, current);
-    should_spill = (first_register_use >= next_use[reg]);
-  }
-
-  DCHECK_NE(reg, kNoRegister);
-  if (should_spill) {
-    DCHECK(!current->IsHighInterval());
-    bool is_allocation_at_use_site = (current->GetStart() >= (first_register_use - 1));
-    if (is_allocation_at_use_site) {
-      if (!current->IsLowInterval()) {
-        DumpInterval(std::cerr, current);
-        DumpAllIntervals(std::cerr);
-        // This situation has the potential to infinite loop, so we make it a non-debug CHECK.
-        HInstruction* at = liveness_.GetInstructionFromPosition(first_register_use / 2);
-        CHECK(false) << "There is not enough registers available for "
-          << current->GetParent()->GetDefinedBy()->DebugName() << " "
-          << current->GetParent()->GetDefinedBy()->GetId()
-          << " at " << first_register_use - 1 << " "
-          << (at == nullptr ? "" : at->DebugName());
-      }
-
-      // If we're allocating a register for `current` because the instruction at
-      // that position requires it, but we think we should spill, then there are
-      // non-pair intervals or unaligned pair intervals blocking the allocation.
-      // We split the first interval found, and put ourselves first in the
-      // `unhandled_` list.
-      bool success = TrySplitNonPairOrUnalignedPairIntervalAt(current->GetStart(),
-                                                              first_register_use,
-                                                              next_use);
-      DCHECK(success);
-      LiveInterval* existing = unhandled_->back();
-      DCHECK(existing->IsHighInterval());
-      DCHECK_EQ(existing->GetLowInterval(), current);
-      unhandled_->push_back(current);
-    } else {
-      // If the first use of that instruction is after the last use of the found
-      // register, we split this interval just before its first register use.
-      AllocateSpillSlotFor(current);
-      LiveInterval* split = SplitBetween(current, current->GetStart(), first_register_use - 1);
-      DCHECK(current != split);
-      AddSorted(unhandled_, split);
+      interval->GetHighInterval()->ClearRegister();
+    } else if (interval->HasLowInterval()) {
+      interval->GetLowInterval()->ClearRegister();
     }
-    return false;
+    return interval;
   } else {
-    // Use this register and spill the active and inactives interval that
-    // have that register.
-    current->SetRegister(reg);
-
-    for (auto it = active_.begin(), end = active_.end(); it != end; ++it) {
-      LiveInterval* active = *it;
-      if (active->GetRegister() == reg) {
-        DCHECK(!active->IsFixed());
-        LiveInterval* split = Split(active, current->GetStart());
-        if (split != active) {
-          handled_.push_back(active);
-        }
-        RemoveIntervalAndPotentialOtherHalf(&active_, it);
-        AddSorted(unhandled_, split);
-        break;
-      }
-    }
-
-    // NOTE: Retrieve end() on each iteration because we're removing elements in the loop body.
-    for (auto it = inactive_.begin(); it != inactive_.end(); ) {
-      LiveInterval* inactive = *it;
-      bool erased = false;
-      if (inactive->GetRegister() == reg) {
-        if (!current->IsSplit() && !inactive->IsFixed()) {
-          // Neither current nor inactive are fixed.
-          // Thanks to SSA, a non-split interval starting in a hole of an
-          // inactive interval should never intersect with that inactive interval.
-          // Only if it's not fixed though, because fixed intervals don't come from SSA.
-          DCHECK_EQ(inactive->FirstIntersectionWith(current), kNoLifetime);
-        } else {
-          size_t next_intersection = inactive->FirstIntersectionWith(current);
-          if (next_intersection != kNoLifetime) {
-            if (inactive->IsFixed()) {
-              LiveInterval* split = Split(current, next_intersection);
-              DCHECK_NE(split, current);
-              AddSorted(unhandled_, split);
-            } else {
-              // Split at the start of `current`, which will lead to splitting
-              // at the end of the lifetime hole of `inactive`.
-              LiveInterval* split = Split(inactive, current->GetStart());
-              // If it's inactive, it must start before the current interval.
-              DCHECK_NE(split, inactive);
-              it = RemoveIntervalAndPotentialOtherHalf(&inactive_, it);
-              erased = true;
-              handled_.push_back(inactive);
-              AddSorted(unhandled_, split);
-            }
-          }
-        }
-      }
-      // If we have erased the element, `it` already points to the next element.
-      // Otherwise we need to move to the next element.
-      if (!erased) {
-        ++it;
-      }
-    }
-
-    return true;
-  }
-}
-
-void RegisterAllocator::AddSorted(ArenaVector<LiveInterval*>* array, LiveInterval* interval) {
-  DCHECK(!interval->IsFixed() && !interval->HasSpillSlot());
-  size_t insert_at = 0;
-  for (size_t i = array->size(); i > 0; --i) {
-    LiveInterval* current = (*array)[i - 1u];
-    // High intervals must be processed right after their low equivalent.
-    if (current->StartsAfter(interval) && !current->IsHighInterval()) {
-      insert_at = i;
-      break;
-    } else if ((current->GetStart() == interval->GetStart()) && current->IsSlowPathSafepoint()) {
-      // Ensure the slow path interval is the last to be processed at its location: we want the
-      // interval to know all live registers at this location.
-      DCHECK(i == 1 || (*array)[i - 2u]->StartsAfter(current));
-      insert_at = i;
-      break;
+    LiveInterval* new_interval = interval->SplitAt(position);
+    if (interval->HasHighInterval()) {
+      LiveInterval* high = interval->GetHighInterval()->SplitAt(position);
+      new_interval->SetHighInterval(high);
+      high->SetLowInterval(new_interval);
+    } else if (interval->HasLowInterval()) {
+      LiveInterval* low = interval->GetLowInterval()->SplitAt(position);
+      new_interval->SetLowInterval(low);
+      low->SetHighInterval(new_interval);
     }
-  }
-
-  // Insert the high interval before the low, to ensure the low is processed before.
-  auto insert_pos = array->begin() + insert_at;
-  if (interval->HasHighInterval()) {
-    array->insert(insert_pos, { interval->GetHighInterval(), interval });
-  } else if (interval->HasLowInterval()) {
-    array->insert(insert_pos, { interval, interval->GetLowInterval() });
-  } else {
-    array->insert(insert_pos, interval);
+    return new_interval;
   }
 }
 
@@ -1258,748 +279,4 @@ LiveInterval* RegisterAllocator::SplitBetween(LiveInterval* interval, size_t fro
   return Split(interval, block_to->GetLifetimeStart());
 }
 
-LiveInterval* RegisterAllocator::Split(LiveInterval* interval, size_t position) {
-  DCHECK_GE(position, interval->GetStart());
-  DCHECK(!interval->IsDeadAt(position));
-  if (position == interval->GetStart()) {
-    // Spill slot will be allocated when handling `interval` again.
-    interval->ClearRegister();
-    if (interval->HasHighInterval()) {
-      interval->GetHighInterval()->ClearRegister();
-    } else if (interval->HasLowInterval()) {
-      interval->GetLowInterval()->ClearRegister();
-    }
-    return interval;
-  } else {
-    LiveInterval* new_interval = interval->SplitAt(position);
-    if (interval->HasHighInterval()) {
-      LiveInterval* high = interval->GetHighInterval()->SplitAt(position);
-      new_interval->SetHighInterval(high);
-      high->SetLowInterval(new_interval);
-    } else if (interval->HasLowInterval()) {
-      LiveInterval* low = interval->GetLowInterval()->SplitAt(position);
-      new_interval->SetLowInterval(low);
-      low->SetHighInterval(new_interval);
-    }
-    return new_interval;
-  }
-}
-
-void RegisterAllocator::AllocateSpillSlotFor(LiveInterval* interval) {
-  if (interval->IsHighInterval()) {
-    // The low interval already took care of allocating the spill slot.
-    DCHECK(!interval->GetLowInterval()->HasRegister());
-    DCHECK(interval->GetLowInterval()->GetParent()->HasSpillSlot());
-    return;
-  }
-
-  LiveInterval* parent = interval->GetParent();
-
-  // An instruction gets a spill slot for its entire lifetime. If the parent
-  // of this interval already has a spill slot, there is nothing to do.
-  if (parent->HasSpillSlot()) {
-    return;
-  }
-
-  HInstruction* defined_by = parent->GetDefinedBy();
-  DCHECK(!defined_by->IsPhi() || !defined_by->AsPhi()->IsCatchPhi());
-
-  if (defined_by->IsParameterValue()) {
-    // Parameters have their own stack slot.
-    parent->SetSpillSlot(codegen_->GetStackSlotOfParameter(defined_by->AsParameterValue()));
-    return;
-  }
-
-  if (defined_by->IsCurrentMethod()) {
-    parent->SetSpillSlot(0);
-    return;
-  }
-
-  if (defined_by->IsConstant()) {
-    // Constants don't need a spill slot.
-    return;
-  }
-
-  ArenaVector<size_t>* spill_slots = nullptr;
-  switch (interval->GetType()) {
-    case Primitive::kPrimDouble:
-      spill_slots = &double_spill_slots_;
-      break;
-    case Primitive::kPrimLong:
-      spill_slots = &long_spill_slots_;
-      break;
-    case Primitive::kPrimFloat:
-      spill_slots = &float_spill_slots_;
-      break;
-    case Primitive::kPrimNot:
-    case Primitive::kPrimInt:
-    case Primitive::kPrimChar:
-    case Primitive::kPrimByte:
-    case Primitive::kPrimBoolean:
-    case Primitive::kPrimShort:
-      spill_slots = &int_spill_slots_;
-      break;
-    case Primitive::kPrimVoid:
-      LOG(FATAL) << "Unexpected type for interval " << interval->GetType();
-  }
-
-  // Find an available spill slot.
-  size_t slot = 0;
-  for (size_t e = spill_slots->size(); slot < e; ++slot) {
-    if ((*spill_slots)[slot] <= parent->GetStart()
-        && (slot == (e - 1) || (*spill_slots)[slot + 1] <= parent->GetStart())) {
-      break;
-    }
-  }
-
-  size_t end = interval->GetLastSibling()->GetEnd();
-  if (parent->NeedsTwoSpillSlots()) {
-    if (slot + 2u > spill_slots->size()) {
-      // We need a new spill slot.
-      spill_slots->resize(slot + 2u, end);
-    }
-    (*spill_slots)[slot] = end;
-    (*spill_slots)[slot + 1] = end;
-  } else {
-    if (slot == spill_slots->size()) {
-      // We need a new spill slot.
-      spill_slots->push_back(end);
-    } else {
-      (*spill_slots)[slot] = end;
-    }
-  }
-
-  // Note that the exact spill slot location will be computed when we resolve,
-  // that is when we know the number of spill slots for each type.
-  parent->SetSpillSlot(slot);
-}
-
-static bool IsValidDestination(Location destination) {
-  return destination.IsRegister()
-      || destination.IsRegisterPair()
-      || destination.IsFpuRegister()
-      || destination.IsFpuRegisterPair()
-      || destination.IsStackSlot()
-      || destination.IsDoubleStackSlot();
-}
-
-void RegisterAllocator::AllocateSpillSlotForCatchPhi(HPhi* phi) {
-  LiveInterval* interval = phi->GetLiveInterval();
-
-  HInstruction* previous_phi = phi->GetPrevious();
-  DCHECK(previous_phi == nullptr ||
-         previous_phi->AsPhi()->GetRegNumber() <= phi->GetRegNumber())
-      << "Phis expected to be sorted by vreg number, so that equivalent phis are adjacent.";
-
-  if (phi->IsVRegEquivalentOf(previous_phi)) {
-    // This is an equivalent of the previous phi. We need to assign the same
-    // catch phi slot.
-    DCHECK(previous_phi->GetLiveInterval()->HasSpillSlot());
-    interval->SetSpillSlot(previous_phi->GetLiveInterval()->GetSpillSlot());
-  } else {
-    // Allocate a new spill slot for this catch phi.
-    // TODO: Reuse spill slots when intervals of phis from different catch
-    //       blocks do not overlap.
-    interval->SetSpillSlot(catch_phi_spill_slots_);
-    catch_phi_spill_slots_ += interval->NeedsTwoSpillSlots() ? 2 : 1;
-  }
-}
-
-void RegisterAllocator::AddMove(HParallelMove* move,
-                                Location source,
-                                Location destination,
-                                HInstruction* instruction,
-                                Primitive::Type type) const {
-  if (type == Primitive::kPrimLong
-      && codegen_->ShouldSplitLongMoves()
-      // The parallel move resolver knows how to deal with long constants.
-      && !source.IsConstant()) {
-    move->AddMove(source.ToLow(), destination.ToLow(), Primitive::kPrimInt, instruction);
-    move->AddMove(source.ToHigh(), destination.ToHigh(), Primitive::kPrimInt, nullptr);
-  } else {
-    move->AddMove(source, destination, type, instruction);
-  }
-}
-
-void RegisterAllocator::AddInputMoveFor(HInstruction* input,
-                                        HInstruction* user,
-                                        Location source,
-                                        Location destination) const {
-  if (source.Equals(destination)) return;
-
-  DCHECK(!user->IsPhi());
-
-  HInstruction* previous = user->GetPrevious();
-  HParallelMove* move = nullptr;
-  if (previous == nullptr
-      || !previous->IsParallelMove()
-      || previous->GetLifetimePosition() < user->GetLifetimePosition()) {
-    move = new (allocator_) HParallelMove(allocator_);
-    move->SetLifetimePosition(user->GetLifetimePosition());
-    user->GetBlock()->InsertInstructionBefore(move, user);
-  } else {
-    move = previous->AsParallelMove();
-  }
-  DCHECK_EQ(move->GetLifetimePosition(), user->GetLifetimePosition());
-  AddMove(move, source, destination, nullptr, input->GetType());
-}
-
-static bool IsInstructionStart(size_t position) {
-  return (position & 1) == 0;
-}
-
-static bool IsInstructionEnd(size_t position) {
-  return (position & 1) == 1;
-}
-
-void RegisterAllocator::InsertParallelMoveAt(size_t position,
-                                             HInstruction* instruction,
-                                             Location source,
-                                             Location destination) const {
-  DCHECK(IsValidDestination(destination)) << destination;
-  if (source.Equals(destination)) return;
-
-  HInstruction* at = liveness_.GetInstructionFromPosition(position / 2);
-  HParallelMove* move;
-  if (at == nullptr) {
-    if (IsInstructionStart(position)) {
-      // Block boundary, don't do anything the connection of split siblings will handle it.
-      return;
-    } else {
-      // Move must happen before the first instruction of the block.
-      at = liveness_.GetInstructionFromPosition((position + 1) / 2);
-      // Note that parallel moves may have already been inserted, so we explicitly
-      // ask for the first instruction of the block: `GetInstructionFromPosition` does
-      // not contain the `HParallelMove` instructions.
-      at = at->GetBlock()->GetFirstInstruction();
-
-      if (at->GetLifetimePosition() < position) {
-        // We may insert moves for split siblings and phi spills at the beginning of the block.
-        // Since this is a different lifetime position, we need to go to the next instruction.
-        DCHECK(at->IsParallelMove());
-        at = at->GetNext();
-      }
-
-      if (at->GetLifetimePosition() != position) {
-        DCHECK_GT(at->GetLifetimePosition(), position);
-        move = new (allocator_) HParallelMove(allocator_);
-        move->SetLifetimePosition(position);
-        at->GetBlock()->InsertInstructionBefore(move, at);
-      } else {
-        DCHECK(at->IsParallelMove());
-        move = at->AsParallelMove();
-      }
-    }
-  } else if (IsInstructionEnd(position)) {
-    // Move must happen after the instruction.
-    DCHECK(!at->IsControlFlow());
-    move = at->GetNext()->AsParallelMove();
-    // This is a parallel move for connecting siblings in a same block. We need to
-    // differentiate it with moves for connecting blocks, and input moves.
-    if (move == nullptr || move->GetLifetimePosition() > position) {
-      move = new (allocator_) HParallelMove(allocator_);
-      move->SetLifetimePosition(position);
-      at->GetBlock()->InsertInstructionBefore(move, at->GetNext());
-    }
-  } else {
-    // Move must happen before the instruction.
-    HInstruction* previous = at->GetPrevious();
-    if (previous == nullptr
-        || !previous->IsParallelMove()
-        || previous->GetLifetimePosition() != position) {
-      // If the previous is a parallel move, then its position must be lower
-      // than the given `position`: it was added just after the non-parallel
-      // move instruction that precedes `instruction`.
-      DCHECK(previous == nullptr
-             || !previous->IsParallelMove()
-             || previous->GetLifetimePosition() < position);
-      move = new (allocator_) HParallelMove(allocator_);
-      move->SetLifetimePosition(position);
-      at->GetBlock()->InsertInstructionBefore(move, at);
-    } else {
-      move = previous->AsParallelMove();
-    }
-  }
-  DCHECK_EQ(move->GetLifetimePosition(), position);
-  AddMove(move, source, destination, instruction, instruction->GetType());
-}
-
-void RegisterAllocator::InsertParallelMoveAtExitOf(HBasicBlock* block,
-                                                   HInstruction* instruction,
-                                                   Location source,
-                                                   Location destination) const {
-  DCHECK(IsValidDestination(destination)) << destination;
-  if (source.Equals(destination)) return;
-
-  DCHECK_EQ(block->GetNormalSuccessors().size(), 1u);
-  HInstruction* last = block->GetLastInstruction();
-  // We insert moves at exit for phi predecessors and connecting blocks.
-  // A block ending with an if or a packed switch cannot branch to a block
-  // with phis because we do not allow critical edges. It can also not connect
-  // a split interval between two blocks: the move has to happen in the successor.
-  DCHECK(!last->IsIf() && !last->IsPackedSwitch());
-  HInstruction* previous = last->GetPrevious();
-  HParallelMove* move;
-  // This is a parallel move for connecting blocks. We need to differentiate
-  // it with moves for connecting siblings in a same block, and output moves.
-  size_t position = last->GetLifetimePosition();
-  if (previous == nullptr || !previous->IsParallelMove()
-      || previous->AsParallelMove()->GetLifetimePosition() != position) {
-    move = new (allocator_) HParallelMove(allocator_);
-    move->SetLifetimePosition(position);
-    block->InsertInstructionBefore(move, last);
-  } else {
-    move = previous->AsParallelMove();
-  }
-  AddMove(move, source, destination, instruction, instruction->GetType());
-}
-
-void RegisterAllocator::InsertParallelMoveAtEntryOf(HBasicBlock* block,
-                                                    HInstruction* instruction,
-                                                    Location source,
-                                                    Location destination) const {
-  DCHECK(IsValidDestination(destination)) << destination;
-  if (source.Equals(destination)) return;
-
-  HInstruction* first = block->GetFirstInstruction();
-  HParallelMove* move = first->AsParallelMove();
-  size_t position = block->GetLifetimeStart();
-  // This is a parallel move for connecting blocks. We need to differentiate
-  // it with moves for connecting siblings in a same block, and input moves.
-  if (move == nullptr || move->GetLifetimePosition() != position) {
-    move = new (allocator_) HParallelMove(allocator_);
-    move->SetLifetimePosition(position);
-    block->InsertInstructionBefore(move, first);
-  }
-  AddMove(move, source, destination, instruction, instruction->GetType());
-}
-
-void RegisterAllocator::InsertMoveAfter(HInstruction* instruction,
-                                        Location source,
-                                        Location destination) const {
-  DCHECK(IsValidDestination(destination)) << destination;
-  if (source.Equals(destination)) return;
-
-  if (instruction->IsPhi()) {
-    InsertParallelMoveAtEntryOf(instruction->GetBlock(), instruction, source, destination);
-    return;
-  }
-
-  size_t position = instruction->GetLifetimePosition() + 1;
-  HParallelMove* move = instruction->GetNext()->AsParallelMove();
-  // This is a parallel move for moving the output of an instruction. We need
-  // to differentiate with input moves, moves for connecting siblings in a
-  // and moves for connecting blocks.
-  if (move == nullptr || move->GetLifetimePosition() != position) {
-    move = new (allocator_) HParallelMove(allocator_);
-    move->SetLifetimePosition(position);
-    instruction->GetBlock()->InsertInstructionBefore(move, instruction->GetNext());
-  }
-  AddMove(move, source, destination, instruction, instruction->GetType());
-}
-
-void RegisterAllocator::ConnectSiblings(LiveInterval* interval) {
-  LiveInterval* current = interval;
-  if (current->HasSpillSlot()
-      && current->HasRegister()
-      // Currently, we spill unconditionnally the current method in the code generators.
-      && !interval->GetDefinedBy()->IsCurrentMethod()) {
-    // We spill eagerly, so move must be at definition.
-    InsertMoveAfter(interval->GetDefinedBy(),
-                    interval->ToLocation(),
-                    interval->NeedsTwoSpillSlots()
-                        ? Location::DoubleStackSlot(interval->GetParent()->GetSpillSlot())
-                        : Location::StackSlot(interval->GetParent()->GetSpillSlot()));
-  }
-  UsePosition* use = current->GetFirstUse();
-  UsePosition* env_use = current->GetFirstEnvironmentUse();
-
-  // Walk over all siblings, updating locations of use positions, and
-  // connecting them when they are adjacent.
-  do {
-    Location source = current->ToLocation();
-
-    // Walk over all uses covered by this interval, and update the location
-    // information.
-
-    LiveRange* range = current->GetFirstRange();
-    while (range != nullptr) {
-      while (use != nullptr && use->GetPosition() < range->GetStart()) {
-        DCHECK(use->IsSynthesized());
-        use = use->GetNext();
-      }
-      while (use != nullptr && use->GetPosition() <= range->GetEnd()) {
-        DCHECK(!use->GetIsEnvironment());
-        DCHECK(current->CoversSlow(use->GetPosition()) || (use->GetPosition() == range->GetEnd()));
-        if (!use->IsSynthesized()) {
-          LocationSummary* locations = use->GetUser()->GetLocations();
-          Location expected_location = locations->InAt(use->GetInputIndex());
-          // The expected (actual) location may be invalid in case the input is unused. Currently
-          // this only happens for intrinsics.
-          if (expected_location.IsValid()) {
-            if (expected_location.IsUnallocated()) {
-              locations->SetInAt(use->GetInputIndex(), source);
-            } else if (!expected_location.IsConstant()) {
-              AddInputMoveFor(interval->GetDefinedBy(), use->GetUser(), source, expected_location);
-            }
-          } else {
-            DCHECK(use->GetUser()->IsInvoke());
-            DCHECK(use->GetUser()->AsInvoke()->GetIntrinsic() != Intrinsics::kNone);
-          }
-        }
-        use = use->GetNext();
-      }
-
-      // Walk over the environment uses, and update their locations.
-      while (env_use != nullptr && env_use->GetPosition() < range->GetStart()) {
-        env_use = env_use->GetNext();
-      }
-
-      while (env_use != nullptr && env_use->GetPosition() <= range->GetEnd()) {
-        DCHECK(current->CoversSlow(env_use->GetPosition())
-               || (env_use->GetPosition() == range->GetEnd()));
-        HEnvironment* environment = env_use->GetEnvironment();
-        environment->SetLocationAt(env_use->GetInputIndex(), source);
-        env_use = env_use->GetNext();
-      }
-
-      range = range->GetNext();
-    }
-
-    // If the next interval starts just after this one, and has a register,
-    // insert a move.
-    LiveInterval* next_sibling = current->GetNextSibling();
-    if (next_sibling != nullptr
-        && next_sibling->HasRegister()
-        && current->GetEnd() == next_sibling->GetStart()) {
-      Location destination = next_sibling->ToLocation();
-      InsertParallelMoveAt(current->GetEnd(), interval->GetDefinedBy(), source, destination);
-    }
-
-    for (SafepointPosition* safepoint_position = current->GetFirstSafepoint();
-         safepoint_position != nullptr;
-         safepoint_position = safepoint_position->GetNext()) {
-      DCHECK(current->CoversSlow(safepoint_position->GetPosition()));
-
-      LocationSummary* locations = safepoint_position->GetLocations();
-      if ((current->GetType() == Primitive::kPrimNot) && current->GetParent()->HasSpillSlot()) {
-        DCHECK(interval->GetDefinedBy()->IsActualObject())
-            << interval->GetDefinedBy()->DebugName()
-            << "@" << safepoint_position->GetInstruction()->DebugName();
-        locations->SetStackBit(current->GetParent()->GetSpillSlot() / kVRegSize);
-      }
-
-      switch (source.GetKind()) {
-        case Location::kRegister: {
-          locations->AddLiveRegister(source);
-          if (kIsDebugBuild && locations->OnlyCallsOnSlowPath()) {
-            DCHECK_LE(locations->GetNumberOfLiveRegisters(),
-                      maximum_number_of_live_core_registers_ +
-                      maximum_number_of_live_fp_registers_);
-          }
-          if (current->GetType() == Primitive::kPrimNot) {
-            DCHECK(interval->GetDefinedBy()->IsActualObject())
-                << interval->GetDefinedBy()->DebugName()
-                << "@" << safepoint_position->GetInstruction()->DebugName();
-            locations->SetRegisterBit(source.reg());
-          }
-          break;
-        }
-        case Location::kFpuRegister: {
-          locations->AddLiveRegister(source);
-          break;
-        }
-
-        case Location::kRegisterPair:
-        case Location::kFpuRegisterPair: {
-          locations->AddLiveRegister(source.ToLow());
-          locations->AddLiveRegister(source.ToHigh());
-          break;
-        }
-        case Location::kStackSlot:  // Fall-through
-        case Location::kDoubleStackSlot:  // Fall-through
-        case Location::kConstant: {
-          // Nothing to do.
-          break;
-        }
-        default: {
-          LOG(FATAL) << "Unexpected location for object";
-        }
-      }
-    }
-    current = next_sibling;
-  } while (current != nullptr);
-
-  if (kIsDebugBuild) {
-    // Following uses can only be synthesized uses.
-    while (use != nullptr) {
-      DCHECK(use->IsSynthesized());
-      use = use->GetNext();
-    }
-  }
-}
-
-static bool IsMaterializableEntryBlockInstructionOfGraphWithIrreducibleLoop(
-    HInstruction* instruction) {
-  return instruction->GetBlock()->GetGraph()->HasIrreducibleLoops() &&
-         (instruction->IsConstant() || instruction->IsCurrentMethod());
-}
-
-void RegisterAllocator::ConnectSplitSiblings(LiveInterval* interval,
-                                             HBasicBlock* from,
-                                             HBasicBlock* to) const {
-  if (interval->GetNextSibling() == nullptr) {
-    // Nothing to connect. The whole range was allocated to the same location.
-    return;
-  }
-
-  // Find the intervals that cover `from` and `to`.
-  size_t destination_position = to->GetLifetimeStart();
-  size_t source_position = from->GetLifetimeEnd() - 1;
-  LiveInterval* destination = interval->GetSiblingAt(destination_position);
-  LiveInterval* source = interval->GetSiblingAt(source_position);
-
-  if (destination == source) {
-    // Interval was not split.
-    return;
-  }
-
-  LiveInterval* parent = interval->GetParent();
-  HInstruction* defined_by = parent->GetDefinedBy();
-  if (codegen_->GetGraph()->HasIrreducibleLoops() &&
-      (destination == nullptr || !destination->CoversSlow(destination_position))) {
-    // Our live_in fixed point calculation has found that the instruction is live
-    // in the `to` block because it will eventually enter an irreducible loop. Our
-    // live interval computation however does not compute a fixed point, and
-    // therefore will not have a location for that instruction for `to`.
-    // Because the instruction is a constant or the ArtMethod, we don't need to
-    // do anything: it will be materialized in the irreducible loop.
-    DCHECK(IsMaterializableEntryBlockInstructionOfGraphWithIrreducibleLoop(defined_by))
-        << defined_by->DebugName() << ":" << defined_by->GetId()
-        << " " << from->GetBlockId() << " -> " << to->GetBlockId();
-    return;
-  }
-
-  if (!destination->HasRegister()) {
-    // Values are eagerly spilled. Spill slot already contains appropriate value.
-    return;
-  }
-
-  Location location_source;
-  // `GetSiblingAt` returns the interval whose start and end cover `position`,
-  // but does not check whether the interval is inactive at that position.
-  // The only situation where the interval is inactive at that position is in the
-  // presence of irreducible loops for constants and ArtMethod.
-  if (codegen_->GetGraph()->HasIrreducibleLoops() &&
-      (source == nullptr || !source->CoversSlow(source_position))) {
-    DCHECK(IsMaterializableEntryBlockInstructionOfGraphWithIrreducibleLoop(defined_by));
-    if (defined_by->IsConstant()) {
-      location_source = defined_by->GetLocations()->Out();
-    } else {
-      DCHECK(defined_by->IsCurrentMethod());
-      location_source = parent->NeedsTwoSpillSlots()
-          ? Location::DoubleStackSlot(parent->GetSpillSlot())
-          : Location::StackSlot(parent->GetSpillSlot());
-    }
-  } else {
-    DCHECK(source != nullptr);
-    DCHECK(source->CoversSlow(source_position));
-    DCHECK(destination->CoversSlow(destination_position));
-    location_source = source->ToLocation();
-  }
-
-  // If `from` has only one successor, we can put the moves at the exit of it. Otherwise
-  // we need to put the moves at the entry of `to`.
-  if (from->GetNormalSuccessors().size() == 1) {
-    InsertParallelMoveAtExitOf(from,
-                               defined_by,
-                               location_source,
-                               destination->ToLocation());
-  } else {
-    DCHECK_EQ(to->GetPredecessors().size(), 1u);
-    InsertParallelMoveAtEntryOf(to,
-                                defined_by,
-                                location_source,
-                                destination->ToLocation());
-  }
-}
-
-void RegisterAllocator::Resolve() {
-  codegen_->InitializeCodeGeneration(GetNumberOfSpillSlots(),
-                                     maximum_number_of_live_core_registers_,
-                                     maximum_number_of_live_fp_registers_,
-                                     reserved_out_slots_,
-                                     codegen_->GetGraph()->GetLinearOrder());
-
-  // Adjust the Out Location of instructions.
-  // TODO: Use pointers of Location inside LiveInterval to avoid doing another iteration.
-  for (size_t i = 0, e = liveness_.GetNumberOfSsaValues(); i < e; ++i) {
-    HInstruction* instruction = liveness_.GetInstructionFromSsaIndex(i);
-    LiveInterval* current = instruction->GetLiveInterval();
-    LocationSummary* locations = instruction->GetLocations();
-    Location location = locations->Out();
-    if (instruction->IsParameterValue()) {
-      // Now that we know the frame size, adjust the parameter's location.
-      if (location.IsStackSlot()) {
-        location = Location::StackSlot(location.GetStackIndex() + codegen_->GetFrameSize());
-        current->SetSpillSlot(location.GetStackIndex());
-        locations->UpdateOut(location);
-      } else if (location.IsDoubleStackSlot()) {
-        location = Location::DoubleStackSlot(location.GetStackIndex() + codegen_->GetFrameSize());
-        current->SetSpillSlot(location.GetStackIndex());
-        locations->UpdateOut(location);
-      } else if (current->HasSpillSlot()) {
-        current->SetSpillSlot(current->GetSpillSlot() + codegen_->GetFrameSize());
-      }
-    } else if (instruction->IsCurrentMethod()) {
-      // The current method is always at offset 0.
-      DCHECK(!current->HasSpillSlot() || (current->GetSpillSlot() == 0));
-    } else if (instruction->IsPhi() && instruction->AsPhi()->IsCatchPhi()) {
-      DCHECK(current->HasSpillSlot());
-      size_t slot = current->GetSpillSlot()
-                    + GetNumberOfSpillSlots()
-                    + reserved_out_slots_
-                    - catch_phi_spill_slots_;
-      current->SetSpillSlot(slot * kVRegSize);
-    } else if (current->HasSpillSlot()) {
-      // Adjust the stack slot, now that we know the number of them for each type.
-      // The way this implementation lays out the stack is the following:
-      // [parameter slots       ]
-      // [catch phi spill slots ]
-      // [double spill slots    ]
-      // [long spill slots      ]
-      // [float spill slots     ]
-      // [int/ref values        ]
-      // [maximum out values    ] (number of arguments for calls)
-      // [art method            ].
-      size_t slot = current->GetSpillSlot();
-      switch (current->GetType()) {
-        case Primitive::kPrimDouble:
-          slot += long_spill_slots_.size();
-          FALLTHROUGH_INTENDED;
-        case Primitive::kPrimLong:
-          slot += float_spill_slots_.size();
-          FALLTHROUGH_INTENDED;
-        case Primitive::kPrimFloat:
-          slot += int_spill_slots_.size();
-          FALLTHROUGH_INTENDED;
-        case Primitive::kPrimNot:
-        case Primitive::kPrimInt:
-        case Primitive::kPrimChar:
-        case Primitive::kPrimByte:
-        case Primitive::kPrimBoolean:
-        case Primitive::kPrimShort:
-          slot += reserved_out_slots_;
-          break;
-        case Primitive::kPrimVoid:
-          LOG(FATAL) << "Unexpected type for interval " << current->GetType();
-      }
-      current->SetSpillSlot(slot * kVRegSize);
-    }
-
-    Location source = current->ToLocation();
-
-    if (location.IsUnallocated()) {
-      if (location.GetPolicy() == Location::kSameAsFirstInput) {
-        if (locations->InAt(0).IsUnallocated()) {
-          locations->SetInAt(0, source);
-        } else {
-          DCHECK(locations->InAt(0).Equals(source));
-        }
-      }
-      locations->UpdateOut(source);
-    } else {
-      DCHECK(source.Equals(location));
-    }
-  }
-
-  // Connect siblings.
-  for (size_t i = 0, e = liveness_.GetNumberOfSsaValues(); i < e; ++i) {
-    HInstruction* instruction = liveness_.GetInstructionFromSsaIndex(i);
-    ConnectSiblings(instruction->GetLiveInterval());
-  }
-
-  // Resolve non-linear control flow across branches. Order does not matter.
-  for (HLinearOrderIterator it(*codegen_->GetGraph()); !it.Done(); it.Advance()) {
-    HBasicBlock* block = it.Current();
-    if (block->IsCatchBlock() ||
-        (block->IsLoopHeader() && block->GetLoopInformation()->IsIrreducible())) {
-      // Instructions live at the top of catch blocks or irreducible loop header
-      // were forced to spill.
-      if (kIsDebugBuild) {
-        BitVector* live = liveness_.GetLiveInSet(*block);
-        for (uint32_t idx : live->Indexes()) {
-          LiveInterval* interval = liveness_.GetInstructionFromSsaIndex(idx)->GetLiveInterval();
-          LiveInterval* sibling = interval->GetSiblingAt(block->GetLifetimeStart());
-          // `GetSiblingAt` returns the sibling that contains a position, but there could be
-          // a lifetime hole in it. `CoversSlow` returns whether the interval is live at that
-          // position.
-          if ((sibling != nullptr) && sibling->CoversSlow(block->GetLifetimeStart())) {
-            DCHECK(!sibling->HasRegister());
-          }
-        }
-      }
-    } else {
-      BitVector* live = liveness_.GetLiveInSet(*block);
-      for (uint32_t idx : live->Indexes()) {
-        LiveInterval* interval = liveness_.GetInstructionFromSsaIndex(idx)->GetLiveInterval();
-        for (HBasicBlock* predecessor : block->GetPredecessors()) {
-          ConnectSplitSiblings(interval, predecessor, block);
-        }
-      }
-    }
-  }
-
-  // Resolve phi inputs. Order does not matter.
-  for (HLinearOrderIterator it(*codegen_->GetGraph()); !it.Done(); it.Advance()) {
-    HBasicBlock* current = it.Current();
-    if (current->IsCatchBlock()) {
-      // Catch phi values are set at runtime by the exception delivery mechanism.
-    } else {
-      for (HInstructionIterator inst_it(current->GetPhis()); !inst_it.Done(); inst_it.Advance()) {
-        HInstruction* phi = inst_it.Current();
-        for (size_t i = 0, e = current->GetPredecessors().size(); i < e; ++i) {
-          HBasicBlock* predecessor = current->GetPredecessors()[i];
-          DCHECK_EQ(predecessor->GetNormalSuccessors().size(), 1u);
-          HInstruction* input = phi->InputAt(i);
-          Location source = input->GetLiveInterval()->GetLocationAt(
-              predecessor->GetLifetimeEnd() - 1);
-          Location destination = phi->GetLiveInterval()->ToLocation();
-          InsertParallelMoveAtExitOf(predecessor, phi, source, destination);
-        }
-      }
-    }
-  }
-
-  // Assign temp locations.
-  for (LiveInterval* temp : temp_intervals_) {
-    if (temp->IsHighInterval()) {
-      // High intervals can be skipped, they are already handled by the low interval.
-      continue;
-    }
-    HInstruction* at = liveness_.GetTempUser(temp);
-    size_t temp_index = liveness_.GetTempIndex(temp);
-    LocationSummary* locations = at->GetLocations();
-    switch (temp->GetType()) {
-      case Primitive::kPrimInt:
-        locations->SetTempAt(temp_index, Location::RegisterLocation(temp->GetRegister()));
-        break;
-
-      case Primitive::kPrimDouble:
-        if (codegen_->NeedsTwoRegisters(Primitive::kPrimDouble)) {
-          Location location = Location::FpuRegisterPairLocation(
-              temp->GetRegister(), temp->GetHighInterval()->GetRegister());
-          locations->SetTempAt(temp_index, location);
-        } else {
-          locations->SetTempAt(temp_index, Location::FpuRegisterLocation(temp->GetRegister()));
-        }
-        break;
-
-      default:
-        LOG(FATAL) << "Unexpected type for temporary location "
-                   << temp->GetType();
-    }
-  }
-}
-
 }  // namespace art
diff --git a/compiler/optimizing/register_allocator.h b/compiler/optimizing/register_allocator.h
index 58600b789b..7e1fff8e2b 100644
--- a/compiler/optimizing/register_allocator.h
+++ b/compiler/optimizing/register_allocator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2014 The Android Open Source Project
+ * Copyright (C) 2016 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 
 #include "arch/instruction_set.h"
 #include "base/arena_containers.h"
+#include "base/arena_object.h"
 #include "base/macros.h"
 #include "primitive.h"
 
@@ -29,36 +30,41 @@ class HBasicBlock;
 class HGraph;
 class HInstruction;
 class HParallelMove;
-class HPhi;
 class LiveInterval;
 class Location;
 class SsaLivenessAnalysis;
 
 /**
- * An implementation of a linear scan register allocator on an `HGraph` with SSA form.
+ * Base class for any register allocator.
  */
-class RegisterAllocator {
+class RegisterAllocator : public ArenaObject<kArenaAllocRegisterAllocator> {
  public:
-  RegisterAllocator(ArenaAllocator* allocator,
-                    CodeGenerator* codegen,
-                    const SsaLivenessAnalysis& analysis);
+  enum Strategy {
+    kRegisterAllocatorLinearScan,
+    kRegisterAllocatorGraphColor
+  };
+
+  static constexpr Strategy kRegisterAllocatorDefault = kRegisterAllocatorLinearScan;
+
+  static RegisterAllocator* Create(ArenaAllocator* allocator,
+                                   CodeGenerator* codegen,
+                                   const SsaLivenessAnalysis& analysis,
+                                   Strategy strategy = kRegisterAllocatorDefault);
+
+  virtual ~RegisterAllocator() = default;
 
   // Main entry point for the register allocator. Given the liveness analysis,
   // allocates registers to live intervals.
-  void AllocateRegisters();
+  virtual void AllocateRegisters() = 0;
 
   // Validate that the register allocator did not allocate the same register to
-  // intervals that intersect each other. Returns false if it did not.
-  bool Validate(bool log_fatal_on_failure) {
-    processing_core_registers_ = true;
-    if (!ValidateInternal(log_fatal_on_failure)) {
-      return false;
-    }
-    processing_core_registers_ = false;
-    return ValidateInternal(log_fatal_on_failure);
-  }
-
-  // Helper method for validation. Used by unit testing.
+  // intervals that intersect each other. Returns false if it failed.
+  virtual bool Validate(bool log_fatal_on_failure) = 0;
+
+  static bool CanAllocateRegistersFor(const HGraph& graph,
+                                      InstructionSet instruction_set);
+
+  // Verifies that live intervals do not conflict. Used by unit testing.
   static bool ValidateIntervals(const ArenaVector<LiveInterval*>& intervals,
                                 size_t number_of_spill_slots,
                                 size_t number_of_out_slots,
@@ -67,178 +73,25 @@ class RegisterAllocator {
                                 bool processing_core_registers,
                                 bool log_fatal_on_failure);
 
-  static bool CanAllocateRegistersFor(const HGraph& graph, InstructionSet instruction_set);
-
-  size_t GetNumberOfSpillSlots() const {
-    return int_spill_slots_.size()
-        + long_spill_slots_.size()
-        + float_spill_slots_.size()
-        + double_spill_slots_.size()
-        + catch_phi_spill_slots_;
-  }
-
   static constexpr const char* kRegisterAllocatorPassName = "register";
 
- private:
-  // Main methods of the allocator.
-  void LinearScan();
-  bool TryAllocateFreeReg(LiveInterval* interval);
-  bool AllocateBlockedReg(LiveInterval* interval);
-  void Resolve();
-
-  // Add `interval` in the given sorted list.
-  static void AddSorted(ArenaVector<LiveInterval*>* array, LiveInterval* interval);
+ protected:
+  RegisterAllocator(ArenaAllocator* allocator,
+                    CodeGenerator* codegen,
+                    const SsaLivenessAnalysis& analysis);
 
   // Split `interval` at the position `position`. The new interval starts at `position`.
-  LiveInterval* Split(LiveInterval* interval, size_t position);
+  // If `position` is at the start of `interval`, returns `interval` with its
+  // register location(s) cleared.
+  static LiveInterval* Split(LiveInterval* interval, size_t position);
 
   // Split `interval` at a position between `from` and `to`. The method will try
   // to find an optimal split position.
   LiveInterval* SplitBetween(LiveInterval* interval, size_t from, size_t to);
 
-  // Returns whether `reg` is blocked by the code generator.
-  bool IsBlocked(int reg) const;
-
-  // Update the interval for the register in `location` to cover [start, end).
-  void BlockRegister(Location location, size_t start, size_t end);
-  void BlockRegisters(size_t start, size_t end, bool caller_save_only = false);
-
-  // Allocate a spill slot for the given interval. Should be called in linear
-  // order of interval starting positions.
-  void AllocateSpillSlotFor(LiveInterval* interval);
-
-  // Allocate a spill slot for the given catch phi. Will allocate the same slot
-  // for phis which share the same vreg. Must be called in reverse linear order
-  // of lifetime positions and ascending vreg numbers for correctness.
-  void AllocateSpillSlotForCatchPhi(HPhi* phi);
-
-  // Connect adjacent siblings within blocks.
-  void ConnectSiblings(LiveInterval* interval);
-
-  // Connect siblings between block entries and exits.
-  void ConnectSplitSiblings(LiveInterval* interval, HBasicBlock* from, HBasicBlock* to) const;
-
-  // Helper methods to insert parallel moves in the graph.
-  void InsertParallelMoveAtExitOf(HBasicBlock* block,
-                                  HInstruction* instruction,
-                                  Location source,
-                                  Location destination) const;
-  void InsertParallelMoveAtEntryOf(HBasicBlock* block,
-                                   HInstruction* instruction,
-                                   Location source,
-                                   Location destination) const;
-  void InsertMoveAfter(HInstruction* instruction, Location source, Location destination) const;
-  void AddInputMoveFor(HInstruction* input,
-                       HInstruction* user,
-                       Location source,
-                       Location destination) const;
-  void InsertParallelMoveAt(size_t position,
-                            HInstruction* instruction,
-                            Location source,
-                            Location destination) const;
-
-  void AddMove(HParallelMove* move,
-               Location source,
-               Location destination,
-               HInstruction* instruction,
-               Primitive::Type type) const;
-
-  // Helper methods.
-  void AllocateRegistersInternal();
-  void ProcessInstruction(HInstruction* instruction);
-  bool ValidateInternal(bool log_fatal_on_failure) const;
-  void DumpInterval(std::ostream& stream, LiveInterval* interval) const;
-  void DumpAllIntervals(std::ostream& stream) const;
-  int FindAvailableRegisterPair(size_t* next_use, size_t starting_at) const;
-  int FindAvailableRegister(size_t* next_use, LiveInterval* current) const;
-  bool IsCallerSaveRegister(int reg) const;
-
-  // Try splitting an active non-pair or unaligned pair interval at the given `position`.
-  // Returns whether it was successful at finding such an interval.
-  bool TrySplitNonPairOrUnalignedPairIntervalAt(size_t position,
-                                                size_t first_register_use,
-                                                size_t* next_use);
-
   ArenaAllocator* const allocator_;
   CodeGenerator* const codegen_;
   const SsaLivenessAnalysis& liveness_;
-
-  // List of intervals for core registers that must be processed, ordered by start
-  // position. Last entry is the interval that has the lowest start position.
-  // This list is initially populated before doing the linear scan.
-  ArenaVector<LiveInterval*> unhandled_core_intervals_;
-
-  // List of intervals for floating-point registers. Same comments as above.
-  ArenaVector<LiveInterval*> unhandled_fp_intervals_;
-
-  // Currently processed list of unhandled intervals. Either `unhandled_core_intervals_`
-  // or `unhandled_fp_intervals_`.
-  ArenaVector<LiveInterval*>* unhandled_;
-
-  // List of intervals that have been processed.
-  ArenaVector<LiveInterval*> handled_;
-
-  // List of intervals that are currently active when processing a new live interval.
-  // That is, they have a live range that spans the start of the new interval.
-  ArenaVector<LiveInterval*> active_;
-
-  // List of intervals that are currently inactive when processing a new live interval.
-  // That is, they have a lifetime hole that spans the start of the new interval.
-  ArenaVector<LiveInterval*> inactive_;
-
-  // Fixed intervals for physical registers. Such intervals cover the positions
-  // where an instruction requires a specific register.
-  ArenaVector<LiveInterval*> physical_core_register_intervals_;
-  ArenaVector<LiveInterval*> physical_fp_register_intervals_;
-
-  // Intervals for temporaries. Such intervals cover the positions
-  // where an instruction requires a temporary.
-  ArenaVector<LiveInterval*> temp_intervals_;
-
-  // The spill slots allocated for live intervals. We ensure spill slots
-  // are typed to avoid (1) doing moves and swaps between two different kinds
-  // of registers, and (2) swapping between a single stack slot and a double
-  // stack slot. This simplifies the parallel move resolver.
-  ArenaVector<size_t> int_spill_slots_;
-  ArenaVector<size_t> long_spill_slots_;
-  ArenaVector<size_t> float_spill_slots_;
-  ArenaVector<size_t> double_spill_slots_;
-
-  // Spill slots allocated to catch phis. This category is special-cased because
-  // (1) slots are allocated prior to linear scan and in reverse linear order,
-  // (2) equivalent phis need to share slots despite having different types.
-  size_t catch_phi_spill_slots_;
-
-  // Instructions that need a safepoint.
-  ArenaVector<HInstruction*> safepoints_;
-
-  // True if processing core registers. False if processing floating
-  // point registers.
-  bool processing_core_registers_;
-
-  // Number of registers for the current register kind (core or floating point).
-  size_t number_of_registers_;
-
-  // Temporary array, allocated ahead of time for simplicity.
-  size_t* registers_array_;
-
-  // Blocked registers, as decided by the code generator.
-  bool* const blocked_core_registers_;
-  bool* const blocked_fp_registers_;
-
-  // Slots reserved for out arguments.
-  size_t reserved_out_slots_;
-
-  // The maximum live core registers at safepoints.
-  size_t maximum_number_of_live_core_registers_;
-
-  // The maximum live FP registers at safepoints.
-  size_t maximum_number_of_live_fp_registers_;
-
-  ART_FRIEND_TEST(RegisterAllocatorTest, FreeUntil);
-  ART_FRIEND_TEST(RegisterAllocatorTest, SpillInactive);
-
-  DISALLOW_COPY_AND_ASSIGN(RegisterAllocator);
 };
 
 }  // namespace art
diff --git a/compiler/optimizing/register_allocator_graph_color.cc b/compiler/optimizing/register_allocator_graph_color.cc
new file mode 100644
index 0000000000..a21595fe03
--- /dev/null
+++ b/compiler/optimizing/register_allocator_graph_color.cc
@@ -0,0 +1,2105 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "register_allocator_graph_color.h"
+
+#include "code_generator.h"
+#include "register_allocation_resolver.h"
+#include "ssa_liveness_analysis.h"
+#include "thread-inl.h"
+
+namespace art {
+
+// Highest number of registers that we support for any platform. This can be used for std::bitset,
+// for example, which needs to know its size at compile time.
+static constexpr size_t kMaxNumRegs = 32;
+
+// The maximum number of graph coloring attempts before triggering a DCHECK.
+// This is meant to catch changes to the graph coloring algorithm that undermine its forward
+// progress guarantees. Forward progress for the algorithm means splitting live intervals on
+// every graph coloring attempt so that eventually the interference graph will be sparse enough
+// to color. The main threat to forward progress is trying to split short intervals which cannot be
+// split further; this could cause infinite looping because the interference graph would never
+// change. This is avoided by prioritizing short intervals before long ones, so that long
+// intervals are split when coloring fails.
+static constexpr size_t kMaxGraphColoringAttemptsDebug = 100;
+
+// We always want to avoid spilling inside loops.
+static constexpr size_t kLoopSpillWeightMultiplier = 10;
+
+// If we avoid moves in single jump blocks, we can avoid jumps to jumps.
+static constexpr size_t kSingleJumpBlockWeightMultiplier = 2;
+
+// We avoid moves in blocks that dominate the exit block, since these blocks will
+// be executed on every path through the method.
+static constexpr size_t kDominatesExitBlockWeightMultiplier = 2;
+
+enum class CoalesceKind {
+  kAdjacentSibling,       // Prevents moves at interval split points.
+  kFixedOutputSibling,    // Prevents moves from a fixed output location.
+  kFixedInput,            // Prevents moves into a fixed input location.
+  kNonlinearControlFlow,  // Prevents moves between blocks.
+  kPhi,                   // Prevents phi resolution moves.
+  kFirstInput,            // Prevents a single input move.
+  kAnyInput,              // May lead to better instruction selection / smaller encodings.
+};
+
+std::ostream& operator<<(std::ostream& os, const CoalesceKind& kind) {
+  return os << static_cast<typename std::underlying_type<CoalesceKind>::type>(kind);
+}
+
+static size_t LoopDepthAt(HBasicBlock* block) {
+  HLoopInformation* loop_info = block->GetLoopInformation();
+  size_t depth = 0;
+  while (loop_info != nullptr) {
+    ++depth;
+    loop_info = loop_info->GetPreHeader()->GetLoopInformation();
+  }
+  return depth;
+}
+
+// Return the runtime cost of inserting a move instruction at the specified location.
+static size_t CostForMoveAt(size_t position, const SsaLivenessAnalysis& liveness) {
+  HBasicBlock* block = liveness.GetBlockFromPosition(position / 2);
+  DCHECK(block != nullptr);
+  size_t cost = 1;
+  if (block->IsSingleJump()) {
+    cost *= kSingleJumpBlockWeightMultiplier;
+  }
+  if (block->Dominates(block->GetGraph()->GetExitBlock())) {
+    cost *= kDominatesExitBlockWeightMultiplier;
+  }
+  for (size_t loop_depth = LoopDepthAt(block); loop_depth > 0; --loop_depth) {
+    cost *= kLoopSpillWeightMultiplier;
+  }
+  return cost;
+}
+
+// In general, we estimate coalesce priority by whether it will definitely avoid a move,
+// and by how likely it is to create an interference graph that's harder to color.
+static size_t ComputeCoalescePriority(CoalesceKind kind,
+                                      size_t position,
+                                      const SsaLivenessAnalysis& liveness) {
+  if (kind == CoalesceKind::kAnyInput) {
+    // This type of coalescing can affect instruction selection, but not moves, so we
+    // give it the lowest priority.
+    return 0;
+  } else {
+    return CostForMoveAt(position, liveness);
+  }
+}
+
+enum class CoalesceStage {
+  kWorklist,  // Currently in the iterative coalescing worklist.
+  kActive,    // Not in a worklist, but could be considered again during iterative coalescing.
+  kInactive,  // No longer considered until last-chance coalescing.
+  kDefunct,   // Either the two nodes interfere, or have already been coalesced.
+};
+
+std::ostream& operator<<(std::ostream& os, const CoalesceStage& stage) {
+  return os << static_cast<typename std::underlying_type<CoalesceStage>::type>(stage);
+}
+
+// Represents a coalesce opportunity between two nodes.
+struct CoalesceOpportunity : public ArenaObject<kArenaAllocRegisterAllocator> {
+  CoalesceOpportunity(InterferenceNode* a,
+                      InterferenceNode* b,
+                      CoalesceKind kind,
+                      size_t position,
+                      const SsaLivenessAnalysis& liveness)
+        : node_a(a),
+          node_b(b),
+          stage(CoalesceStage::kWorklist),
+          priority(ComputeCoalescePriority(kind, position, liveness)) {}
+
+  // Compare two coalesce opportunities based on their priority.
+  // Return true if lhs has a lower priority than that of rhs.
+  static bool CmpPriority(const CoalesceOpportunity* lhs,
+                          const CoalesceOpportunity* rhs) {
+    return lhs->priority < rhs->priority;
+  }
+
+  InterferenceNode* const node_a;
+  InterferenceNode* const node_b;
+
+  // The current stage of this coalesce opportunity, indicating whether it is in a worklist,
+  // and whether it should still be considered.
+  CoalesceStage stage;
+
+  // The priority of this coalesce opportunity, based on heuristics.
+  const size_t priority;
+};
+
+enum class NodeStage {
+  kInitial,           // Uninitialized.
+  kPrecolored,        // Marks fixed nodes.
+  kSafepoint,         // Marks safepoint nodes.
+  kPrunable,          // Marks uncolored nodes in the interference graph.
+  kSimplifyWorklist,  // Marks non-move-related nodes with degree less than the number of registers.
+  kFreezeWorklist,    // Marks move-related nodes with degree less than the number of registers.
+  kSpillWorklist,     // Marks nodes with degree greater or equal to the number of registers.
+  kPruned             // Marks nodes already pruned from the interference graph.
+};
+
+std::ostream& operator<<(std::ostream& os, const NodeStage& stage) {
+  return os << static_cast<typename std::underlying_type<NodeStage>::type>(stage);
+}
+
+// Returns the estimated cost of spilling a particular live interval.
+static float ComputeSpillWeight(LiveInterval* interval, const SsaLivenessAnalysis& liveness) {
+  if (interval->HasRegister()) {
+    // Intervals with a fixed register cannot be spilled.
+    return std::numeric_limits<float>::min();
+  }
+
+  size_t length = interval->GetLength();
+  if (length == 1) {
+    // Tiny intervals should have maximum priority, since they cannot be split any further.
+    return std::numeric_limits<float>::max();
+  }
+
+  size_t use_weight = 0;
+  if (interval->GetDefinedBy() != nullptr && interval->DefinitionRequiresRegister()) {
+    // Cost for spilling at a register definition point.
+    use_weight += CostForMoveAt(interval->GetStart() + 1, liveness);
+  }
+
+  UsePosition* use = interval->GetFirstUse();
+  while (use != nullptr && use->GetPosition() <= interval->GetStart()) {
+    // Skip uses before the start of this live interval.
+    use = use->GetNext();
+  }
+
+  while (use != nullptr && use->GetPosition() <= interval->GetEnd()) {
+    if (use->GetUser() != nullptr && use->RequiresRegister()) {
+      // Cost for spilling at a register use point.
+      use_weight += CostForMoveAt(use->GetUser()->GetLifetimePosition() - 1, liveness);
+    }
+    use = use->GetNext();
+  }
+
+  // We divide by the length of the interval because we want to prioritize
+  // short intervals; we do not benefit much if we split them further.
+  return static_cast<float>(use_weight) / static_cast<float>(length);
+}
+
+// Interference nodes make up the interference graph, which is the primary data structure in
+// graph coloring register allocation. Each node represents a single live interval, and contains
+// a set of adjacent nodes corresponding to intervals overlapping with its own. To save memory,
+// pre-colored nodes never contain outgoing edges (only incoming ones).
+//
+// As nodes are pruned from the interference graph, incoming edges of the pruned node are removed,
+// but outgoing edges remain in order to later color the node based on the colors of its neighbors.
+//
+// Note that a pair interval is represented by a single node in the interference graph, which
+// essentially requires two colors. One consequence of this is that the degree of a node is not
+// necessarily equal to the number of adjacent nodes--instead, the degree reflects the maximum
+// number of colors with which a node could interfere. We model this by giving edges different
+// weights (1 or 2) to control how much it increases the degree of adjacent nodes.
+// For example, the edge between two single nodes will have weight 1. On the other hand,
+// the edge between a single node and a pair node will have weight 2. This is because the pair
+// node could block up to two colors for the single node, and because the single node could
+// block an entire two-register aligned slot for the pair node.
+// The degree is defined this way because we use it to decide whether a node is guaranteed a color,
+// and thus whether it is safe to prune it from the interference graph early on.
+class InterferenceNode : public ArenaObject<kArenaAllocRegisterAllocator> {
+ public:
+  InterferenceNode(ArenaAllocator* allocator,
+                   LiveInterval* interval,
+                   const SsaLivenessAnalysis& liveness)
+        : stage(NodeStage::kInitial),
+          interval_(interval),
+          adjacent_nodes_(allocator->Adapter(kArenaAllocRegisterAllocator)),
+          coalesce_opportunities_(allocator->Adapter(kArenaAllocRegisterAllocator)),
+          out_degree_(interval->HasRegister() ? std::numeric_limits<size_t>::max() : 0),
+          alias_(this),
+          spill_weight_(ComputeSpillWeight(interval, liveness)),
+          requires_color_(interval->RequiresRegister()),
+          needs_spill_slot_(false) {
+    DCHECK(!interval->IsHighInterval()) << "Pair nodes should be represented by the low interval";
+  }
+
+  void AddInterference(InterferenceNode* other, bool guaranteed_not_interfering_yet) {
+    DCHECK(!IsPrecolored()) << "To save memory, fixed nodes should not have outgoing interferences";
+    DCHECK_NE(this, other) << "Should not create self loops in the interference graph";
+    DCHECK_EQ(this, alias_) << "Should not add interferences to a node that aliases another";
+    DCHECK_NE(stage, NodeStage::kPruned);
+    DCHECK_NE(other->stage, NodeStage::kPruned);
+    if (guaranteed_not_interfering_yet) {
+      DCHECK(std::find(adjacent_nodes_.begin(), adjacent_nodes_.end(), other)
+             == adjacent_nodes_.end());
+      adjacent_nodes_.push_back(other);
+      out_degree_ += EdgeWeightWith(other);
+    } else {
+      auto it = std::find(adjacent_nodes_.begin(), adjacent_nodes_.end(), other);
+      if (it == adjacent_nodes_.end()) {
+        adjacent_nodes_.push_back(other);
+        out_degree_ += EdgeWeightWith(other);
+      }
+    }
+  }
+
+  void RemoveInterference(InterferenceNode* other) {
+    DCHECK_EQ(this, alias_) << "Should not remove interferences from a coalesced node";
+    DCHECK_EQ(other->stage, NodeStage::kPruned) << "Should only remove interferences when pruning";
+    auto it = std::find(adjacent_nodes_.begin(), adjacent_nodes_.end(), other);
+    if (it != adjacent_nodes_.end()) {
+      adjacent_nodes_.erase(it);
+      out_degree_ -= EdgeWeightWith(other);
+    }
+  }
+
+  bool ContainsInterference(InterferenceNode* other) const {
+    DCHECK(!IsPrecolored()) << "Should not query fixed nodes for interferences";
+    DCHECK_EQ(this, alias_) << "Should not query a coalesced node for interferences";
+    auto it = std::find(adjacent_nodes_.begin(), adjacent_nodes_.end(), other);
+    return it != adjacent_nodes_.end();
+  }
+
+  LiveInterval* GetInterval() const {
+    return interval_;
+  }
+
+  const ArenaVector<InterferenceNode*>& GetAdjacentNodes() const {
+    return adjacent_nodes_;
+  }
+
+  size_t GetOutDegree() const {
+    // Pre-colored nodes have infinite degree.
+    DCHECK(!IsPrecolored() || out_degree_ == std::numeric_limits<size_t>::max());
+    return out_degree_;
+  }
+
+  void AddCoalesceOpportunity(CoalesceOpportunity* opportunity) {
+    coalesce_opportunities_.push_back(opportunity);
+  }
+
+  void ClearCoalesceOpportunities() {
+    coalesce_opportunities_.clear();
+  }
+
+  bool IsMoveRelated() const {
+    for (CoalesceOpportunity* opportunity : coalesce_opportunities_) {
+      if (opportunity->stage == CoalesceStage::kWorklist ||
+          opportunity->stage == CoalesceStage::kActive) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // Return whether this node already has a color.
+  // Used to find fixed nodes in the interference graph before coloring.
+  bool IsPrecolored() const {
+    return interval_->HasRegister();
+  }
+
+  bool IsPair() const {
+    return interval_->HasHighInterval();
+  }
+
+  void SetAlias(InterferenceNode* rep) {
+    DCHECK_NE(rep->stage, NodeStage::kPruned);
+    DCHECK_EQ(this, alias_) << "Should only set a node's alias once";
+    alias_ = rep;
+  }
+
+  InterferenceNode* GetAlias() {
+    if (alias_ != this) {
+      // Recurse in order to flatten tree of alias pointers.
+      alias_ = alias_->GetAlias();
+    }
+    return alias_;
+  }
+
+  const ArenaVector<CoalesceOpportunity*>& GetCoalesceOpportunities() const {
+    return coalesce_opportunities_;
+  }
+
+  float GetSpillWeight() const {
+    return spill_weight_;
+  }
+
+  bool RequiresColor() const {
+    return requires_color_;
+  }
+
+  // We give extra weight to edges adjacent to pair nodes. See the general comment on the
+  // interference graph above.
+  size_t EdgeWeightWith(const InterferenceNode* other) const {
+    return (IsPair() || other->IsPair()) ? 2 : 1;
+  }
+
+  bool NeedsSpillSlot() const {
+    return needs_spill_slot_;
+  }
+
+  void SetNeedsSpillSlot() {
+    needs_spill_slot_ = true;
+  }
+
+  // The current stage of this node, indicating which worklist it belongs to.
+  NodeStage stage;
+
+ private:
+  // The live interval that this node represents.
+  LiveInterval* const interval_;
+
+  // All nodes interfering with this one.
+  // We use an unsorted vector as a set, since a tree or hash set is too heavy for the
+  // set sizes that we encounter. Using a vector leads to much better performance.
+  ArenaVector<InterferenceNode*> adjacent_nodes_;
+
+  // Interference nodes that this node should be coalesced with to reduce moves.
+  ArenaVector<CoalesceOpportunity*> coalesce_opportunities_;
+
+  // The maximum number of colors with which this node could interfere. This could be more than
+  // the number of adjacent nodes if this is a pair node, or if some adjacent nodes are pair nodes.
+  // We use "out" degree because incoming edges come from nodes already pruned from the graph,
+  // and do not affect the coloring of this node.
+  // Pre-colored nodes are treated as having infinite degree.
+  size_t out_degree_;
+
+  // The node representing this node in the interference graph.
+  // Initially set to `this`, and only changed if this node is coalesced into another.
+  InterferenceNode* alias_;
+
+  // The cost of splitting and spilling this interval to the stack.
+  // Nodes with a higher spill weight should be prioritized when assigning registers.
+  // This is essentially based on use density and location; short intervals with many uses inside
+  // deeply nested loops have a high spill weight.
+  const float spill_weight_;
+
+  const bool requires_color_;
+
+  bool needs_spill_slot_;
+
+  DISALLOW_COPY_AND_ASSIGN(InterferenceNode);
+};
+
+// The order in which we color nodes is important. To guarantee forward progress,
+// we prioritize intervals that require registers, and after that we prioritize
+// short intervals. That way, if we fail to color a node, it either won't require a
+// register, or it will be a long interval that can be split in order to make the
+// interference graph sparser.
+// To improve code quality, we prioritize intervals used frequently in deeply nested loops.
+// (This metric is secondary to the forward progress requirements above.)
+// TODO: May also want to consider:
+// - Constants (since they can be rematerialized)
+// - Allocated spill slots
+static bool HasGreaterNodePriority(const InterferenceNode* lhs,
+                                   const InterferenceNode* rhs) {
+  // (1) Prioritize the node that requires a color.
+  if (lhs->RequiresColor() != rhs->RequiresColor()) {
+    return lhs->RequiresColor();
+  }
+
+  // (2) Prioritize the interval that has a higher spill weight.
+  return lhs->GetSpillWeight() > rhs->GetSpillWeight();
+}
+
+// A ColoringIteration holds the many data structures needed for a single graph coloring attempt,
+// and provides methods for each phase of the attempt.
+class ColoringIteration {
+ public:
+  ColoringIteration(RegisterAllocatorGraphColor* register_allocator,
+                    ArenaAllocator* allocator,
+                    bool processing_core_regs,
+                    size_t num_regs)
+        : register_allocator_(register_allocator),
+          allocator_(allocator),
+          processing_core_regs_(processing_core_regs),
+          num_regs_(num_regs),
+          interval_node_map_(allocator->Adapter(kArenaAllocRegisterAllocator)),
+          prunable_nodes_(allocator->Adapter(kArenaAllocRegisterAllocator)),
+          pruned_nodes_(allocator->Adapter(kArenaAllocRegisterAllocator)),
+          simplify_worklist_(allocator->Adapter(kArenaAllocRegisterAllocator)),
+          freeze_worklist_(allocator->Adapter(kArenaAllocRegisterAllocator)),
+          spill_worklist_(HasGreaterNodePriority, allocator->Adapter(kArenaAllocRegisterAllocator)),
+          coalesce_worklist_(CoalesceOpportunity::CmpPriority,
+                             allocator->Adapter(kArenaAllocRegisterAllocator)) {}
+
+  // Use the intervals collected from instructions to construct an
+  // interference graph mapping intervals to adjacency lists.
+  // Also, collect synthesized safepoint nodes, used to keep
+  // track of live intervals across safepoints.
+  // TODO: Should build safepoints elsewhere.
+  void BuildInterferenceGraph(const ArenaVector<LiveInterval*>& intervals,
+                              const ArenaVector<InterferenceNode*>& physical_nodes,
+                              ArenaVector<InterferenceNode*>* safepoints);
+
+  // Add coalesce opportunities to interference nodes.
+  void FindCoalesceOpportunities();
+
+  // Prune nodes from the interference graph to be colored later. Build
+  // a stack (pruned_nodes) containing these intervals in an order determined
+  // by various heuristics.
+  void PruneInterferenceGraph();
+
+  // Process pruned_intervals_ to color the interference graph, spilling when
+  // necessary. Returns true if successful. Else, some intervals have been
+  // split, and the interference graph should be rebuilt for another attempt.
+  bool ColorInterferenceGraph();
+
+  // Return prunable nodes.
+  // The register allocator will need to access prunable nodes after coloring
+  // in order to tell the code generator which registers have been assigned.
+  const ArenaVector<InterferenceNode*>& GetPrunableNodes() const {
+    return prunable_nodes_;
+  }
+
+ private:
+  // Create a coalesce opportunity between two nodes.
+  void CreateCoalesceOpportunity(InterferenceNode* a,
+                                 InterferenceNode* b,
+                                 CoalesceKind kind,
+                                 size_t position);
+
+  // Add an edge in the interference graph, if valid.
+  // Note that `guaranteed_not_interfering_yet` is used to optimize adjacency set insertion
+  // when possible.
+  void AddPotentialInterference(InterferenceNode* from,
+                                InterferenceNode* to,
+                                bool guaranteed_not_interfering_yet,
+                                bool both_directions = true);
+
+  // Invalidate all coalesce opportunities this node has, so that it (and possibly its neighbors)
+  // may be pruned from the interference graph.
+  void FreezeMoves(InterferenceNode* node);
+
+  // Prune a node from the interference graph, updating worklists if necessary.
+  void PruneNode(InterferenceNode* node);
+
+  // Add coalesce opportunities associated with this node to the coalesce worklist.
+  void EnableCoalesceOpportunities(InterferenceNode* node);
+
+  // If needed, from `node` from the freeze worklist to the simplify worklist.
+  void CheckTransitionFromFreezeWorklist(InterferenceNode* node);
+
+  // Return true if `into` is colored, and `from` can be coalesced with `into` conservatively.
+  bool PrecoloredHeuristic(InterferenceNode* from, InterferenceNode* into);
+
+  // Return true if `from` and `into` are uncolored, and can be coalesced conservatively.
+  bool UncoloredHeuristic(InterferenceNode* from, InterferenceNode* into);
+
+  void Coalesce(CoalesceOpportunity* opportunity);
+
+  // Merge `from` into `into` in the interference graph.
+  void Combine(InterferenceNode* from, InterferenceNode* into);
+
+  // A reference to the register allocator instance,
+  // needed to split intervals and assign spill slots.
+  RegisterAllocatorGraphColor* register_allocator_;
+
+  // An arena allocator used for a single graph coloring attempt.
+  ArenaAllocator* allocator_;
+
+  const bool processing_core_regs_;
+
+  const size_t num_regs_;
+
+  // A map from live intervals to interference nodes.
+  ArenaHashMap<LiveInterval*, InterferenceNode*> interval_node_map_;
+
+  // Uncolored nodes that should be pruned from the interference graph.
+  ArenaVector<InterferenceNode*> prunable_nodes_;
+
+  // A stack of nodes pruned from the interference graph, waiting to be pruned.
+  ArenaStdStack<InterferenceNode*> pruned_nodes_;
+
+  // A queue containing low degree, non-move-related nodes that can pruned immediately.
+  ArenaDeque<InterferenceNode*> simplify_worklist_;
+
+  // A queue containing low degree, move-related nodes.
+  ArenaDeque<InterferenceNode*> freeze_worklist_;
+
+  // A queue containing high degree nodes.
+  // If we have to prune from the spill worklist, we cannot guarantee
+  // the pruned node a color, so we order the worklist by priority.
+  ArenaPriorityQueue<InterferenceNode*, decltype(&HasGreaterNodePriority)> spill_worklist_;
+
+  // A queue containing coalesce opportunities.
+  // We order the coalesce worklist by priority, since some coalesce opportunities (e.g., those
+  // inside of loops) are more important than others.
+  ArenaPriorityQueue<CoalesceOpportunity*,
+                     decltype(&CoalesceOpportunity::CmpPriority)> coalesce_worklist_;
+
+  DISALLOW_COPY_AND_ASSIGN(ColoringIteration);
+};
+
+static bool IsCoreInterval(LiveInterval* interval) {
+  return !Primitive::IsFloatingPointType(interval->GetType());
+}
+
+static size_t ComputeReservedArtMethodSlots(const CodeGenerator& codegen) {
+  return static_cast<size_t>(InstructionSetPointerSize(codegen.GetInstructionSet())) / kVRegSize;
+}
+
+RegisterAllocatorGraphColor::RegisterAllocatorGraphColor(ArenaAllocator* allocator,
+                                                         CodeGenerator* codegen,
+                                                         const SsaLivenessAnalysis& liveness,
+                                                         bool iterative_move_coalescing)
+      : RegisterAllocator(allocator, codegen, liveness),
+        iterative_move_coalescing_(iterative_move_coalescing),
+        core_intervals_(allocator->Adapter(kArenaAllocRegisterAllocator)),
+        fp_intervals_(allocator->Adapter(kArenaAllocRegisterAllocator)),
+        temp_intervals_(allocator->Adapter(kArenaAllocRegisterAllocator)),
+        safepoints_(allocator->Adapter(kArenaAllocRegisterAllocator)),
+        physical_core_nodes_(allocator->Adapter(kArenaAllocRegisterAllocator)),
+        physical_fp_nodes_(allocator->Adapter(kArenaAllocRegisterAllocator)),
+        num_int_spill_slots_(0),
+        num_double_spill_slots_(0),
+        num_float_spill_slots_(0),
+        num_long_spill_slots_(0),
+        catch_phi_spill_slot_counter_(0),
+        reserved_art_method_slots_(ComputeReservedArtMethodSlots(*codegen)),
+        reserved_out_slots_(codegen->GetGraph()->GetMaximumNumberOfOutVRegs()),
+        number_of_globally_blocked_core_regs_(0),
+        number_of_globally_blocked_fp_regs_(0),
+        max_safepoint_live_core_regs_(0),
+        max_safepoint_live_fp_regs_(0) {
+  // Before we ask for blocked registers, set them up in the code generator.
+  codegen->SetupBlockedRegisters();
+
+  // Initialize physical core register live intervals and blocked registers.
+  // This includes globally blocked registers, such as the stack pointer.
+  physical_core_nodes_.resize(codegen_->GetNumberOfCoreRegisters(), nullptr);
+  for (size_t i = 0; i < codegen_->GetNumberOfCoreRegisters(); ++i) {
+    LiveInterval* interval = LiveInterval::MakeFixedInterval(allocator_, i, Primitive::kPrimInt);
+    physical_core_nodes_[i] =
+        new (allocator_) InterferenceNode(allocator_, interval, liveness);
+    physical_core_nodes_[i]->stage = NodeStage::kPrecolored;
+    core_intervals_.push_back(interval);
+    if (codegen_->IsBlockedCoreRegister(i)) {
+      ++number_of_globally_blocked_core_regs_;
+      interval->AddRange(0, liveness.GetMaxLifetimePosition());
+    }
+  }
+  // Initialize physical floating point register live intervals and blocked registers.
+  physical_fp_nodes_.resize(codegen_->GetNumberOfFloatingPointRegisters(), nullptr);
+  for (size_t i = 0; i < codegen_->GetNumberOfFloatingPointRegisters(); ++i) {
+    LiveInterval* interval = LiveInterval::MakeFixedInterval(allocator_, i, Primitive::kPrimFloat);
+    physical_fp_nodes_[i] =
+        new (allocator_) InterferenceNode(allocator_, interval, liveness);
+    physical_fp_nodes_[i]->stage = NodeStage::kPrecolored;
+    fp_intervals_.push_back(interval);
+    if (codegen_->IsBlockedFloatingPointRegister(i)) {
+      ++number_of_globally_blocked_fp_regs_;
+      interval->AddRange(0, liveness.GetMaxLifetimePosition());
+    }
+  }
+}
+
+void RegisterAllocatorGraphColor::AllocateRegisters() {
+  // (1) Collect and prepare live intervals.
+  ProcessInstructions();
+
+  for (bool processing_core_regs : {true, false}) {
+    ArenaVector<LiveInterval*>& intervals = processing_core_regs
+        ? core_intervals_
+        : fp_intervals_;
+    size_t num_registers = processing_core_regs
+        ? codegen_->GetNumberOfCoreRegisters()
+        : codegen_->GetNumberOfFloatingPointRegisters();
+
+    size_t attempt = 0;
+    while (true) {
+      ++attempt;
+      DCHECK(attempt <= kMaxGraphColoringAttemptsDebug)
+          << "Exceeded debug max graph coloring register allocation attempts. "
+          << "This could indicate that the register allocator is not making forward progress, "
+          << "which could be caused by prioritizing the wrong live intervals. (Short intervals "
+          << "should be prioritized over long ones, because they cannot be split further.)";
+
+      // Many data structures are cleared between graph coloring attempts, so we reduce
+      // total memory usage by using a new arena allocator for each attempt.
+      ArenaAllocator coloring_attempt_allocator(allocator_->GetArenaPool());
+      ColoringIteration iteration(this,
+                                  &coloring_attempt_allocator,
+                                  processing_core_regs,
+                                  num_registers);
+
+      // (2) Build the interference graph. Also gather safepoints.
+      ArenaVector<InterferenceNode*> safepoints(
+          coloring_attempt_allocator.Adapter(kArenaAllocRegisterAllocator));
+      ArenaVector<InterferenceNode*>& physical_nodes = processing_core_regs
+          ? physical_core_nodes_
+          : physical_fp_nodes_;
+      iteration.BuildInterferenceGraph(intervals, physical_nodes, &safepoints);
+
+      // (3) Add coalesce opportunities.
+      //     If we have tried coloring the graph a suspiciously high number of times, give
+      //     up on move coalescing, just in case the coalescing heuristics are not conservative.
+      //     (This situation will be caught if DCHECKs are turned on.)
+      if (iterative_move_coalescing_ && attempt <= kMaxGraphColoringAttemptsDebug) {
+        iteration.FindCoalesceOpportunities();
+      }
+
+      // (4) Prune all uncolored nodes from interference graph.
+      iteration.PruneInterferenceGraph();
+
+      // (5) Color pruned nodes based on interferences.
+      bool successful = iteration.ColorInterferenceGraph();
+
+      // We manually clear coalesce opportunities for physical nodes,
+      // since they persist across coloring attempts.
+      for (InterferenceNode* node : physical_core_nodes_) {
+        node->ClearCoalesceOpportunities();
+      }
+      for (InterferenceNode* node : physical_fp_nodes_) {
+        node->ClearCoalesceOpportunities();
+      }
+
+      if (successful) {
+        // Assign spill slots.
+        AllocateSpillSlots(iteration.GetPrunableNodes());
+
+        // Compute the maximum number of live registers across safepoints.
+        // Notice that we do not count globally blocked registers, such as the stack pointer.
+        if (safepoints.size() > 0) {
+          size_t max_safepoint_live_regs = ComputeMaxSafepointLiveRegisters(safepoints);
+          if (processing_core_regs) {
+            max_safepoint_live_core_regs_ =
+                max_safepoint_live_regs - number_of_globally_blocked_core_regs_;
+          } else {
+            max_safepoint_live_fp_regs_=
+                max_safepoint_live_regs - number_of_globally_blocked_fp_regs_;
+          }
+        }
+
+        // Tell the code generator which registers were allocated.
+        // We only look at prunable_nodes because we already told the code generator about
+        // fixed intervals while processing instructions. We also ignore the fixed intervals
+        // placed at the top of catch blocks.
+        for (InterferenceNode* node : iteration.GetPrunableNodes()) {
+          LiveInterval* interval = node->GetInterval();
+          if (interval->HasRegister()) {
+            Location low_reg = processing_core_regs
+                ? Location::RegisterLocation(interval->GetRegister())
+                : Location::FpuRegisterLocation(interval->GetRegister());
+            codegen_->AddAllocatedRegister(low_reg);
+            if (interval->HasHighInterval()) {
+              LiveInterval* high = interval->GetHighInterval();
+              DCHECK(high->HasRegister());
+              Location high_reg = processing_core_regs
+                  ? Location::RegisterLocation(high->GetRegister())
+                  : Location::FpuRegisterLocation(high->GetRegister());
+              codegen_->AddAllocatedRegister(high_reg);
+            }
+          } else {
+            DCHECK(!interval->HasHighInterval() || !interval->GetHighInterval()->HasRegister());
+          }
+        }
+
+        break;
+      }
+    }  // while unsuccessful
+  }  // for processing_core_instructions
+
+  // (6) Resolve locations and deconstruct SSA form.
+  RegisterAllocationResolver(allocator_, codegen_, liveness_)
+      .Resolve(max_safepoint_live_core_regs_,
+               max_safepoint_live_fp_regs_,
+               reserved_art_method_slots_ + reserved_out_slots_,
+               num_int_spill_slots_,
+               num_long_spill_slots_,
+               num_float_spill_slots_,
+               num_double_spill_slots_,
+               catch_phi_spill_slot_counter_,
+               temp_intervals_);
+
+  if (kIsDebugBuild) {
+    Validate(/*log_fatal_on_failure*/ true);
+  }
+}
+
+bool RegisterAllocatorGraphColor::Validate(bool log_fatal_on_failure) {
+  for (bool processing_core_regs : {true, false}) {
+    ArenaVector<LiveInterval*> intervals(
+        allocator_->Adapter(kArenaAllocRegisterAllocatorValidate));
+    for (size_t i = 0; i < liveness_.GetNumberOfSsaValues(); ++i) {
+      HInstruction* instruction = liveness_.GetInstructionFromSsaIndex(i);
+      LiveInterval* interval = instruction->GetLiveInterval();
+      if (interval != nullptr && IsCoreInterval(interval) == processing_core_regs) {
+        intervals.push_back(instruction->GetLiveInterval());
+      }
+    }
+
+    ArenaVector<InterferenceNode*>& physical_nodes = processing_core_regs
+        ? physical_core_nodes_
+        : physical_fp_nodes_;
+    for (InterferenceNode* fixed : physical_nodes) {
+      LiveInterval* interval = fixed->GetInterval();
+      if (interval->GetFirstRange() != nullptr) {
+        // Ideally we would check fixed ranges as well, but currently there are times when
+        // two fixed intervals for the same register will overlap. For example, a fixed input
+        // and a fixed output may sometimes share the same register, in which there will be two
+        // fixed intervals for the same place.
+      }
+    }
+
+    for (LiveInterval* temp : temp_intervals_) {
+      if (IsCoreInterval(temp) == processing_core_regs) {
+        intervals.push_back(temp);
+      }
+    }
+
+    size_t spill_slots = num_int_spill_slots_
+                       + num_long_spill_slots_
+                       + num_float_spill_slots_
+                       + num_double_spill_slots_
+                       + catch_phi_spill_slot_counter_;
+    bool ok = ValidateIntervals(intervals,
+                                spill_slots,
+                                reserved_art_method_slots_ + reserved_out_slots_,
+                                *codegen_,
+                                allocator_,
+                                processing_core_regs,
+                                log_fatal_on_failure);
+    if (!ok) {
+      return false;
+    }
+  }  // for processing_core_regs
+
+  return true;
+}
+
+void RegisterAllocatorGraphColor::ProcessInstructions() {
+  for (HLinearPostOrderIterator it(*codegen_->GetGraph()); !it.Done(); it.Advance()) {
+    HBasicBlock* block = it.Current();
+
+    // Note that we currently depend on this ordering, since some helper
+    // code is designed for linear scan register allocation.
+    for (HBackwardInstructionIterator instr_it(block->GetInstructions());
+          !instr_it.Done();
+          instr_it.Advance()) {
+      ProcessInstruction(instr_it.Current());
+    }
+
+    for (HInstructionIterator phi_it(block->GetPhis()); !phi_it.Done(); phi_it.Advance()) {
+      ProcessInstruction(phi_it.Current());
+    }
+
+    if (block->IsCatchBlock()
+        || (block->IsLoopHeader() && block->GetLoopInformation()->IsIrreducible())) {
+      // By blocking all registers at the top of each catch block or irreducible loop, we force
+      // intervals belonging to the live-in set of the catch/header block to be spilled.
+      // TODO(ngeoffray): Phis in this block could be allocated in register.
+      size_t position = block->GetLifetimeStart();
+      BlockRegisters(position, position + 1);
+    }
+  }
+}
+
+void RegisterAllocatorGraphColor::ProcessInstruction(HInstruction* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  if (locations == nullptr) {
+    return;
+  }
+  if (locations->NeedsSafepoint() && codegen_->IsLeafMethod()) {
+    // We do this here because we do not want the suspend check to artificially
+    // create live registers.
+    DCHECK(instruction->IsSuspendCheckEntry());
+    DCHECK_EQ(locations->GetTempCount(), 0u);
+    instruction->GetBlock()->RemoveInstruction(instruction);
+    return;
+  }
+
+  CheckForTempLiveIntervals(instruction);
+  CheckForSafepoint(instruction);
+  if (instruction->GetLocations()->WillCall()) {
+    // If a call will happen, create fixed intervals for caller-save registers.
+    // TODO: Note that it may be beneficial to later split intervals at this point,
+    //       so that we allow last-minute moves from a caller-save register
+    //       to a callee-save register.
+    BlockRegisters(instruction->GetLifetimePosition(),
+                   instruction->GetLifetimePosition() + 1,
+                   /*caller_save_only*/ true);
+  }
+  CheckForFixedInputs(instruction);
+
+  LiveInterval* interval = instruction->GetLiveInterval();
+  if (interval == nullptr) {
+    // Instructions lacking a valid output location do not have a live interval.
+    DCHECK(!locations->Out().IsValid());
+    return;
+  }
+
+  // Low intervals act as representatives for their corresponding high interval.
+  DCHECK(!interval->IsHighInterval());
+  if (codegen_->NeedsTwoRegisters(interval->GetType())) {
+    interval->AddHighInterval();
+  }
+  AddSafepointsFor(instruction);
+  CheckForFixedOutput(instruction);
+  AllocateSpillSlotForCatchPhi(instruction);
+
+  ArenaVector<LiveInterval*>& intervals = IsCoreInterval(interval)
+      ? core_intervals_
+      : fp_intervals_;
+  if (interval->HasSpillSlot() || instruction->IsConstant()) {
+    // Note that if an interval already has a spill slot, then its value currently resides
+    // in the stack (e.g., parameters). Thus we do not have to allocate a register until its first
+    // register use. This is also true for constants, which can be materialized at any point.
+    size_t first_register_use = interval->FirstRegisterUse();
+    if (first_register_use != kNoLifetime) {
+      LiveInterval* split = SplitBetween(interval, interval->GetStart(), first_register_use - 1);
+      intervals.push_back(split);
+    } else {
+      // We won't allocate a register for this value.
+    }
+  } else {
+    intervals.push_back(interval);
+  }
+}
+
+void RegisterAllocatorGraphColor::CheckForFixedInputs(HInstruction* instruction) {
+  // We simply block physical registers where necessary.
+  // TODO: Ideally we would coalesce the physical register with the register
+  //       allocated to the input value, but this can be tricky if, e.g., there
+  //       could be multiple physical register uses of the same value at the
+  //       same instruction. Furthermore, there's currently no distinction between
+  //       fixed inputs to a call (which will be clobbered) and other fixed inputs (which
+  //       may not be clobbered).
+  LocationSummary* locations = instruction->GetLocations();
+  size_t position = instruction->GetLifetimePosition();
+  for (size_t i = 0; i < locations->GetInputCount(); ++i) {
+    Location input = locations->InAt(i);
+    if (input.IsRegister() || input.IsFpuRegister()) {
+      BlockRegister(input, position, position + 1);
+      codegen_->AddAllocatedRegister(input);
+    } else if (input.IsPair()) {
+      BlockRegister(input.ToLow(), position, position + 1);
+      BlockRegister(input.ToHigh(), position, position + 1);
+      codegen_->AddAllocatedRegister(input.ToLow());
+      codegen_->AddAllocatedRegister(input.ToHigh());
+    }
+  }
+}
+
+void RegisterAllocatorGraphColor::CheckForFixedOutput(HInstruction* instruction) {
+  // If an instruction has a fixed output location, we give the live interval a register and then
+  // proactively split it just after the definition point to avoid creating too many interferences
+  // with a fixed node.
+  LiveInterval* interval = instruction->GetLiveInterval();
+  Location out = interval->GetDefinedBy()->GetLocations()->Out();
+  size_t position = instruction->GetLifetimePosition();
+  DCHECK_GE(interval->GetEnd() - position, 2u);
+
+  if (out.IsUnallocated() && out.GetPolicy() == Location::kSameAsFirstInput) {
+    out = instruction->GetLocations()->InAt(0);
+  }
+
+  if (out.IsRegister() || out.IsFpuRegister()) {
+    interval->SetRegister(out.reg());
+    codegen_->AddAllocatedRegister(out);
+    Split(interval, position + 1);
+  } else if (out.IsPair()) {
+    interval->SetRegister(out.low());
+    interval->GetHighInterval()->SetRegister(out.high());
+    codegen_->AddAllocatedRegister(out.ToLow());
+    codegen_->AddAllocatedRegister(out.ToHigh());
+    Split(interval, position + 1);
+  } else if (out.IsStackSlot() || out.IsDoubleStackSlot()) {
+    interval->SetSpillSlot(out.GetStackIndex());
+  } else {
+    DCHECK(out.IsUnallocated() || out.IsConstant());
+  }
+}
+
+void RegisterAllocatorGraphColor::AddSafepointsFor(HInstruction* instruction) {
+  LiveInterval* interval = instruction->GetLiveInterval();
+  for (size_t safepoint_index = safepoints_.size(); safepoint_index > 0; --safepoint_index) {
+    HInstruction* safepoint = safepoints_[safepoint_index - 1u];
+    size_t safepoint_position = safepoint->GetLifetimePosition();
+
+    // Test that safepoints_ are ordered in the optimal way.
+    DCHECK(safepoint_index == safepoints_.size() ||
+           safepoints_[safepoint_index]->GetLifetimePosition() < safepoint_position);
+
+    if (safepoint_position == interval->GetStart()) {
+      // The safepoint is for this instruction, so the location of the instruction
+      // does not need to be saved.
+      DCHECK_EQ(safepoint_index, safepoints_.size());
+      DCHECK_EQ(safepoint, instruction);
+      continue;
+    } else if (interval->IsDeadAt(safepoint_position)) {
+      break;
+    } else if (!interval->Covers(safepoint_position)) {
+      // Hole in the interval.
+      continue;
+    }
+    interval->AddSafepoint(safepoint);
+  }
+}
+
+void RegisterAllocatorGraphColor::CheckForTempLiveIntervals(HInstruction* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  size_t position = instruction->GetLifetimePosition();
+  for (size_t i = 0; i < locations->GetTempCount(); ++i) {
+    Location temp = locations->GetTemp(i);
+    if (temp.IsRegister() || temp.IsFpuRegister()) {
+      BlockRegister(temp, position, position + 1);
+      codegen_->AddAllocatedRegister(temp);
+    } else {
+      DCHECK(temp.IsUnallocated());
+      switch (temp.GetPolicy()) {
+        case Location::kRequiresRegister: {
+          LiveInterval* interval =
+              LiveInterval::MakeTempInterval(allocator_, Primitive::kPrimInt);
+          interval->AddTempUse(instruction, i);
+          core_intervals_.push_back(interval);
+          temp_intervals_.push_back(interval);
+          break;
+        }
+
+        case Location::kRequiresFpuRegister: {
+          LiveInterval* interval =
+              LiveInterval::MakeTempInterval(allocator_, Primitive::kPrimDouble);
+          interval->AddTempUse(instruction, i);
+          fp_intervals_.push_back(interval);
+          temp_intervals_.push_back(interval);
+          if (codegen_->NeedsTwoRegisters(Primitive::kPrimDouble)) {
+            interval->AddHighInterval(/*is_temp*/ true);
+            temp_intervals_.push_back(interval->GetHighInterval());
+          }
+          break;
+        }
+
+        default:
+          LOG(FATAL) << "Unexpected policy for temporary location "
+                     << temp.GetPolicy();
+      }
+    }
+  }
+}
+
+void RegisterAllocatorGraphColor::CheckForSafepoint(HInstruction* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  size_t position = instruction->GetLifetimePosition();
+
+  if (locations->NeedsSafepoint()) {
+    safepoints_.push_back(instruction);
+    if (locations->OnlyCallsOnSlowPath()) {
+      // We add a synthesized range at this position to record the live registers
+      // at this position. Ideally, we could just update the safepoints when locations
+      // are updated, but we currently need to know the full stack size before updating
+      // locations (because of parameters and the fact that we don't have a frame pointer).
+      // And knowing the full stack size requires to know the maximum number of live
+      // registers at calls in slow paths.
+      // By adding the following interval in the algorithm, we can compute this
+      // maximum before updating locations.
+      LiveInterval* interval = LiveInterval::MakeSlowPathInterval(allocator_, instruction);
+      interval->AddRange(position, position + 1);
+      core_intervals_.push_back(interval);
+      fp_intervals_.push_back(interval);
+    }
+  }
+}
+
+LiveInterval* RegisterAllocatorGraphColor::TrySplit(LiveInterval* interval, size_t position) {
+  if (interval->GetStart() < position && position < interval->GetEnd()) {
+    return Split(interval, position);
+  } else {
+    return interval;
+  }
+}
+
+void RegisterAllocatorGraphColor::SplitAtRegisterUses(LiveInterval* interval) {
+  DCHECK(!interval->IsHighInterval());
+
+  // Split just after a register definition.
+  if (interval->IsParent() && interval->DefinitionRequiresRegister()) {
+    interval = TrySplit(interval, interval->GetStart() + 1);
+  }
+
+  UsePosition* use = interval->GetFirstUse();
+  while (use != nullptr && use->GetPosition() < interval->GetStart()) {
+    use = use->GetNext();
+  }
+
+  // Split around register uses.
+  size_t end = interval->GetEnd();
+  while (use != nullptr && use->GetPosition() <= end) {
+    if (use->RequiresRegister()) {
+      size_t position = use->GetPosition();
+      interval = TrySplit(interval, position - 1);
+      if (liveness_.GetInstructionFromPosition(position / 2)->IsControlFlow()) {
+        // If we are at the very end of a basic block, we cannot split right
+        // at the use. Split just after instead.
+        interval = TrySplit(interval, position + 1);
+      } else {
+        interval = TrySplit(interval, position);
+      }
+    }
+    use = use->GetNext();
+  }
+}
+
+void RegisterAllocatorGraphColor::AllocateSpillSlotForCatchPhi(HInstruction* instruction) {
+  if (instruction->IsPhi() && instruction->AsPhi()->IsCatchPhi()) {
+    HPhi* phi = instruction->AsPhi();
+    LiveInterval* interval = phi->GetLiveInterval();
+
+    HInstruction* previous_phi = phi->GetPrevious();
+    DCHECK(previous_phi == nullptr ||
+           previous_phi->AsPhi()->GetRegNumber() <= phi->GetRegNumber())
+        << "Phis expected to be sorted by vreg number, "
+        << "so that equivalent phis are adjacent.";
+
+    if (phi->IsVRegEquivalentOf(previous_phi)) {
+      // Assign the same spill slot.
+      DCHECK(previous_phi->GetLiveInterval()->HasSpillSlot());
+      interval->SetSpillSlot(previous_phi->GetLiveInterval()->GetSpillSlot());
+    } else {
+      interval->SetSpillSlot(catch_phi_spill_slot_counter_);
+      catch_phi_spill_slot_counter_ += interval->NeedsTwoSpillSlots() ? 2 : 1;
+    }
+  }
+}
+
+void RegisterAllocatorGraphColor::BlockRegister(Location location,
+                                                size_t start,
+                                                size_t end) {
+  DCHECK(location.IsRegister() || location.IsFpuRegister());
+  int reg = location.reg();
+  LiveInterval* interval = location.IsRegister()
+      ? physical_core_nodes_[reg]->GetInterval()
+      : physical_fp_nodes_[reg]->GetInterval();
+  DCHECK(interval->GetRegister() == reg);
+  bool blocked_by_codegen = location.IsRegister()
+      ? codegen_->IsBlockedCoreRegister(reg)
+      : codegen_->IsBlockedFloatingPointRegister(reg);
+  if (blocked_by_codegen) {
+    // We've already blocked this register for the entire method. (And adding a
+    // range inside another range violates the preconditions of AddRange).
+  } else {
+    interval->AddRange(start, end);
+  }
+}
+
+void RegisterAllocatorGraphColor::BlockRegisters(size_t start, size_t end, bool caller_save_only) {
+  for (size_t i = 0; i < codegen_->GetNumberOfCoreRegisters(); ++i) {
+    if (!caller_save_only || !codegen_->IsCoreCalleeSaveRegister(i)) {
+      BlockRegister(Location::RegisterLocation(i), start, end);
+    }
+  }
+  for (size_t i = 0; i < codegen_->GetNumberOfFloatingPointRegisters(); ++i) {
+    if (!caller_save_only || !codegen_->IsFloatingPointCalleeSaveRegister(i)) {
+      BlockRegister(Location::FpuRegisterLocation(i), start, end);
+    }
+  }
+}
+
+void ColoringIteration::AddPotentialInterference(InterferenceNode* from,
+                                                 InterferenceNode* to,
+                                                 bool guaranteed_not_interfering_yet,
+                                                 bool both_directions) {
+  if (from->IsPrecolored()) {
+    // We save space by ignoring outgoing edges from fixed nodes.
+  } else if (to->GetInterval()->IsSlowPathSafepoint()) {
+    // Safepoint intervals are only there to count max live registers,
+    // so no need to give them incoming interference edges.
+    // This is also necessary for correctness, because we don't want nodes
+    // to remove themselves from safepoint adjacency sets when they're pruned.
+  } else if (to->IsPrecolored()) {
+    // It is important that only a single node represents a given fixed register in the
+    // interference graph. We retrieve that node here.
+    const ArenaVector<InterferenceNode*>& physical_nodes = to->GetInterval()->IsFloatingPoint()
+        ? register_allocator_->physical_fp_nodes_
+        : register_allocator_->physical_core_nodes_;
+    InterferenceNode* physical_node = physical_nodes[to->GetInterval()->GetRegister()];
+    from->AddInterference(physical_node, /*guaranteed_not_interfering_yet*/ false);
+    DCHECK_EQ(to->GetInterval()->GetRegister(), physical_node->GetInterval()->GetRegister());
+    DCHECK_EQ(to->GetAlias(), physical_node) << "Fixed nodes should alias the canonical fixed node";
+
+    // If a node interferes with a fixed pair node, the weight of the edge may
+    // be inaccurate after using the alias of the pair node, because the alias of the pair node
+    // is a singular node.
+    // We could make special pair fixed nodes, but that ends up being too conservative because
+    // a node could then interfere with both {r1} and {r1,r2}, leading to a degree of
+    // three rather than two.
+    // Instead, we explicitly add an interference with the high node of the fixed pair node.
+    // TODO: This is too conservative at time for pair nodes, but the fact that fixed pair intervals
+    //       can be unaligned on x86 complicates things.
+    if (to->IsPair()) {
+      InterferenceNode* high_node =
+          physical_nodes[to->GetInterval()->GetHighInterval()->GetRegister()];
+      DCHECK_EQ(to->GetInterval()->GetHighInterval()->GetRegister(),
+                high_node->GetInterval()->GetRegister());
+      from->AddInterference(high_node, /*guaranteed_not_interfering_yet*/ false);
+    }
+  } else {
+    // Standard interference between two uncolored nodes.
+    from->AddInterference(to, guaranteed_not_interfering_yet);
+  }
+
+  if (both_directions) {
+    AddPotentialInterference(to, from, guaranteed_not_interfering_yet, /*both_directions*/ false);
+  }
+}
+
+// Returns true if `in_node` represents an input interval of `out_node`, and the output interval
+// is allowed to have the same register as the input interval.
+// TODO: Ideally we should just produce correct intervals in liveness analysis.
+//       We would need to refactor the current live interval layout to do so, which is
+//       no small task.
+static bool CheckInputOutputCanOverlap(InterferenceNode* in_node, InterferenceNode* out_node) {
+  LiveInterval* output_interval = out_node->GetInterval();
+  HInstruction* defined_by = output_interval->GetDefinedBy();
+  if (defined_by == nullptr) {
+    // This must not be a definition point.
+    return false;
+  }
+
+  LocationSummary* locations = defined_by->GetLocations();
+  if (locations->OutputCanOverlapWithInputs()) {
+    // This instruction does not allow the output to reuse a register from an input.
+    return false;
+  }
+
+  LiveInterval* input_interval = in_node->GetInterval();
+  LiveInterval* next_sibling = input_interval->GetNextSibling();
+  size_t def_position = defined_by->GetLifetimePosition();
+  size_t use_position = def_position + 1;
+  if (next_sibling != nullptr && next_sibling->GetStart() == use_position) {
+    // The next sibling starts at the use position, so reusing the input register in the output
+    // would clobber the input before it's moved into the sibling interval location.
+    return false;
+  }
+
+  if (!input_interval->IsDeadAt(use_position) && input_interval->CoversSlow(use_position)) {
+    // The input interval is live after the use position.
+    return false;
+  }
+
+  HInputsRef inputs = defined_by->GetInputs();
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    if (inputs[i]->GetLiveInterval()->GetSiblingAt(def_position) == input_interval) {
+      DCHECK(input_interval->SameRegisterKind(*output_interval));
+      return true;
+    }
+  }
+
+  // The input interval was not an input for this instruction.
+  return false;
+}
+
+void ColoringIteration::BuildInterferenceGraph(
+    const ArenaVector<LiveInterval*>& intervals,
+    const ArenaVector<InterferenceNode*>& physical_nodes,
+    ArenaVector<InterferenceNode*>* safepoints) {
+  DCHECK(interval_node_map_.Empty() && prunable_nodes_.empty());
+  // Build the interference graph efficiently by ordering range endpoints
+  // by position and doing a linear sweep to find interferences. (That is, we
+  // jump from endpoint to endpoint, maintaining a set of intervals live at each
+  // point. If two nodes are ever in the live set at the same time, then they
+  // interfere with each other.)
+  //
+  // We order by both position and (secondarily) by whether the endpoint
+  // begins or ends a range; we want to process range endings before range
+  // beginnings at the same position because they should not conflict.
+  //
+  // For simplicity, we create a tuple for each endpoint, and then sort the tuples.
+  // Tuple contents: (position, is_range_beginning, node).
+  ArenaVector<std::tuple<size_t, bool, InterferenceNode*>> range_endpoints(
+      allocator_->Adapter(kArenaAllocRegisterAllocator));
+
+  // We reserve plenty of space to avoid excessive copying.
+  range_endpoints.reserve(4 * prunable_nodes_.size());
+
+  for (LiveInterval* parent : intervals) {
+    for (LiveInterval* sibling = parent; sibling != nullptr; sibling = sibling->GetNextSibling()) {
+      LiveRange* range = sibling->GetFirstRange();
+      if (range != nullptr) {
+        InterferenceNode* node = new (allocator_) InterferenceNode(
+            allocator_, sibling, register_allocator_->liveness_);
+        interval_node_map_.Insert(std::make_pair(sibling, node));
+
+        if (sibling->HasRegister()) {
+          // Fixed nodes should alias the canonical node for the corresponding register.
+          node->stage = NodeStage::kPrecolored;
+          InterferenceNode* physical_node = physical_nodes[sibling->GetRegister()];
+          node->SetAlias(physical_node);
+          DCHECK_EQ(node->GetInterval()->GetRegister(),
+                    physical_node->GetInterval()->GetRegister());
+        } else if (sibling->IsSlowPathSafepoint()) {
+          // Safepoint intervals are synthesized to count max live registers.
+          // They will be processed separately after coloring.
+          node->stage = NodeStage::kSafepoint;
+          safepoints->push_back(node);
+        } else {
+          node->stage = NodeStage::kPrunable;
+          prunable_nodes_.push_back(node);
+        }
+
+        while (range != nullptr) {
+          range_endpoints.push_back(std::make_tuple(range->GetStart(), true, node));
+          range_endpoints.push_back(std::make_tuple(range->GetEnd(), false, node));
+          range = range->GetNext();
+        }
+      }
+    }
+  }
+
+  // Sort the endpoints.
+  // We explicitly ignore the third entry of each tuple (the node pointer) in order
+  // to maintain determinism.
+  std::sort(range_endpoints.begin(), range_endpoints.end(),
+            [] (const std::tuple<size_t, bool, InterferenceNode*>& lhs,
+                const std::tuple<size_t, bool, InterferenceNode*>& rhs) {
+    return std::tie(std::get<0>(lhs), std::get<1>(lhs))
+         < std::tie(std::get<0>(rhs), std::get<1>(rhs));
+  });
+
+  // Nodes live at the current position in the linear sweep.
+  ArenaVector<InterferenceNode*> live(
+      allocator_->Adapter(kArenaAllocRegisterAllocator));
+
+  // Linear sweep. When we encounter the beginning of a range, we add the corresponding node to the
+  // live set. When we encounter the end of a range, we remove the corresponding node
+  // from the live set. Nodes interfere if they are in the live set at the same time.
+  for (auto it = range_endpoints.begin(); it != range_endpoints.end(); ++it) {
+    bool is_range_beginning;
+    InterferenceNode* node;
+    size_t position;
+    // Extract information from the tuple, including the node this tuple represents.
+    std::tie(position, is_range_beginning, node) = *it;
+
+    if (is_range_beginning) {
+      bool guaranteed_not_interfering_yet = position == node->GetInterval()->GetStart();
+      for (InterferenceNode* conflicting : live) {
+        DCHECK_NE(node, conflicting);
+        if (CheckInputOutputCanOverlap(conflicting, node)) {
+          // We do not add an interference, because the instruction represented by `node` allows
+          // its output to share a register with an input, represented here by `conflicting`.
+        } else {
+          AddPotentialInterference(node, conflicting, guaranteed_not_interfering_yet);
+        }
+      }
+      DCHECK(std::find(live.begin(), live.end(), node) == live.end());
+      live.push_back(node);
+    } else {
+      // End of range.
+      auto live_it = std::find(live.begin(), live.end(), node);
+      DCHECK(live_it != live.end());
+      live.erase(live_it);
+    }
+  }
+  DCHECK(live.empty());
+}
+
+void ColoringIteration::CreateCoalesceOpportunity(InterferenceNode* a,
+                                                  InterferenceNode* b,
+                                                  CoalesceKind kind,
+                                                  size_t position) {
+  DCHECK_EQ(a->IsPair(), b->IsPair())
+      << "Nodes of different memory widths should never be coalesced";
+  CoalesceOpportunity* opportunity =
+      new (allocator_) CoalesceOpportunity(a, b, kind, position, register_allocator_->liveness_);
+  a->AddCoalesceOpportunity(opportunity);
+  b->AddCoalesceOpportunity(opportunity);
+  coalesce_worklist_.push(opportunity);
+}
+
+// When looking for coalesce opportunities, we use the interval_node_map_ to find the node
+// corresponding to an interval. Note that not all intervals are in this map, notably the parents
+// of constants and stack arguments. (However, these interval should not be involved in coalesce
+// opportunities anyway, because they're not going to be in registers.)
+void ColoringIteration::FindCoalesceOpportunities() {
+  DCHECK(coalesce_worklist_.empty());
+
+  for (InterferenceNode* node : prunable_nodes_) {
+    LiveInterval* interval = node->GetInterval();
+
+    // Coalesce siblings.
+    LiveInterval* next_sibling = interval->GetNextSibling();
+    if (next_sibling != nullptr && interval->GetEnd() == next_sibling->GetStart()) {
+      auto it = interval_node_map_.Find(next_sibling);
+      if (it != interval_node_map_.end()) {
+        InterferenceNode* sibling_node = it->second;
+        CreateCoalesceOpportunity(node,
+                                  sibling_node,
+                                  CoalesceKind::kAdjacentSibling,
+                                  interval->GetEnd());
+      }
+    }
+
+    // Coalesce fixed outputs with this interval if this interval is an adjacent sibling.
+    LiveInterval* parent = interval->GetParent();
+    if (parent->HasRegister()
+        && parent->GetNextSibling() == interval
+        && parent->GetEnd() == interval->GetStart()) {
+      auto it = interval_node_map_.Find(parent);
+      if (it != interval_node_map_.end()) {
+        InterferenceNode* parent_node = it->second;
+        CreateCoalesceOpportunity(node,
+                                  parent_node,
+                                  CoalesceKind::kFixedOutputSibling,
+                                  parent->GetEnd());
+      }
+    }
+
+    // Try to prevent moves across blocks.
+    // Note that this does not lead to many succeeding coalesce attempts, so could be removed
+    // if found to add to compile time.
+    const SsaLivenessAnalysis& liveness = register_allocator_->liveness_;
+    if (interval->IsSplit() && liveness.IsAtBlockBoundary(interval->GetStart() / 2)) {
+      // If the start of this interval is at a block boundary, we look at the
+      // location of the interval in blocks preceding the block this interval
+      // starts at. This can avoid a move between the two blocks.
+      HBasicBlock* block = liveness.GetBlockFromPosition(interval->GetStart() / 2);
+      for (HBasicBlock* predecessor : block->GetPredecessors()) {
+        size_t position = predecessor->GetLifetimeEnd() - 1;
+        LiveInterval* existing = interval->GetParent()->GetSiblingAt(position);
+        if (existing != nullptr) {
+          auto it = interval_node_map_.Find(existing);
+          if (it != interval_node_map_.end()) {
+            InterferenceNode* existing_node = it->second;
+            CreateCoalesceOpportunity(node,
+                                      existing_node,
+                                      CoalesceKind::kNonlinearControlFlow,
+                                      position);
+          }
+        }
+      }
+    }
+
+    // Coalesce phi inputs with the corresponding output.
+    HInstruction* defined_by = interval->GetDefinedBy();
+    if (defined_by != nullptr && defined_by->IsPhi()) {
+      const ArenaVector<HBasicBlock*>& predecessors = defined_by->GetBlock()->GetPredecessors();
+      HInputsRef inputs = defined_by->GetInputs();
+
+      for (size_t i = 0, e = inputs.size(); i < e; ++i) {
+        // We want the sibling at the end of the appropriate predecessor block.
+        size_t position = predecessors[i]->GetLifetimeEnd() - 1;
+        LiveInterval* input_interval = inputs[i]->GetLiveInterval()->GetSiblingAt(position);
+
+        auto it = interval_node_map_.Find(input_interval);
+        if (it != interval_node_map_.end()) {
+          InterferenceNode* input_node = it->second;
+          CreateCoalesceOpportunity(node, input_node, CoalesceKind::kPhi, position);
+        }
+      }
+    }
+
+    // Coalesce output with first input when policy is kSameAsFirstInput.
+    if (defined_by != nullptr) {
+      Location out = defined_by->GetLocations()->Out();
+      if (out.IsUnallocated() && out.GetPolicy() == Location::kSameAsFirstInput) {
+        LiveInterval* input_interval
+            = defined_by->InputAt(0)->GetLiveInterval()->GetSiblingAt(interval->GetStart() - 1);
+        // TODO: Could we consider lifetime holes here?
+        if (input_interval->GetEnd() == interval->GetStart()) {
+          auto it = interval_node_map_.Find(input_interval);
+          if (it != interval_node_map_.end()) {
+            InterferenceNode* input_node = it->second;
+            CreateCoalesceOpportunity(node,
+                                      input_node,
+                                      CoalesceKind::kFirstInput,
+                                      interval->GetStart());
+          }
+        }
+      }
+    }
+
+    // An interval that starts an instruction (that is, it is not split), may
+    // re-use the registers used by the inputs of that instruction, based on the
+    // location summary.
+    if (defined_by != nullptr) {
+      DCHECK(!interval->IsSplit());
+      LocationSummary* locations = defined_by->GetLocations();
+      if (!locations->OutputCanOverlapWithInputs()) {
+        HInputsRef inputs = defined_by->GetInputs();
+        for (size_t i = 0; i < inputs.size(); ++i) {
+          size_t def_point = defined_by->GetLifetimePosition();
+          // TODO: Getting the sibling at the def_point might not be quite what we want
+          //       for fixed inputs, since the use will be *at* the def_point rather than after.
+          LiveInterval* input_interval = inputs[i]->GetLiveInterval()->GetSiblingAt(def_point);
+          if (input_interval != nullptr &&
+              input_interval->HasHighInterval() == interval->HasHighInterval()) {
+            auto it = interval_node_map_.Find(input_interval);
+            if (it != interval_node_map_.end()) {
+              InterferenceNode* input_node = it->second;
+              CreateCoalesceOpportunity(node,
+                                        input_node,
+                                        CoalesceKind::kAnyInput,
+                                        interval->GetStart());
+            }
+          }
+        }
+      }
+    }
+
+    // Try to prevent moves into fixed input locations.
+    UsePosition* use = interval->GetFirstUse();
+    for (; use != nullptr && use->GetPosition() <= interval->GetStart(); use = use->GetNext()) {
+      // Skip past uses before the start of this interval.
+    }
+    for (; use != nullptr && use->GetPosition() <= interval->GetEnd(); use = use->GetNext()) {
+      HInstruction* user = use->GetUser();
+      if (user == nullptr) {
+        // User may be null for certain intervals, such as temp intervals.
+        continue;
+      }
+      LocationSummary* locations = user->GetLocations();
+      Location input = locations->InAt(use->GetInputIndex());
+      if (input.IsRegister() || input.IsFpuRegister()) {
+        // TODO: Could try to handle pair interval too, but coalescing with fixed pair nodes
+        //       is currently not supported.
+        InterferenceNode* fixed_node = input.IsRegister()
+            ? register_allocator_->physical_core_nodes_[input.reg()]
+            : register_allocator_->physical_fp_nodes_[input.reg()];
+        CreateCoalesceOpportunity(node,
+                                  fixed_node,
+                                  CoalesceKind::kFixedInput,
+                                  user->GetLifetimePosition());
+      }
+    }
+  }  // for node in prunable_nodes
+}
+
+static bool IsLowDegreeNode(InterferenceNode* node, size_t num_regs) {
+  return node->GetOutDegree() < num_regs;
+}
+
+static bool IsHighDegreeNode(InterferenceNode* node, size_t num_regs) {
+  return !IsLowDegreeNode(node, num_regs);
+}
+
+void ColoringIteration::PruneInterferenceGraph() {
+  DCHECK(pruned_nodes_.empty()
+      && simplify_worklist_.empty()
+      && freeze_worklist_.empty()
+      && spill_worklist_.empty());
+  // When pruning the graph, we refer to nodes with degree less than num_regs as low degree nodes,
+  // and all others as high degree nodes. The distinction is important: low degree nodes are
+  // guaranteed a color, while high degree nodes are not.
+
+  // Build worklists. Note that the coalesce worklist has already been
+  // filled by FindCoalesceOpportunities().
+  for (InterferenceNode* node : prunable_nodes_) {
+    DCHECK(!node->IsPrecolored()) << "Fixed nodes should never be pruned";
+    DCHECK(!node->GetInterval()->IsSlowPathSafepoint()) << "Safepoint nodes should never be pruned";
+    if (IsLowDegreeNode(node, num_regs_)) {
+      if (node->GetCoalesceOpportunities().empty()) {
+        // Simplify Worklist.
+        node->stage = NodeStage::kSimplifyWorklist;
+        simplify_worklist_.push_back(node);
+      } else {
+        // Freeze Worklist.
+        node->stage = NodeStage::kFreezeWorklist;
+        freeze_worklist_.push_back(node);
+      }
+    } else {
+      // Spill worklist.
+      node->stage = NodeStage::kSpillWorklist;
+      spill_worklist_.push(node);
+    }
+  }
+
+  // Prune graph.
+  // Note that we do not remove a node from its current worklist if it moves to another, so it may
+  // be in multiple worklists at once; the node's `phase` says which worklist it is really in.
+  while (true) {
+    if (!simplify_worklist_.empty()) {
+      // Prune low-degree nodes.
+      // TODO: pop_back() should work as well, but it didn't; we get a
+      //       failed check while pruning. We should look into this.
+      InterferenceNode* node = simplify_worklist_.front();
+      simplify_worklist_.pop_front();
+      DCHECK_EQ(node->stage, NodeStage::kSimplifyWorklist) << "Cannot move from simplify list";
+      DCHECK_LT(node->GetOutDegree(), num_regs_) << "Nodes in simplify list should be low degree";
+      DCHECK(!node->IsMoveRelated()) << "Nodes in simplify list should not be move related";
+      PruneNode(node);
+    } else if (!coalesce_worklist_.empty()) {
+      // Coalesce.
+      CoalesceOpportunity* opportunity = coalesce_worklist_.top();
+      coalesce_worklist_.pop();
+      if (opportunity->stage == CoalesceStage::kWorklist) {
+        Coalesce(opportunity);
+      }
+    } else if (!freeze_worklist_.empty()) {
+      // Freeze moves and prune a low-degree move-related node.
+      InterferenceNode* node = freeze_worklist_.front();
+      freeze_worklist_.pop_front();
+      if (node->stage == NodeStage::kFreezeWorklist) {
+        DCHECK_LT(node->GetOutDegree(), num_regs_) << "Nodes in freeze list should be low degree";
+        DCHECK(node->IsMoveRelated()) << "Nodes in freeze list should be move related";
+        FreezeMoves(node);
+        PruneNode(node);
+      }
+    } else if (!spill_worklist_.empty()) {
+      // We spill the lowest-priority node, because pruning a node earlier
+      // gives it a higher chance of being spilled.
+      InterferenceNode* node = spill_worklist_.top();
+      spill_worklist_.pop();
+      if (node->stage == NodeStage::kSpillWorklist) {
+        DCHECK_GE(node->GetOutDegree(), num_regs_) << "Nodes in spill list should be high degree";
+        FreezeMoves(node);
+        PruneNode(node);
+      }
+    } else {
+      // Pruning complete.
+      break;
+    }
+  }
+  DCHECK_EQ(prunable_nodes_.size(), pruned_nodes_.size());
+}
+
+void ColoringIteration::EnableCoalesceOpportunities(InterferenceNode* node) {
+  for (CoalesceOpportunity* opportunity : node->GetCoalesceOpportunities()) {
+    if (opportunity->stage == CoalesceStage::kActive) {
+      opportunity->stage = CoalesceStage::kWorklist;
+      coalesce_worklist_.push(opportunity);
+    }
+  }
+}
+
+void ColoringIteration::PruneNode(InterferenceNode* node) {
+  DCHECK_NE(node->stage, NodeStage::kPruned);
+  DCHECK(!node->IsPrecolored());
+  node->stage = NodeStage::kPruned;
+  pruned_nodes_.push(node);
+
+  for (InterferenceNode* adj : node->GetAdjacentNodes()) {
+    DCHECK(!adj->GetInterval()->IsSlowPathSafepoint())
+        << "Nodes should never interfere with synthesized safepoint nodes";
+    DCHECK_NE(adj->stage, NodeStage::kPruned) << "Should be no interferences with pruned nodes";
+
+    if (adj->IsPrecolored()) {
+      // No effect on pre-colored nodes; they're never pruned.
+    } else {
+      // Remove the interference.
+      bool was_high_degree = IsHighDegreeNode(adj, num_regs_);
+      DCHECK(adj->ContainsInterference(node))
+          << "Missing reflexive interference from non-fixed node";
+      adj->RemoveInterference(node);
+
+      // Handle transitions from high degree to low degree.
+      if (was_high_degree && IsLowDegreeNode(adj, num_regs_)) {
+        EnableCoalesceOpportunities(adj);
+        for (InterferenceNode* adj_adj : adj->GetAdjacentNodes()) {
+          EnableCoalesceOpportunities(adj_adj);
+        }
+
+        DCHECK_EQ(adj->stage, NodeStage::kSpillWorklist);
+        if (adj->IsMoveRelated()) {
+          adj->stage = NodeStage::kFreezeWorklist;
+          freeze_worklist_.push_back(adj);
+        } else {
+          adj->stage = NodeStage::kSimplifyWorklist;
+          simplify_worklist_.push_back(adj);
+        }
+      }
+    }
+  }
+}
+
+void ColoringIteration::CheckTransitionFromFreezeWorklist(InterferenceNode* node) {
+  if (IsLowDegreeNode(node, num_regs_) && !node->IsMoveRelated()) {
+    DCHECK_EQ(node->stage, NodeStage::kFreezeWorklist);
+    node->stage = NodeStage::kSimplifyWorklist;
+    simplify_worklist_.push_back(node);
+  }
+}
+
+void ColoringIteration::FreezeMoves(InterferenceNode* node) {
+  for (CoalesceOpportunity* opportunity : node->GetCoalesceOpportunities()) {
+    if (opportunity->stage == CoalesceStage::kDefunct) {
+      // Constrained moves should remain constrained, since they will not be considered
+      // during last-chance coalescing.
+    } else {
+      opportunity->stage = CoalesceStage::kInactive;
+    }
+    InterferenceNode* other = opportunity->node_a->GetAlias() == node
+        ? opportunity->node_b->GetAlias()
+        : opportunity->node_a->GetAlias();
+    if (other != node && other->stage == NodeStage::kFreezeWorklist) {
+      DCHECK(IsLowDegreeNode(node, num_regs_));
+      CheckTransitionFromFreezeWorklist(other);
+    }
+  }
+}
+
+bool ColoringIteration::PrecoloredHeuristic(InterferenceNode* from,
+                                            InterferenceNode* into) {
+  if (!into->IsPrecolored()) {
+    // The uncolored heuristic will cover this case.
+    return false;
+  }
+  if (from->IsPair() || into->IsPair()) {
+    // TODO: Merging from a pair node is currently not supported, since fixed pair nodes
+    //       are currently represented as two single fixed nodes in the graph, and `into` is
+    //       only one of them. (We may lose the implicit connections to the second one in a merge.)
+    return false;
+  }
+
+  // If all adjacent nodes of `from` are "ok", then we can conservatively merge with `into`.
+  // Reasons an adjacent node `adj` can be "ok":
+  // (1) If `adj` is low degree, interference with `into` will not affect its existing
+  //     colorable guarantee. (Notice that coalescing cannot increase its degree.)
+  // (2) If `adj` is pre-colored, it already interferes with `into`. See (3).
+  // (3) If there's already an interference with `into`, coalescing will not add interferences.
+  for (InterferenceNode* adj : from->GetAdjacentNodes()) {
+    if (IsLowDegreeNode(adj, num_regs_) || adj->IsPrecolored() || adj->ContainsInterference(into)) {
+      // Ok.
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool ColoringIteration::UncoloredHeuristic(InterferenceNode* from,
+                                           InterferenceNode* into) {
+  if (into->IsPrecolored()) {
+    // The pre-colored heuristic will handle this case.
+    return false;
+  }
+
+  // Arbitrary cap to improve compile time. Tests show that this has negligible affect
+  // on generated code.
+  if (from->GetOutDegree() + into->GetOutDegree() > 2 * num_regs_) {
+    return false;
+  }
+
+  // It's safe to coalesce two nodes if the resulting node has fewer than `num_regs` neighbors
+  // of high degree. (Low degree neighbors can be ignored, because they will eventually be
+  // pruned from the interference graph in the simplify stage.)
+  size_t high_degree_interferences = 0;
+  for (InterferenceNode* adj : from->GetAdjacentNodes()) {
+    if (IsHighDegreeNode(adj, num_regs_)) {
+      high_degree_interferences += from->EdgeWeightWith(adj);
+    }
+  }
+  for (InterferenceNode* adj : into->GetAdjacentNodes()) {
+    if (IsHighDegreeNode(adj, num_regs_)) {
+      if (from->ContainsInterference(adj)) {
+        // We've already counted this adjacent node.
+        // Furthermore, its degree will decrease if coalescing succeeds. Thus, it's possible that
+        // we should not have counted it at all. (This extends the textbook Briggs coalescing test,
+        // but remains conservative.)
+        if (adj->GetOutDegree() - into->EdgeWeightWith(adj) < num_regs_) {
+          high_degree_interferences -= from->EdgeWeightWith(adj);
+        }
+      } else {
+        high_degree_interferences += into->EdgeWeightWith(adj);
+      }
+    }
+  }
+
+  return high_degree_interferences < num_regs_;
+}
+
+void ColoringIteration::Combine(InterferenceNode* from,
+                                InterferenceNode* into) {
+  from->SetAlias(into);
+
+  // Add interferences.
+  for (InterferenceNode* adj : from->GetAdjacentNodes()) {
+    bool was_low_degree = IsLowDegreeNode(adj, num_regs_);
+    AddPotentialInterference(adj, into, /*guaranteed_not_interfering_yet*/ false);
+    if (was_low_degree && IsHighDegreeNode(adj, num_regs_)) {
+      // This is a (temporary) transition to a high degree node. Its degree will decrease again
+      // when we prune `from`, but it's best to be consistent about the current worklist.
+      adj->stage = NodeStage::kSpillWorklist;
+      spill_worklist_.push(adj);
+    }
+  }
+
+  // Add coalesce opportunities.
+  for (CoalesceOpportunity* opportunity : from->GetCoalesceOpportunities()) {
+    if (opportunity->stage != CoalesceStage::kDefunct) {
+      into->AddCoalesceOpportunity(opportunity);
+    }
+  }
+  EnableCoalesceOpportunities(from);
+
+  // Prune and update worklists.
+  PruneNode(from);
+  if (IsLowDegreeNode(into, num_regs_)) {
+    // Coalesce(...) takes care of checking for a transition to the simplify worklist.
+    DCHECK_EQ(into->stage, NodeStage::kFreezeWorklist);
+  } else if (into->stage == NodeStage::kFreezeWorklist) {
+    // This is a transition to a high degree node.
+    into->stage = NodeStage::kSpillWorklist;
+    spill_worklist_.push(into);
+  } else {
+    DCHECK(into->stage == NodeStage::kSpillWorklist || into->stage == NodeStage::kPrecolored);
+  }
+}
+
+void ColoringIteration::Coalesce(CoalesceOpportunity* opportunity) {
+  InterferenceNode* from = opportunity->node_a->GetAlias();
+  InterferenceNode* into = opportunity->node_b->GetAlias();
+  DCHECK_NE(from->stage, NodeStage::kPruned);
+  DCHECK_NE(into->stage, NodeStage::kPruned);
+
+  if (from->IsPrecolored()) {
+    // If we have one pre-colored node, make sure it's the `into` node.
+    std::swap(from, into);
+  }
+
+  if (from == into) {
+    // These nodes have already been coalesced.
+    opportunity->stage = CoalesceStage::kDefunct;
+    CheckTransitionFromFreezeWorklist(from);
+  } else if (from->IsPrecolored() || from->ContainsInterference(into)) {
+    // These nodes interfere.
+    opportunity->stage = CoalesceStage::kDefunct;
+    CheckTransitionFromFreezeWorklist(from);
+    CheckTransitionFromFreezeWorklist(into);
+  } else if (PrecoloredHeuristic(from, into)
+          || UncoloredHeuristic(from, into)) {
+    // We can coalesce these nodes.
+    opportunity->stage = CoalesceStage::kDefunct;
+    Combine(from, into);
+    CheckTransitionFromFreezeWorklist(into);
+  } else {
+    // We cannot coalesce, but we may be able to later.
+    opportunity->stage = CoalesceStage::kActive;
+  }
+}
+
+// Build a mask with a bit set for each register assigned to some
+// interval in `intervals`.
+template <typename Container>
+static std::bitset<kMaxNumRegs> BuildConflictMask(Container& intervals) {
+  std::bitset<kMaxNumRegs> conflict_mask;
+  for (InterferenceNode* adjacent : intervals) {
+    LiveInterval* conflicting = adjacent->GetInterval();
+    if (conflicting->HasRegister()) {
+      conflict_mask.set(conflicting->GetRegister());
+      if (conflicting->HasHighInterval()) {
+        DCHECK(conflicting->GetHighInterval()->HasRegister());
+        conflict_mask.set(conflicting->GetHighInterval()->GetRegister());
+      }
+    } else {
+      DCHECK(!conflicting->HasHighInterval()
+          || !conflicting->GetHighInterval()->HasRegister());
+    }
+  }
+  return conflict_mask;
+}
+
+bool RegisterAllocatorGraphColor::IsCallerSave(size_t reg, bool processing_core_regs) {
+  return processing_core_regs
+      ? !codegen_->IsCoreCalleeSaveRegister(reg)
+      : !codegen_->IsCoreCalleeSaveRegister(reg);
+}
+
+static bool RegisterIsAligned(size_t reg) {
+  return reg % 2 == 0;
+}
+
+static size_t FindFirstZeroInConflictMask(std::bitset<kMaxNumRegs> conflict_mask) {
+  // We use CTZ (count trailing zeros) to quickly find the lowest 0 bit.
+  // Note that CTZ is undefined if all bits are 0, so we special-case it.
+  return conflict_mask.all() ? conflict_mask.size() : CTZ(~conflict_mask.to_ulong());
+}
+
+bool ColoringIteration::ColorInterferenceGraph() {
+  DCHECK_LE(num_regs_, kMaxNumRegs) << "kMaxNumRegs is too small";
+  ArenaVector<LiveInterval*> colored_intervals(
+      allocator_->Adapter(kArenaAllocRegisterAllocator));
+  bool successful = true;
+
+  while (!pruned_nodes_.empty()) {
+    InterferenceNode* node = pruned_nodes_.top();
+    pruned_nodes_.pop();
+    LiveInterval* interval = node->GetInterval();
+    size_t reg = 0;
+
+    InterferenceNode* alias = node->GetAlias();
+    if (alias != node) {
+      // This node was coalesced with another.
+      LiveInterval* alias_interval = alias->GetInterval();
+      if (alias_interval->HasRegister()) {
+        reg = alias_interval->GetRegister();
+        DCHECK(!BuildConflictMask(node->GetAdjacentNodes())[reg])
+            << "This node conflicts with the register it was coalesced with";
+      } else {
+        DCHECK(false) << node->GetOutDegree() << " " << alias->GetOutDegree() << " "
+            << "Move coalescing was not conservative, causing a node to be coalesced "
+            << "with another node that could not be colored";
+        if (interval->RequiresRegister()) {
+          successful = false;
+        }
+      }
+    } else {
+      // Search for free register(s).
+      std::bitset<kMaxNumRegs> conflict_mask = BuildConflictMask(node->GetAdjacentNodes());
+      if (interval->HasHighInterval()) {
+        // Note that the graph coloring allocator assumes that pair intervals are aligned here,
+        // excluding pre-colored pair intervals (which can currently be unaligned on x86). If we
+        // change the alignment requirements here, we will have to update the algorithm (e.g.,
+        // be more conservative about the weight of edges adjacent to pair nodes.)
+        while (reg < num_regs_ - 1 && (conflict_mask[reg] || conflict_mask[reg + 1])) {
+          reg += 2;
+        }
+
+        // Try to use a caller-save register first.
+        for (size_t i = 0; i < num_regs_ - 1; i += 2) {
+          bool low_caller_save  = register_allocator_->IsCallerSave(i, processing_core_regs_);
+          bool high_caller_save = register_allocator_->IsCallerSave(i + 1, processing_core_regs_);
+          if (!conflict_mask[i] && !conflict_mask[i + 1]) {
+            if (low_caller_save && high_caller_save) {
+              reg = i;
+              break;
+            } else if (low_caller_save || high_caller_save) {
+              reg = i;
+              // Keep looking to try to get both parts in caller-save registers.
+            }
+          }
+        }
+      } else {
+        // Not a pair interval.
+        reg = FindFirstZeroInConflictMask(conflict_mask);
+
+        // Try to use caller-save registers first.
+        for (size_t i = 0; i < num_regs_; ++i) {
+          if (!conflict_mask[i] && register_allocator_->IsCallerSave(i, processing_core_regs_)) {
+            reg = i;
+            break;
+          }
+        }
+      }
+
+      // Last-chance coalescing.
+      for (CoalesceOpportunity* opportunity : node->GetCoalesceOpportunities()) {
+        if (opportunity->stage == CoalesceStage::kDefunct) {
+          continue;
+        }
+        LiveInterval* other_interval = opportunity->node_a->GetAlias() == node
+            ? opportunity->node_b->GetAlias()->GetInterval()
+            : opportunity->node_a->GetAlias()->GetInterval();
+        if (other_interval->HasRegister()) {
+          size_t coalesce_register = other_interval->GetRegister();
+          if (interval->HasHighInterval()) {
+            if (!conflict_mask[coalesce_register] &&
+                !conflict_mask[coalesce_register + 1] &&
+                RegisterIsAligned(coalesce_register)) {
+              reg = coalesce_register;
+              break;
+            }
+          } else if (!conflict_mask[coalesce_register]) {
+            reg = coalesce_register;
+            break;
+          }
+        }
+      }
+    }
+
+    if (reg < (interval->HasHighInterval() ? num_regs_ - 1 : num_regs_)) {
+      // Assign register.
+      DCHECK(!interval->HasRegister());
+      interval->SetRegister(reg);
+      colored_intervals.push_back(interval);
+      if (interval->HasHighInterval()) {
+        DCHECK(!interval->GetHighInterval()->HasRegister());
+        interval->GetHighInterval()->SetRegister(reg + 1);
+        colored_intervals.push_back(interval->GetHighInterval());
+      }
+    } else if (interval->RequiresRegister()) {
+      // The interference graph is too dense to color. Make it sparser by
+      // splitting this live interval.
+      successful = false;
+      register_allocator_->SplitAtRegisterUses(interval);
+      // We continue coloring, because there may be additional intervals that cannot
+      // be colored, and that we should split.
+    } else {
+      // Spill.
+      node->SetNeedsSpillSlot();
+    }
+  }
+
+  // If unsuccessful, reset all register assignments.
+  if (!successful) {
+    for (LiveInterval* interval : colored_intervals) {
+      interval->ClearRegister();
+    }
+  }
+
+  return successful;
+}
+
+size_t RegisterAllocatorGraphColor::ComputeMaxSafepointLiveRegisters(
+    const ArenaVector<InterferenceNode*>& safepoints) {
+  size_t max_safepoint_live_regs = 0;
+  for (InterferenceNode* safepoint : safepoints) {
+    DCHECK(safepoint->GetInterval()->IsSlowPathSafepoint());
+    std::bitset<kMaxNumRegs> conflict_mask = BuildConflictMask(safepoint->GetAdjacentNodes());
+    size_t live_regs = conflict_mask.count();
+    max_safepoint_live_regs = std::max(max_safepoint_live_regs, live_regs);
+  }
+  return max_safepoint_live_regs;
+}
+
+void RegisterAllocatorGraphColor::AllocateSpillSlots(const ArenaVector<InterferenceNode*>& nodes) {
+  // The register allocation resolver will organize the stack based on value type,
+  // so we assign stack slots for each value type separately.
+  ArenaVector<LiveInterval*> double_intervals(allocator_->Adapter(kArenaAllocRegisterAllocator));
+  ArenaVector<LiveInterval*> long_intervals(allocator_->Adapter(kArenaAllocRegisterAllocator));
+  ArenaVector<LiveInterval*> float_intervals(allocator_->Adapter(kArenaAllocRegisterAllocator));
+  ArenaVector<LiveInterval*> int_intervals(allocator_->Adapter(kArenaAllocRegisterAllocator));
+
+  // The set of parent intervals already handled.
+  ArenaSet<LiveInterval*> seen(allocator_->Adapter(kArenaAllocRegisterAllocator));
+
+  // Find nodes that need spill slots.
+  for (InterferenceNode* node : nodes) {
+    if (!node->NeedsSpillSlot()) {
+      continue;
+    }
+
+    LiveInterval* parent = node->GetInterval()->GetParent();
+    if (seen.find(parent) != seen.end()) {
+      // We've already handled this interval.
+      // This can happen if multiple siblings of the same interval request a stack slot.
+      continue;
+    }
+    seen.insert(parent);
+
+    HInstruction* defined_by = parent->GetDefinedBy();
+    if (parent->HasSpillSlot()) {
+      // We already have a spill slot for this value that we can reuse.
+    } else if (defined_by->IsParameterValue()) {
+      // Parameters already have a stack slot.
+      parent->SetSpillSlot(codegen_->GetStackSlotOfParameter(defined_by->AsParameterValue()));
+    } else if (defined_by->IsCurrentMethod()) {
+      // The current method is always at stack slot 0.
+      parent->SetSpillSlot(0);
+    } else if (defined_by->IsConstant()) {
+      // Constants don't need a spill slot.
+    } else {
+      // We need to find a spill slot for this interval. Place it in the correct
+      // worklist to be processed later.
+      switch (node->GetInterval()->GetType()) {
+        case Primitive::kPrimDouble:
+          double_intervals.push_back(parent);
+          break;
+        case Primitive::kPrimLong:
+          long_intervals.push_back(parent);
+          break;
+        case Primitive::kPrimFloat:
+          float_intervals.push_back(parent);
+          break;
+        case Primitive::kPrimNot:
+        case Primitive::kPrimInt:
+        case Primitive::kPrimChar:
+        case Primitive::kPrimByte:
+        case Primitive::kPrimBoolean:
+        case Primitive::kPrimShort:
+          int_intervals.push_back(parent);
+          break;
+        case Primitive::kPrimVoid:
+          LOG(FATAL) << "Unexpected type for interval " << node->GetInterval()->GetType();
+          UNREACHABLE();
+      }
+    }
+  }
+
+  // Color spill slots for each value type.
+  ColorSpillSlots(&double_intervals, &num_double_spill_slots_);
+  ColorSpillSlots(&long_intervals, &num_long_spill_slots_);
+  ColorSpillSlots(&float_intervals, &num_float_spill_slots_);
+  ColorSpillSlots(&int_intervals, &num_int_spill_slots_);
+}
+
+void RegisterAllocatorGraphColor::ColorSpillSlots(ArenaVector<LiveInterval*>* intervals,
+                                                  size_t* num_stack_slots_used) {
+  // We cannot use the original interference graph here because spill slots are assigned to
+  // all of the siblings of an interval, whereas an interference node represents only a single
+  // sibling. So, we assign spill slots linear-scan-style by sorting all the interval endpoints
+  // by position, and assigning the lowest spill slot available when we encounter an interval
+  // beginning. We ignore lifetime holes for simplicity.
+  ArenaVector<std::tuple<size_t, bool, LiveInterval*>> interval_endpoints(
+      allocator_->Adapter(kArenaAllocRegisterAllocator));
+
+  for (auto it = intervals->begin(), e = intervals->end(); it != e; ++it) {
+    LiveInterval* parent_interval = *it;
+    DCHECK(parent_interval->IsParent());
+    DCHECK(!parent_interval->HasSpillSlot());
+    size_t start = parent_interval->GetStart();
+    size_t end = parent_interval->GetLastSibling()->GetEnd();
+    DCHECK_LT(start, end);
+    interval_endpoints.push_back(std::make_tuple(start, true, parent_interval));
+    interval_endpoints.push_back(std::make_tuple(end, false, parent_interval));
+  }
+
+  // Sort by position.
+  // We explicitly ignore the third entry of each tuple (the interval pointer) in order
+  // to maintain determinism.
+  std::sort(interval_endpoints.begin(), interval_endpoints.end(),
+            [] (const std::tuple<size_t, bool, LiveInterval*>& lhs,
+                const std::tuple<size_t, bool, LiveInterval*>& rhs) {
+    return std::tie(std::get<0>(lhs), std::get<1>(lhs))
+         < std::tie(std::get<0>(rhs), std::get<1>(rhs));
+  });
+
+  ArenaBitVector taken(allocator_, 0, true);
+  for (auto it = interval_endpoints.begin(), end = interval_endpoints.end(); it != end; ++it) {
+    // Extract information from the current tuple.
+    LiveInterval* parent_interval;
+    bool is_interval_beginning;
+    size_t position;
+    std::tie(position, is_interval_beginning, parent_interval) = *it;
+
+    bool needs_two_slots = parent_interval->NeedsTwoSpillSlots();
+
+    if (is_interval_beginning) {
+      DCHECK(!parent_interval->HasSpillSlot());
+      DCHECK_EQ(position, parent_interval->GetStart());
+
+      // Find a free stack slot.
+      size_t slot = 0;
+      for (; taken.IsBitSet(slot) || (needs_two_slots && taken.IsBitSet(slot + 1)); ++slot) {
+        // Skip taken slots.
+      }
+      parent_interval->SetSpillSlot(slot);
+
+      *num_stack_slots_used = std::max(*num_stack_slots_used,
+                                       needs_two_slots ? slot + 1 : slot + 2);
+      if (needs_two_slots && *num_stack_slots_used % 2 != 0) {
+        // The parallel move resolver requires that there be an even number of spill slots
+        // allocated for pair value types.
+        ++(*num_stack_slots_used);
+      }
+
+      taken.SetBit(slot);
+      if (needs_two_slots) {
+        taken.SetBit(slot + 1);
+      }
+    } else {
+      DCHECK_EQ(position, parent_interval->GetLastSibling()->GetEnd());
+      DCHECK(parent_interval->HasSpillSlot());
+
+      // Free up the stack slot used by this interval.
+      size_t slot = parent_interval->GetSpillSlot();
+      DCHECK(taken.IsBitSet(slot));
+      DCHECK(!needs_two_slots || taken.IsBitSet(slot + 1));
+      taken.ClearBit(slot);
+      if (needs_two_slots) {
+        taken.ClearBit(slot + 1);
+      }
+    }
+  }
+  DCHECK_EQ(taken.NumSetBits(), 0u);
+}
+
+}  // namespace art
diff --git a/compiler/optimizing/register_allocator_graph_color.h b/compiler/optimizing/register_allocator_graph_color.h
new file mode 100644
index 0000000000..ed12561d2c
--- /dev/null
+++ b/compiler/optimizing/register_allocator_graph_color.h
@@ -0,0 +1,205 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_OPTIMIZING_REGISTER_ALLOCATOR_GRAPH_COLOR_H_
+#define ART_COMPILER_OPTIMIZING_REGISTER_ALLOCATOR_GRAPH_COLOR_H_
+
+#include "arch/instruction_set.h"
+#include "base/arena_containers.h"
+#include "base/arena_object.h"
+#include "base/macros.h"
+#include "primitive.h"
+#include "register_allocator.h"
+
+namespace art {
+
+class CodeGenerator;
+class HBasicBlock;
+class HGraph;
+class HInstruction;
+class HParallelMove;
+class Location;
+class SsaLivenessAnalysis;
+class InterferenceNode;
+struct CoalesceOpportunity;
+enum class CoalesceKind;
+
+/**
+ * A graph coloring register allocator.
+ *
+ * The algorithm proceeds as follows:
+ * (1) Build an interference graph, where nodes represent live intervals, and edges represent
+ *     interferences between two intervals. Coloring this graph with k colors is isomorphic to
+ *     finding a valid register assignment with k registers.
+ * (2) To color the graph, first prune all nodes with degree less than k, since these nodes are
+ *     guaranteed a color. (No matter how we color their adjacent nodes, we can give them a
+ *     different color.) As we prune nodes from the graph, more nodes may drop below degree k,
+ *     enabling further pruning. The key is to maintain the pruning order in a stack, so that we
+ *     can color the nodes in the reverse order.
+ *     When there are no more nodes with degree less than k, we start pruning alternate nodes based
+ *     on heuristics. Since these nodes are not guaranteed a color, we are careful to
+ *     prioritize nodes that require a register. We also prioritize short intervals, because
+ *     short intervals cannot be split very much if coloring fails (see below). "Prioritizing"
+ *     a node amounts to pruning it later, since it will have fewer interferences if we prune other
+ *     nodes first.
+ * (3) We color nodes in the reverse order in which we pruned them. If we cannot assign
+ *     a node a color, we do one of two things:
+ *     - If the node requires a register, we consider the current coloring attempt a failure.
+ *       However, we split the node's live interval in order to make the interference graph
+ *       sparser, so that future coloring attempts may succeed.
+ *     - If the node does not require a register, we simply assign it a location on the stack.
+ *
+ * If iterative move coalescing is enabled, the algorithm also attempts to conservatively
+ * combine nodes in the graph that would prefer to have the same color. (For example, the output
+ * of a phi instruction would prefer to have the same register as at least one of its inputs.)
+ * There are several additional steps involved with this:
+ * - We look for coalesce opportunities by examining each live interval, a step similar to that
+ *   used by linear scan when looking for register hints.
+ * - When pruning the graph, we maintain a worklist of coalesce opportunities, as well as a worklist
+ *   of low degree nodes that have associated coalesce opportunities. Only when we run out of
+ *   coalesce opportunities do we start pruning coalesce-associated nodes.
+ * - When pruning a node, if any nodes transition from high degree to low degree, we add
+ *   associated coalesce opportunities to the worklist, since these opportunities may now succeed.
+ * - Whether two nodes can be combined is decided by two different heuristics--one used when
+ *   coalescing uncolored nodes, and one used for coalescing an uncolored node with a colored node.
+ *   It is vital that we only combine two nodes if the node that remains is guaranteed to receive
+ *   a color. This is because additionally spilling is more costly than failing to coalesce.
+ * - Even if nodes are not coalesced while pruning, we keep the coalesce opportunities around
+ *   to be used as last-chance register hints when coloring. If nothing else, we try to use
+ *   caller-save registers before callee-save registers.
+ *
+ * A good reference for graph coloring register allocation is
+ * "Modern Compiler Implementation in Java" (Andrew W. Appel, 2nd Edition).
+ */
+class RegisterAllocatorGraphColor : public RegisterAllocator {
+ public:
+  RegisterAllocatorGraphColor(ArenaAllocator* allocator,
+                              CodeGenerator* codegen,
+                              const SsaLivenessAnalysis& analysis,
+                              bool iterative_move_coalescing = true);
+  ~RegisterAllocatorGraphColor() OVERRIDE {}
+
+  void AllocateRegisters() OVERRIDE;
+
+  bool Validate(bool log_fatal_on_failure);
+
+ private:
+  // Collect all intervals and prepare for register allocation.
+  void ProcessInstructions();
+  void ProcessInstruction(HInstruction* instruction);
+
+  // If any inputs require specific registers, block those registers
+  // at the position of this instruction.
+  void CheckForFixedInputs(HInstruction* instruction);
+
+  // If the output of an instruction requires a specific register, split
+  // the interval and assign the register to the first part.
+  void CheckForFixedOutput(HInstruction* instruction);
+
+  // Add all applicable safepoints to a live interval.
+  // Currently depends on instruction processing order.
+  void AddSafepointsFor(HInstruction* instruction);
+
+  // Collect all live intervals associated with the temporary locations
+  // needed by an instruction.
+  void CheckForTempLiveIntervals(HInstruction* instruction);
+
+  // If a safe point is needed, add a synthesized interval to later record
+  // the number of live registers at this point.
+  void CheckForSafepoint(HInstruction* instruction);
+
+  // Split an interval, but only if `position` is inside of `interval`.
+  // Return either the new interval, or the original interval if not split.
+  static LiveInterval* TrySplit(LiveInterval* interval, size_t position);
+
+  // To ensure every graph can be colored, split live intervals
+  // at their register defs and uses. This creates short intervals with low
+  // degree in the interference graph, which are prioritized during graph
+  // coloring.
+  void SplitAtRegisterUses(LiveInterval* interval);
+
+  // If the given instruction is a catch phi, give it a spill slot.
+  void AllocateSpillSlotForCatchPhi(HInstruction* instruction);
+
+  // Ensure that the given register cannot be allocated for a given range.
+  void BlockRegister(Location location, size_t start, size_t end);
+  void BlockRegisters(size_t start, size_t end, bool caller_save_only = false);
+
+  bool IsCallerSave(size_t reg, bool processing_core_regs);
+
+  // Return the maximum number of registers live at safepoints,
+  // based on the outgoing interference edges of safepoint nodes.
+  size_t ComputeMaxSafepointLiveRegisters(const ArenaVector<InterferenceNode*>& safepoints);
+
+  // Assigns stack slots to a list of intervals, ensuring that interfering intervals are not
+  // assigned the same stack slot.
+  void ColorSpillSlots(ArenaVector<LiveInterval*>* nodes,
+                       size_t* num_stack_slots_used);
+
+  // Provide stack slots to nodes that need them.
+  void AllocateSpillSlots(const ArenaVector<InterferenceNode*>& nodes);
+
+  // Whether iterative move coalescing should be performed. Iterative move coalescing
+  // improves code quality, but increases compile time.
+  const bool iterative_move_coalescing_;
+
+  // Live intervals, split by kind (core and floating point).
+  // These should not contain high intervals, as those are represented by
+  // the corresponding low interval throughout register allocation.
+  ArenaVector<LiveInterval*> core_intervals_;
+  ArenaVector<LiveInterval*> fp_intervals_;
+
+  // Intervals for temporaries, saved for special handling in the resolution phase.
+  ArenaVector<LiveInterval*> temp_intervals_;
+
+  // Safepoints, saved for special handling while processing instructions.
+  ArenaVector<HInstruction*> safepoints_;
+
+  // Interference nodes representing specific registers. These are "pre-colored" nodes
+  // in the interference graph.
+  ArenaVector<InterferenceNode*> physical_core_nodes_;
+  ArenaVector<InterferenceNode*> physical_fp_nodes_;
+
+  // Allocated stack slot counters.
+  size_t num_int_spill_slots_;
+  size_t num_double_spill_slots_;
+  size_t num_float_spill_slots_;
+  size_t num_long_spill_slots_;
+  size_t catch_phi_spill_slot_counter_;
+
+  // Number of stack slots needed for the pointer to the current method.
+  // This is 1 for 32-bit architectures, and 2 for 64-bit architectures.
+  const size_t reserved_art_method_slots_;
+
+  // Number of stack slots needed for outgoing arguments.
+  const size_t reserved_out_slots_;
+
+  // The number of globally blocked core and floating point registers, such as the stack pointer.
+  size_t number_of_globally_blocked_core_regs_;
+  size_t number_of_globally_blocked_fp_regs_;
+
+  // The maximum number of registers live at safe points. Needed by the code generator.
+  size_t max_safepoint_live_core_regs_;
+  size_t max_safepoint_live_fp_regs_;
+
+  friend class ColoringIteration;
+
+  DISALLOW_COPY_AND_ASSIGN(RegisterAllocatorGraphColor);
+};
+
+}  // namespace art
+
+#endif  // ART_COMPILER_OPTIMIZING_REGISTER_ALLOCATOR_GRAPH_COLOR_H_
diff --git a/compiler/optimizing/register_allocator_linear_scan.cc b/compiler/optimizing/register_allocator_linear_scan.cc
new file mode 100644
index 0000000000..768ed2d26a
--- /dev/null
+++ b/compiler/optimizing/register_allocator_linear_scan.cc
@@ -0,0 +1,1225 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "register_allocator_linear_scan.h"
+
+#include <iostream>
+#include <sstream>
+
+#include "base/bit_vector-inl.h"
+#include "base/enums.h"
+#include "code_generator.h"
+#include "register_allocation_resolver.h"
+#include "ssa_liveness_analysis.h"
+
+namespace art {
+
+static constexpr size_t kMaxLifetimePosition = -1;
+static constexpr size_t kDefaultNumberOfSpillSlots = 4;
+
+// For simplicity, we implement register pairs as (reg, reg + 1).
+// Note that this is a requirement for double registers on ARM, since we
+// allocate SRegister.
+static int GetHighForLowRegister(int reg) { return reg + 1; }
+static bool IsLowRegister(int reg) { return (reg & 1) == 0; }
+static bool IsLowOfUnalignedPairInterval(LiveInterval* low) {
+  return GetHighForLowRegister(low->GetRegister()) != low->GetHighInterval()->GetRegister();
+}
+
+RegisterAllocatorLinearScan::RegisterAllocatorLinearScan(ArenaAllocator* allocator,
+                                                         CodeGenerator* codegen,
+                                                         const SsaLivenessAnalysis& liveness)
+      : RegisterAllocator(allocator, codegen, liveness),
+        unhandled_core_intervals_(allocator->Adapter(kArenaAllocRegisterAllocator)),
+        unhandled_fp_intervals_(allocator->Adapter(kArenaAllocRegisterAllocator)),
+        unhandled_(nullptr),
+        handled_(allocator->Adapter(kArenaAllocRegisterAllocator)),
+        active_(allocator->Adapter(kArenaAllocRegisterAllocator)),
+        inactive_(allocator->Adapter(kArenaAllocRegisterAllocator)),
+        physical_core_register_intervals_(allocator->Adapter(kArenaAllocRegisterAllocator)),
+        physical_fp_register_intervals_(allocator->Adapter(kArenaAllocRegisterAllocator)),
+        temp_intervals_(allocator->Adapter(kArenaAllocRegisterAllocator)),
+        int_spill_slots_(allocator->Adapter(kArenaAllocRegisterAllocator)),
+        long_spill_slots_(allocator->Adapter(kArenaAllocRegisterAllocator)),
+        float_spill_slots_(allocator->Adapter(kArenaAllocRegisterAllocator)),
+        double_spill_slots_(allocator->Adapter(kArenaAllocRegisterAllocator)),
+        catch_phi_spill_slots_(0),
+        safepoints_(allocator->Adapter(kArenaAllocRegisterAllocator)),
+        processing_core_registers_(false),
+        number_of_registers_(-1),
+        registers_array_(nullptr),
+        blocked_core_registers_(codegen->GetBlockedCoreRegisters()),
+        blocked_fp_registers_(codegen->GetBlockedFloatingPointRegisters()),
+        reserved_out_slots_(0),
+        maximum_number_of_live_core_registers_(0),
+        maximum_number_of_live_fp_registers_(0) {
+  temp_intervals_.reserve(4);
+  int_spill_slots_.reserve(kDefaultNumberOfSpillSlots);
+  long_spill_slots_.reserve(kDefaultNumberOfSpillSlots);
+  float_spill_slots_.reserve(kDefaultNumberOfSpillSlots);
+  double_spill_slots_.reserve(kDefaultNumberOfSpillSlots);
+
+  codegen->SetupBlockedRegisters();
+  physical_core_register_intervals_.resize(codegen->GetNumberOfCoreRegisters(), nullptr);
+  physical_fp_register_intervals_.resize(codegen->GetNumberOfFloatingPointRegisters(), nullptr);
+  // Always reserve for the current method and the graph's max out registers.
+  // TODO: compute it instead.
+  // ArtMethod* takes 2 vregs for 64 bits.
+  size_t ptr_size = static_cast<size_t>(InstructionSetPointerSize(codegen->GetInstructionSet()));
+  reserved_out_slots_ = ptr_size / kVRegSize + codegen->GetGraph()->GetMaximumNumberOfOutVRegs();
+}
+
+static bool ShouldProcess(bool processing_core_registers, LiveInterval* interval) {
+  if (interval == nullptr) return false;
+  bool is_core_register = (interval->GetType() != Primitive::kPrimDouble)
+      && (interval->GetType() != Primitive::kPrimFloat);
+  return processing_core_registers == is_core_register;
+}
+
+void RegisterAllocatorLinearScan::AllocateRegisters() {
+  AllocateRegistersInternal();
+  RegisterAllocationResolver(allocator_, codegen_, liveness_)
+      .Resolve(maximum_number_of_live_core_registers_,
+               maximum_number_of_live_fp_registers_,
+               reserved_out_slots_,
+               int_spill_slots_.size(),
+               long_spill_slots_.size(),
+               float_spill_slots_.size(),
+               double_spill_slots_.size(),
+               catch_phi_spill_slots_,
+               temp_intervals_);
+
+  if (kIsDebugBuild) {
+    processing_core_registers_ = true;
+    ValidateInternal(true);
+    processing_core_registers_ = false;
+    ValidateInternal(true);
+    // Check that the linear order is still correct with regards to lifetime positions.
+    // Since only parallel moves have been inserted during the register allocation,
+    // these checks are mostly for making sure these moves have been added correctly.
+    size_t current_liveness = 0;
+    for (HLinearOrderIterator it(*codegen_->GetGraph()); !it.Done(); it.Advance()) {
+      HBasicBlock* block = it.Current();
+      for (HInstructionIterator inst_it(block->GetPhis()); !inst_it.Done(); inst_it.Advance()) {
+        HInstruction* instruction = inst_it.Current();
+        DCHECK_LE(current_liveness, instruction->GetLifetimePosition());
+        current_liveness = instruction->GetLifetimePosition();
+      }
+      for (HInstructionIterator inst_it(block->GetInstructions());
+           !inst_it.Done();
+           inst_it.Advance()) {
+        HInstruction* instruction = inst_it.Current();
+        DCHECK_LE(current_liveness, instruction->GetLifetimePosition()) << instruction->DebugName();
+        current_liveness = instruction->GetLifetimePosition();
+      }
+    }
+  }
+}
+
+void RegisterAllocatorLinearScan::BlockRegister(Location location, size_t start, size_t end) {
+  int reg = location.reg();
+  DCHECK(location.IsRegister() || location.IsFpuRegister());
+  LiveInterval* interval = location.IsRegister()
+      ? physical_core_register_intervals_[reg]
+      : physical_fp_register_intervals_[reg];
+  Primitive::Type type = location.IsRegister()
+      ? Primitive::kPrimInt
+      : Primitive::kPrimFloat;
+  if (interval == nullptr) {
+    interval = LiveInterval::MakeFixedInterval(allocator_, reg, type);
+    if (location.IsRegister()) {
+      physical_core_register_intervals_[reg] = interval;
+    } else {
+      physical_fp_register_intervals_[reg] = interval;
+    }
+  }
+  DCHECK(interval->GetRegister() == reg);
+  interval->AddRange(start, end);
+}
+
+void RegisterAllocatorLinearScan::BlockRegisters(size_t start, size_t end, bool caller_save_only) {
+  for (size_t i = 0; i < codegen_->GetNumberOfCoreRegisters(); ++i) {
+    if (!caller_save_only || !codegen_->IsCoreCalleeSaveRegister(i)) {
+      BlockRegister(Location::RegisterLocation(i), start, end);
+    }
+  }
+  for (size_t i = 0; i < codegen_->GetNumberOfFloatingPointRegisters(); ++i) {
+    if (!caller_save_only || !codegen_->IsFloatingPointCalleeSaveRegister(i)) {
+      BlockRegister(Location::FpuRegisterLocation(i), start, end);
+    }
+  }
+}
+
+void RegisterAllocatorLinearScan::AllocateRegistersInternal() {
+  // Iterate post-order, to ensure the list is sorted, and the last added interval
+  // is the one with the lowest start position.
+  for (HLinearPostOrderIterator it(*codegen_->GetGraph()); !it.Done(); it.Advance()) {
+    HBasicBlock* block = it.Current();
+    for (HBackwardInstructionIterator back_it(block->GetInstructions()); !back_it.Done();
+         back_it.Advance()) {
+      ProcessInstruction(back_it.Current());
+    }
+    for (HInstructionIterator inst_it(block->GetPhis()); !inst_it.Done(); inst_it.Advance()) {
+      ProcessInstruction(inst_it.Current());
+    }
+
+    if (block->IsCatchBlock() ||
+        (block->IsLoopHeader() && block->GetLoopInformation()->IsIrreducible())) {
+      // By blocking all registers at the top of each catch block or irreducible loop, we force
+      // intervals belonging to the live-in set of the catch/header block to be spilled.
+      // TODO(ngeoffray): Phis in this block could be allocated in register.
+      size_t position = block->GetLifetimeStart();
+      BlockRegisters(position, position + 1);
+    }
+  }
+
+  number_of_registers_ = codegen_->GetNumberOfCoreRegisters();
+  registers_array_ = allocator_->AllocArray<size_t>(number_of_registers_,
+                                                    kArenaAllocRegisterAllocator);
+  processing_core_registers_ = true;
+  unhandled_ = &unhandled_core_intervals_;
+  for (LiveInterval* fixed : physical_core_register_intervals_) {
+    if (fixed != nullptr) {
+      // Fixed interval is added to inactive_ instead of unhandled_.
+      // It's also the only type of inactive interval whose start position
+      // can be after the current interval during linear scan.
+      // Fixed interval is never split and never moves to unhandled_.
+      inactive_.push_back(fixed);
+    }
+  }
+  LinearScan();
+
+  inactive_.clear();
+  active_.clear();
+  handled_.clear();
+
+  number_of_registers_ = codegen_->GetNumberOfFloatingPointRegisters();
+  registers_array_ = allocator_->AllocArray<size_t>(number_of_registers_,
+                                                    kArenaAllocRegisterAllocator);
+  processing_core_registers_ = false;
+  unhandled_ = &unhandled_fp_intervals_;
+  for (LiveInterval* fixed : physical_fp_register_intervals_) {
+    if (fixed != nullptr) {
+      // Fixed interval is added to inactive_ instead of unhandled_.
+      // It's also the only type of inactive interval whose start position
+      // can be after the current interval during linear scan.
+      // Fixed interval is never split and never moves to unhandled_.
+      inactive_.push_back(fixed);
+    }
+  }
+  LinearScan();
+}
+
+void RegisterAllocatorLinearScan::ProcessInstruction(HInstruction* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  size_t position = instruction->GetLifetimePosition();
+
+  if (locations == nullptr) return;
+
+  // Create synthesized intervals for temporaries.
+  for (size_t i = 0; i < locations->GetTempCount(); ++i) {
+    Location temp = locations->GetTemp(i);
+    if (temp.IsRegister() || temp.IsFpuRegister()) {
+      BlockRegister(temp, position, position + 1);
+      // Ensure that an explicit temporary register is marked as being allocated.
+      codegen_->AddAllocatedRegister(temp);
+    } else {
+      DCHECK(temp.IsUnallocated());
+      switch (temp.GetPolicy()) {
+        case Location::kRequiresRegister: {
+          LiveInterval* interval =
+              LiveInterval::MakeTempInterval(allocator_, Primitive::kPrimInt);
+          temp_intervals_.push_back(interval);
+          interval->AddTempUse(instruction, i);
+          unhandled_core_intervals_.push_back(interval);
+          break;
+        }
+
+        case Location::kRequiresFpuRegister: {
+          LiveInterval* interval =
+              LiveInterval::MakeTempInterval(allocator_, Primitive::kPrimDouble);
+          temp_intervals_.push_back(interval);
+          interval->AddTempUse(instruction, i);
+          if (codegen_->NeedsTwoRegisters(Primitive::kPrimDouble)) {
+            interval->AddHighInterval(/* is_temp */ true);
+            LiveInterval* high = interval->GetHighInterval();
+            temp_intervals_.push_back(high);
+            unhandled_fp_intervals_.push_back(high);
+          }
+          unhandled_fp_intervals_.push_back(interval);
+          break;
+        }
+
+        default:
+          LOG(FATAL) << "Unexpected policy for temporary location "
+                     << temp.GetPolicy();
+      }
+    }
+  }
+
+  bool core_register = (instruction->GetType() != Primitive::kPrimDouble)
+      && (instruction->GetType() != Primitive::kPrimFloat);
+
+  if (locations->NeedsSafepoint()) {
+    if (codegen_->IsLeafMethod()) {
+      // TODO: We do this here because we do not want the suspend check to artificially
+      // create live registers. We should find another place, but this is currently the
+      // simplest.
+      DCHECK(instruction->IsSuspendCheckEntry());
+      instruction->GetBlock()->RemoveInstruction(instruction);
+      return;
+    }
+    safepoints_.push_back(instruction);
+    if (locations->OnlyCallsOnSlowPath()) {
+      // We add a synthesized range at this position to record the live registers
+      // at this position. Ideally, we could just update the safepoints when locations
+      // are updated, but we currently need to know the full stack size before updating
+      // locations (because of parameters and the fact that we don't have a frame pointer).
+      // And knowing the full stack size requires to know the maximum number of live
+      // registers at calls in slow paths.
+      // By adding the following interval in the algorithm, we can compute this
+      // maximum before updating locations.
+      LiveInterval* interval = LiveInterval::MakeSlowPathInterval(allocator_, instruction);
+      interval->AddRange(position, position + 1);
+      AddSorted(&unhandled_core_intervals_, interval);
+      AddSorted(&unhandled_fp_intervals_, interval);
+    }
+  }
+
+  if (locations->WillCall()) {
+    BlockRegisters(position, position + 1, /* caller_save_only */ true);
+  }
+
+  for (size_t i = 0; i < locations->GetInputCount(); ++i) {
+    Location input = locations->InAt(i);
+    if (input.IsRegister() || input.IsFpuRegister()) {
+      BlockRegister(input, position, position + 1);
+    } else if (input.IsPair()) {
+      BlockRegister(input.ToLow(), position, position + 1);
+      BlockRegister(input.ToHigh(), position, position + 1);
+    }
+  }
+
+  LiveInterval* current = instruction->GetLiveInterval();
+  if (current == nullptr) return;
+
+  ArenaVector<LiveInterval*>& unhandled = core_register
+      ? unhandled_core_intervals_
+      : unhandled_fp_intervals_;
+
+  DCHECK(unhandled.empty() || current->StartsBeforeOrAt(unhandled.back()));
+
+  if (codegen_->NeedsTwoRegisters(current->GetType())) {
+    current->AddHighInterval();
+  }
+
+  for (size_t safepoint_index = safepoints_.size(); safepoint_index > 0; --safepoint_index) {
+    HInstruction* safepoint = safepoints_[safepoint_index - 1u];
+    size_t safepoint_position = safepoint->GetLifetimePosition();
+
+    // Test that safepoints are ordered in the optimal way.
+    DCHECK(safepoint_index == safepoints_.size() ||
+           safepoints_[safepoint_index]->GetLifetimePosition() < safepoint_position);
+
+    if (safepoint_position == current->GetStart()) {
+      // The safepoint is for this instruction, so the location of the instruction
+      // does not need to be saved.
+      DCHECK_EQ(safepoint_index, safepoints_.size());
+      DCHECK_EQ(safepoint, instruction);
+      continue;
+    } else if (current->IsDeadAt(safepoint_position)) {
+      break;
+    } else if (!current->Covers(safepoint_position)) {
+      // Hole in the interval.
+      continue;
+    }
+    current->AddSafepoint(safepoint);
+  }
+  current->ResetSearchCache();
+
+  // Some instructions define their output in fixed register/stack slot. We need
+  // to ensure we know these locations before doing register allocation. For a
+  // given register, we create an interval that covers these locations. The register
+  // will be unavailable at these locations when trying to allocate one for an
+  // interval.
+  //
+  // The backwards walking ensures the ranges are ordered on increasing start positions.
+  Location output = locations->Out();
+  if (output.IsUnallocated() && output.GetPolicy() == Location::kSameAsFirstInput) {
+    Location first = locations->InAt(0);
+    if (first.IsRegister() || first.IsFpuRegister()) {
+      current->SetFrom(position + 1);
+      current->SetRegister(first.reg());
+    } else if (first.IsPair()) {
+      current->SetFrom(position + 1);
+      current->SetRegister(first.low());
+      LiveInterval* high = current->GetHighInterval();
+      high->SetRegister(first.high());
+      high->SetFrom(position + 1);
+    }
+  } else if (output.IsRegister() || output.IsFpuRegister()) {
+    // Shift the interval's start by one to account for the blocked register.
+    current->SetFrom(position + 1);
+    current->SetRegister(output.reg());
+    BlockRegister(output, position, position + 1);
+  } else if (output.IsPair()) {
+    current->SetFrom(position + 1);
+    current->SetRegister(output.low());
+    LiveInterval* high = current->GetHighInterval();
+    high->SetRegister(output.high());
+    high->SetFrom(position + 1);
+    BlockRegister(output.ToLow(), position, position + 1);
+    BlockRegister(output.ToHigh(), position, position + 1);
+  } else if (output.IsStackSlot() || output.IsDoubleStackSlot()) {
+    current->SetSpillSlot(output.GetStackIndex());
+  } else {
+    DCHECK(output.IsUnallocated() || output.IsConstant());
+  }
+
+  if (instruction->IsPhi() && instruction->AsPhi()->IsCatchPhi()) {
+    AllocateSpillSlotForCatchPhi(instruction->AsPhi());
+  }
+
+  // If needed, add interval to the list of unhandled intervals.
+  if (current->HasSpillSlot() || instruction->IsConstant()) {
+    // Split just before first register use.
+    size_t first_register_use = current->FirstRegisterUse();
+    if (first_register_use != kNoLifetime) {
+      LiveInterval* split = SplitBetween(current, current->GetStart(), first_register_use - 1);
+      // Don't add directly to `unhandled`, it needs to be sorted and the start
+      // of this new interval might be after intervals already in the list.
+      AddSorted(&unhandled, split);
+    } else {
+      // Nothing to do, we won't allocate a register for this value.
+    }
+  } else {
+    // Don't add directly to `unhandled`, temp or safepoint intervals
+    // for this instruction may have been added, and those can be
+    // processed first.
+    AddSorted(&unhandled, current);
+  }
+}
+
+class AllRangesIterator : public ValueObject {
+ public:
+  explicit AllRangesIterator(LiveInterval* interval)
+      : current_interval_(interval),
+        current_range_(interval->GetFirstRange()) {}
+
+  bool Done() const { return current_interval_ == nullptr; }
+  LiveRange* CurrentRange() const { return current_range_; }
+  LiveInterval* CurrentInterval() const { return current_interval_; }
+
+  void Advance() {
+    current_range_ = current_range_->GetNext();
+    if (current_range_ == nullptr) {
+      current_interval_ = current_interval_->GetNextSibling();
+      if (current_interval_ != nullptr) {
+        current_range_ = current_interval_->GetFirstRange();
+      }
+    }
+  }
+
+ private:
+  LiveInterval* current_interval_;
+  LiveRange* current_range_;
+
+  DISALLOW_COPY_AND_ASSIGN(AllRangesIterator);
+};
+
+bool RegisterAllocatorLinearScan::ValidateInternal(bool log_fatal_on_failure) const {
+  // To simplify unit testing, we eagerly create the array of intervals, and
+  // call the helper method.
+  ArenaVector<LiveInterval*> intervals(allocator_->Adapter(kArenaAllocRegisterAllocatorValidate));
+  for (size_t i = 0; i < liveness_.GetNumberOfSsaValues(); ++i) {
+    HInstruction* instruction = liveness_.GetInstructionFromSsaIndex(i);
+    if (ShouldProcess(processing_core_registers_, instruction->GetLiveInterval())) {
+      intervals.push_back(instruction->GetLiveInterval());
+    }
+  }
+
+  const ArenaVector<LiveInterval*>* physical_register_intervals = processing_core_registers_
+      ? &physical_core_register_intervals_
+      : &physical_fp_register_intervals_;
+  for (LiveInterval* fixed : *physical_register_intervals) {
+    if (fixed != nullptr) {
+      intervals.push_back(fixed);
+    }
+  }
+
+  for (LiveInterval* temp : temp_intervals_) {
+    if (ShouldProcess(processing_core_registers_, temp)) {
+      intervals.push_back(temp);
+    }
+  }
+
+  return ValidateIntervals(intervals, GetNumberOfSpillSlots(), reserved_out_slots_, *codegen_,
+                           allocator_, processing_core_registers_, log_fatal_on_failure);
+}
+
+void RegisterAllocatorLinearScan::DumpInterval(std::ostream& stream, LiveInterval* interval) const {
+  interval->Dump(stream);
+  stream << ": ";
+  if (interval->HasRegister()) {
+    if (interval->IsFloatingPoint()) {
+      codegen_->DumpFloatingPointRegister(stream, interval->GetRegister());
+    } else {
+      codegen_->DumpCoreRegister(stream, interval->GetRegister());
+    }
+  } else {
+    stream << "spilled";
+  }
+  stream << std::endl;
+}
+
+void RegisterAllocatorLinearScan::DumpAllIntervals(std::ostream& stream) const {
+  stream << "inactive: " << std::endl;
+  for (LiveInterval* inactive_interval : inactive_) {
+    DumpInterval(stream, inactive_interval);
+  }
+  stream << "active: " << std::endl;
+  for (LiveInterval* active_interval : active_) {
+    DumpInterval(stream, active_interval);
+  }
+  stream << "unhandled: " << std::endl;
+  auto unhandled = (unhandled_ != nullptr) ?
+      unhandled_ : &unhandled_core_intervals_;
+  for (LiveInterval* unhandled_interval : *unhandled) {
+    DumpInterval(stream, unhandled_interval);
+  }
+  stream << "handled: " << std::endl;
+  for (LiveInterval* handled_interval : handled_) {
+    DumpInterval(stream, handled_interval);
+  }
+}
+
+// By the book implementation of a linear scan register allocator.
+void RegisterAllocatorLinearScan::LinearScan() {
+  while (!unhandled_->empty()) {
+    // (1) Remove interval with the lowest start position from unhandled.
+    LiveInterval* current = unhandled_->back();
+    unhandled_->pop_back();
+
+    // Make sure the interval is an expected state.
+    DCHECK(!current->IsFixed() && !current->HasSpillSlot());
+    // Make sure we are going in the right order.
+    DCHECK(unhandled_->empty() || unhandled_->back()->GetStart() >= current->GetStart());
+    // Make sure a low interval is always with a high.
+    DCHECK(!current->IsLowInterval() || unhandled_->back()->IsHighInterval());
+    // Make sure a high interval is always with a low.
+    DCHECK(current->IsLowInterval() ||
+           unhandled_->empty() ||
+           !unhandled_->back()->IsHighInterval());
+
+    size_t position = current->GetStart();
+
+    // Remember the inactive_ size here since the ones moved to inactive_ from
+    // active_ below shouldn't need to be re-checked.
+    size_t inactive_intervals_to_handle = inactive_.size();
+
+    // (2) Remove currently active intervals that are dead at this position.
+    //     Move active intervals that have a lifetime hole at this position
+    //     to inactive.
+    auto active_kept_end = std::remove_if(
+        active_.begin(),
+        active_.end(),
+        [this, position](LiveInterval* interval) {
+          if (interval->IsDeadAt(position)) {
+            handled_.push_back(interval);
+            return true;
+          } else if (!interval->Covers(position)) {
+            inactive_.push_back(interval);
+            return true;
+          } else {
+            return false;  // Keep this interval.
+          }
+        });
+    active_.erase(active_kept_end, active_.end());
+
+    // (3) Remove currently inactive intervals that are dead at this position.
+    //     Move inactive intervals that cover this position to active.
+    auto inactive_to_handle_end = inactive_.begin() + inactive_intervals_to_handle;
+    auto inactive_kept_end = std::remove_if(
+        inactive_.begin(),
+        inactive_to_handle_end,
+        [this, position](LiveInterval* interval) {
+          DCHECK(interval->GetStart() < position || interval->IsFixed());
+          if (interval->IsDeadAt(position)) {
+            handled_.push_back(interval);
+            return true;
+          } else if (interval->Covers(position)) {
+            active_.push_back(interval);
+            return true;
+          } else {
+            return false;  // Keep this interval.
+          }
+        });
+    inactive_.erase(inactive_kept_end, inactive_to_handle_end);
+
+    if (current->IsSlowPathSafepoint()) {
+      // Synthesized interval to record the maximum number of live registers
+      // at safepoints. No need to allocate a register for it.
+      if (processing_core_registers_) {
+        maximum_number_of_live_core_registers_ =
+          std::max(maximum_number_of_live_core_registers_, active_.size());
+      } else {
+        maximum_number_of_live_fp_registers_ =
+          std::max(maximum_number_of_live_fp_registers_, active_.size());
+      }
+      DCHECK(unhandled_->empty() || unhandled_->back()->GetStart() > current->GetStart());
+      continue;
+    }
+
+    if (current->IsHighInterval() && !current->GetLowInterval()->HasRegister()) {
+      DCHECK(!current->HasRegister());
+      // Allocating the low part was unsucessful. The splitted interval for the high part
+      // will be handled next (it is in the `unhandled_` list).
+      continue;
+    }
+
+    // (4) Try to find an available register.
+    bool success = TryAllocateFreeReg(current);
+
+    // (5) If no register could be found, we need to spill.
+    if (!success) {
+      success = AllocateBlockedReg(current);
+    }
+
+    // (6) If the interval had a register allocated, add it to the list of active
+    //     intervals.
+    if (success) {
+      codegen_->AddAllocatedRegister(processing_core_registers_
+          ? Location::RegisterLocation(current->GetRegister())
+          : Location::FpuRegisterLocation(current->GetRegister()));
+      active_.push_back(current);
+      if (current->HasHighInterval() && !current->GetHighInterval()->HasRegister()) {
+        current->GetHighInterval()->SetRegister(GetHighForLowRegister(current->GetRegister()));
+      }
+    }
+  }
+}
+
+static void FreeIfNotCoverAt(LiveInterval* interval, size_t position, size_t* free_until) {
+  DCHECK(!interval->IsHighInterval());
+  // Note that the same instruction may occur multiple times in the input list,
+  // so `free_until` may have changed already.
+  // Since `position` is not the current scan position, we need to use CoversSlow.
+  if (interval->IsDeadAt(position)) {
+    // Set the register to be free. Note that inactive intervals might later
+    // update this.
+    free_until[interval->GetRegister()] = kMaxLifetimePosition;
+    if (interval->HasHighInterval()) {
+      DCHECK(interval->GetHighInterval()->IsDeadAt(position));
+      free_until[interval->GetHighInterval()->GetRegister()] = kMaxLifetimePosition;
+    }
+  } else if (!interval->CoversSlow(position)) {
+    // The interval becomes inactive at `defined_by`. We make its register
+    // available only until the next use strictly after `defined_by`.
+    free_until[interval->GetRegister()] = interval->FirstUseAfter(position);
+    if (interval->HasHighInterval()) {
+      DCHECK(!interval->GetHighInterval()->CoversSlow(position));
+      free_until[interval->GetHighInterval()->GetRegister()] = free_until[interval->GetRegister()];
+    }
+  }
+}
+
+// Find a free register. If multiple are found, pick the register that
+// is free the longest.
+bool RegisterAllocatorLinearScan::TryAllocateFreeReg(LiveInterval* current) {
+  size_t* free_until = registers_array_;
+
+  // First set all registers to be free.
+  for (size_t i = 0; i < number_of_registers_; ++i) {
+    free_until[i] = kMaxLifetimePosition;
+  }
+
+  // For each active interval, set its register to not free.
+  for (LiveInterval* interval : active_) {
+    DCHECK(interval->HasRegister());
+    free_until[interval->GetRegister()] = 0;
+  }
+
+  // An interval that starts an instruction (that is, it is not split), may
+  // re-use the registers used by the inputs of that instruciton, based on the
+  // location summary.
+  HInstruction* defined_by = current->GetDefinedBy();
+  if (defined_by != nullptr && !current->IsSplit()) {
+    LocationSummary* locations = defined_by->GetLocations();
+    if (!locations->OutputCanOverlapWithInputs() && locations->Out().IsUnallocated()) {
+      HInputsRef inputs = defined_by->GetInputs();
+      for (size_t i = 0; i < inputs.size(); ++i) {
+        // Take the last interval of the input. It is the location of that interval
+        // that will be used at `defined_by`.
+        LiveInterval* interval = inputs[i]->GetLiveInterval()->GetLastSibling();
+        // Note that interval may have not been processed yet.
+        // TODO: Handle non-split intervals last in the work list.
+        if (locations->InAt(i).IsValid()
+            && interval->HasRegister()
+            && interval->SameRegisterKind(*current)) {
+          // The input must be live until the end of `defined_by`, to comply to
+          // the linear scan algorithm. So we use `defined_by`'s end lifetime
+          // position to check whether the input is dead or is inactive after
+          // `defined_by`.
+          DCHECK(interval->CoversSlow(defined_by->GetLifetimePosition()));
+          size_t position = defined_by->GetLifetimePosition() + 1;
+          FreeIfNotCoverAt(interval, position, free_until);
+        }
+      }
+    }
+  }
+
+  // For each inactive interval, set its register to be free until
+  // the next intersection with `current`.
+  for (LiveInterval* inactive : inactive_) {
+    // Temp/Slow-path-safepoint interval has no holes.
+    DCHECK(!inactive->IsTemp() && !inactive->IsSlowPathSafepoint());
+    if (!current->IsSplit() && !inactive->IsFixed()) {
+      // Neither current nor inactive are fixed.
+      // Thanks to SSA, a non-split interval starting in a hole of an
+      // inactive interval should never intersect with that inactive interval.
+      // Only if it's not fixed though, because fixed intervals don't come from SSA.
+      DCHECK_EQ(inactive->FirstIntersectionWith(current), kNoLifetime);
+      continue;
+    }
+
+    DCHECK(inactive->HasRegister());
+    if (free_until[inactive->GetRegister()] == 0) {
+      // Already used by some active interval. No need to intersect.
+      continue;
+    }
+    size_t next_intersection = inactive->FirstIntersectionWith(current);
+    if (next_intersection != kNoLifetime) {
+      free_until[inactive->GetRegister()] =
+          std::min(free_until[inactive->GetRegister()], next_intersection);
+    }
+  }
+
+  int reg = kNoRegister;
+  if (current->HasRegister()) {
+    // Some instructions have a fixed register output.
+    reg = current->GetRegister();
+    if (free_until[reg] == 0) {
+      DCHECK(current->IsHighInterval());
+      // AllocateBlockedReg will spill the holder of the register.
+      return false;
+    }
+  } else {
+    DCHECK(!current->IsHighInterval());
+    int hint = current->FindFirstRegisterHint(free_until, liveness_);
+    if ((hint != kNoRegister)
+        // For simplicity, if the hint we are getting for a pair cannot be used,
+        // we are just going to allocate a new pair.
+        && !(current->IsLowInterval() && IsBlocked(GetHighForLowRegister(hint)))) {
+      DCHECK(!IsBlocked(hint));
+      reg = hint;
+    } else if (current->IsLowInterval()) {
+      reg = FindAvailableRegisterPair(free_until, current->GetStart());
+    } else {
+      reg = FindAvailableRegister(free_until, current);
+    }
+  }
+
+  DCHECK_NE(reg, kNoRegister);
+  // If we could not find a register, we need to spill.
+  if (free_until[reg] == 0) {
+    return false;
+  }
+
+  if (current->IsLowInterval()) {
+    // If the high register of this interval is not available, we need to spill.
+    int high_reg = current->GetHighInterval()->GetRegister();
+    if (high_reg == kNoRegister) {
+      high_reg = GetHighForLowRegister(reg);
+    }
+    if (free_until[high_reg] == 0) {
+      return false;
+    }
+  }
+
+  current->SetRegister(reg);
+  if (!current->IsDeadAt(free_until[reg])) {
+    // If the register is only available for a subset of live ranges
+    // covered by `current`, split `current` before the position where
+    // the register is not available anymore.
+    LiveInterval* split = SplitBetween(current, current->GetStart(), free_until[reg]);
+    DCHECK(split != nullptr);
+    AddSorted(unhandled_, split);
+  }
+  return true;
+}
+
+bool RegisterAllocatorLinearScan::IsBlocked(int reg) const {
+  return processing_core_registers_
+      ? blocked_core_registers_[reg]
+      : blocked_fp_registers_[reg];
+}
+
+int RegisterAllocatorLinearScan::FindAvailableRegisterPair(size_t* next_use, size_t starting_at) const {
+  int reg = kNoRegister;
+  // Pick the register pair that is used the last.
+  for (size_t i = 0; i < number_of_registers_; ++i) {
+    if (IsBlocked(i)) continue;
+    if (!IsLowRegister(i)) continue;
+    int high_register = GetHighForLowRegister(i);
+    if (IsBlocked(high_register)) continue;
+    int existing_high_register = GetHighForLowRegister(reg);
+    if ((reg == kNoRegister) || (next_use[i] >= next_use[reg]
+                        && next_use[high_register] >= next_use[existing_high_register])) {
+      reg = i;
+      if (next_use[i] == kMaxLifetimePosition
+          && next_use[high_register] == kMaxLifetimePosition) {
+        break;
+      }
+    } else if (next_use[reg] <= starting_at || next_use[existing_high_register] <= starting_at) {
+      // If one of the current register is known to be unavailable, just unconditionally
+      // try a new one.
+      reg = i;
+    }
+  }
+  return reg;
+}
+
+bool RegisterAllocatorLinearScan::IsCallerSaveRegister(int reg) const {
+  return processing_core_registers_
+      ? !codegen_->IsCoreCalleeSaveRegister(reg)
+      : !codegen_->IsFloatingPointCalleeSaveRegister(reg);
+}
+
+int RegisterAllocatorLinearScan::FindAvailableRegister(size_t* next_use, LiveInterval* current) const {
+  // We special case intervals that do not span a safepoint to try to find a caller-save
+  // register if one is available. We iterate from 0 to the number of registers,
+  // so if there are caller-save registers available at the end, we continue the iteration.
+  bool prefers_caller_save = !current->HasWillCallSafepoint();
+  int reg = kNoRegister;
+  for (size_t i = 0; i < number_of_registers_; ++i) {
+    if (IsBlocked(i)) {
+      // Register cannot be used. Continue.
+      continue;
+    }
+
+    // Best case: we found a register fully available.
+    if (next_use[i] == kMaxLifetimePosition) {
+      if (prefers_caller_save && !IsCallerSaveRegister(i)) {
+        // We can get shorter encodings on some platforms by using
+        // small register numbers. So only update the candidate if the previous
+        // one was not available for the whole method.
+        if (reg == kNoRegister || next_use[reg] != kMaxLifetimePosition) {
+          reg = i;
+        }
+        // Continue the iteration in the hope of finding a caller save register.
+        continue;
+      } else {
+        reg = i;
+        // We know the register is good enough. Return it.
+        break;
+      }
+    }
+
+    // If we had no register before, take this one as a reference.
+    if (reg == kNoRegister) {
+      reg = i;
+      continue;
+    }
+
+    // Pick the register that is used the last.
+    if (next_use[i] > next_use[reg]) {
+      reg = i;
+      continue;
+    }
+  }
+  return reg;
+}
+
+// Remove interval and its other half if any. Return iterator to the following element.
+static ArenaVector<LiveInterval*>::iterator RemoveIntervalAndPotentialOtherHalf(
+    ArenaVector<LiveInterval*>* intervals, ArenaVector<LiveInterval*>::iterator pos) {
+  DCHECK(intervals->begin() <= pos && pos < intervals->end());
+  LiveInterval* interval = *pos;
+  if (interval->IsLowInterval()) {
+    DCHECK(pos + 1 < intervals->end());
+    DCHECK_EQ(*(pos + 1), interval->GetHighInterval());
+    return intervals->erase(pos, pos + 2);
+  } else if (interval->IsHighInterval()) {
+    DCHECK(intervals->begin() < pos);
+    DCHECK_EQ(*(pos - 1), interval->GetLowInterval());
+    return intervals->erase(pos - 1, pos + 1);
+  } else {
+    return intervals->erase(pos);
+  }
+}
+
+bool RegisterAllocatorLinearScan::TrySplitNonPairOrUnalignedPairIntervalAt(size_t position,
+                                                                           size_t first_register_use,
+                                                                           size_t* next_use) {
+  for (auto it = active_.begin(), end = active_.end(); it != end; ++it) {
+    LiveInterval* active = *it;
+    DCHECK(active->HasRegister());
+    if (active->IsFixed()) continue;
+    if (active->IsHighInterval()) continue;
+    if (first_register_use > next_use[active->GetRegister()]) continue;
+
+    // Split the first interval found that is either:
+    // 1) A non-pair interval.
+    // 2) A pair interval whose high is not low + 1.
+    // 3) A pair interval whose low is not even.
+    if (!active->IsLowInterval() ||
+        IsLowOfUnalignedPairInterval(active) ||
+        !IsLowRegister(active->GetRegister())) {
+      LiveInterval* split = Split(active, position);
+      if (split != active) {
+        handled_.push_back(active);
+      }
+      RemoveIntervalAndPotentialOtherHalf(&active_, it);
+      AddSorted(unhandled_, split);
+      return true;
+    }
+  }
+  return false;
+}
+
+// Find the register that is used the last, and spill the interval
+// that holds it. If the first use of `current` is after that register
+// we spill `current` instead.
+bool RegisterAllocatorLinearScan::AllocateBlockedReg(LiveInterval* current) {
+  size_t first_register_use = current->FirstRegisterUse();
+  if (current->HasRegister()) {
+    DCHECK(current->IsHighInterval());
+    // The low interval has allocated the register for the high interval. In
+    // case the low interval had to split both intervals, we may end up in a
+    // situation where the high interval does not have a register use anymore.
+    // We must still proceed in order to split currently active and inactive
+    // uses of the high interval's register, and put the high interval in the
+    // active set.
+    DCHECK(first_register_use != kNoLifetime || (current->GetNextSibling() != nullptr));
+  } else if (first_register_use == kNoLifetime) {
+    AllocateSpillSlotFor(current);
+    return false;
+  }
+
+  // First set all registers as not being used.
+  size_t* next_use = registers_array_;
+  for (size_t i = 0; i < number_of_registers_; ++i) {
+    next_use[i] = kMaxLifetimePosition;
+  }
+
+  // For each active interval, find the next use of its register after the
+  // start of current.
+  for (LiveInterval* active : active_) {
+    DCHECK(active->HasRegister());
+    if (active->IsFixed()) {
+      next_use[active->GetRegister()] = current->GetStart();
+    } else {
+      size_t use = active->FirstRegisterUseAfter(current->GetStart());
+      if (use != kNoLifetime) {
+        next_use[active->GetRegister()] = use;
+      }
+    }
+  }
+
+  // For each inactive interval, find the next use of its register after the
+  // start of current.
+  for (LiveInterval* inactive : inactive_) {
+    // Temp/Slow-path-safepoint interval has no holes.
+    DCHECK(!inactive->IsTemp() && !inactive->IsSlowPathSafepoint());
+    if (!current->IsSplit() && !inactive->IsFixed()) {
+      // Neither current nor inactive are fixed.
+      // Thanks to SSA, a non-split interval starting in a hole of an
+      // inactive interval should never intersect with that inactive interval.
+      // Only if it's not fixed though, because fixed intervals don't come from SSA.
+      DCHECK_EQ(inactive->FirstIntersectionWith(current), kNoLifetime);
+      continue;
+    }
+    DCHECK(inactive->HasRegister());
+    size_t next_intersection = inactive->FirstIntersectionWith(current);
+    if (next_intersection != kNoLifetime) {
+      if (inactive->IsFixed()) {
+        next_use[inactive->GetRegister()] =
+            std::min(next_intersection, next_use[inactive->GetRegister()]);
+      } else {
+        size_t use = inactive->FirstUseAfter(current->GetStart());
+        if (use != kNoLifetime) {
+          next_use[inactive->GetRegister()] = std::min(use, next_use[inactive->GetRegister()]);
+        }
+      }
+    }
+  }
+
+  int reg = kNoRegister;
+  bool should_spill = false;
+  if (current->HasRegister()) {
+    DCHECK(current->IsHighInterval());
+    reg = current->GetRegister();
+    // When allocating the low part, we made sure the high register was available.
+    DCHECK_LT(first_register_use, next_use[reg]);
+  } else if (current->IsLowInterval()) {
+    reg = FindAvailableRegisterPair(next_use, first_register_use);
+    // We should spill if both registers are not available.
+    should_spill = (first_register_use >= next_use[reg])
+      || (first_register_use >= next_use[GetHighForLowRegister(reg)]);
+  } else {
+    DCHECK(!current->IsHighInterval());
+    reg = FindAvailableRegister(next_use, current);
+    should_spill = (first_register_use >= next_use[reg]);
+  }
+
+  DCHECK_NE(reg, kNoRegister);
+  if (should_spill) {
+    DCHECK(!current->IsHighInterval());
+    bool is_allocation_at_use_site = (current->GetStart() >= (first_register_use - 1));
+    if (is_allocation_at_use_site) {
+      if (!current->IsLowInterval()) {
+        DumpInterval(std::cerr, current);
+        DumpAllIntervals(std::cerr);
+        // This situation has the potential to infinite loop, so we make it a non-debug CHECK.
+        HInstruction* at = liveness_.GetInstructionFromPosition(first_register_use / 2);
+        CHECK(false) << "There is not enough registers available for "
+          << current->GetParent()->GetDefinedBy()->DebugName() << " "
+          << current->GetParent()->GetDefinedBy()->GetId()
+          << " at " << first_register_use - 1 << " "
+          << (at == nullptr ? "" : at->DebugName());
+      }
+
+      // If we're allocating a register for `current` because the instruction at
+      // that position requires it, but we think we should spill, then there are
+      // non-pair intervals or unaligned pair intervals blocking the allocation.
+      // We split the first interval found, and put ourselves first in the
+      // `unhandled_` list.
+      bool success = TrySplitNonPairOrUnalignedPairIntervalAt(current->GetStart(),
+                                                              first_register_use,
+                                                              next_use);
+      DCHECK(success);
+      LiveInterval* existing = unhandled_->back();
+      DCHECK(existing->IsHighInterval());
+      DCHECK_EQ(existing->GetLowInterval(), current);
+      unhandled_->push_back(current);
+    } else {
+      // If the first use of that instruction is after the last use of the found
+      // register, we split this interval just before its first register use.
+      AllocateSpillSlotFor(current);
+      LiveInterval* split = SplitBetween(current, current->GetStart(), first_register_use - 1);
+      DCHECK(current != split);
+      AddSorted(unhandled_, split);
+    }
+    return false;
+  } else {
+    // Use this register and spill the active and inactives interval that
+    // have that register.
+    current->SetRegister(reg);
+
+    for (auto it = active_.begin(), end = active_.end(); it != end; ++it) {
+      LiveInterval* active = *it;
+      if (active->GetRegister() == reg) {
+        DCHECK(!active->IsFixed());
+        LiveInterval* split = Split(active, current->GetStart());
+        if (split != active) {
+          handled_.push_back(active);
+        }
+        RemoveIntervalAndPotentialOtherHalf(&active_, it);
+        AddSorted(unhandled_, split);
+        break;
+      }
+    }
+
+    // NOTE: Retrieve end() on each iteration because we're removing elements in the loop body.
+    for (auto it = inactive_.begin(); it != inactive_.end(); ) {
+      LiveInterval* inactive = *it;
+      bool erased = false;
+      if (inactive->GetRegister() == reg) {
+        if (!current->IsSplit() && !inactive->IsFixed()) {
+          // Neither current nor inactive are fixed.
+          // Thanks to SSA, a non-split interval starting in a hole of an
+          // inactive interval should never intersect with that inactive interval.
+          // Only if it's not fixed though, because fixed intervals don't come from SSA.
+          DCHECK_EQ(inactive->FirstIntersectionWith(current), kNoLifetime);
+        } else {
+          size_t next_intersection = inactive->FirstIntersectionWith(current);
+          if (next_intersection != kNoLifetime) {
+            if (inactive->IsFixed()) {
+              LiveInterval* split = Split(current, next_intersection);
+              DCHECK_NE(split, current);
+              AddSorted(unhandled_, split);
+            } else {
+              // Split at the start of `current`, which will lead to splitting
+              // at the end of the lifetime hole of `inactive`.
+              LiveInterval* split = Split(inactive, current->GetStart());
+              // If it's inactive, it must start before the current interval.
+              DCHECK_NE(split, inactive);
+              it = RemoveIntervalAndPotentialOtherHalf(&inactive_, it);
+              erased = true;
+              handled_.push_back(inactive);
+              AddSorted(unhandled_, split);
+            }
+          }
+        }
+      }
+      // If we have erased the element, `it` already points to the next element.
+      // Otherwise we need to move to the next element.
+      if (!erased) {
+        ++it;
+      }
+    }
+
+    return true;
+  }
+}
+
+void RegisterAllocatorLinearScan::AddSorted(ArenaVector<LiveInterval*>* array, LiveInterval* interval) {
+  DCHECK(!interval->IsFixed() && !interval->HasSpillSlot());
+  size_t insert_at = 0;
+  for (size_t i = array->size(); i > 0; --i) {
+    LiveInterval* current = (*array)[i - 1u];
+    // High intervals must be processed right after their low equivalent.
+    if (current->StartsAfter(interval) && !current->IsHighInterval()) {
+      insert_at = i;
+      break;
+    } else if ((current->GetStart() == interval->GetStart()) && current->IsSlowPathSafepoint()) {
+      // Ensure the slow path interval is the last to be processed at its location: we want the
+      // interval to know all live registers at this location.
+      DCHECK(i == 1 || (*array)[i - 2u]->StartsAfter(current));
+      insert_at = i;
+      break;
+    }
+  }
+
+  // Insert the high interval before the low, to ensure the low is processed before.
+  auto insert_pos = array->begin() + insert_at;
+  if (interval->HasHighInterval()) {
+    array->insert(insert_pos, { interval->GetHighInterval(), interval });
+  } else if (interval->HasLowInterval()) {
+    array->insert(insert_pos, { interval, interval->GetLowInterval() });
+  } else {
+    array->insert(insert_pos, interval);
+  }
+}
+
+void RegisterAllocatorLinearScan::AllocateSpillSlotFor(LiveInterval* interval) {
+  if (interval->IsHighInterval()) {
+    // The low interval already took care of allocating the spill slot.
+    DCHECK(!interval->GetLowInterval()->HasRegister());
+    DCHECK(interval->GetLowInterval()->GetParent()->HasSpillSlot());
+    return;
+  }
+
+  LiveInterval* parent = interval->GetParent();
+
+  // An instruction gets a spill slot for its entire lifetime. If the parent
+  // of this interval already has a spill slot, there is nothing to do.
+  if (parent->HasSpillSlot()) {
+    return;
+  }
+
+  HInstruction* defined_by = parent->GetDefinedBy();
+  DCHECK(!defined_by->IsPhi() || !defined_by->AsPhi()->IsCatchPhi());
+
+  if (defined_by->IsParameterValue()) {
+    // Parameters have their own stack slot.
+    parent->SetSpillSlot(codegen_->GetStackSlotOfParameter(defined_by->AsParameterValue()));
+    return;
+  }
+
+  if (defined_by->IsCurrentMethod()) {
+    parent->SetSpillSlot(0);
+    return;
+  }
+
+  if (defined_by->IsConstant()) {
+    // Constants don't need a spill slot.
+    return;
+  }
+
+  ArenaVector<size_t>* spill_slots = nullptr;
+  switch (interval->GetType()) {
+    case Primitive::kPrimDouble:
+      spill_slots = &double_spill_slots_;
+      break;
+    case Primitive::kPrimLong:
+      spill_slots = &long_spill_slots_;
+      break;
+    case Primitive::kPrimFloat:
+      spill_slots = &float_spill_slots_;
+      break;
+    case Primitive::kPrimNot:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimShort:
+      spill_slots = &int_spill_slots_;
+      break;
+    case Primitive::kPrimVoid:
+      LOG(FATAL) << "Unexpected type for interval " << interval->GetType();
+  }
+
+  // Find an available spill slot.
+  size_t slot = 0;
+  for (size_t e = spill_slots->size(); slot < e; ++slot) {
+    if ((*spill_slots)[slot] <= parent->GetStart()) {
+      if (!parent->NeedsTwoSpillSlots()) {
+        // One spill slot is sufficient.
+        break;
+      }
+      if (slot == e - 1 || (*spill_slots)[slot + 1] <= parent->GetStart()) {
+        // Two spill slots are available.
+        break;
+      }
+    }
+  }
+
+  size_t end = interval->GetLastSibling()->GetEnd();
+  if (parent->NeedsTwoSpillSlots()) {
+    if (slot + 2u > spill_slots->size()) {
+      // We need a new spill slot.
+      spill_slots->resize(slot + 2u, end);
+    }
+    (*spill_slots)[slot] = end;
+    (*spill_slots)[slot + 1] = end;
+  } else {
+    if (slot == spill_slots->size()) {
+      // We need a new spill slot.
+      spill_slots->push_back(end);
+    } else {
+      (*spill_slots)[slot] = end;
+    }
+  }
+
+  // Note that the exact spill slot location will be computed when we resolve,
+  // that is when we know the number of spill slots for each type.
+  parent->SetSpillSlot(slot);
+}
+
+void RegisterAllocatorLinearScan::AllocateSpillSlotForCatchPhi(HPhi* phi) {
+  LiveInterval* interval = phi->GetLiveInterval();
+
+  HInstruction* previous_phi = phi->GetPrevious();
+  DCHECK(previous_phi == nullptr ||
+         previous_phi->AsPhi()->GetRegNumber() <= phi->GetRegNumber())
+      << "Phis expected to be sorted by vreg number, so that equivalent phis are adjacent.";
+
+  if (phi->IsVRegEquivalentOf(previous_phi)) {
+    // This is an equivalent of the previous phi. We need to assign the same
+    // catch phi slot.
+    DCHECK(previous_phi->GetLiveInterval()->HasSpillSlot());
+    interval->SetSpillSlot(previous_phi->GetLiveInterval()->GetSpillSlot());
+  } else {
+    // Allocate a new spill slot for this catch phi.
+    // TODO: Reuse spill slots when intervals of phis from different catch
+    //       blocks do not overlap.
+    interval->SetSpillSlot(catch_phi_spill_slots_);
+    catch_phi_spill_slots_ += interval->NeedsTwoSpillSlots() ? 2 : 1;
+  }
+}
+
+}  // namespace art
diff --git a/compiler/optimizing/register_allocator_linear_scan.h b/compiler/optimizing/register_allocator_linear_scan.h
new file mode 100644
index 0000000000..1a643a0d1a
--- /dev/null
+++ b/compiler/optimizing/register_allocator_linear_scan.h
@@ -0,0 +1,188 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_OPTIMIZING_REGISTER_ALLOCATOR_LINEAR_SCAN_H_
+#define ART_COMPILER_OPTIMIZING_REGISTER_ALLOCATOR_LINEAR_SCAN_H_
+
+#include "arch/instruction_set.h"
+#include "base/arena_containers.h"
+#include "base/macros.h"
+#include "primitive.h"
+#include "register_allocator.h"
+
+namespace art {
+
+class CodeGenerator;
+class HBasicBlock;
+class HGraph;
+class HInstruction;
+class HParallelMove;
+class HPhi;
+class LiveInterval;
+class Location;
+class SsaLivenessAnalysis;
+
+/**
+ * An implementation of a linear scan register allocator on an `HGraph` with SSA form.
+ */
+class RegisterAllocatorLinearScan : public RegisterAllocator {
+ public:
+  RegisterAllocatorLinearScan(ArenaAllocator* allocator,
+                              CodeGenerator* codegen,
+                              const SsaLivenessAnalysis& analysis);
+  ~RegisterAllocatorLinearScan() OVERRIDE {}
+
+  void AllocateRegisters() OVERRIDE;
+
+  bool Validate(bool log_fatal_on_failure) OVERRIDE {
+    processing_core_registers_ = true;
+    if (!ValidateInternal(log_fatal_on_failure)) {
+      return false;
+    }
+    processing_core_registers_ = false;
+    return ValidateInternal(log_fatal_on_failure);
+  }
+
+  size_t GetNumberOfSpillSlots() const {
+    return int_spill_slots_.size()
+        + long_spill_slots_.size()
+        + float_spill_slots_.size()
+        + double_spill_slots_.size()
+        + catch_phi_spill_slots_;
+  }
+
+ private:
+  // Main methods of the allocator.
+  void LinearScan();
+  bool TryAllocateFreeReg(LiveInterval* interval);
+  bool AllocateBlockedReg(LiveInterval* interval);
+
+  // Add `interval` in the given sorted list.
+  static void AddSorted(ArenaVector<LiveInterval*>* array, LiveInterval* interval);
+
+  // Returns whether `reg` is blocked by the code generator.
+  bool IsBlocked(int reg) const;
+
+  // Update the interval for the register in `location` to cover [start, end).
+  void BlockRegister(Location location, size_t start, size_t end);
+  void BlockRegisters(size_t start, size_t end, bool caller_save_only = false);
+
+  // Allocate a spill slot for the given interval. Should be called in linear
+  // order of interval starting positions.
+  void AllocateSpillSlotFor(LiveInterval* interval);
+
+  // Allocate a spill slot for the given catch phi. Will allocate the same slot
+  // for phis which share the same vreg. Must be called in reverse linear order
+  // of lifetime positions and ascending vreg numbers for correctness.
+  void AllocateSpillSlotForCatchPhi(HPhi* phi);
+
+  // Helper methods.
+  void AllocateRegistersInternal();
+  void ProcessInstruction(HInstruction* instruction);
+  bool ValidateInternal(bool log_fatal_on_failure) const;
+  void DumpInterval(std::ostream& stream, LiveInterval* interval) const;
+  void DumpAllIntervals(std::ostream& stream) const;
+  int FindAvailableRegisterPair(size_t* next_use, size_t starting_at) const;
+  int FindAvailableRegister(size_t* next_use, LiveInterval* current) const;
+  bool IsCallerSaveRegister(int reg) const;
+
+  // Try splitting an active non-pair or unaligned pair interval at the given `position`.
+  // Returns whether it was successful at finding such an interval.
+  bool TrySplitNonPairOrUnalignedPairIntervalAt(size_t position,
+                                                size_t first_register_use,
+                                                size_t* next_use);
+
+  // List of intervals for core registers that must be processed, ordered by start
+  // position. Last entry is the interval that has the lowest start position.
+  // This list is initially populated before doing the linear scan.
+  ArenaVector<LiveInterval*> unhandled_core_intervals_;
+
+  // List of intervals for floating-point registers. Same comments as above.
+  ArenaVector<LiveInterval*> unhandled_fp_intervals_;
+
+  // Currently processed list of unhandled intervals. Either `unhandled_core_intervals_`
+  // or `unhandled_fp_intervals_`.
+  ArenaVector<LiveInterval*>* unhandled_;
+
+  // List of intervals that have been processed.
+  ArenaVector<LiveInterval*> handled_;
+
+  // List of intervals that are currently active when processing a new live interval.
+  // That is, they have a live range that spans the start of the new interval.
+  ArenaVector<LiveInterval*> active_;
+
+  // List of intervals that are currently inactive when processing a new live interval.
+  // That is, they have a lifetime hole that spans the start of the new interval.
+  ArenaVector<LiveInterval*> inactive_;
+
+  // Fixed intervals for physical registers. Such intervals cover the positions
+  // where an instruction requires a specific register.
+  ArenaVector<LiveInterval*> physical_core_register_intervals_;
+  ArenaVector<LiveInterval*> physical_fp_register_intervals_;
+
+  // Intervals for temporaries. Such intervals cover the positions
+  // where an instruction requires a temporary.
+  ArenaVector<LiveInterval*> temp_intervals_;
+
+  // The spill slots allocated for live intervals. We ensure spill slots
+  // are typed to avoid (1) doing moves and swaps between two different kinds
+  // of registers, and (2) swapping between a single stack slot and a double
+  // stack slot. This simplifies the parallel move resolver.
+  ArenaVector<size_t> int_spill_slots_;
+  ArenaVector<size_t> long_spill_slots_;
+  ArenaVector<size_t> float_spill_slots_;
+  ArenaVector<size_t> double_spill_slots_;
+
+  // Spill slots allocated to catch phis. This category is special-cased because
+  // (1) slots are allocated prior to linear scan and in reverse linear order,
+  // (2) equivalent phis need to share slots despite having different types.
+  size_t catch_phi_spill_slots_;
+
+  // Instructions that need a safepoint.
+  ArenaVector<HInstruction*> safepoints_;
+
+  // True if processing core registers. False if processing floating
+  // point registers.
+  bool processing_core_registers_;
+
+  // Number of registers for the current register kind (core or floating point).
+  size_t number_of_registers_;
+
+  // Temporary array, allocated ahead of time for simplicity.
+  size_t* registers_array_;
+
+  // Blocked registers, as decided by the code generator.
+  bool* const blocked_core_registers_;
+  bool* const blocked_fp_registers_;
+
+  // Slots reserved for out arguments.
+  size_t reserved_out_slots_;
+
+  // The maximum live core registers at safepoints.
+  size_t maximum_number_of_live_core_registers_;
+
+  // The maximum live FP registers at safepoints.
+  size_t maximum_number_of_live_fp_registers_;
+
+  ART_FRIEND_TEST(RegisterAllocatorTest, FreeUntil);
+  ART_FRIEND_TEST(RegisterAllocatorTest, SpillInactive);
+
+  DISALLOW_COPY_AND_ASSIGN(RegisterAllocatorLinearScan);
+};
+
+}  // namespace art
+
+#endif  // ART_COMPILER_OPTIMIZING_REGISTER_ALLOCATOR_LINEAR_SCAN_H_
diff --git a/compiler/optimizing/register_allocator_test.cc b/compiler/optimizing/register_allocator_test.cc
index a9de7c3e59..55ea99e592 100644
--- a/compiler/optimizing/register_allocator_test.cc
+++ b/compiler/optimizing/register_allocator_test.cc
@@ -25,17 +25,35 @@
 #include "nodes.h"
 #include "optimizing_unit_test.h"
 #include "register_allocator.h"
+#include "register_allocator_linear_scan.h"
 #include "ssa_liveness_analysis.h"
 #include "ssa_phi_elimination.h"
 
 namespace art {
 
+using Strategy = RegisterAllocator::Strategy;
+
 // Note: the register allocator tests rely on the fact that constants have live
 // intervals and registers get allocated to them.
 
-class RegisterAllocatorTest : public CommonCompilerTest {};
+class RegisterAllocatorTest : public CommonCompilerTest {
+ protected:
+  // These functions need to access private variables of LocationSummary, so we declare it
+  // as a member of RegisterAllocatorTest, which we make a friend class.
+  static void SameAsFirstInputHint(Strategy strategy);
+  static void ExpectedInRegisterHint(Strategy strategy);
+};
+
+// This macro should include all register allocation strategies that should be tested.
+#define TEST_ALL_STRATEGIES(test_name)\
+TEST_F(RegisterAllocatorTest, test_name##_LinearScan) {\
+  test_name(Strategy::kRegisterAllocatorLinearScan);\
+}\
+TEST_F(RegisterAllocatorTest, test_name##_GraphColor) {\
+  test_name(Strategy::kRegisterAllocatorGraphColor);\
+}
 
-static bool Check(const uint16_t* data) {
+static bool Check(const uint16_t* data, Strategy strategy) {
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
   HGraph* graph = CreateCFG(&allocator, data);
@@ -44,9 +62,10 @@ static bool Check(const uint16_t* data) {
   x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   SsaLivenessAnalysis liveness(graph, &codegen);
   liveness.Analyze();
-  RegisterAllocator register_allocator(&allocator, &codegen, liveness);
-  register_allocator.AllocateRegisters();
-  return register_allocator.Validate(false);
+  RegisterAllocator* register_allocator =
+      RegisterAllocator::Create(&allocator, &codegen, liveness, strategy);
+  register_allocator->AllocateRegisters();
+  return register_allocator->Validate(false);
 }
 
 /**
@@ -142,7 +161,7 @@ TEST_F(RegisterAllocatorTest, ValidateIntervals) {
   }
 }
 
-TEST_F(RegisterAllocatorTest, CFG1) {
+static void CFG1(Strategy strategy) {
   /*
    * Test the following snippet:
    *  return 0;
@@ -159,10 +178,12 @@ TEST_F(RegisterAllocatorTest, CFG1) {
     Instruction::CONST_4 | 0 | 0,
     Instruction::RETURN);
 
-  ASSERT_TRUE(Check(data));
+  ASSERT_TRUE(Check(data, strategy));
 }
 
-TEST_F(RegisterAllocatorTest, Loop1) {
+TEST_ALL_STRATEGIES(CFG1);
+
+static void Loop1(Strategy strategy) {
   /*
    * Test the following snippet:
    *  int a = 0;
@@ -198,10 +219,12 @@ TEST_F(RegisterAllocatorTest, Loop1) {
     Instruction::CONST_4 | 5 << 12 | 1 << 8,
     Instruction::RETURN | 1 << 8);
 
-  ASSERT_TRUE(Check(data));
+  ASSERT_TRUE(Check(data, strategy));
 }
 
-TEST_F(RegisterAllocatorTest, Loop2) {
+TEST_ALL_STRATEGIES(Loop1);
+
+static void Loop2(Strategy strategy) {
   /*
    * Test the following snippet:
    *  int a = 0;
@@ -247,10 +270,12 @@ TEST_F(RegisterAllocatorTest, Loop2) {
     Instruction::ADD_INT, 1 << 8 | 0,
     Instruction::RETURN | 1 << 8);
 
-  ASSERT_TRUE(Check(data));
+  ASSERT_TRUE(Check(data, strategy));
 }
 
-TEST_F(RegisterAllocatorTest, Loop3) {
+TEST_ALL_STRATEGIES(Loop2);
+
+static void Loop3(Strategy strategy) {
   /*
    * Test the following snippet:
    *  int a = 0
@@ -295,9 +320,10 @@ TEST_F(RegisterAllocatorTest, Loop3) {
   x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   SsaLivenessAnalysis liveness(graph, &codegen);
   liveness.Analyze();
-  RegisterAllocator register_allocator(&allocator, &codegen, liveness);
-  register_allocator.AllocateRegisters();
-  ASSERT_TRUE(register_allocator.Validate(false));
+  RegisterAllocator* register_allocator =
+      RegisterAllocator::Create(&allocator, &codegen, liveness, strategy);
+  register_allocator->AllocateRegisters();
+  ASSERT_TRUE(register_allocator->Validate(false));
 
   HBasicBlock* loop_header = graph->GetBlocks()[2];
   HPhi* phi = loop_header->GetFirstPhi()->AsPhi();
@@ -313,6 +339,8 @@ TEST_F(RegisterAllocatorTest, Loop3) {
   ASSERT_EQ(phi_interval->GetRegister(), ret->InputAt(0)->GetLiveInterval()->GetRegister());
 }
 
+TEST_ALL_STRATEGIES(Loop3);
+
 TEST_F(RegisterAllocatorTest, FirstRegisterUse) {
   const uint16_t data[] = THREE_REGISTERS_CODE_ITEM(
     Instruction::CONST_4 | 0 | 0,
@@ -353,7 +381,7 @@ TEST_F(RegisterAllocatorTest, FirstRegisterUse) {
   ASSERT_EQ(new_interval->FirstRegisterUse(), last_xor->GetLifetimePosition());
 }
 
-TEST_F(RegisterAllocatorTest, DeadPhi) {
+static void DeadPhi(Strategy strategy) {
   /* Test for a dead loop phi taking as back-edge input a phi that also has
    * this loop phi as input. Walking backwards in SsaDeadPhiElimination
    * does not solve the problem because the loop phi will be visited last.
@@ -384,15 +412,19 @@ TEST_F(RegisterAllocatorTest, DeadPhi) {
   x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   SsaLivenessAnalysis liveness(graph, &codegen);
   liveness.Analyze();
-  RegisterAllocator register_allocator(&allocator, &codegen, liveness);
-  register_allocator.AllocateRegisters();
-  ASSERT_TRUE(register_allocator.Validate(false));
+  RegisterAllocator* register_allocator =
+      RegisterAllocator::Create(&allocator, &codegen, liveness, strategy);
+  register_allocator->AllocateRegisters();
+  ASSERT_TRUE(register_allocator->Validate(false));
 }
 
+TEST_ALL_STRATEGIES(DeadPhi);
+
 /**
  * Test that the TryAllocateFreeReg method works in the presence of inactive intervals
  * that share the same register. It should split the interval it is currently
  * allocating for at the minimum lifetime position between the two inactive intervals.
+ * This test only applies to the linear scan allocator.
  */
 TEST_F(RegisterAllocatorTest, FreeUntil) {
   const uint16_t data[] = TWO_REGISTERS_CODE_ITEM(
@@ -408,7 +440,7 @@ TEST_F(RegisterAllocatorTest, FreeUntil) {
   x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   SsaLivenessAnalysis liveness(graph, &codegen);
   liveness.Analyze();
-  RegisterAllocator register_allocator(&allocator, &codegen, liveness);
+  RegisterAllocatorLinearScan register_allocator(&allocator, &codegen, liveness);
 
   // Add an artifical range to cover the temps that will be put in the unhandled list.
   LiveInterval* unhandled = graph->GetEntryBlock()->GetFirstInstruction()->GetLiveInterval();
@@ -506,15 +538,15 @@ static HGraph* BuildIfElseWithPhi(ArenaAllocator* allocator,
                                               graph->GetDexFile(),
                                               dex_cache,
                                               0);
-*input2 = new (allocator) HInstanceFieldGet(parameter,
-                                            Primitive::kPrimInt,
-                                            MemberOffset(42),
-                                            false,
-                                            kUnknownFieldIndex,
-                                            kUnknownClassDefIndex,
-                                            graph->GetDexFile(),
-                                            dex_cache,
-                                            0);
+  *input2 = new (allocator) HInstanceFieldGet(parameter,
+                                              Primitive::kPrimInt,
+                                              MemberOffset(42),
+                                              false,
+                                              kUnknownFieldIndex,
+                                              kUnknownClassDefIndex,
+                                              graph->GetDexFile(),
+                                              dex_cache,
+                                              0);
   then->AddInstruction(*input1);
   else_->AddInstruction(*input2);
   join->AddInstruction(new (allocator) HExit());
@@ -526,7 +558,7 @@ static HGraph* BuildIfElseWithPhi(ArenaAllocator* allocator,
   return graph;
 }
 
-TEST_F(RegisterAllocatorTest, PhiHint) {
+static void PhiHint(Strategy strategy) {
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
   HPhi *phi;
@@ -541,8 +573,9 @@ TEST_F(RegisterAllocatorTest, PhiHint) {
     liveness.Analyze();
 
     // Check that the register allocator is deterministic.
-    RegisterAllocator register_allocator(&allocator, &codegen, liveness);
-    register_allocator.AllocateRegisters();
+    RegisterAllocator* register_allocator =
+        RegisterAllocator::Create(&allocator, &codegen, liveness, strategy);
+    register_allocator->AllocateRegisters();
 
     ASSERT_EQ(input1->GetLiveInterval()->GetRegister(), 0);
     ASSERT_EQ(input2->GetLiveInterval()->GetRegister(), 0);
@@ -560,8 +593,9 @@ TEST_F(RegisterAllocatorTest, PhiHint) {
     // Set the phi to a specific register, and check that the inputs get allocated
     // the same register.
     phi->GetLocations()->UpdateOut(Location::RegisterLocation(2));
-    RegisterAllocator register_allocator(&allocator, &codegen, liveness);
-    register_allocator.AllocateRegisters();
+    RegisterAllocator* register_allocator =
+        RegisterAllocator::Create(&allocator, &codegen, liveness, strategy);
+    register_allocator->AllocateRegisters();
 
     ASSERT_EQ(input1->GetLiveInterval()->GetRegister(), 2);
     ASSERT_EQ(input2->GetLiveInterval()->GetRegister(), 2);
@@ -579,8 +613,9 @@ TEST_F(RegisterAllocatorTest, PhiHint) {
     // Set input1 to a specific register, and check that the phi and other input get allocated
     // the same register.
     input1->GetLocations()->UpdateOut(Location::RegisterLocation(2));
-    RegisterAllocator register_allocator(&allocator, &codegen, liveness);
-    register_allocator.AllocateRegisters();
+    RegisterAllocator* register_allocator =
+        RegisterAllocator::Create(&allocator, &codegen, liveness, strategy);
+    register_allocator->AllocateRegisters();
 
     ASSERT_EQ(input1->GetLiveInterval()->GetRegister(), 2);
     ASSERT_EQ(input2->GetLiveInterval()->GetRegister(), 2);
@@ -598,8 +633,9 @@ TEST_F(RegisterAllocatorTest, PhiHint) {
     // Set input2 to a specific register, and check that the phi and other input get allocated
     // the same register.
     input2->GetLocations()->UpdateOut(Location::RegisterLocation(2));
-    RegisterAllocator register_allocator(&allocator, &codegen, liveness);
-    register_allocator.AllocateRegisters();
+    RegisterAllocator* register_allocator =
+        RegisterAllocator::Create(&allocator, &codegen, liveness, strategy);
+    register_allocator->AllocateRegisters();
 
     ASSERT_EQ(input1->GetLiveInterval()->GetRegister(), 2);
     ASSERT_EQ(input2->GetLiveInterval()->GetRegister(), 2);
@@ -607,6 +643,12 @@ TEST_F(RegisterAllocatorTest, PhiHint) {
   }
 }
 
+// TODO: Enable this test for graph coloring register allocation when iterative move
+//       coalescing is merged.
+TEST_F(RegisterAllocatorTest, PhiHint_LinearScan) {
+  PhiHint(Strategy::kRegisterAllocatorLinearScan);
+}
+
 static HGraph* BuildFieldReturn(ArenaAllocator* allocator,
                                 HInstruction** field,
                                 HInstruction** ret) {
@@ -645,7 +687,7 @@ static HGraph* BuildFieldReturn(ArenaAllocator* allocator,
   return graph;
 }
 
-TEST_F(RegisterAllocatorTest, ExpectedInRegisterHint) {
+void RegisterAllocatorTest::ExpectedInRegisterHint(Strategy strategy) {
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
   HInstruction *field, *ret;
@@ -658,8 +700,9 @@ TEST_F(RegisterAllocatorTest, ExpectedInRegisterHint) {
     SsaLivenessAnalysis liveness(graph, &codegen);
     liveness.Analyze();
 
-    RegisterAllocator register_allocator(&allocator, &codegen, liveness);
-    register_allocator.AllocateRegisters();
+    RegisterAllocator* register_allocator =
+        RegisterAllocator::Create(&allocator, &codegen, liveness, strategy);
+    register_allocator->AllocateRegisters();
 
     // Sanity check that in normal conditions, the register should be hinted to 0 (EAX).
     ASSERT_EQ(field->GetLiveInterval()->GetRegister(), 0);
@@ -677,13 +720,20 @@ TEST_F(RegisterAllocatorTest, ExpectedInRegisterHint) {
     // Don't use SetInAt because we are overriding an already allocated location.
     ret->GetLocations()->inputs_[0] = Location::RegisterLocation(2);
 
-    RegisterAllocator register_allocator(&allocator, &codegen, liveness);
-    register_allocator.AllocateRegisters();
+    RegisterAllocator* register_allocator =
+        RegisterAllocator::Create(&allocator, &codegen, liveness, strategy);
+    register_allocator->AllocateRegisters();
 
     ASSERT_EQ(field->GetLiveInterval()->GetRegister(), 2);
   }
 }
 
+// TODO: Enable this test for graph coloring register allocation when iterative move
+//       coalescing is merged.
+TEST_F(RegisterAllocatorTest, ExpectedInRegisterHint_LinearScan) {
+  ExpectedInRegisterHint(Strategy::kRegisterAllocatorLinearScan);
+}
+
 static HGraph* BuildTwoSubs(ArenaAllocator* allocator,
                             HInstruction** first_sub,
                             HInstruction** second_sub) {
@@ -713,7 +763,7 @@ static HGraph* BuildTwoSubs(ArenaAllocator* allocator,
   return graph;
 }
 
-TEST_F(RegisterAllocatorTest, SameAsFirstInputHint) {
+void RegisterAllocatorTest::SameAsFirstInputHint(Strategy strategy) {
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
   HInstruction *first_sub, *second_sub;
@@ -726,8 +776,9 @@ TEST_F(RegisterAllocatorTest, SameAsFirstInputHint) {
     SsaLivenessAnalysis liveness(graph, &codegen);
     liveness.Analyze();
 
-    RegisterAllocator register_allocator(&allocator, &codegen, liveness);
-    register_allocator.AllocateRegisters();
+    RegisterAllocator* register_allocator =
+        RegisterAllocator::Create(&allocator, &codegen, liveness, strategy);
+    register_allocator->AllocateRegisters();
 
     // Sanity check that in normal conditions, the registers are the same.
     ASSERT_EQ(first_sub->GetLiveInterval()->GetRegister(), 1);
@@ -748,14 +799,21 @@ TEST_F(RegisterAllocatorTest, SameAsFirstInputHint) {
     ASSERT_EQ(first_sub->GetLocations()->Out().GetPolicy(), Location::kSameAsFirstInput);
     ASSERT_EQ(second_sub->GetLocations()->Out().GetPolicy(), Location::kSameAsFirstInput);
 
-    RegisterAllocator register_allocator(&allocator, &codegen, liveness);
-    register_allocator.AllocateRegisters();
+    RegisterAllocator* register_allocator =
+        RegisterAllocator::Create(&allocator, &codegen, liveness, strategy);
+    register_allocator->AllocateRegisters();
 
     ASSERT_EQ(first_sub->GetLiveInterval()->GetRegister(), 2);
     ASSERT_EQ(second_sub->GetLiveInterval()->GetRegister(), 2);
   }
 }
 
+// TODO: Enable this test for graph coloring register allocation when iterative move
+//       coalescing is merged.
+TEST_F(RegisterAllocatorTest, SameAsFirstInputHint_LinearScan) {
+  SameAsFirstInputHint(Strategy::kRegisterAllocatorLinearScan);
+}
+
 static HGraph* BuildDiv(ArenaAllocator* allocator,
                         HInstruction** div) {
   HGraph* graph = CreateGraph(allocator);
@@ -782,7 +840,7 @@ static HGraph* BuildDiv(ArenaAllocator* allocator,
   return graph;
 }
 
-TEST_F(RegisterAllocatorTest, ExpectedExactInRegisterAndSameOutputHint) {
+static void ExpectedExactInRegisterAndSameOutputHint(Strategy strategy) {
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
   HInstruction *div;
@@ -795,17 +853,25 @@ TEST_F(RegisterAllocatorTest, ExpectedExactInRegisterAndSameOutputHint) {
     SsaLivenessAnalysis liveness(graph, &codegen);
     liveness.Analyze();
 
-    RegisterAllocator register_allocator(&allocator, &codegen, liveness);
-    register_allocator.AllocateRegisters();
+    RegisterAllocator* register_allocator =
+        RegisterAllocator::Create(&allocator, &codegen, liveness, strategy);
+    register_allocator->AllocateRegisters();
 
     // div on x86 requires its first input in eax and the output be the same as the first input.
     ASSERT_EQ(div->GetLiveInterval()->GetRegister(), 0);
   }
 }
 
+// TODO: Enable this test for graph coloring register allocation when iterative move
+//       coalescing is merged.
+TEST_F(RegisterAllocatorTest, ExpectedExactInRegisterAndSameOutputHint_LinearScan) {
+  ExpectedExactInRegisterAndSameOutputHint(Strategy::kRegisterAllocatorLinearScan);
+}
+
 // Test a bug in the register allocator, where allocating a blocked
 // register would lead to spilling an inactive interval at the wrong
 // position.
+// This test only applies to the linear scan allocator.
 TEST_F(RegisterAllocatorTest, SpillInactive) {
   ArenaPool pool;
 
@@ -892,7 +958,7 @@ TEST_F(RegisterAllocatorTest, SpillInactive) {
     liveness.instructions_from_lifetime_position_.push_back(user);
   }
 
-  RegisterAllocator register_allocator(&allocator, &codegen, liveness);
+  RegisterAllocatorLinearScan register_allocator(&allocator, &codegen, liveness);
   register_allocator.unhandled_core_intervals_.push_back(fourth);
   register_allocator.unhandled_core_intervals_.push_back(third);
   register_allocator.unhandled_core_intervals_.push_back(second);
diff --git a/compiler/optimizing/sharpening.cc b/compiler/optimizing/sharpening.cc
index 97f34e6c32..6effc306dc 100644
--- a/compiler/optimizing/sharpening.cc
+++ b/compiler/optimizing/sharpening.cc
@@ -17,6 +17,7 @@
 #include "sharpening.h"
 
 #include "base/casts.h"
+#include "base/enums.h"
 #include "class_linker.h"
 #include "code_generator.h"
 #include "driver/dex_compilation_unit.h"
@@ -259,7 +260,7 @@ void HSharpening::ProcessLoadClass(HLoadClass* load_class) {
       load_class->SetLoadKindWithAddress(load_kind, address);
       break;
     case HLoadClass::LoadKind::kDexCachePcRelative: {
-      size_t pointer_size = InstructionSetPointerSize(codegen_->GetInstructionSet());
+      PointerSize pointer_size = InstructionSetPointerSize(codegen_->GetInstructionSet());
       DexCacheArraysLayout layout(pointer_size, &dex_file);
       size_t element_index = layout.TypeOffset(type_index);
       load_class->SetLoadKindWithDexCacheReference(load_kind, dex_file, element_index);
@@ -278,8 +279,7 @@ void HSharpening::ProcessLoadString(HLoadString* load_string) {
   const DexFile& dex_file = load_string->GetDexFile();
   uint32_t string_index = load_string->GetStringIndex();
 
-  bool is_in_dex_cache = false;
-  HLoadString::LoadKind desired_load_kind;
+  HLoadString::LoadKind desired_load_kind = HLoadString::LoadKind::kDexCacheViaMethod;
   uint64_t address = 0u;  // String or dex cache element address.
   {
     Runtime* runtime = Runtime::Current();
@@ -295,33 +295,14 @@ void HSharpening::ProcessLoadString(HLoadString* load_string) {
       DCHECK(!runtime->UseJitCompilation());
       mirror::String* string = class_linker->ResolveString(dex_file, string_index, dex_cache);
       CHECK(string != nullptr);
-      if (!compiler_driver_->GetSupportBootImageFixup()) {
-        // MIPS/MIPS64 or compiler_driver_test. Do not sharpen.
-        desired_load_kind = HLoadString::LoadKind::kDexCacheViaMethod;
-      } else {
-        DCHECK(ContainsElement(compiler_driver_->GetDexFilesForOatFile(), &dex_file));
-        is_in_dex_cache = true;
-        desired_load_kind = codegen_->GetCompilerOptions().GetCompilePic()
-            ? HLoadString::LoadKind::kBootImageLinkTimePcRelative
-            : HLoadString::LoadKind::kBootImageLinkTimeAddress;
-      }
+      // TODO: In follow up CL, add PcRelative and Address back in.
     } else if (runtime->UseJitCompilation()) {
       // TODO: Make sure we don't set the "compile PIC" flag for JIT as that's bogus.
       // DCHECK(!codegen_->GetCompilerOptions().GetCompilePic());
       mirror::String* string = dex_cache->GetResolvedString(string_index);
-      is_in_dex_cache = (string != nullptr);
       if (string != nullptr && runtime->GetHeap()->ObjectIsInBootImageSpace(string)) {
-        // TODO: Use direct pointers for all non-moving spaces, not just boot image. Bug: 29530787
         desired_load_kind = HLoadString::LoadKind::kBootImageAddress;
         address = reinterpret_cast64<uint64_t>(string);
-      } else {
-        // Note: If the string is not in the dex cache, the instruction needs environment
-        // and will not be inlined across dex files. Within a dex file, the slow-path helper
-        // loads the correct string and inlined frames are used correctly for OOM stack trace.
-        // TODO: Write a test for this. Bug: 29416588
-        desired_load_kind = HLoadString::LoadKind::kDexCacheAddress;
-        void* dex_cache_element_address = &dex_cache->GetStrings()[string_index];
-        address = reinterpret_cast64<uint64_t>(dex_cache_element_address);
       }
     } else {
       // AOT app compilation. Try to lookup the string without allocating if not found.
@@ -331,19 +312,9 @@ void HSharpening::ProcessLoadString(HLoadString* load_string) {
           !codegen_->GetCompilerOptions().GetCompilePic()) {
         desired_load_kind = HLoadString::LoadKind::kBootImageAddress;
         address = reinterpret_cast64<uint64_t>(string);
-      } else {
-        // Not JIT and either the string is not in boot image or we are compiling in PIC mode.
-        // Use PC-relative load from the dex cache if the dex file belongs
-        // to the oat file that we're currently compiling.
-        desired_load_kind = ContainsElement(compiler_driver_->GetDexFilesForOatFile(), &dex_file)
-            ? HLoadString::LoadKind::kDexCachePcRelative
-            : HLoadString::LoadKind::kDexCacheViaMethod;
       }
     }
   }
-  if (is_in_dex_cache) {
-    load_string->MarkInDexCache();
-  }
 
   HLoadString::LoadKind load_kind = codegen_->GetSupportedLoadStringKind(desired_load_kind);
   switch (load_kind) {
@@ -358,7 +329,7 @@ void HSharpening::ProcessLoadString(HLoadString* load_string) {
       load_string->SetLoadKindWithAddress(load_kind, address);
       break;
     case HLoadString::LoadKind::kDexCachePcRelative: {
-      size_t pointer_size = InstructionSetPointerSize(codegen_->GetInstructionSet());
+      PointerSize pointer_size = InstructionSetPointerSize(codegen_->GetInstructionSet());
       DexCacheArraysLayout layout(pointer_size, &dex_file);
       size_t element_index = layout.StringOffset(string_index);
       load_string->SetLoadKindWithDexCacheReference(load_kind, dex_file, element_index);
diff --git a/compiler/optimizing/ssa_liveness_analysis.cc b/compiler/optimizing/ssa_liveness_analysis.cc
index 7af4302884..a01e107e02 100644
--- a/compiler/optimizing/ssa_liveness_analysis.cc
+++ b/compiler/optimizing/ssa_liveness_analysis.cc
@@ -368,6 +368,27 @@ bool SsaLivenessAnalysis::UpdateLiveIn(const HBasicBlock& block) {
   return live_in->UnionIfNotIn(live_out, kill);
 }
 
+void LiveInterval::DumpWithContext(std::ostream& stream,
+                                   const CodeGenerator& codegen) const {
+  Dump(stream);
+  if (IsFixed()) {
+    stream << ", register:" << GetRegister() << "(";
+    if (IsFloatingPoint()) {
+      codegen.DumpFloatingPointRegister(stream, GetRegister());
+    } else {
+      codegen.DumpCoreRegister(stream, GetRegister());
+    }
+    stream << ")";
+  } else {
+    stream << ", spill slot:" << GetSpillSlot();
+  }
+  stream << ", requires_register:" << (GetDefinedBy() != nullptr && RequiresRegister());
+  if (GetParent()->GetDefinedBy() != nullptr) {
+    stream << ", defined_by:" << GetParent()->GetDefinedBy()->GetKind();
+    stream << "(" << GetParent()->GetDefinedBy()->GetLifetimePosition() << ")";
+  }
+}
+
 static int RegisterOrLowRegister(Location location) {
   return location.IsPair() ? location.low() : location.reg();
 }
diff --git a/compiler/optimizing/ssa_liveness_analysis.h b/compiler/optimizing/ssa_liveness_analysis.h
index dc98864d9b..92788fe6b8 100644
--- a/compiler/optimizing/ssa_liveness_analysis.h
+++ b/compiler/optimizing/ssa_liveness_analysis.h
@@ -150,9 +150,7 @@ class UsePosition : public ArenaObject<kArenaAllocSsaLiveness> {
     if (GetIsEnvironment()) return false;
     if (IsSynthesized()) return false;
     Location location = GetUser()->GetLocations()->InAt(GetInputIndex());
-    return location.IsUnallocated()
-        && (location.GetPolicy() == Location::kRequiresRegister
-            || location.GetPolicy() == Location::kRequiresFpuRegister);
+    return location.IsUnallocated() && location.RequiresRegisterKind();
   }
 
  private:
@@ -481,6 +479,10 @@ class LiveInterval : public ArenaObject<kArenaAllocSsaLiveness> {
     return last_range_->GetEnd();
   }
 
+  size_t GetLength() const {
+    return GetEnd() - GetStart();
+  }
+
   size_t FirstRegisterUseAfter(size_t position) const {
     if (is_temp_) {
       return position == GetStart() ? position : kNoLifetime;
@@ -504,10 +506,18 @@ class LiveInterval : public ArenaObject<kArenaAllocSsaLiveness> {
     return kNoLifetime;
   }
 
+  // Returns the location of the first register use for this live interval,
+  // including a register definition if applicable.
   size_t FirstRegisterUse() const {
     return FirstRegisterUseAfter(GetStart());
   }
 
+  // Whether the interval requires a register rather than a stack location.
+  // If needed for performance, this could be cached.
+  bool RequiresRegister() const {
+    return !HasRegister() && FirstRegisterUse() != kNoLifetime;
+  }
+
   size_t FirstUseAfter(size_t position) const {
     if (is_temp_) {
       return position == GetStart() ? position : kNoLifetime;
@@ -693,6 +703,10 @@ class LiveInterval : public ArenaObject<kArenaAllocSsaLiveness> {
     stream << " is_high: " << IsHighInterval();
   }
 
+  // Same as Dump, but adds context such as the instruction defining this interval, and
+  // the register currently assigned to this interval.
+  void DumpWithContext(std::ostream& stream, const CodeGenerator& codegen) const;
+
   LiveInterval* GetNextSibling() const { return next_sibling_; }
   LiveInterval* GetLastSibling() {
     LiveInterval* result = this;
@@ -871,6 +885,33 @@ class LiveInterval : public ArenaObject<kArenaAllocSsaLiveness> {
     range_search_start_ = first_range_;
   }
 
+  bool DefinitionRequiresRegister() const {
+    DCHECK(IsParent());
+    LocationSummary* locations = defined_by_->GetLocations();
+    Location location = locations->Out();
+    // This interval is the first interval of the instruction. If the output
+    // of the instruction requires a register, we return the position of that instruction
+    // as the first register use.
+    if (location.IsUnallocated()) {
+      if ((location.GetPolicy() == Location::kRequiresRegister)
+           || (location.GetPolicy() == Location::kSameAsFirstInput
+               && (locations->InAt(0).IsRegister()
+                   || locations->InAt(0).IsRegisterPair()
+                   || locations->InAt(0).GetPolicy() == Location::kRequiresRegister))) {
+        return true;
+      } else if ((location.GetPolicy() == Location::kRequiresFpuRegister)
+                 || (location.GetPolicy() == Location::kSameAsFirstInput
+                     && (locations->InAt(0).IsFpuRegister()
+                         || locations->InAt(0).IsFpuRegisterPair()
+                         || locations->InAt(0).GetPolicy() == Location::kRequiresFpuRegister))) {
+        return true;
+      }
+    } else if (location.IsRegister() || location.IsRegisterPair()) {
+      return true;
+    }
+    return false;
+  }
+
  private:
   LiveInterval(ArenaAllocator* allocator,
                Primitive::Type type,
@@ -925,33 +966,6 @@ class LiveInterval : public ArenaObject<kArenaAllocSsaLiveness> {
     return range;
   }
 
-  bool DefinitionRequiresRegister() const {
-    DCHECK(IsParent());
-    LocationSummary* locations = defined_by_->GetLocations();
-    Location location = locations->Out();
-    // This interval is the first interval of the instruction. If the output
-    // of the instruction requires a register, we return the position of that instruction
-    // as the first register use.
-    if (location.IsUnallocated()) {
-      if ((location.GetPolicy() == Location::kRequiresRegister)
-           || (location.GetPolicy() == Location::kSameAsFirstInput
-               && (locations->InAt(0).IsRegister()
-                   || locations->InAt(0).IsRegisterPair()
-                   || locations->InAt(0).GetPolicy() == Location::kRequiresRegister))) {
-        return true;
-      } else if ((location.GetPolicy() == Location::kRequiresFpuRegister)
-                 || (location.GetPolicy() == Location::kSameAsFirstInput
-                     && (locations->InAt(0).IsFpuRegister()
-                         || locations->InAt(0).IsFpuRegisterPair()
-                         || locations->InAt(0).GetPolicy() == Location::kRequiresFpuRegister))) {
-        return true;
-      }
-    } else if (location.IsRegister() || location.IsRegisterPair()) {
-      return true;
-    }
-    return false;
-  }
-
   bool IsDefiningPosition(size_t position) const {
     return IsParent() && (position == GetStart());
   }
diff --git a/compiler/optimizing/x86_memory_gen.cc b/compiler/optimizing/x86_memory_gen.cc
new file mode 100644
index 0000000000..8aa315a7e3
--- /dev/null
+++ b/compiler/optimizing/x86_memory_gen.cc
@@ -0,0 +1,84 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "x86_memory_gen.h"
+#include "code_generator.h"
+
+namespace art {
+namespace x86 {
+
+/**
+ * Replace instructions with memory operand forms.
+ */
+class MemoryOperandVisitor : public HGraphVisitor {
+ public:
+  MemoryOperandVisitor(HGraph* graph, bool do_implicit_null_checks)
+      : HGraphVisitor(graph),
+        do_implicit_null_checks_(do_implicit_null_checks) {}
+
+ private:
+  void VisitBoundsCheck(HBoundsCheck* check) OVERRIDE {
+    // Replace the length by the array itself, so that we can do compares to memory.
+    HArrayLength* array_len = check->InputAt(1)->AsArrayLength();
+
+    // We only want to replace an ArrayLength.
+    if (array_len == nullptr) {
+      return;
+    }
+
+    HInstruction* array = array_len->InputAt(0);
+    DCHECK_EQ(array->GetType(), Primitive::kPrimNot);
+
+    // Don't apply this optimization when the array is nullptr.
+    if (array->IsConstant() || (array->IsNullCheck() && array->InputAt(0)->IsConstant())) {
+      return;
+    }
+
+    // Is there a null check that could be an implicit check?
+    if (array->IsNullCheck() && do_implicit_null_checks_) {
+      // The ArrayLen may generate the implicit null check.  Can the
+      // bounds check do so as well?
+      if (array_len->GetNextDisregardingMoves() != check) {
+        // No, it won't.  Leave as is.
+        return;
+      }
+    }
+
+    // Can we suppress the ArrayLength and generate at BoundCheck?
+    if (array_len->HasOnlyOneNonEnvironmentUse()) {
+      array_len->MarkEmittedAtUseSite();
+      // We need the ArrayLength just before the BoundsCheck.
+      array_len->MoveBefore(check);
+    }
+  }
+
+  bool do_implicit_null_checks_;
+};
+
+X86MemoryOperandGeneration::X86MemoryOperandGeneration(HGraph* graph,
+                                                       CodeGenerator* codegen,
+                                                       OptimizingCompilerStats* stats)
+    : HOptimization(graph, kX86MemoryOperandGenerationPassName, stats),
+      do_implicit_null_checks_(codegen->GetCompilerOptions().GetImplicitNullChecks()) {
+}
+
+void X86MemoryOperandGeneration::Run() {
+  MemoryOperandVisitor visitor(graph_, do_implicit_null_checks_);
+  visitor.VisitInsertionOrder();
+}
+
+}  // namespace x86
+}  // namespace art
diff --git a/compiler/optimizing/x86_memory_gen.h b/compiler/optimizing/x86_memory_gen.h
new file mode 100644
index 0000000000..5f15d9f1e6
--- /dev/null
+++ b/compiler/optimizing/x86_memory_gen.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_OPTIMIZING_X86_MEMORY_GEN_H_
+#define ART_COMPILER_OPTIMIZING_X86_MEMORY_GEN_H_
+
+#include "nodes.h"
+#include "optimization.h"
+
+namespace art {
+class CodeGenerator;
+
+namespace x86 {
+
+class X86MemoryOperandGeneration : public HOptimization {
+ public:
+  X86MemoryOperandGeneration(HGraph* graph,
+                             CodeGenerator* codegen,
+                             OptimizingCompilerStats* stats);
+
+  void Run() OVERRIDE;
+
+  static constexpr const char* kX86MemoryOperandGenerationPassName =
+          "x86_memory_operand_generation";
+
+ private:
+  bool do_implicit_null_checks_;
+};
+
+}  // namespace x86
+}  // namespace art
+
+#endif  // ART_COMPILER_OPTIMIZING_X86_MEMORY_GEN_H_