29 files changed, 1454 insertions, 628 deletions
diff --git a/compiler/optimizing/bounds_check_elimination.cc b/compiler/optimizing/bounds_check_elimination.cc
index da2f9cbed5..eee6116098 100644
--- a/compiler/optimizing/bounds_check_elimination.cc
+++ b/compiler/optimizing/bounds_check_elimination.cc
@@ -1142,7 +1142,7 @@ class BCEVisitor : public HGraphVisitor {
           loop->IsDefinedOutOfTheLoop(array_get->InputAt(1))) {
         SideEffects loop_effects = side_effects_.GetLoopEffects(loop->GetHeader());
         if (!array_get->GetSideEffects().MayDependOn(loop_effects)) {
-          HoistToPreheaderOrDeoptBlock(loop, array_get);
+          HoistToPreHeaderOrDeoptBlock(loop, array_get);
         }
       }
     }
@@ -1280,7 +1280,8 @@ class BCEVisitor : public HGraphVisitor {
       // as runtime test. By restricting dynamic bce to unit strides (with a maximum of 32-bit
       // iterations) and by not combining access (e.g. a[i], a[i-3], a[i+5] etc.), these tests
       // correctly guard against any possible OOB (including arithmetic wrap-around cases).
-      HBasicBlock* block = TransformLoopForDeoptimizationIfNeeded(loop, needs_taken_test);
+      TransformLoopForDeoptimizationIfNeeded(loop, needs_taken_test);
+      HBasicBlock* block = GetPreHeader(loop, instruction);
       induction_range_.GenerateRangeCode(instruction, index, GetGraph(), block, &lower, &upper);
       if (lower != nullptr) {
         InsertDeopt(loop, block, new (GetGraph()->GetArena()) HAbove(lower, upper));
@@ -1358,7 +1359,7 @@ class BCEVisitor : public HGraphVisitor {
       return true;
     } else if (length->IsArrayLength() && length->GetBlock()->GetLoopInformation() == loop) {
       if (CanHandleNullCheck(loop, length->InputAt(0), needs_taken_test)) {
-        HoistToPreheaderOrDeoptBlock(loop, length);
+        HoistToPreHeaderOrDeoptBlock(loop, length);
         return true;
       }
     }
@@ -1376,7 +1377,8 @@ class BCEVisitor : public HGraphVisitor {
       HInstruction* array = check->InputAt(0);
       if (loop->IsDefinedOutOfTheLoop(array)) {
         // Generate: if (array == null) deoptimize;
-        HBasicBlock* block = TransformLoopForDeoptimizationIfNeeded(loop, needs_taken_test);
+        TransformLoopForDeoptimizationIfNeeded(loop, needs_taken_test);
+        HBasicBlock* block = GetPreHeader(loop, check);
         HInstruction* cond =
             new (GetGraph()->GetArena()) HEqual(array, GetGraph()->GetNullConstant());
         InsertDeopt(loop, block, cond);
@@ -1423,6 +1425,28 @@ class BCEVisitor : public HGraphVisitor {
     return true;
   }
 
+  /**
+   * Returns appropriate preheader for the loop, depending on whether the
+   * instruction appears in the loop header or proper loop-body.
+   */
+  HBasicBlock* GetPreHeader(HLoopInformation* loop, HInstruction* instruction) {
+    // Use preheader unless there is an earlier generated deoptimization block since
+    // hoisted expressions may depend on and/or used by the deoptimization tests.
+    HBasicBlock* header = loop->GetHeader();
+    const uint32_t loop_id = header->GetBlockId();
+    auto it = taken_test_loop_.find(loop_id);
+    if (it != taken_test_loop_.end()) {
+      HBasicBlock* block = it->second;
+      // If always taken, keep it that way by returning the original preheader,
+      // which can be found by following the predecessor of the true-block twice.
+      if (instruction->GetBlock() == header) {
+        return block->GetSinglePredecessor()->GetSinglePredecessor();
+      }
+      return block;
+    }
+    return loop->GetPreHeader();
+  }
+
   /** Inserts a deoptimization test. */
   void InsertDeopt(HLoopInformation* loop, HBasicBlock* block, HInstruction* condition) {
     HInstruction* suspend = loop->GetSuspendCheck();
@@ -1437,28 +1461,17 @@ class BCEVisitor : public HGraphVisitor {
   }
 
   /** Hoists instruction out of the loop to preheader or deoptimization block. */
-  void HoistToPreheaderOrDeoptBlock(HLoopInformation* loop, HInstruction* instruction) {
-    // Use preheader unless there is an earlier generated deoptimization block since
-    // hoisted expressions may depend on and/or used by the deoptimization tests.
-    const uint32_t loop_id = loop->GetHeader()->GetBlockId();
-    HBasicBlock* preheader = loop->GetPreHeader();
-    HBasicBlock* block = preheader;
-    auto it = taken_test_loop_.find(loop_id);
-    if (it != taken_test_loop_.end()) {
-      block = it->second;
-    }
-    // Hoist the instruction.
+  void HoistToPreHeaderOrDeoptBlock(HLoopInformation* loop, HInstruction* instruction) {
+    HBasicBlock* block = GetPreHeader(loop, instruction);
     DCHECK(!instruction->HasEnvironment());
     instruction->MoveBefore(block->GetLastInstruction());
   }
 
   /**
-   * Adds a new taken-test structure to a loop if needed (and not already done).
+   * Adds a new taken-test structure to a loop if needed and not already done.
    * The taken-test protects range analysis evaluation code to avoid any
    * deoptimization caused by incorrect trip-count evaluation in non-taken loops.
    *
-   * Returns block in which deoptimizations/invariants can be put.
-   *
    *          old_preheader
    *               |
    *            if_block          <- taken-test protects deoptimization block
@@ -1490,16 +1503,11 @@ class BCEVisitor : public HGraphVisitor {
    *     array[i] = 0;
    *   }
    */
-  HBasicBlock* TransformLoopForDeoptimizationIfNeeded(HLoopInformation* loop, bool needs_taken_test) {
-    // Not needed (can use preheader), or already done (can reuse)?
+  void TransformLoopForDeoptimizationIfNeeded(HLoopInformation* loop, bool needs_taken_test) {
+    // Not needed (can use preheader) or already done (can reuse)?
     const uint32_t loop_id = loop->GetHeader()->GetBlockId();
-    if (!needs_taken_test) {
-      return loop->GetPreHeader();
-    } else {
-      auto it = taken_test_loop_.find(loop_id);
-      if (it != taken_test_loop_.end()) {
-        return it->second;
-      }
+    if (!needs_taken_test || taken_test_loop_.find(loop_id) != taken_test_loop_.end()) {
+      return;
     }
 
     // Generate top test structure.
@@ -1528,7 +1536,6 @@ class BCEVisitor : public HGraphVisitor {
     if_block->AddInstruction(new (GetGraph()->GetArena()) HIf(condition));
 
     taken_test_loop_.Put(loop_id, true_block);
-    return true_block;
   }
 
   /**
@@ -1543,7 +1550,7 @@ class BCEVisitor : public HGraphVisitor {
    *            \       /
    *           x_1 = phi(x_0, null)   <- synthetic phi
    *               |
-   *             header
+   *          new_preheader
    */
   void InsertPhiNodes() {
     // Scan all new deoptimization blocks.
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index 53d3615a41..ea0b9eca9a 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -997,6 +997,12 @@ void CodeGenerator::RecordPcInfo(HInstruction* instruction,
   stack_map_stream_.EndStackMapEntry();
 }
 
+bool CodeGenerator::HasStackMapAtCurrentPc() {
+  uint32_t pc = GetAssembler()->CodeSize();
+  size_t count = stack_map_stream_.GetNumberOfStackMaps();
+  return count > 0 && stack_map_stream_.GetStackMap(count - 1).native_pc_offset == pc;
+}
+
 void CodeGenerator::RecordCatchBlockInfo() {
   ArenaAllocator* arena = graph_->GetArena();
 
@@ -1320,12 +1326,6 @@ void CodeGenerator::ValidateInvokeRuntime(HInstruction* instruction, SlowPathCod
         << "instruction->DebugName()=" << instruction->DebugName()
         << " slow_path->GetDescription()=" << slow_path->GetDescription();
     DCHECK(instruction->GetSideEffects().Includes(SideEffects::CanTriggerGC()) ||
-           // Control flow would not come back into the code if a fatal slow
-           // path is taken, so we do not care if it triggers GC.
-           slow_path->IsFatal() ||
-           // HDeoptimize is a special case: we know we are not coming back from
-           // it into the code.
-           instruction->IsDeoptimize() ||
            // When read barriers are enabled, some instructions use a
            // slow path to emit a read barrier, which does not trigger
            // GC, is not fatal, nor is emitted by HDeoptimize
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index eade05d7b6..5958cd89bc 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -269,6 +269,8 @@ class CodeGenerator {
 
   // Record native to dex mapping for a suspend point.  Required by runtime.
   void RecordPcInfo(HInstruction* instruction, uint32_t dex_pc, SlowPathCode* slow_path = nullptr);
+  // Check whether we have already recorded mapping at this PC.
+  bool HasStackMapAtCurrentPc();
 
   bool CanMoveNullCheckToUser(HNullCheck* null_check);
   void MaybeRecordImplicitNullCheck(HInstruction* instruction);
@@ -611,7 +613,7 @@ class CodeGenerator {
 
   ArenaVector<SlowPathCode*> slow_paths_;
 
-  // The current slow path that we're generating code for.
+  // The current slow-path that we're generating code for.
   SlowPathCode* current_slow_path_;
 
   // The current block index in `block_order_` of the block
@@ -672,6 +674,122 @@ class CallingConvention {
   DISALLOW_COPY_AND_ASSIGN(CallingConvention);
 };
 
+/**
+ * A templated class SlowPathGenerator with a templated method NewSlowPath()
+ * that can be used by any code generator to share equivalent slow-paths with
+ * the objective of reducing generated code size.
+ *
+ * InstructionType:  instruction that requires SlowPathCodeType
+ * SlowPathCodeType: subclass of SlowPathCode, with constructor SlowPathCodeType(InstructionType *)
+ */
+template <typename InstructionType>
+class SlowPathGenerator {
+  static_assert(std::is_base_of<HInstruction, InstructionType>::value,
+                "InstructionType is not a subclass of art::HInstruction");
+
+ public:
+  SlowPathGenerator(HGraph* graph, CodeGenerator* codegen)
+      : graph_(graph),
+        codegen_(codegen),
+        slow_path_map_(std::less<uint32_t>(), graph->GetArena()->Adapter(kArenaAllocSlowPaths)) {}
+
+  // Creates and adds a new slow-path, if needed, or returns existing one otherwise.
+  // Templating the method (rather than the whole class) on the slow-path type enables
+  // keeping this code at a generic, non architecture-specific place.
+  //
+  // NOTE: This approach assumes each InstructionType only generates one SlowPathCodeType.
+  //       To relax this requirement, we would need some RTTI on the stored slow-paths,
+  //       or template the class as a whole on SlowPathType.
+  template <typename SlowPathCodeType>
+  SlowPathCodeType* NewSlowPath(InstructionType* instruction) {
+    static_assert(std::is_base_of<SlowPathCode, SlowPathCodeType>::value,
+                  "SlowPathCodeType is not a subclass of art::SlowPathCode");
+    static_assert(std::is_constructible<SlowPathCodeType, InstructionType*>::value,
+                  "SlowPathCodeType is not constructible from InstructionType*");
+    // Iterate over potential candidates for sharing. Currently, only same-typed
+    // slow-paths with exactly the same dex-pc are viable candidates.
+    // TODO: pass dex-pc/slow-path-type to run-time to allow even more sharing?
+    const uint32_t dex_pc = instruction->GetDexPc();
+    auto iter = slow_path_map_.find(dex_pc);
+    if (iter != slow_path_map_.end()) {
+      auto candidates = iter->second;
+      for (const auto& it : candidates) {
+        InstructionType* other_instruction = it.first;
+        SlowPathCodeType* other_slow_path = down_cast<SlowPathCodeType*>(it.second);
+        // Determine if the instructions allow for slow-path sharing.
+        if (HaveSameLiveRegisters(instruction, other_instruction) &&
+            HaveSameStackMap(instruction, other_instruction)) {
+          // Can share: reuse existing one.
+          return other_slow_path;
+        }
+      }
+    } else {
+      // First time this dex-pc is seen.
+      iter = slow_path_map_.Put(dex_pc, {{}, {graph_->GetArena()->Adapter(kArenaAllocSlowPaths)}});
+    }
+    // Cannot share: create and add new slow-path for this particular dex-pc.
+    SlowPathCodeType* slow_path = new (graph_->GetArena()) SlowPathCodeType(instruction);
+    iter->second.emplace_back(std::make_pair(instruction, slow_path));
+    codegen_->AddSlowPath(slow_path);
+    return slow_path;
+  }
+
+ private:
+  // Tests if both instructions have same set of live physical registers. This ensures
+  // the slow-path has exactly the same preamble on saving these registers to stack.
+  bool HaveSameLiveRegisters(const InstructionType* i1, const InstructionType* i2) const {
+    const uint32_t core_spill = ~codegen_->GetCoreSpillMask();
+    const uint32_t fpu_spill = ~codegen_->GetFpuSpillMask();
+    RegisterSet* live1 = i1->GetLocations()->GetLiveRegisters();
+    RegisterSet* live2 = i2->GetLocations()->GetLiveRegisters();
+    return (((live1->GetCoreRegisters() & core_spill) ==
+             (live2->GetCoreRegisters() & core_spill)) &&
+            ((live1->GetFloatingPointRegisters() & fpu_spill) ==
+             (live2->GetFloatingPointRegisters() & fpu_spill)));
+  }
+
+  // Tests if both instructions have the same stack map. This ensures the interpreter
+  // will find exactly the same dex-registers at the same entries.
+  bool HaveSameStackMap(const InstructionType* i1, const InstructionType* i2) const {
+    DCHECK(i1->HasEnvironment());
+    DCHECK(i2->HasEnvironment());
+    // We conservatively test if the two instructions find exactly the same instructions
+    // and location in each dex-register. This guarantees they will have the same stack map.
+    HEnvironment* e1 = i1->GetEnvironment();
+    HEnvironment* e2 = i2->GetEnvironment();
+    if (e1->GetParent() != e2->GetParent() || e1->Size() != e2->Size()) {
+      return false;
+    }
+    for (size_t i = 0, sz = e1->Size(); i < sz; ++i) {
+      if (e1->GetInstructionAt(i) != e2->GetInstructionAt(i) ||
+          !e1->GetLocationAt(i).Equals(e2->GetLocationAt(i))) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  HGraph* const graph_;
+  CodeGenerator* const codegen_;
+
+  // Map from dex-pc to vector of already existing instruction/slow-path pairs.
+  ArenaSafeMap<uint32_t, ArenaVector<std::pair<InstructionType*, SlowPathCode*>>> slow_path_map_;
+
+  DISALLOW_COPY_AND_ASSIGN(SlowPathGenerator);
+};
+
+class InstructionCodeGenerator : public HGraphVisitor {
+ public:
+  InstructionCodeGenerator(HGraph* graph, CodeGenerator* codegen)
+      : HGraphVisitor(graph),
+        deopt_slow_paths_(graph, codegen) {}
+
+ protected:
+  // Add slow-path generator for each instruction/slow-path combination that desires sharing.
+  // TODO: under current regime, only deopt sharing make sense; extend later.
+  SlowPathGenerator<HDeoptimize> deopt_slow_paths_;
+};
+
 }  // namespace art
 
 #endif  // ART_COMPILER_OPTIMIZING_CODE_GENERATOR_H_
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 9a1f2b8717..d64b8784e1 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -350,24 +350,24 @@ class TypeCheckSlowPathARM : public SlowPathCode {
 
 class DeoptimizationSlowPathARM : public SlowPathCode {
  public:
-  explicit DeoptimizationSlowPathARM(HInstruction* instruction)
+  explicit DeoptimizationSlowPathARM(HDeoptimize* instruction)
     : instruction_(instruction) {}
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen);
     __ Bind(GetEntryLabel());
     SaveLiveRegisters(codegen, instruction_->GetLocations());
-    DCHECK(instruction_->IsDeoptimize());
-    HDeoptimize* deoptimize = instruction_->AsDeoptimize();
-    uint32_t dex_pc = deoptimize->GetDexPc();
-    CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen);
-    arm_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pDeoptimize), instruction_, dex_pc, this);
+    arm_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pDeoptimize),
+                               instruction_,
+                               instruction_->GetDexPc(),
+                               this);
     CheckEntrypointTypes<kQuickDeoptimize, void, void>();
   }
 
   const char* GetDescription() const OVERRIDE { return "DeoptimizationSlowPathARM"; }
 
  private:
-  HInstruction* const instruction_;
+  HDeoptimize* const instruction_;
   DISALLOW_COPY_AND_ASSIGN(DeoptimizationSlowPathARM);
 };
 
@@ -417,6 +417,56 @@ class ArraySetSlowPathARM : public SlowPathCode {
   DISALLOW_COPY_AND_ASSIGN(ArraySetSlowPathARM);
 };
 
+// Slow path marking an object during a read barrier.
+class ReadBarrierMarkSlowPathARM : public SlowPathCode {
+ public:
+  ReadBarrierMarkSlowPathARM(HInstruction* instruction, Location out, Location obj)
+      : instruction_(instruction), out_(out), obj_(obj) {
+    DCHECK(kEmitCompilerReadBarrier);
+  }
+
+  const char* GetDescription() const OVERRIDE { return "ReadBarrierMarkSlowPathARM"; }
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    LocationSummary* locations = instruction_->GetLocations();
+    Register reg_out = out_.AsRegister<Register>();
+    DCHECK(locations->CanCall());
+    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg_out));
+    DCHECK(instruction_->IsInstanceFieldGet() ||
+           instruction_->IsStaticFieldGet() ||
+           instruction_->IsArrayGet() ||
+           instruction_->IsLoadClass() ||
+           instruction_->IsLoadString() ||
+           instruction_->IsInstanceOf() ||
+           instruction_->IsCheckCast())
+        << "Unexpected instruction in read barrier marking slow path: "
+        << instruction_->DebugName();
+
+    __ Bind(GetEntryLabel());
+    SaveLiveRegisters(codegen, locations);
+
+    InvokeRuntimeCallingConvention calling_convention;
+    CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen);
+    arm_codegen->Move32(Location::RegisterLocation(calling_convention.GetRegisterAt(0)), obj_);
+    arm_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pReadBarrierMark),
+                               instruction_,
+                               instruction_->GetDexPc(),
+                               this);
+    CheckEntrypointTypes<kQuickReadBarrierMark, mirror::Object*, mirror::Object*>();
+    arm_codegen->Move32(out_, Location::RegisterLocation(R0));
+
+    RestoreLiveRegisters(codegen, locations);
+    __ b(GetExitLabel());
+  }
+
+ private:
+  HInstruction* const instruction_;
+  const Location out_;
+  const Location obj_;
+
+  DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathARM);
+};
+
 // Slow path generating a read barrier for a heap reference.
 class ReadBarrierForHeapReferenceSlowPathARM : public SlowPathCode {
  public:
@@ -438,7 +488,7 @@ class ReadBarrierForHeapReferenceSlowPathARM : public SlowPathCode {
     // to be instrumented, e.g.:
     //
     //   __ LoadFromOffset(kLoadWord, out, out, offset);
-    //   codegen_->GenerateReadBarrier(instruction, out_loc, out_loc, out_loc, offset);
+    //   codegen_->GenerateReadBarrierSlow(instruction, out_loc, out_loc, out_loc, offset);
     //
     // In that case, we have lost the information about the original
     // object, and the emitted read barrier cannot work properly.
@@ -454,7 +504,9 @@ class ReadBarrierForHeapReferenceSlowPathARM : public SlowPathCode {
     DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg_out));
     DCHECK(!instruction_->IsInvoke() ||
            (instruction_->IsInvokeStaticOrDirect() &&
-            instruction_->GetLocations()->Intrinsified()));
+            instruction_->GetLocations()->Intrinsified()))
+        << "Unexpected instruction in read barrier for heap reference slow path: "
+        << instruction_->DebugName();
 
     __ Bind(GetEntryLabel());
     SaveLiveRegisters(codegen, locations);
@@ -596,14 +648,18 @@ class ReadBarrierForHeapReferenceSlowPathARM : public SlowPathCode {
 class ReadBarrierForRootSlowPathARM : public SlowPathCode {
  public:
   ReadBarrierForRootSlowPathARM(HInstruction* instruction, Location out, Location root)
-      : instruction_(instruction), out_(out), root_(root) {}
+      : instruction_(instruction), out_(out), root_(root) {
+    DCHECK(kEmitCompilerReadBarrier);
+  }
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     LocationSummary* locations = instruction_->GetLocations();
     Register reg_out = out_.AsRegister<Register>();
     DCHECK(locations->CanCall());
     DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg_out));
-    DCHECK(instruction_->IsLoadClass() || instruction_->IsLoadString());
+    DCHECK(instruction_->IsLoadClass() || instruction_->IsLoadString())
+        << "Unexpected instruction in read barrier for GC root slow path: "
+        << instruction_->DebugName();
 
     __ Bind(GetEntryLabel());
     SaveLiveRegisters(codegen, locations);
@@ -857,7 +913,7 @@ void CodeGeneratorARM::UpdateBlockedPairRegisters() const {
 }
 
 InstructionCodeGeneratorARM::InstructionCodeGeneratorARM(HGraph* graph, CodeGeneratorARM* codegen)
-      : HGraphVisitor(graph),
+      : InstructionCodeGenerator(graph, codegen),
         assembler_(codegen->GetAssembler()),
         codegen_(codegen) {}
 
@@ -1358,17 +1414,6 @@ void LocationsBuilderARM::VisitExit(HExit* exit) {
 void InstructionCodeGeneratorARM::VisitExit(HExit* exit ATTRIBUTE_UNUSED) {
 }
 
-void InstructionCodeGeneratorARM::GenerateCompareWithImmediate(Register left, int32_t right) {
-  ShifterOperand operand;
-  if (GetAssembler()->ShifterOperandCanHold(R0, left, CMP, right, &operand)) {
-    __ cmp(left, operand);
-  } else {
-    Register temp = IP;
-    __ LoadImmediate(temp, right);
-    __ cmp(left, ShifterOperand(temp));
-  }
-}
-
 void InstructionCodeGeneratorARM::GenerateFPJumps(HCondition* cond,
                                                   Label* true_label,
                                                   Label* false_label) {
@@ -1434,7 +1479,7 @@ void InstructionCodeGeneratorARM::GenerateLongComparesAndJumps(HCondition* cond,
     int32_t val_low = Low32Bits(value);
     int32_t val_high = High32Bits(value);
 
-    GenerateCompareWithImmediate(left_high, val_high);
+    __ CmpConstant(left_high, val_high);
     if (if_cond == kCondNE) {
       __ b(true_label, ARMCondition(true_high_cond));
     } else if (if_cond == kCondEQ) {
@@ -1444,7 +1489,7 @@ void InstructionCodeGeneratorARM::GenerateLongComparesAndJumps(HCondition* cond,
       __ b(false_label, ARMCondition(false_high_cond));
     }
     // Must be equal high, so compare the lows.
-    GenerateCompareWithImmediate(left_low, val_low);
+    __ CmpConstant(left_low, val_low);
   } else {
     Register right_high = right.AsRegisterPairHigh<Register>();
     Register right_low = right.AsRegisterPairLow<Register>();
@@ -1568,7 +1613,7 @@ void InstructionCodeGeneratorARM::GenerateTestAndBranch(HInstruction* instructio
       __ cmp(left, ShifterOperand(right.AsRegister<Register>()));
     } else {
       DCHECK(right.IsConstant());
-      GenerateCompareWithImmediate(left, CodeGenerator::GetInt32ValueOf(right.GetConstant()));
+      __ CmpConstant(left, CodeGenerator::GetInt32ValueOf(right.GetConstant()));
     }
     if (true_target == nullptr) {
       __ b(false_target, ARMCondition(condition->GetOppositeCondition()));
@@ -1610,8 +1655,7 @@ void LocationsBuilderARM::VisitDeoptimize(HDeoptimize* deoptimize) {
 }
 
 void InstructionCodeGeneratorARM::VisitDeoptimize(HDeoptimize* deoptimize) {
-  SlowPathCode* slow_path = new (GetGraph()->GetArena()) DeoptimizationSlowPathARM(deoptimize);
-  codegen_->AddSlowPath(slow_path);
+  SlowPathCode* slow_path = deopt_slow_paths_.NewSlowPath<DeoptimizationSlowPathARM>(deoptimize);
   GenerateTestAndBranch(deoptimize,
                         /* condition_input_index */ 0,
                         slow_path->GetEntryLabel(),
@@ -1623,6 +1667,10 @@ void LocationsBuilderARM::VisitNativeDebugInfo(HNativeDebugInfo* info) {
 }
 
 void InstructionCodeGeneratorARM::VisitNativeDebugInfo(HNativeDebugInfo* info) {
+  if (codegen_->HasStackMapAtCurrentPc()) {
+    // Ensure that we do not collide with the stack map of the previous instruction.
+    __ nop();
+  }
   codegen_->RecordPcInfo(info, info->GetDexPc());
 }
 
@@ -1675,8 +1723,8 @@ void InstructionCodeGeneratorARM::HandleCondition(HCondition* cond) {
         __ cmp(left.AsRegister<Register>(), ShifterOperand(right.AsRegister<Register>()));
       } else {
         DCHECK(right.IsConstant());
-        GenerateCompareWithImmediate(left.AsRegister<Register>(),
-                                     CodeGenerator::GetInt32ValueOf(right.GetConstant()));
+        __ CmpConstant(left.AsRegister<Register>(),
+                       CodeGenerator::GetInt32ValueOf(right.GetConstant()));
       }
       __ it(ARMCondition(cond->GetCondition()), kItElse);
       __ mov(locations->Out().AsRegister<Register>(), ShifterOperand(1),
@@ -1891,7 +1939,7 @@ void LocationsBuilderARM::VisitMemoryBarrier(HMemoryBarrier* memory_barrier) {
 }
 
 void InstructionCodeGeneratorARM::VisitMemoryBarrier(HMemoryBarrier* memory_barrier) {
-  GenerateMemoryBarrier(memory_barrier->GetBarrierKind());
+  codegen_->GenerateMemoryBarrier(memory_barrier->GetBarrierKind());
 }
 
 void LocationsBuilderARM::VisitReturnVoid(HReturnVoid* ret) {
@@ -2846,8 +2894,7 @@ void InstructionCodeGeneratorARM::DivRemByPowerOfTwo(HBinaryOperation* instructi
   Register dividend = locations->InAt(0).AsRegister<Register>();
   Register temp = locations->GetTemp(0).AsRegister<Register>();
   int32_t imm = second.GetConstant()->AsIntConstant()->GetValue();
-  uint32_t abs_imm = static_cast<uint32_t>(std::abs(imm));
-  DCHECK(IsPowerOfTwo(abs_imm));
+  uint32_t abs_imm = static_cast<uint32_t>(AbsOrMin(imm));
   int ctz_imm = CTZ(abs_imm);
 
   if (ctz_imm == 1) {
@@ -2923,7 +2970,7 @@ void InstructionCodeGeneratorARM::GenerateDivRemConstantIntegral(HBinaryOperatio
     // Do not generate anything. DivZeroCheck would prevent any code to be executed.
   } else if (imm == 1 || imm == -1) {
     DivRemOneOrMinusOne(instruction);
-  } else if (IsPowerOfTwo(std::abs(imm))) {
+  } else if (IsPowerOfTwo(AbsOrMin(imm))) {
     DivRemByPowerOfTwo(instruction);
   } else {
     DCHECK(imm <= -2 || imm >= 2);
@@ -2952,12 +2999,12 @@ void LocationsBuilderARM::VisitDiv(HDiv* div) {
         locations->SetInAt(0, Location::RequiresRegister());
         locations->SetInAt(1, Location::ConstantLocation(div->InputAt(1)->AsConstant()));
         locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
-        int32_t abs_imm = std::abs(div->InputAt(1)->AsIntConstant()->GetValue());
-        if (abs_imm <= 1) {
+        int32_t value = div->InputAt(1)->AsIntConstant()->GetValue();
+        if (value == 1 || value == 0 || value == -1) {
           // No temp register required.
         } else {
           locations->AddTemp(Location::RequiresRegister());
-          if (!IsPowerOfTwo(abs_imm)) {
+          if (!IsPowerOfTwo(AbsOrMin(value))) {
             locations->AddTemp(Location::RequiresRegister());
           }
         }
@@ -3078,12 +3125,12 @@ void LocationsBuilderARM::VisitRem(HRem* rem) {
         locations->SetInAt(0, Location::RequiresRegister());
         locations->SetInAt(1, Location::ConstantLocation(rem->InputAt(1)->AsConstant()));
         locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
-        int32_t abs_imm = std::abs(rem->InputAt(1)->AsIntConstant()->GetValue());
-        if (abs_imm <= 1) {
+        int32_t value = rem->InputAt(1)->AsIntConstant()->GetValue();
+        if (value == 1 || value == 0 || value == -1) {
           // No temp register required.
         } else {
           locations->AddTemp(Location::RequiresRegister());
-          if (!IsPowerOfTwo(abs_imm)) {
+          if (!IsPowerOfTwo(AbsOrMin(value))) {
             locations->AddTemp(Location::RequiresRegister());
           }
         }
@@ -3437,7 +3484,7 @@ void InstructionCodeGeneratorARM::HandleShift(HBinaryOperation* op) {
       Register first_reg = first.AsRegister<Register>();
       if (second.IsRegister()) {
         Register second_reg = second.AsRegister<Register>();
-        // Arm doesn't mask the shift count so we need to do it ourselves.
+        // ARM doesn't mask the shift count so we need to do it ourselves.
         __ and_(out_reg, second_reg, ShifterOperand(kMaxIntShiftValue));
         if (op->IsShl()) {
           __ Lsl(out_reg, first_reg, out_reg);
@@ -3449,7 +3496,7 @@ void InstructionCodeGeneratorARM::HandleShift(HBinaryOperation* op) {
       } else {
         int32_t cst = second.GetConstant()->AsIntConstant()->GetValue();
         uint32_t shift_value = static_cast<uint32_t>(cst & kMaxIntShiftValue);
-        if (shift_value == 0) {  // arm does not support shifting with 0 immediate.
+        if (shift_value == 0) {  // ARM does not support shifting with 0 immediate.
           __ Mov(out_reg, first_reg);
         } else if (op->IsShl()) {
           __ Lsl(out_reg, first_reg, shift_value);
@@ -3796,9 +3843,9 @@ void InstructionCodeGeneratorARM::VisitPhi(HPhi* instruction ATTRIBUTE_UNUSED) {
   LOG(FATAL) << "Unreachable";
 }
 
-void InstructionCodeGeneratorARM::GenerateMemoryBarrier(MemBarrierKind kind) {
-  // TODO (ported from quick): revisit Arm barrier kinds
-  DmbOptions flavor = DmbOptions::ISH;  // quiet c++ warnings
+void CodeGeneratorARM::GenerateMemoryBarrier(MemBarrierKind kind) {
+  // TODO (ported from quick): revisit ARM barrier kinds.
+  DmbOptions flavor = DmbOptions::ISH;  // Quiet C++ warnings.
   switch (kind) {
     case MemBarrierKind::kAnyStore:
     case MemBarrierKind::kLoadAny:
@@ -3879,11 +3926,11 @@ void LocationsBuilderARM::HandleFieldSet(HInstruction* instruction, const FieldI
     locations->AddTemp(Location::RequiresRegister());  // Possibly used for reference poisoning too.
     locations->AddTemp(Location::RequiresRegister());
   } else if (generate_volatile) {
-    // Arm encoding have some additional constraints for ldrexd/strexd:
+    // ARM encoding have some additional constraints for ldrexd/strexd:
     // - registers need to be consecutive
     // - the first register should be even but not R14.
-    // We don't test for Arm yet, and the assertion makes sure that we revisit this if we ever
-    // enable Arm encoding.
+    // We don't test for ARM yet, and the assertion makes sure that we
+    // revisit this if we ever enable ARM encoding.
     DCHECK_EQ(InstructionSet::kThumb2, codegen_->GetInstructionSet());
 
     locations->AddTemp(Location::RequiresRegister());
@@ -3913,7 +3960,7 @@ void InstructionCodeGeneratorARM::HandleFieldSet(HInstruction* instruction,
       CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->InputAt(1));
 
   if (is_volatile) {
-    GenerateMemoryBarrier(MemBarrierKind::kAnyStore);
+    codegen_->GenerateMemoryBarrier(MemBarrierKind::kAnyStore);
   }
 
   switch (field_type) {
@@ -4005,7 +4052,7 @@ void InstructionCodeGeneratorARM::HandleFieldSet(HInstruction* instruction,
   }
 
   if (is_volatile) {
-    GenerateMemoryBarrier(MemBarrierKind::kAnyAny);
+    codegen_->GenerateMemoryBarrier(MemBarrierKind::kAnyAny);
   }
 }
 
@@ -4039,14 +4086,18 @@ void LocationsBuilderARM::HandleFieldGet(HInstruction* instruction, const FieldI
                       (overlap ? Location::kOutputOverlap : Location::kNoOutputOverlap));
   }
   if (volatile_for_double) {
-    // Arm encoding have some additional constraints for ldrexd/strexd:
+    // ARM encoding have some additional constraints for ldrexd/strexd:
     // - registers need to be consecutive
     // - the first register should be even but not R14.
-    // We don't test for Arm yet, and the assertion makes sure that we revisit this if we ever
-    // enable Arm encoding.
+    // We don't test for ARM yet, and the assertion makes sure that we
+    // revisit this if we ever enable ARM encoding.
     DCHECK_EQ(InstructionSet::kThumb2, codegen_->GetInstructionSet());
     locations->AddTemp(Location::RequiresRegister());
     locations->AddTemp(Location::RequiresRegister());
+  } else if (object_field_get_with_read_barrier && kUseBakerReadBarrier) {
+    // We need a temporary register for the read barrier marking slow
+    // path in CodeGeneratorARM::GenerateFieldLoadWithBakerReadBarrier.
+    locations->AddTemp(Location::RequiresRegister());
   }
 }
 
@@ -4105,33 +4156,52 @@ void InstructionCodeGeneratorARM::HandleFieldGet(HInstruction* instruction,
   uint32_t offset = field_info.GetFieldOffset().Uint32Value();
 
   switch (field_type) {
-    case Primitive::kPrimBoolean: {
+    case Primitive::kPrimBoolean:
       __ LoadFromOffset(kLoadUnsignedByte, out.AsRegister<Register>(), base, offset);
       break;
-    }
 
-    case Primitive::kPrimByte: {
+    case Primitive::kPrimByte:
       __ LoadFromOffset(kLoadSignedByte, out.AsRegister<Register>(), base, offset);
       break;
-    }
 
-    case Primitive::kPrimShort: {
+    case Primitive::kPrimShort:
       __ LoadFromOffset(kLoadSignedHalfword, out.AsRegister<Register>(), base, offset);
       break;
-    }
 
-    case Primitive::kPrimChar: {
+    case Primitive::kPrimChar:
       __ LoadFromOffset(kLoadUnsignedHalfword, out.AsRegister<Register>(), base, offset);
       break;
-    }
 
     case Primitive::kPrimInt:
-    case Primitive::kPrimNot: {
       __ LoadFromOffset(kLoadWord, out.AsRegister<Register>(), base, offset);
       break;
+
+    case Primitive::kPrimNot: {
+      // /* HeapReference<Object> */ out = *(base + offset)
+      if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+        Location temp_loc = locations->GetTemp(0);
+        // Note that a potential implicit null check is handled in this
+        // CodeGeneratorARM::GenerateFieldLoadWithBakerReadBarrier call.
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(
+            instruction, out, base, offset, temp_loc, /* needs_null_check */ true);
+        if (is_volatile) {
+          codegen_->GenerateMemoryBarrier(MemBarrierKind::kLoadAny);
+        }
+      } else {
+        __ LoadFromOffset(kLoadWord, out.AsRegister<Register>(), base, offset);
+        codegen_->MaybeRecordImplicitNullCheck(instruction);
+        if (is_volatile) {
+          codegen_->GenerateMemoryBarrier(MemBarrierKind::kLoadAny);
+        }
+        // If read barriers are enabled, emit read barriers other than
+        // Baker's using a slow path (and also unpoison the loaded
+        // reference, if heap poisoning is enabled).
+        codegen_->MaybeGenerateReadBarrierSlow(instruction, out, out, base_loc, offset);
+      }
+      break;
     }
 
-    case Primitive::kPrimLong: {
+    case Primitive::kPrimLong:
       if (is_volatile && !atomic_ldrd_strd) {
         GenerateWideAtomicLoad(base, offset,
                                out.AsRegisterPairLow<Register>(),
@@ -4140,12 +4210,10 @@ void InstructionCodeGeneratorARM::HandleFieldGet(HInstruction* instruction,
         __ LoadFromOffset(kLoadWordPair, out.AsRegisterPairLow<Register>(), base, offset);
       }
       break;
-    }
 
-    case Primitive::kPrimFloat: {
+    case Primitive::kPrimFloat:
       __ LoadSFromOffset(out.AsFpuRegister<SRegister>(), base, offset);
       break;
-    }
 
     case Primitive::kPrimDouble: {
       DRegister out_reg = FromLowSToD(out.AsFpuRegisterPairLow<SRegister>());
@@ -4167,17 +4235,20 @@ void InstructionCodeGeneratorARM::HandleFieldGet(HInstruction* instruction,
       UNREACHABLE();
   }
 
-  // Doubles are handled in the switch.
-  if (field_type != Primitive::kPrimDouble) {
+  if (field_type == Primitive::kPrimNot || field_type == Primitive::kPrimDouble) {
+    // Potential implicit null checks, in the case of reference or
+    // double fields, are handled in the previous switch statement.
+  } else {
     codegen_->MaybeRecordImplicitNullCheck(instruction);
   }
 
   if (is_volatile) {
-    GenerateMemoryBarrier(MemBarrierKind::kLoadAny);
-  }
-
-  if (field_type == Primitive::kPrimNot) {
-    codegen_->MaybeGenerateReadBarrier(instruction, out, out, base_loc, offset);
+    if (field_type == Primitive::kPrimNot) {
+      // Memory barriers, in the case of references, are also handled
+      // in the previous switch statement.
+    } else {
+      codegen_->GenerateMemoryBarrier(MemBarrierKind::kLoadAny);
+    }
   }
 }
 
@@ -4340,6 +4411,11 @@ void LocationsBuilderARM::VisitArrayGet(HArrayGet* instruction) {
         Location::RequiresRegister(),
         object_array_get_with_read_barrier ? Location::kOutputOverlap : Location::kNoOutputOverlap);
   }
+  // We need a temporary register for the read barrier marking slow
+  // path in CodeGeneratorARM::GenerateArrayLoadWithBakerReadBarrier.
+  if (object_array_get_with_read_barrier && kUseBakerReadBarrier) {
+    locations->AddTemp(Location::RequiresRegister());
+  }
 }
 
 void InstructionCodeGeneratorARM::VisitArrayGet(HArrayGet* instruction) {
@@ -4347,12 +4423,13 @@ void InstructionCodeGeneratorARM::VisitArrayGet(HArrayGet* instruction) {
   Location obj_loc = locations->InAt(0);
   Register obj = obj_loc.AsRegister<Register>();
   Location index = locations->InAt(1);
-  Primitive::Type type = instruction->GetType();
+  Location out_loc = locations->Out();
 
+  Primitive::Type type = instruction->GetType();
   switch (type) {
     case Primitive::kPrimBoolean: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(uint8_t)).Uint32Value();
-      Register out = locations->Out().AsRegister<Register>();
+      Register out = out_loc.AsRegister<Register>();
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset;
@@ -4366,7 +4443,7 @@ void InstructionCodeGeneratorARM::VisitArrayGet(HArrayGet* instruction) {
 
     case Primitive::kPrimByte: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(int8_t)).Uint32Value();
-      Register out = locations->Out().AsRegister<Register>();
+      Register out = out_loc.AsRegister<Register>();
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset;
@@ -4380,7 +4457,7 @@ void InstructionCodeGeneratorARM::VisitArrayGet(HArrayGet* instruction) {
 
     case Primitive::kPrimShort: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(int16_t)).Uint32Value();
-      Register out = locations->Out().AsRegister<Register>();
+      Register out = out_loc.AsRegister<Register>();
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_2) + data_offset;
@@ -4394,7 +4471,7 @@ void InstructionCodeGeneratorARM::VisitArrayGet(HArrayGet* instruction) {
 
     case Primitive::kPrimChar: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(uint16_t)).Uint32Value();
-      Register out = locations->Out().AsRegister<Register>();
+      Register out = out_loc.AsRegister<Register>();
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_2) + data_offset;
@@ -4406,13 +4483,9 @@ void InstructionCodeGeneratorARM::VisitArrayGet(HArrayGet* instruction) {
       break;
     }
 
-    case Primitive::kPrimInt:
-    case Primitive::kPrimNot: {
-      static_assert(
-          sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
-          "art::mirror::HeapReference<mirror::Object> and int32_t have different sizes.");
+    case Primitive::kPrimInt: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value();
-      Register out = locations->Out().AsRegister<Register>();
+      Register out = out_loc.AsRegister<Register>();
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
@@ -4424,44 +4497,79 @@ void InstructionCodeGeneratorARM::VisitArrayGet(HArrayGet* instruction) {
       break;
     }
 
+    case Primitive::kPrimNot: {
+      static_assert(
+          sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
+          "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
+      uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value();
+      // /* HeapReference<Object> */ out =
+      //     *(obj + data_offset + index * sizeof(HeapReference<Object>))
+      if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+        Location temp = locations->GetTemp(0);
+        // Note that a potential implicit null check is handled in this
+        // CodeGeneratorARM::GenerateArrayLoadWithBakerReadBarrier call.
+        codegen_->GenerateArrayLoadWithBakerReadBarrier(
+            instruction, out_loc, obj, data_offset, index, temp, /* needs_null_check */ true);
+      } else {
+        Register out = out_loc.AsRegister<Register>();
+        if (index.IsConstant()) {
+          size_t offset =
+              (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
+          __ LoadFromOffset(kLoadWord, out, obj, offset);
+          codegen_->MaybeRecordImplicitNullCheck(instruction);
+          // If read barriers are enabled, emit read barriers other than
+          // Baker's using a slow path (and also unpoison the loaded
+          // reference, if heap poisoning is enabled).
+          codegen_->MaybeGenerateReadBarrierSlow(instruction, out_loc, out_loc, obj_loc, offset);
+        } else {
+          __ add(IP, obj, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_4));
+          __ LoadFromOffset(kLoadWord, out, IP, data_offset);
+          codegen_->MaybeRecordImplicitNullCheck(instruction);
+          // If read barriers are enabled, emit read barriers other than
+          // Baker's using a slow path (and also unpoison the loaded
+          // reference, if heap poisoning is enabled).
+          codegen_->MaybeGenerateReadBarrierSlow(
+              instruction, out_loc, out_loc, obj_loc, data_offset, index);
+        }
+      }
+      break;
+    }
+
     case Primitive::kPrimLong: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(int64_t)).Uint32Value();
-      Location out = locations->Out();
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset;
-        __ LoadFromOffset(kLoadWordPair, out.AsRegisterPairLow<Register>(), obj, offset);
+        __ LoadFromOffset(kLoadWordPair, out_loc.AsRegisterPairLow<Register>(), obj, offset);
       } else {
         __ add(IP, obj, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_8));
-        __ LoadFromOffset(kLoadWordPair, out.AsRegisterPairLow<Register>(), IP, data_offset);
+        __ LoadFromOffset(kLoadWordPair, out_loc.AsRegisterPairLow<Register>(), IP, data_offset);
       }
       break;
     }
 
     case Primitive::kPrimFloat: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(float)).Uint32Value();
-      Location out = locations->Out();
-      DCHECK(out.IsFpuRegister());
+      SRegister out = out_loc.AsFpuRegister<SRegister>();
       if (index.IsConstant()) {
         size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
-        __ LoadSFromOffset(out.AsFpuRegister<SRegister>(), obj, offset);
+        __ LoadSFromOffset(out, obj, offset);
       } else {
         __ add(IP, obj, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_4));
-        __ LoadSFromOffset(out.AsFpuRegister<SRegister>(), IP, data_offset);
+        __ LoadSFromOffset(out, IP, data_offset);
       }
       break;
     }
 
     case Primitive::kPrimDouble: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(double)).Uint32Value();
-      Location out = locations->Out();
-      DCHECK(out.IsFpuRegisterPair());
+      SRegister out = out_loc.AsFpuRegisterPairLow<SRegister>();
       if (index.IsConstant()) {
         size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset;
-        __ LoadDFromOffset(FromLowSToD(out.AsFpuRegisterPairLow<SRegister>()), obj, offset);
+        __ LoadDFromOffset(FromLowSToD(out), obj, offset);
       } else {
         __ add(IP, obj, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_8));
-        __ LoadDFromOffset(FromLowSToD(out.AsFpuRegisterPairLow<SRegister>()), IP, data_offset);
+        __ LoadDFromOffset(FromLowSToD(out), IP, data_offset);
       }
       break;
     }
@@ -4470,20 +4578,12 @@ void InstructionCodeGeneratorARM::VisitArrayGet(HArrayGet* instruction) {
       LOG(FATAL) << "Unreachable type " << type;
       UNREACHABLE();
   }
-  codegen_->MaybeRecordImplicitNullCheck(instruction);
 
   if (type == Primitive::kPrimNot) {
-    static_assert(
-        sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
-        "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
-    uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value();
-    Location out = locations->Out();
-    if (index.IsConstant()) {
-      uint32_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
-      codegen_->MaybeGenerateReadBarrier(instruction, out, out, obj_loc, offset);
-    } else {
-      codegen_->MaybeGenerateReadBarrier(instruction, out, out, obj_loc, data_offset, index);
-    }
+    // Potential implicit null checks, in the case of reference
+    // arrays, are handled in the previous switch statement.
+  } else {
+    codegen_->MaybeRecordImplicitNullCheck(instruction);
   }
 }
 
@@ -4574,6 +4674,7 @@ void InstructionCodeGeneratorARM::VisitArraySet(HArraySet* instruction) {
           __ add(IP, array, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_4));
           __ StoreToOffset(kStoreWord, source, IP, data_offset);
         }
+        codegen_->MaybeRecordImplicitNullCheck(instruction);
         DCHECK(!needs_write_barrier);
         DCHECK(!may_need_runtime_call_for_type_check);
         break;
@@ -4615,12 +4716,12 @@ void InstructionCodeGeneratorARM::VisitArraySet(HArraySet* instruction) {
           //   __ Mov(temp2, temp1);
           //   // /* HeapReference<Class> */ temp1 = temp1->component_type_
           //   __ LoadFromOffset(kLoadWord, temp1, temp1, component_offset);
-          //   codegen_->GenerateReadBarrier(
+          //   codegen_->GenerateReadBarrierSlow(
           //       instruction, temp1_loc, temp1_loc, temp2_loc, component_offset);
           //
           //   // /* HeapReference<Class> */ temp2 = value->klass_
           //   __ LoadFromOffset(kLoadWord, temp2, value, class_offset);
-          //   codegen_->GenerateReadBarrier(
+          //   codegen_->GenerateReadBarrierSlow(
           //       instruction, temp2_loc, temp2_loc, value_loc, class_offset, temp1_loc);
           //
           //   __ cmp(temp1, ShifterOperand(temp2));
@@ -4717,8 +4818,6 @@ void InstructionCodeGeneratorARM::VisitArraySet(HArraySet* instruction) {
         __ add(IP, array, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_4));
         __ StoreToOffset(kStoreWord, value, IP, data_offset);
       }
-
-      codegen_->MaybeRecordImplicitNullCheck(instruction);
       break;
     }
 
@@ -4770,8 +4869,8 @@ void InstructionCodeGeneratorARM::VisitArraySet(HArraySet* instruction) {
       UNREACHABLE();
   }
 
-  // Ints and objects are handled in the switch.
-  if (value_type != Primitive::kPrimInt && value_type != Primitive::kPrimNot) {
+  // Objects are handled in the switch.
+  if (value_type != Primitive::kPrimNot) {
     codegen_->MaybeRecordImplicitNullCheck(instruction);
   }
 }
@@ -5140,16 +5239,9 @@ void InstructionCodeGeneratorARM::VisitLoadClass(HLoadClass* cls) {
   if (cls->IsReferrersClass()) {
     DCHECK(!cls->CanCallRuntime());
     DCHECK(!cls->MustGenerateClinitCheck());
-    uint32_t declaring_class_offset = ArtMethod::DeclaringClassOffset().Int32Value();
-    if (kEmitCompilerReadBarrier) {
-      // /* GcRoot<mirror::Class>* */ out = &(current_method->declaring_class_)
-      __ AddConstant(out, current_method, declaring_class_offset);
-      // /* mirror::Class* */ out = out->Read()
-      codegen_->GenerateReadBarrierForRoot(cls, out_loc, out_loc);
-    } else {
-      // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
-      __ LoadFromOffset(kLoadWord, out, current_method, declaring_class_offset);
-    }
+    // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
+    GenerateGcRootFieldLoad(
+        cls, out_loc, current_method, ArtMethod::DeclaringClassOffset().Int32Value());
   } else {
     // /* GcRoot<mirror::Class>[] */ out =
     //        current_method.ptr_sized_fields_->dex_cache_resolved_types_
@@ -5157,17 +5249,8 @@ void InstructionCodeGeneratorARM::VisitLoadClass(HLoadClass* cls) {
                       out,
                       current_method,
                       ArtMethod::DexCacheResolvedTypesOffset(kArmPointerSize).Int32Value());
-
-    size_t cache_offset = CodeGenerator::GetCacheOffset(cls->GetTypeIndex());
-    if (kEmitCompilerReadBarrier) {
-      // /* GcRoot<mirror::Class>* */ out = &out[type_index]
-      __ AddConstant(out, out, cache_offset);
-      // /* mirror::Class* */ out = out->Read()
-      codegen_->GenerateReadBarrierForRoot(cls, out_loc, out_loc);
-    } else {
-      // /* GcRoot<mirror::Class> */ out = out[type_index]
-      __ LoadFromOffset(kLoadWord, out, out, cache_offset);
-    }
+    // /* GcRoot<mirror::Class> */ out = out[type_index]
+    GenerateGcRootFieldLoad(cls, out_loc, out, CodeGenerator::GetCacheOffset(cls->GetTypeIndex()));
 
     if (!cls->IsInDexCache() || cls->MustGenerateClinitCheck()) {
       DCHECK(cls->CanCallRuntime());
@@ -5230,30 +5313,14 @@ void InstructionCodeGeneratorARM::VisitLoadString(HLoadString* load) {
   Register out = out_loc.AsRegister<Register>();
   Register current_method = locations->InAt(0).AsRegister<Register>();
 
-  uint32_t declaring_class_offset = ArtMethod::DeclaringClassOffset().Int32Value();
-  if (kEmitCompilerReadBarrier) {
-    // /* GcRoot<mirror::Class>* */ out = &(current_method->declaring_class_)
-    __ AddConstant(out, current_method, declaring_class_offset);
-    // /* mirror::Class* */ out = out->Read()
-    codegen_->GenerateReadBarrierForRoot(load, out_loc, out_loc);
-  } else {
-    // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
-    __ LoadFromOffset(kLoadWord, out, current_method, declaring_class_offset);
-  }
-
+  // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
+  GenerateGcRootFieldLoad(
+      load, out_loc, current_method, ArtMethod::DeclaringClassOffset().Int32Value());
   // /* GcRoot<mirror::String>[] */ out = out->dex_cache_strings_
   __ LoadFromOffset(kLoadWord, out, out, mirror::Class::DexCacheStringsOffset().Int32Value());
-
-  size_t cache_offset = CodeGenerator::GetCacheOffset(load->GetStringIndex());
-  if (kEmitCompilerReadBarrier) {
-    // /* GcRoot<mirror::String>* */ out = &out[string_index]
-    __ AddConstant(out, out, cache_offset);
-    // /* mirror::String* */ out = out->Read()
-    codegen_->GenerateReadBarrierForRoot(load, out_loc, out_loc);
-  } else {
-    // /* GcRoot<mirror::String> */ out = out[string_index]
-    __ LoadFromOffset(kLoadWord, out, out, cache_offset);
-  }
+  // /* GcRoot<mirror::String> */ out = out[string_index]
+  GenerateGcRootFieldLoad(
+      load, out_loc, out, CodeGenerator::GetCacheOffset(load->GetStringIndex()));
 
   if (!load->IsInDexCache()) {
     SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathARM(load);
@@ -5300,6 +5367,14 @@ void InstructionCodeGeneratorARM::VisitThrow(HThrow* instruction) {
   CheckEntrypointTypes<kQuickDeliverException, void, mirror::Object*>();
 }
 
+static bool TypeCheckNeedsATemporary(TypeCheckKind type_check_kind) {
+  return kEmitCompilerReadBarrier &&
+      (kUseBakerReadBarrier ||
+       type_check_kind == TypeCheckKind::kAbstractClassCheck ||
+       type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
+       type_check_kind == TypeCheckKind::kArrayObjectCheck);
+}
+
 void LocationsBuilderARM::VisitInstanceOf(HInstanceOf* instruction) {
   LocationSummary::CallKind call_kind = LocationSummary::kNoCall;
   TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
@@ -5326,21 +5401,22 @@ void LocationsBuilderARM::VisitInstanceOf(HInstanceOf* instruction) {
   locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
   // When read barriers are enabled, we need a temporary register for
   // some cases.
-  if (kEmitCompilerReadBarrier &&
-      (type_check_kind == TypeCheckKind::kAbstractClassCheck ||
-       type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
-       type_check_kind == TypeCheckKind::kArrayObjectCheck)) {
+  if (TypeCheckNeedsATemporary(type_check_kind)) {
     locations->AddTemp(Location::RequiresRegister());
   }
 }
 
 void InstructionCodeGeneratorARM::VisitInstanceOf(HInstanceOf* instruction) {
+  TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
   LocationSummary* locations = instruction->GetLocations();
   Location obj_loc = locations->InAt(0);
   Register obj = obj_loc.AsRegister<Register>();
   Register cls = locations->InAt(1).AsRegister<Register>();
   Location out_loc = locations->Out();
   Register out = out_loc.AsRegister<Register>();
+  Location temp_loc = TypeCheckNeedsATemporary(type_check_kind) ?
+      locations->GetTemp(0) :
+      Location::NoLocation();
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
   uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
   uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
@@ -5355,10 +5431,9 @@ void InstructionCodeGeneratorARM::VisitInstanceOf(HInstanceOf* instruction) {
   }
 
   // /* HeapReference<Class> */ out = obj->klass_
-  __ LoadFromOffset(kLoadWord, out, obj, class_offset);
-  codegen_->MaybeGenerateReadBarrier(instruction, out_loc, out_loc, obj_loc, class_offset);
+  GenerateReferenceLoadTwoRegisters(instruction, out_loc, obj_loc, class_offset, temp_loc);
 
-  switch (instruction->GetTypeCheckKind()) {
+  switch (type_check_kind) {
     case TypeCheckKind::kExactCheck: {
       __ cmp(out, ShifterOperand(cls));
       // Classes must be equal for the instanceof to succeed.
@@ -5373,17 +5448,8 @@ void InstructionCodeGeneratorARM::VisitInstanceOf(HInstanceOf* instruction) {
       // object to avoid doing a comparison we know will fail.
       Label loop;
       __ Bind(&loop);
-      Location temp_loc = kEmitCompilerReadBarrier ? locations->GetTemp(0) : Location::NoLocation();
-      if (kEmitCompilerReadBarrier) {
-        // Save the value of `out` into `temp` before overwriting it
-        // in the following move operation, as we will need it for the
-        // read barrier below.
-        Register temp = temp_loc.AsRegister<Register>();
-        __ Mov(temp, out);
-      }
       // /* HeapReference<Class> */ out = out->super_class_
-      __ LoadFromOffset(kLoadWord, out, out, super_offset);
-      codegen_->MaybeGenerateReadBarrier(instruction, out_loc, out_loc, temp_loc, super_offset);
+      GenerateReferenceLoadOneRegister(instruction, out_loc, super_offset, temp_loc);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ CompareAndBranchIfZero(out, &done);
       __ cmp(out, ShifterOperand(cls));
@@ -5401,17 +5467,8 @@ void InstructionCodeGeneratorARM::VisitInstanceOf(HInstanceOf* instruction) {
       __ Bind(&loop);
       __ cmp(out, ShifterOperand(cls));
       __ b(&success, EQ);
-      Location temp_loc = kEmitCompilerReadBarrier ? locations->GetTemp(0) : Location::NoLocation();
-      if (kEmitCompilerReadBarrier) {
-        // Save the value of `out` into `temp` before overwriting it
-        // in the following move operation, as we will need it for the
-        // read barrier below.
-        Register temp = temp_loc.AsRegister<Register>();
-        __ Mov(temp, out);
-      }
       // /* HeapReference<Class> */ out = out->super_class_
-      __ LoadFromOffset(kLoadWord, out, out, super_offset);
-      codegen_->MaybeGenerateReadBarrier(instruction, out_loc, out_loc, temp_loc, super_offset);
+      GenerateReferenceLoadOneRegister(instruction, out_loc, super_offset, temp_loc);
       __ CompareAndBranchIfNonZero(out, &loop);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ b(&done);
@@ -5429,17 +5486,8 @@ void InstructionCodeGeneratorARM::VisitInstanceOf(HInstanceOf* instruction) {
       __ cmp(out, ShifterOperand(cls));
       __ b(&exact_check, EQ);
       // Otherwise, we need to check that the object's class is a non-primitive array.
-      Location temp_loc = kEmitCompilerReadBarrier ? locations->GetTemp(0) : Location::NoLocation();
-      if (kEmitCompilerReadBarrier) {
-        // Save the value of `out` into `temp` before overwriting it
-        // in the following move operation, as we will need it for the
-        // read barrier below.
-        Register temp = temp_loc.AsRegister<Register>();
-        __ Mov(temp, out);
-      }
       // /* HeapReference<Class> */ out = out->component_type_
-      __ LoadFromOffset(kLoadWord, out, out, component_offset);
-      codegen_->MaybeGenerateReadBarrier(instruction, out_loc, out_loc, temp_loc, component_offset);
+      GenerateReferenceLoadOneRegister(instruction, out_loc, component_offset, temp_loc);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ CompareAndBranchIfZero(out, &done);
       __ LoadFromOffset(kLoadUnsignedHalfword, out, out, primitive_offset);
@@ -5478,6 +5526,13 @@ void InstructionCodeGeneratorARM::VisitInstanceOf(HInstanceOf* instruction) {
       // HInstanceOf instruction (following the runtime calling
       // convention), which might be cluttered by the potential first
       // read barrier emission at the beginning of this method.
+      //
+      // TODO: Introduce a new runtime entry point taking the object
+      // to test (instead of its class) as argument, and let it deal
+      // with the read barrier issues. This will let us refactor this
+      // case of the `switch` code as it was previously (with a direct
+      // call to the runtime not using a type checking slow path).
+      // This should also be beneficial for the other cases above.
       DCHECK(locations->OnlyCallsOnSlowPath());
       slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathARM(instruction,
                                                                     /* is_fatal */ false);
@@ -5532,27 +5587,27 @@ void LocationsBuilderARM::VisitCheckCast(HCheckCast* instruction) {
   locations->AddTemp(Location::RequiresRegister());
   // When read barriers are enabled, we need an additional temporary
   // register for some cases.
-  if (kEmitCompilerReadBarrier &&
-      (type_check_kind == TypeCheckKind::kAbstractClassCheck ||
-       type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
-       type_check_kind == TypeCheckKind::kArrayObjectCheck)) {
+  if (TypeCheckNeedsATemporary(type_check_kind)) {
     locations->AddTemp(Location::RequiresRegister());
   }
 }
 
 void InstructionCodeGeneratorARM::VisitCheckCast(HCheckCast* instruction) {
+  TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
   LocationSummary* locations = instruction->GetLocations();
   Location obj_loc = locations->InAt(0);
   Register obj = obj_loc.AsRegister<Register>();
   Register cls = locations->InAt(1).AsRegister<Register>();
   Location temp_loc = locations->GetTemp(0);
   Register temp = temp_loc.AsRegister<Register>();
+  Location temp2_loc = TypeCheckNeedsATemporary(type_check_kind) ?
+      locations->GetTemp(1) :
+      Location::NoLocation();
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
   uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
   uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
   uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
 
-  TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
   bool is_type_check_slow_path_fatal =
       (type_check_kind == TypeCheckKind::kExactCheck ||
        type_check_kind == TypeCheckKind::kAbstractClassCheck ||
@@ -5571,8 +5626,7 @@ void InstructionCodeGeneratorARM::VisitCheckCast(HCheckCast* instruction) {
   }
 
   // /* HeapReference<Class> */ temp = obj->klass_
-  __ LoadFromOffset(kLoadWord, temp, obj, class_offset);
-  codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, obj_loc, class_offset);
+  GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset, temp2_loc);
 
   switch (type_check_kind) {
     case TypeCheckKind::kExactCheck:
@@ -5589,18 +5643,8 @@ void InstructionCodeGeneratorARM::VisitCheckCast(HCheckCast* instruction) {
       // object to avoid doing a comparison we know will fail.
       Label loop, compare_classes;
       __ Bind(&loop);
-      Location temp2_loc =
-          kEmitCompilerReadBarrier ? locations->GetTemp(1) : Location::NoLocation();
-      if (kEmitCompilerReadBarrier) {
-        // Save the value of `temp` into `temp2` before overwriting it
-        // in the following move operation, as we will need it for the
-        // read barrier below.
-        Register temp2 = temp2_loc.AsRegister<Register>();
-        __ Mov(temp2, temp);
-      }
       // /* HeapReference<Class> */ temp = temp->super_class_
-      __ LoadFromOffset(kLoadWord, temp, temp, super_offset);
-      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, temp2_loc, super_offset);
+      GenerateReferenceLoadOneRegister(instruction, temp_loc, super_offset, temp2_loc);
 
       // If the class reference currently in `temp` is not null, jump
       // to the `compare_classes` label to compare it with the checked
@@ -5612,8 +5656,7 @@ void InstructionCodeGeneratorARM::VisitCheckCast(HCheckCast* instruction) {
       // going into the slow path, as it has been overwritten in the
       // meantime.
       // /* HeapReference<Class> */ temp = obj->klass_
-      __ LoadFromOffset(kLoadWord, temp, obj, class_offset);
-      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, obj_loc, class_offset);
+      GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset, temp2_loc);
       __ b(type_check_slow_path->GetEntryLabel());
 
       __ Bind(&compare_classes);
@@ -5629,18 +5672,8 @@ void InstructionCodeGeneratorARM::VisitCheckCast(HCheckCast* instruction) {
       __ cmp(temp, ShifterOperand(cls));
       __ b(&done, EQ);
 
-      Location temp2_loc =
-          kEmitCompilerReadBarrier ? locations->GetTemp(1) : Location::NoLocation();
-      if (kEmitCompilerReadBarrier) {
-        // Save the value of `temp` into `temp2` before overwriting it
-        // in the following move operation, as we will need it for the
-        // read barrier below.
-        Register temp2 = temp2_loc.AsRegister<Register>();
-        __ Mov(temp2, temp);
-      }
       // /* HeapReference<Class> */ temp = temp->super_class_
-      __ LoadFromOffset(kLoadWord, temp, temp, super_offset);
-      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, temp2_loc, super_offset);
+      GenerateReferenceLoadOneRegister(instruction, temp_loc, super_offset, temp2_loc);
 
       // If the class reference currently in `temp` is not null, jump
       // back at the beginning of the loop.
@@ -5651,8 +5684,7 @@ void InstructionCodeGeneratorARM::VisitCheckCast(HCheckCast* instruction) {
       // going into the slow path, as it has been overwritten in the
       // meantime.
       // /* HeapReference<Class> */ temp = obj->klass_
-      __ LoadFromOffset(kLoadWord, temp, obj, class_offset);
-      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, obj_loc, class_offset);
+      GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset, temp2_loc);
       __ b(type_check_slow_path->GetEntryLabel());
       break;
     }
@@ -5664,19 +5696,8 @@ void InstructionCodeGeneratorARM::VisitCheckCast(HCheckCast* instruction) {
       __ b(&done, EQ);
 
       // Otherwise, we need to check that the object's class is a non-primitive array.
-      Location temp2_loc =
-          kEmitCompilerReadBarrier ? locations->GetTemp(1) : Location::NoLocation();
-      if (kEmitCompilerReadBarrier) {
-        // Save the value of `temp` into `temp2` before overwriting it
-        // in the following move operation, as we will need it for the
-        // read barrier below.
-        Register temp2 = temp2_loc.AsRegister<Register>();
-        __ Mov(temp2, temp);
-      }
       // /* HeapReference<Class> */ temp = temp->component_type_
-      __ LoadFromOffset(kLoadWord, temp, temp, component_offset);
-      codegen_->MaybeGenerateReadBarrier(
-          instruction, temp_loc, temp_loc, temp2_loc, component_offset);
+      GenerateReferenceLoadOneRegister(instruction, temp_loc, component_offset, temp2_loc);
 
       // If the component type is not null (i.e. the object is indeed
       // an array), jump to label `check_non_primitive_component_type`
@@ -5689,8 +5710,7 @@ void InstructionCodeGeneratorARM::VisitCheckCast(HCheckCast* instruction) {
       // going into the slow path, as it has been overwritten in the
       // meantime.
       // /* HeapReference<Class> */ temp = obj->klass_
-      __ LoadFromOffset(kLoadWord, temp, obj, class_offset);
-      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, obj_loc, class_offset);
+      GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset, temp2_loc);
       __ b(type_check_slow_path->GetEntryLabel());
 
       __ Bind(&check_non_primitive_component_type);
@@ -5699,8 +5719,7 @@ void InstructionCodeGeneratorARM::VisitCheckCast(HCheckCast* instruction) {
       __ CompareAndBranchIfZero(temp, &done);
       // Same comment as above regarding `temp` and the slow path.
       // /* HeapReference<Class> */ temp = obj->klass_
-      __ LoadFromOffset(kLoadWord, temp, obj, class_offset);
-      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, obj_loc, class_offset);
+      GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset, temp2_loc);
       __ b(type_check_slow_path->GetEntryLabel());
       break;
     }
@@ -5717,6 +5736,13 @@ void InstructionCodeGeneratorARM::VisitCheckCast(HCheckCast* instruction) {
       // instruction (following the runtime calling convention), which
       // might be cluttered by the potential first read barrier
       // emission at the beginning of this method.
+      //
+      // TODO: Introduce a new runtime entry point taking the object
+      // to test (instead of its class) as argument, and let it deal
+      // with the read barrier issues. This will let us refactor this
+      // case of the `switch` code as it was previously (with a direct
+      // call to the runtime not using a type checking slow path).
+      // This should also be beneficial for the other cases above.
       __ b(type_check_slow_path->GetEntryLabel());
       break;
   }
@@ -5901,14 +5927,249 @@ void InstructionCodeGeneratorARM::HandleBitwiseOperation(HBinaryOperation* instr
   }
 }
 
-void CodeGeneratorARM::GenerateReadBarrier(HInstruction* instruction,
-                                           Location out,
-                                           Location ref,
-                                           Location obj,
-                                           uint32_t offset,
-                                           Location index) {
+void InstructionCodeGeneratorARM::GenerateReferenceLoadOneRegister(HInstruction* instruction,
+                                                                   Location out,
+                                                                   uint32_t offset,
+                                                                   Location temp) {
+  Register out_reg = out.AsRegister<Register>();
+  if (kEmitCompilerReadBarrier) {
+    if (kUseBakerReadBarrier) {
+      // Load with fast path based Baker's read barrier.
+      // /* HeapReference<Object> */ out = *(out + offset)
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(
+          instruction, out, out_reg, offset, temp, /* needs_null_check */ false);
+    } else {
+      // Load with slow path based read barrier.
+      // Save the value of `out` into `temp` before overwriting it
+      // in the following move operation, as we will need it for the
+      // read barrier below.
+      __ Mov(temp.AsRegister<Register>(), out_reg);
+      // /* HeapReference<Object> */ out = *(out + offset)
+      __ LoadFromOffset(kLoadWord, out_reg, out_reg, offset);
+      codegen_->GenerateReadBarrierSlow(instruction, out, out, temp, offset);
+    }
+  } else {
+    // Plain load with no read barrier.
+    // /* HeapReference<Object> */ out = *(out + offset)
+    __ LoadFromOffset(kLoadWord, out_reg, out_reg, offset);
+    __ MaybeUnpoisonHeapReference(out_reg);
+  }
+}
+
+void InstructionCodeGeneratorARM::GenerateReferenceLoadTwoRegisters(HInstruction* instruction,
+                                                                    Location out,
+                                                                    Location obj,
+                                                                    uint32_t offset,
+                                                                    Location temp) {
+  Register out_reg = out.AsRegister<Register>();
+  Register obj_reg = obj.AsRegister<Register>();
+  if (kEmitCompilerReadBarrier) {
+    if (kUseBakerReadBarrier) {
+      // Load with fast path based Baker's read barrier.
+      // /* HeapReference<Object> */ out = *(obj + offset)
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(
+          instruction, out, obj_reg, offset, temp, /* needs_null_check */ false);
+    } else {
+      // Load with slow path based read barrier.
+      // /* HeapReference<Object> */ out = *(obj + offset)
+      __ LoadFromOffset(kLoadWord, out_reg, obj_reg, offset);
+      codegen_->GenerateReadBarrierSlow(instruction, out, out, obj, offset);
+    }
+  } else {
+    // Plain load with no read barrier.
+    // /* HeapReference<Object> */ out = *(obj + offset)
+    __ LoadFromOffset(kLoadWord, out_reg, obj_reg, offset);
+    __ MaybeUnpoisonHeapReference(out_reg);
+  }
+}
+
+void InstructionCodeGeneratorARM::GenerateGcRootFieldLoad(HInstruction* instruction,
+                                                          Location root,
+                                                          Register obj,
+                                                          uint32_t offset) {
+  Register root_reg = root.AsRegister<Register>();
+  if (kEmitCompilerReadBarrier) {
+    if (kUseBakerReadBarrier) {
+      // Fast path implementation of art::ReadBarrier::BarrierForRoot when
+      // Baker's read barrier are used:
+      //
+      //   root = obj.field;
+      //   if (Thread::Current()->GetIsGcMarking()) {
+      //     root = ReadBarrier::Mark(root)
+      //   }
+
+      // /* GcRoot<mirror::Object> */ root = *(obj + offset)
+      __ LoadFromOffset(kLoadWord, root_reg, obj, offset);
+      static_assert(
+          sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(GcRoot<mirror::Object>),
+          "art::mirror::CompressedReference<mirror::Object> and art::GcRoot<mirror::Object> "
+          "have different sizes.");
+      static_assert(sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(int32_t),
+                    "art::mirror::CompressedReference<mirror::Object> and int32_t "
+                    "have different sizes.");
+
+      // Slow path used to mark the GC root `root`.
+      SlowPathCode* slow_path =
+          new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM(instruction, root, root);
+      codegen_->AddSlowPath(slow_path);
+
+      __ LoadFromOffset(
+          kLoadWord, IP, TR, Thread::IsGcMarkingOffset<kArmWordSize>().Int32Value());
+      __ CompareAndBranchIfNonZero(IP, slow_path->GetEntryLabel());
+      __ Bind(slow_path->GetExitLabel());
+    } else {
+      // GC root loaded through a slow path for read barriers other
+      // than Baker's.
+      // /* GcRoot<mirror::Object>* */ root = obj + offset
+      __ AddConstant(root_reg, obj, offset);
+      // /* mirror::Object* */ root = root->Read()
+      codegen_->GenerateReadBarrierForRootSlow(instruction, root, root);
+    }
+  } else {
+    // Plain GC root load with no read barrier.
+    // /* GcRoot<mirror::Object> */ root = *(obj + offset)
+    __ LoadFromOffset(kLoadWord, root_reg, obj, offset);
+    // Note that GC roots are not affected by heap poisoning, thus we
+    // do not have to unpoison `root_reg` here.
+  }
+}
+
+void CodeGeneratorARM::GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction,
+                                                             Location ref,
+                                                             Register obj,
+                                                             uint32_t offset,
+                                                             Location temp,
+                                                             bool needs_null_check) {
+  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(kUseBakerReadBarrier);
+
+  // /* HeapReference<Object> */ ref = *(obj + offset)
+  Location no_index = Location::NoLocation();
+  GenerateReferenceLoadWithBakerReadBarrier(
+      instruction, ref, obj, offset, no_index, temp, needs_null_check);
+}
+
+void CodeGeneratorARM::GenerateArrayLoadWithBakerReadBarrier(HInstruction* instruction,
+                                                             Location ref,
+                                                             Register obj,
+                                                             uint32_t data_offset,
+                                                             Location index,
+                                                             Location temp,
+                                                             bool needs_null_check) {
+  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(kUseBakerReadBarrier);
+
+  // /* HeapReference<Object> */ ref =
+  //     *(obj + data_offset + index * sizeof(HeapReference<Object>))
+  GenerateReferenceLoadWithBakerReadBarrier(
+      instruction, ref, obj, data_offset, index, temp, needs_null_check);
+}
+
+void CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* instruction,
+                                                                 Location ref,
+                                                                 Register obj,
+                                                                 uint32_t offset,
+                                                                 Location index,
+                                                                 Location temp,
+                                                                 bool needs_null_check) {
+  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(kUseBakerReadBarrier);
+
+  // In slow path based read barriers, the read barrier call is
+  // inserted after the original load. However, in fast path based
+  // Baker's read barriers, we need to perform the load of
+  // mirror::Object::monitor_ *before* the original reference load.
+  // This load-load ordering is required by the read barrier.
+  // The fast path/slow path (for Baker's algorithm) should look like:
+  //
+  //   uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
+  //   lfence;  // Load fence or artificial data dependency to prevent load-load reordering
+  //   HeapReference<Object> ref = *src;  // Original reference load.
+  //   bool is_gray = (rb_state == ReadBarrier::gray_ptr_);
+  //   if (is_gray) {
+  //     ref = ReadBarrier::Mark(ref);  // Performed by runtime entrypoint slow path.
+  //   }
+  //
+  // Note: the original implementation in ReadBarrier::Barrier is
+  // slightly more complex as:
+  // - it implements the load-load fence using a data dependency on
+  //   the high-bits of rb_state, which are expected to be all zeroes;
+  // - it performs additional checks that we do not do here for
+  //   performance reasons.
+
+  Register ref_reg = ref.AsRegister<Register>();
+  Register temp_reg = temp.AsRegister<Register>();
+  uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
+
+  // /* int32_t */ monitor = obj->monitor_
+  __ LoadFromOffset(kLoadWord, temp_reg, obj, monitor_offset);
+  if (needs_null_check) {
+    MaybeRecordImplicitNullCheck(instruction);
+  }
+  // /* LockWord */ lock_word = LockWord(monitor)
+  static_assert(sizeof(LockWord) == sizeof(int32_t),
+                "art::LockWord and int32_t have different sizes.");
+  // /* uint32_t */ rb_state = lock_word.ReadBarrierState()
+  __ Lsr(temp_reg, temp_reg, LockWord::kReadBarrierStateShift);
+  __ and_(temp_reg, temp_reg, ShifterOperand(LockWord::kReadBarrierStateMask));
+  static_assert(
+      LockWord::kReadBarrierStateMask == ReadBarrier::rb_ptr_mask_,
+      "art::LockWord::kReadBarrierStateMask is not equal to art::ReadBarrier::rb_ptr_mask_.");
+
+  // Introduce a dependency on the high bits of rb_state, which shall
+  // be all zeroes, to prevent load-load reordering, and without using
+  // a memory barrier (which would be more expensive).
+  // IP = rb_state & ~LockWord::kReadBarrierStateMask = 0
+  __ bic(IP, temp_reg, ShifterOperand(LockWord::kReadBarrierStateMask));
+  // obj is unchanged by this operation, but its value now depends on
+  // IP, which depends on temp_reg.
+  __ add(obj, obj, ShifterOperand(IP));
+
+  // The actual reference load.
+  if (index.IsValid()) {
+    static_assert(
+        sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
+        "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
+    // /* HeapReference<Object> */ ref =
+    //     *(obj + offset + index * sizeof(HeapReference<Object>))
+    if (index.IsConstant()) {
+      size_t computed_offset =
+          (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + offset;
+      __ LoadFromOffset(kLoadWord, ref_reg, obj, computed_offset);
+    } else {
+      __ add(IP, obj, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_4));
+      __ LoadFromOffset(kLoadWord, ref_reg, IP, offset);
+    }
+  } else {
+    // /* HeapReference<Object> */ ref = *(obj + offset)
+    __ LoadFromOffset(kLoadWord, ref_reg, obj, offset);
+  }
+
+  // Object* ref = ref_addr->AsMirrorPtr()
+  __ MaybeUnpoisonHeapReference(ref_reg);
+
+  // Slow path used to mark the object `ref` when it is gray.
+  SlowPathCode* slow_path =
+      new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM(instruction, ref, ref);
+  AddSlowPath(slow_path);
+
+  // if (rb_state == ReadBarrier::gray_ptr_)
+  //   ref = ReadBarrier::Mark(ref);
+  __ cmp(temp_reg, ShifterOperand(ReadBarrier::gray_ptr_));
+  __ b(slow_path->GetEntryLabel(), EQ);
+  __ Bind(slow_path->GetExitLabel());
+}
+
+void CodeGeneratorARM::GenerateReadBarrierSlow(HInstruction* instruction,
+                                               Location out,
+                                               Location ref,
+                                               Location obj,
+                                               uint32_t offset,
+                                               Location index) {
   DCHECK(kEmitCompilerReadBarrier);
 
+  // Insert a slow path based read barrier *after* the reference load.
+  //
   // If heap poisoning is enabled, the unpoisoning of the loaded
   // reference will be carried out by the runtime within the slow
   // path.
@@ -5922,57 +6183,41 @@ void CodeGeneratorARM::GenerateReadBarrier(HInstruction* instruction,
       ReadBarrierForHeapReferenceSlowPathARM(instruction, out, ref, obj, offset, index);
   AddSlowPath(slow_path);
 
-  // TODO: When read barrier has a fast path, add it here.
-  /* Currently the read barrier call is inserted after the original load.
-   * However, if we have a fast path, we need to perform the load of obj.LockWord *before* the
-   * original load. This load-load ordering is required by the read barrier.
-   * The fast path/slow path (for Baker's algorithm) should look like:
-   *
-   * bool isGray = obj.LockWord & kReadBarrierMask;
-   * lfence;  // load fence or artificial data dependence to prevent load-load reordering
-   * ref = obj.field;    // this is the original load
-   * if (isGray) {
-   *   ref = Mark(ref);  // ideally the slow path just does Mark(ref)
-   * }
-   */
-
   __ b(slow_path->GetEntryLabel());
   __ Bind(slow_path->GetExitLabel());
 }
 
-void CodeGeneratorARM::MaybeGenerateReadBarrier(HInstruction* instruction,
-                                                Location out,
-                                                Location ref,
-                                                Location obj,
-                                                uint32_t offset,
-                                                Location index) {
+void CodeGeneratorARM::MaybeGenerateReadBarrierSlow(HInstruction* instruction,
+                                                    Location out,
+                                                    Location ref,
+                                                    Location obj,
+                                                    uint32_t offset,
+                                                    Location index) {
   if (kEmitCompilerReadBarrier) {
+    // Baker's read barriers shall be handled by the fast path
+    // (CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier).
+    DCHECK(!kUseBakerReadBarrier);
     // If heap poisoning is enabled, unpoisoning will be taken care of
     // by the runtime within the slow path.
-    GenerateReadBarrier(instruction, out, ref, obj, offset, index);
+    GenerateReadBarrierSlow(instruction, out, ref, obj, offset, index);
   } else if (kPoisonHeapReferences) {
     __ UnpoisonHeapReference(out.AsRegister<Register>());
   }
 }
 
-void CodeGeneratorARM::GenerateReadBarrierForRoot(HInstruction* instruction,
-                                                  Location out,
-                                                  Location root) {
+void CodeGeneratorARM::GenerateReadBarrierForRootSlow(HInstruction* instruction,
+                                                      Location out,
+                                                      Location root) {
   DCHECK(kEmitCompilerReadBarrier);
 
+  // Insert a slow path based read barrier *after* the GC root load.
+  //
   // Note that GC roots are not affected by heap poisoning, so we do
   // not need to do anything special for this here.
   SlowPathCode* slow_path =
       new (GetGraph()->GetArena()) ReadBarrierForRootSlowPathARM(instruction, out, root);
   AddSlowPath(slow_path);
 
-  // TODO: Implement a fast path for ReadBarrierForRoot, performing
-  // the following operation (for Baker's algorithm):
-  //
-  //   if (thread.tls32_.is_gc_marking) {
-  //     root = Mark(root);
-  //   }
-
   __ b(slow_path->GetEntryLabel());
   __ Bind(slow_path->GetExitLabel());
 }
@@ -6304,7 +6549,7 @@ void InstructionCodeGeneratorARM::VisitPackedSwitch(HPackedSwitch* switch_instr)
     }
     if (num_entries - last_index == 2) {
       // The last missing case_value.
-      GenerateCompareWithImmediate(temp_reg, 1);
+      __ CmpConstant(temp_reg, 1);
       __ b(codegen_->GetLabelOf(successors[last_index + 1]), EQ);
     }
 
@@ -6364,7 +6609,7 @@ void InstructionCodeGeneratorARM::VisitArmDexCacheArraysBase(HArmDexCacheArraysB
 
 void CodeGeneratorARM::MoveFromReturnRegister(Location trg, Primitive::Type type) {
   if (!trg.IsValid()) {
-    DCHECK(type == Primitive::kPrimVoid);
+    DCHECK_EQ(type, Primitive::kPrimVoid);
     return;
   }
 
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index b7c58e1248..26d6d63b31 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -188,7 +188,7 @@ class LocationsBuilderARM : public HGraphVisitor {
   DISALLOW_COPY_AND_ASSIGN(LocationsBuilderARM);
 };
 
-class InstructionCodeGeneratorARM : public HGraphVisitor {
+class InstructionCodeGeneratorARM : public InstructionCodeGenerator {
  public:
   InstructionCodeGeneratorARM(HGraph* graph, CodeGeneratorARM* codegen);
 
@@ -222,24 +222,57 @@ class InstructionCodeGeneratorARM : public HGraphVisitor {
   void HandleLongRotate(LocationSummary* locations);
   void HandleRotate(HRor* ror);
   void HandleShift(HBinaryOperation* operation);
-  void GenerateMemoryBarrier(MemBarrierKind kind);
+
   void GenerateWideAtomicStore(Register addr, uint32_t offset,
                                Register value_lo, Register value_hi,
                                Register temp1, Register temp2,
                                HInstruction* instruction);
   void GenerateWideAtomicLoad(Register addr, uint32_t offset,
                               Register out_lo, Register out_hi);
+
   void HandleFieldSet(HInstruction* instruction,
                       const FieldInfo& field_info,
                       bool value_can_be_null);
   void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info);
+
+  // Generate a heap reference load using one register `out`:
+  //
+  //   out <- *(out + offset)
+  //
+  // while honoring heap poisoning and/or read barriers (if any).
+  // Register `temp` is used when generating a read barrier.
+  void GenerateReferenceLoadOneRegister(HInstruction* instruction,
+                                        Location out,
+                                        uint32_t offset,
+                                        Location temp);
+  // Generate a heap reference load using two different registers
+  // `out` and `obj`:
+  //
+  //   out <- *(obj + offset)
+  //
+  // while honoring heap poisoning and/or read barriers (if any).
+  // Register `temp` is used when generating a Baker's read barrier.
+  void GenerateReferenceLoadTwoRegisters(HInstruction* instruction,
+                                         Location out,
+                                         Location obj,
+                                         uint32_t offset,
+                                         Location temp);
+  // Generate a GC root reference load:
+  //
+  //   root <- *(obj + offset)
+  //
+  // while honoring read barriers (if any).
+  void GenerateGcRootFieldLoad(HInstruction* instruction,
+                               Location root,
+                               Register obj,
+                               uint32_t offset);
+
   void GenerateImplicitNullCheck(HNullCheck* instruction);
   void GenerateExplicitNullCheck(HNullCheck* instruction);
   void GenerateTestAndBranch(HInstruction* instruction,
                              size_t condition_input_index,
                              Label* true_target,
                              Label* false_target);
-  void GenerateCompareWithImmediate(Register left, int32_t right);
   void GenerateCompareTestAndBranch(HCondition* condition,
                                     Label* true_target,
                                     Label* false_target);
@@ -346,6 +379,8 @@ class CodeGeneratorARM : public CodeGenerator {
   // Emit a write barrier.
   void MarkGCCard(Register temp, Register card, Register object, Register value, bool can_be_null);
 
+  void GenerateMemoryBarrier(MemBarrierKind kind);
+
   Label* GetLabelOf(HBasicBlock* block) const {
     return CommonGetLabelOf<Label>(block_labels_, block);
   }
@@ -406,7 +441,26 @@ class CodeGeneratorARM : public CodeGenerator {
     return &it->second;
   }
 
-  // Generate a read barrier for a heap reference within `instruction`.
+  // Fast path implementation of ReadBarrier::Barrier for a heap
+  // reference field load when Baker's read barriers are used.
+  void GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction,
+                                             Location out,
+                                             Register obj,
+                                             uint32_t offset,
+                                             Location temp,
+                                             bool needs_null_check);
+  // Fast path implementation of ReadBarrier::Barrier for a heap
+  // reference array load when Baker's read barriers are used.
+  void GenerateArrayLoadWithBakerReadBarrier(HInstruction* instruction,
+                                             Location out,
+                                             Register obj,
+                                             uint32_t data_offset,
+                                             Location index,
+                                             Location temp,
+                                             bool needs_null_check);
+
+  // Generate a read barrier for a heap reference within `instruction`
+  // using a slow path.
   //
   // A read barrier for an object reference read from the heap is
   // implemented as a call to the artReadBarrierSlow runtime entry
@@ -423,23 +477,25 @@ class CodeGeneratorARM : public CodeGenerator {
   // When `index` is provided (i.e. for array accesses), the offset
   // value passed to artReadBarrierSlow is adjusted to take `index`
   // into account.
-  void GenerateReadBarrier(HInstruction* instruction,
-                           Location out,
-                           Location ref,
-                           Location obj,
-                           uint32_t offset,
-                           Location index = Location::NoLocation());
-
-  // If read barriers are enabled, generate a read barrier for a heap reference.
-  // If heap poisoning is enabled, also unpoison the reference in `out`.
-  void MaybeGenerateReadBarrier(HInstruction* instruction,
-                                Location out,
-                                Location ref,
-                                Location obj,
-                                uint32_t offset,
-                                Location index = Location::NoLocation());
-
-  // Generate a read barrier for a GC root within `instruction`.
+  void GenerateReadBarrierSlow(HInstruction* instruction,
+                               Location out,
+                               Location ref,
+                               Location obj,
+                               uint32_t offset,
+                               Location index = Location::NoLocation());
+
+  // If read barriers are enabled, generate a read barrier for a heap
+  // reference using a slow path. If heap poisoning is enabled, also
+  // unpoison the reference in `out`.
+  void MaybeGenerateReadBarrierSlow(HInstruction* instruction,
+                                    Location out,
+                                    Location ref,
+                                    Location obj,
+                                    uint32_t offset,
+                                    Location index = Location::NoLocation());
+
+  // Generate a read barrier for a GC root within `instruction` using
+  // a slow path.
   //
   // A read barrier for an object reference GC root is implemented as
   // a call to the artReadBarrierForRootSlow runtime entry point,
@@ -449,9 +505,19 @@ class CodeGeneratorARM : public CodeGenerator {
   //
   // The `out` location contains the value returned by
   // artReadBarrierForRootSlow.
-  void GenerateReadBarrierForRoot(HInstruction* instruction, Location out, Location root);
+  void GenerateReadBarrierForRootSlow(HInstruction* instruction, Location out, Location root);
 
  private:
+  // Factored implementation of GenerateFieldLoadWithBakerReadBarrier
+  // and GenerateArrayLoadWithBakerReadBarrier.
+  void GenerateReferenceLoadWithBakerReadBarrier(HInstruction* instruction,
+                                                 Location ref,
+                                                 Register obj,
+                                                 uint32_t offset,
+                                                 Location index,
+                                                 Location temp,
+                                                 bool needs_null_check);
+
   Register GetInvokeStaticOrDirectExtraParameter(HInvokeStaticOrDirect* invoke, Register temp);
 
   using MethodToLiteralMap = ArenaSafeMap<MethodReference, Literal*, MethodReferenceComparator>;
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index b49f42b6c8..a3150d3d22 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -477,24 +477,24 @@ class TypeCheckSlowPathARM64 : public SlowPathCodeARM64 {
 
 class DeoptimizationSlowPathARM64 : public SlowPathCodeARM64 {
  public:
-  explicit DeoptimizationSlowPathARM64(HInstruction* instruction)
+  explicit DeoptimizationSlowPathARM64(HDeoptimize* instruction)
       : instruction_(instruction) {}
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen);
     __ Bind(GetEntryLabel());
     SaveLiveRegisters(codegen, instruction_->GetLocations());
-    DCHECK(instruction_->IsDeoptimize());
-    HDeoptimize* deoptimize = instruction_->AsDeoptimize();
-    uint32_t dex_pc = deoptimize->GetDexPc();
-    CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen);
-    arm64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pDeoptimize), instruction_, dex_pc, this);
+    arm64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pDeoptimize),
+                                 instruction_,
+                                 instruction_->GetDexPc(),
+                                 this);
     CheckEntrypointTypes<kQuickDeoptimize, void, void>();
   }
 
   const char* GetDescription() const OVERRIDE { return "DeoptimizationSlowPathARM64"; }
 
  private:
-  HInstruction* const instruction_;
+  HDeoptimize* const instruction_;
   DISALLOW_COPY_AND_ASSIGN(DeoptimizationSlowPathARM64);
 };
 
@@ -1605,7 +1605,7 @@ void InstructionCodeGeneratorARM64::GenerateSuspendCheck(HSuspendCheck* instruct
 
 InstructionCodeGeneratorARM64::InstructionCodeGeneratorARM64(HGraph* graph,
                                                              CodeGeneratorARM64* codegen)
-      : HGraphVisitor(graph),
+      : InstructionCodeGenerator(graph, codegen),
         assembler_(codegen->GetAssembler()),
         codegen_(codegen) {}
 
@@ -2534,8 +2534,7 @@ void InstructionCodeGeneratorARM64::DivRemByPowerOfTwo(HBinaryOperation* instruc
   Register out = OutputRegister(instruction);
   Register dividend = InputRegisterAt(instruction, 0);
   int64_t imm = Int64FromConstant(second.GetConstant());
-  uint64_t abs_imm = static_cast<uint64_t>(std::abs(imm));
-  DCHECK(IsPowerOfTwo(abs_imm));
+  uint64_t abs_imm = static_cast<uint64_t>(AbsOrMin(imm));
   int ctz_imm = CTZ(abs_imm);
 
   UseScratchRegisterScope temps(GetVIXLAssembler());
@@ -2627,7 +2626,7 @@ void InstructionCodeGeneratorARM64::GenerateDivRemIntegral(HBinaryOperation* ins
       // Do not generate anything. DivZeroCheck would prevent any code to be executed.
     } else if (imm == 1 || imm == -1) {
       DivRemOneOrMinusOne(instruction);
-    } else if (IsPowerOfTwo(std::abs(imm))) {
+    } else if (IsPowerOfTwo(AbsOrMin(imm))) {
       DivRemByPowerOfTwo(instruction);
     } else {
       DCHECK(imm <= -2 || imm >= 2);
@@ -2940,9 +2939,8 @@ void LocationsBuilderARM64::VisitDeoptimize(HDeoptimize* deoptimize) {
 }
 
 void InstructionCodeGeneratorARM64::VisitDeoptimize(HDeoptimize* deoptimize) {
-  SlowPathCodeARM64* slow_path = new (GetGraph()->GetArena())
-      DeoptimizationSlowPathARM64(deoptimize);
-  codegen_->AddSlowPath(slow_path);
+  SlowPathCodeARM64* slow_path =
+      deopt_slow_paths_.NewSlowPath<DeoptimizationSlowPathARM64>(deoptimize);
   GenerateTestAndBranch(deoptimize,
                         /* condition_input_index */ 0,
                         slow_path->GetEntryLabel(),
@@ -2954,6 +2952,10 @@ void LocationsBuilderARM64::VisitNativeDebugInfo(HNativeDebugInfo* info) {
 }
 
 void InstructionCodeGeneratorARM64::VisitNativeDebugInfo(HNativeDebugInfo* info) {
+  if (codegen_->HasStackMapAtCurrentPc()) {
+    // Ensure that we do not collide with the stack map of the previous instruction.
+    __ Nop();
+  }
   codegen_->RecordPcInfo(info, info->GetDexPc());
 }
 
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 0e90ac6345..f2ff89488e 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -186,7 +186,7 @@ class FieldAccessCallingConventionARM64 : public FieldAccessCallingConvention {
   DISALLOW_COPY_AND_ASSIGN(FieldAccessCallingConventionARM64);
 };
 
-class InstructionCodeGeneratorARM64 : public HGraphVisitor {
+class InstructionCodeGeneratorARM64 : public InstructionCodeGenerator {
  public:
   InstructionCodeGeneratorARM64(HGraph* graph, CodeGeneratorARM64* codegen);
 
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index 4648606da8..322912976e 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -444,19 +444,16 @@ class TypeCheckSlowPathMIPS : public SlowPathCodeMIPS {
 
 class DeoptimizationSlowPathMIPS : public SlowPathCodeMIPS {
  public:
-  explicit DeoptimizationSlowPathMIPS(HInstruction* instruction)
+  explicit DeoptimizationSlowPathMIPS(HDeoptimize* instruction)
     : instruction_(instruction) {}
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    CodeGeneratorMIPS* mips_codegen = down_cast<CodeGeneratorMIPS*>(codegen);
     __ Bind(GetEntryLabel());
     SaveLiveRegisters(codegen, instruction_->GetLocations());
-    DCHECK(instruction_->IsDeoptimize());
-    HDeoptimize* deoptimize = instruction_->AsDeoptimize();
-    uint32_t dex_pc = deoptimize->GetDexPc();
-    CodeGeneratorMIPS* mips_codegen = down_cast<CodeGeneratorMIPS*>(codegen);
     mips_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pDeoptimize),
                                 instruction_,
-                                dex_pc,
+                                instruction_->GetDexPc(),
                                 this,
                                 IsDirectEntrypoint(kQuickDeoptimize));
     CheckEntrypointTypes<kQuickDeoptimize, void, void>();
@@ -465,7 +462,7 @@ class DeoptimizationSlowPathMIPS : public SlowPathCodeMIPS {
   const char* GetDescription() const OVERRIDE { return "DeoptimizationSlowPathMIPS"; }
 
  private:
-  HInstruction* const instruction_;
+  HDeoptimize* const instruction_;
   DISALLOW_COPY_AND_ASSIGN(DeoptimizationSlowPathMIPS);
 };
 
@@ -608,9 +605,9 @@ void ParallelMoveResolverMIPS::EmitSwap(size_t index) {
     // then swap the high 32 bits of the same FPR. mtc1 makes the high 32 bits of an FPR
     // unpredictable and the following mfch1 will fail.
     __ Mfc1(TMP, f1);
-    __ Mfhc1(AT, f1);
+    __ MoveFromFpuHigh(AT, f1);
     __ Mtc1(r2_l, f1);
-    __ Mthc1(r2_h, f1);
+    __ MoveToFpuHigh(r2_h, f1);
     __ Move(r2_l, TMP);
     __ Move(r2_h, AT);
   } else if (loc1.IsStackSlot() && loc2.IsStackSlot()) {
@@ -862,7 +859,7 @@ void CodeGeneratorMIPS::Move64(Location destination, Location source) {
       Register dst_low =  destination.AsRegisterPairLow<Register>();
       FRegister src = source.AsFpuRegister<FRegister>();
       __ Mfc1(dst_low, src);
-      __ Mfhc1(dst_high, src);
+      __ MoveFromFpuHigh(dst_high, src);
     } else {
       DCHECK(source.IsDoubleStackSlot()) << "Cannot move from " << source << " to " << destination;
       int32_t off = source.GetStackIndex();
@@ -875,7 +872,7 @@ void CodeGeneratorMIPS::Move64(Location destination, Location source) {
       Register src_high = source.AsRegisterPairHigh<Register>();
       Register src_low = source.AsRegisterPairLow<Register>();
       __ Mtc1(src_low, dst);
-      __ Mthc1(src_high, dst);
+      __ MoveToFpuHigh(src_high, dst);
     } else if (source.IsFpuRegister()) {
       __ MovD(destination.AsFpuRegister<FRegister>(), source.AsFpuRegister<FRegister>());
     } else {
@@ -1241,7 +1238,7 @@ void InstructionCodeGeneratorMIPS::GenerateSuspendCheck(HSuspendCheck* instructi
 
 InstructionCodeGeneratorMIPS::InstructionCodeGeneratorMIPS(HGraph* graph,
                                                            CodeGeneratorMIPS* codegen)
-      : HGraphVisitor(graph),
+      : InstructionCodeGenerator(graph, codegen),
         assembler_(codegen->GetAssembler()),
         codegen_(codegen) {}
 
@@ -1511,7 +1508,7 @@ void InstructionCodeGeneratorMIPS::HandleBinaryOp(HBinaryOperation* instruction)
 }
 
 void LocationsBuilderMIPS::HandleShift(HBinaryOperation* instr) {
-  DCHECK(instr->IsShl() || instr->IsShr() || instr->IsUShr());
+  DCHECK(instr->IsShl() || instr->IsShr() || instr->IsUShr() || instr->IsRor());
 
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instr);
   Primitive::Type type = instr->GetResultType();
@@ -1534,7 +1531,7 @@ void LocationsBuilderMIPS::HandleShift(HBinaryOperation* instr) {
 static constexpr size_t kMipsBitsPerWord = kMipsWordSize * kBitsPerByte;
 
 void InstructionCodeGeneratorMIPS::HandleShift(HBinaryOperation* instr) {
-  DCHECK(instr->IsShl() || instr->IsShr() || instr->IsUShr());
+  DCHECK(instr->IsShl() || instr->IsShr() || instr->IsUShr() || instr->IsRor());
   LocationSummary* locations = instr->GetLocations();
   Primitive::Type type = instr->GetType();
 
@@ -1542,30 +1539,58 @@ void InstructionCodeGeneratorMIPS::HandleShift(HBinaryOperation* instr) {
   bool use_imm = rhs_location.IsConstant();
   Register rhs_reg = use_imm ? ZERO : rhs_location.AsRegister<Register>();
   int64_t rhs_imm = use_imm ? CodeGenerator::GetInt64ValueOf(rhs_location.GetConstant()) : 0;
-  uint32_t shift_mask = (type == Primitive::kPrimInt) ? kMaxIntShiftValue : kMaxLongShiftValue;
-  uint32_t shift_value = rhs_imm & shift_mask;
-  // Is the INS (Insert Bit Field) instruction supported?
-  bool has_ins = codegen_->GetInstructionSetFeatures().IsMipsIsaRevGreaterThanEqual2();
+  const uint32_t shift_mask = (type == Primitive::kPrimInt)
+      ? kMaxIntShiftValue
+      : kMaxLongShiftValue;
+  const uint32_t shift_value = rhs_imm & shift_mask;
+  // Are the INS (Insert Bit Field) and ROTR instructions supported?
+  bool has_ins_rotr = codegen_->GetInstructionSetFeatures().IsMipsIsaRevGreaterThanEqual2();
 
   switch (type) {
     case Primitive::kPrimInt: {
       Register dst = locations->Out().AsRegister<Register>();
       Register lhs = locations->InAt(0).AsRegister<Register>();
       if (use_imm) {
-        if (instr->IsShl()) {
+        if (shift_value == 0) {
+          if (dst != lhs) {
+            __ Move(dst, lhs);
+          }
+        } else if (instr->IsShl()) {
           __ Sll(dst, lhs, shift_value);
         } else if (instr->IsShr()) {
           __ Sra(dst, lhs, shift_value);
-        } else {
+        } else if (instr->IsUShr()) {
           __ Srl(dst, lhs, shift_value);
+        } else {
+          if (has_ins_rotr) {
+            __ Rotr(dst, lhs, shift_value);
+          } else {
+            __ Sll(TMP, lhs, (kMipsBitsPerWord - shift_value) & shift_mask);
+            __ Srl(dst, lhs, shift_value);
+            __ Or(dst, dst, TMP);
+          }
         }
       } else {
         if (instr->IsShl()) {
           __ Sllv(dst, lhs, rhs_reg);
         } else if (instr->IsShr()) {
           __ Srav(dst, lhs, rhs_reg);
-        } else {
+        } else if (instr->IsUShr()) {
           __ Srlv(dst, lhs, rhs_reg);
+        } else {
+          if (has_ins_rotr) {
+            __ Rotrv(dst, lhs, rhs_reg);
+          } else {
+            __ Subu(TMP, ZERO, rhs_reg);
+            // 32-bit shift instructions use the 5 least significant bits of the shift count, so
+            // shifting by `-rhs_reg` is equivalent to shifting by `(32 - rhs_reg) & 31`. The case
+            // when `rhs_reg & 31 == 0` is OK even though we don't shift `lhs` left all the way out
+            // by 32, because the result in this case is computed as `(lhs >> 0) | (lhs << 0)`,
+            // IOW, the OR'd values are equal.
+            __ Sllv(TMP, lhs, TMP);
+            __ Srlv(dst, lhs, rhs_reg);
+            __ Or(dst, dst, TMP);
+          }
         }
       }
       break;
@@ -1580,7 +1605,7 @@ void InstructionCodeGeneratorMIPS::HandleShift(HBinaryOperation* instr) {
           if (shift_value == 0) {
             codegen_->Move64(locations->Out(), locations->InAt(0));
           } else if (shift_value < kMipsBitsPerWord) {
-            if (has_ins) {
+            if (has_ins_rotr) {
               if (instr->IsShl()) {
                 __ Srl(dst_high, lhs_low, kMipsBitsPerWord - shift_value);
                 __ Ins(dst_high, lhs_high, shift_value, kMipsBitsPerWord - shift_value);
@@ -1589,10 +1614,15 @@ void InstructionCodeGeneratorMIPS::HandleShift(HBinaryOperation* instr) {
                 __ Srl(dst_low, lhs_low, shift_value);
                 __ Ins(dst_low, lhs_high, kMipsBitsPerWord - shift_value, shift_value);
                 __ Sra(dst_high, lhs_high, shift_value);
+              } else if (instr->IsUShr()) {
+                __ Srl(dst_low, lhs_low, shift_value);
+                __ Ins(dst_low, lhs_high, kMipsBitsPerWord - shift_value, shift_value);
+                __ Srl(dst_high, lhs_high, shift_value);
               } else {
                 __ Srl(dst_low, lhs_low, shift_value);
                 __ Ins(dst_low, lhs_high, kMipsBitsPerWord - shift_value, shift_value);
                 __ Srl(dst_high, lhs_high, shift_value);
+                __ Ins(dst_high, lhs_low, kMipsBitsPerWord - shift_value, shift_value);
               }
             } else {
               if (instr->IsShl()) {
@@ -1605,24 +1635,51 @@ void InstructionCodeGeneratorMIPS::HandleShift(HBinaryOperation* instr) {
                 __ Sll(TMP, lhs_high, kMipsBitsPerWord - shift_value);
                 __ Srl(dst_low, lhs_low, shift_value);
                 __ Or(dst_low, dst_low, TMP);
-              } else {
+              } else if (instr->IsUShr()) {
                 __ Srl(dst_high, lhs_high, shift_value);
                 __ Sll(TMP, lhs_high, kMipsBitsPerWord - shift_value);
                 __ Srl(dst_low, lhs_low, shift_value);
                 __ Or(dst_low, dst_low, TMP);
+              } else {
+                __ Srl(TMP, lhs_low, shift_value);
+                __ Sll(dst_low, lhs_high, kMipsBitsPerWord - shift_value);
+                __ Or(dst_low, dst_low, TMP);
+                __ Srl(TMP, lhs_high, shift_value);
+                __ Sll(dst_high, lhs_low, kMipsBitsPerWord - shift_value);
+                __ Or(dst_high, dst_high, TMP);
               }
             }
           } else {
-            shift_value -= kMipsBitsPerWord;
+            const uint32_t shift_value_high = shift_value - kMipsBitsPerWord;
             if (instr->IsShl()) {
-              __ Sll(dst_high, lhs_low, shift_value);
+              __ Sll(dst_high, lhs_low, shift_value_high);
               __ Move(dst_low, ZERO);
             } else if (instr->IsShr()) {
-              __ Sra(dst_low, lhs_high, shift_value);
+              __ Sra(dst_low, lhs_high, shift_value_high);
               __ Sra(dst_high, dst_low, kMipsBitsPerWord - 1);
-            } else {
-              __ Srl(dst_low, lhs_high, shift_value);
+            } else if (instr->IsUShr()) {
+              __ Srl(dst_low, lhs_high, shift_value_high);
               __ Move(dst_high, ZERO);
+            } else {
+              if (shift_value == kMipsBitsPerWord) {
+                // 64-bit rotation by 32 is just a swap.
+                __ Move(dst_low, lhs_high);
+                __ Move(dst_high, lhs_low);
+              } else {
+                if (has_ins_rotr) {
+                  __ Srl(dst_low, lhs_high, shift_value_high);
+                  __ Ins(dst_low, lhs_low, kMipsBitsPerWord - shift_value_high, shift_value_high);
+                  __ Srl(dst_high, lhs_low, shift_value_high);
+                  __ Ins(dst_high, lhs_high, kMipsBitsPerWord - shift_value_high, shift_value_high);
+                } else {
+                  __ Sll(TMP, lhs_low, kMipsBitsPerWord - shift_value_high);
+                  __ Srl(dst_low, lhs_high, shift_value_high);
+                  __ Or(dst_low, dst_low, TMP);
+                  __ Sll(TMP, lhs_high, kMipsBitsPerWord - shift_value_high);
+                  __ Srl(dst_high, lhs_low, shift_value_high);
+                  __ Or(dst_high, dst_high, TMP);
+                }
+              }
             }
           }
       } else {
@@ -1649,7 +1706,7 @@ void InstructionCodeGeneratorMIPS::HandleShift(HBinaryOperation* instr) {
           __ Beqz(TMP, &done);
           __ Move(dst_low, dst_high);
           __ Sra(dst_high, dst_high, 31);
-        } else {
+        } else if (instr->IsUShr()) {
           __ Srlv(dst_high, lhs_high, rhs_reg);
           __ Nor(AT, ZERO, rhs_reg);
           __ Sll(TMP, lhs_high, 1);
@@ -1660,6 +1717,21 @@ void InstructionCodeGeneratorMIPS::HandleShift(HBinaryOperation* instr) {
           __ Beqz(TMP, &done);
           __ Move(dst_low, dst_high);
           __ Move(dst_high, ZERO);
+        } else {
+          __ Nor(AT, ZERO, rhs_reg);
+          __ Srlv(TMP, lhs_low, rhs_reg);
+          __ Sll(dst_low, lhs_high, 1);
+          __ Sllv(dst_low, dst_low, AT);
+          __ Or(dst_low, dst_low, TMP);
+          __ Srlv(TMP, lhs_high, rhs_reg);
+          __ Sll(dst_high, lhs_low, 1);
+          __ Sllv(dst_high, dst_high, AT);
+          __ Or(dst_high, dst_high, TMP);
+          __ Andi(TMP, rhs_reg, kMipsBitsPerWord);
+          __ Beqz(TMP, &done);
+          __ Move(TMP, dst_high);
+          __ Move(dst_high, dst_low);
+          __ Move(dst_low, TMP);
         }
         __ Bind(&done);
       }
@@ -2314,8 +2386,7 @@ void InstructionCodeGeneratorMIPS::DivRemByPowerOfTwo(HBinaryOperation* instruct
   Register out = locations->Out().AsRegister<Register>();
   Register dividend = locations->InAt(0).AsRegister<Register>();
   int32_t imm = second.GetConstant()->AsIntConstant()->GetValue();
-  uint32_t abs_imm = static_cast<uint32_t>(std::abs(imm));
-  DCHECK(IsPowerOfTwo(abs_imm));
+  uint32_t abs_imm = static_cast<uint32_t>(AbsOrMin(imm));
   int ctz_imm = CTZ(abs_imm);
 
   if (instruction->IsDiv()) {
@@ -2418,7 +2489,7 @@ void InstructionCodeGeneratorMIPS::GenerateDivRemIntegral(HBinaryOperation* inst
       // Do not generate anything. DivZeroCheck would prevent any code to be executed.
     } else if (imm == 1 || imm == -1) {
       DivRemOneOrMinusOne(instruction);
-    } else if (IsPowerOfTwo(std::abs(imm))) {
+    } else if (IsPowerOfTwo(AbsOrMin(imm))) {
       DivRemByPowerOfTwo(instruction);
     } else {
       DCHECK(imm <= -2 || imm >= 2);
@@ -3358,8 +3429,8 @@ void LocationsBuilderMIPS::VisitDeoptimize(HDeoptimize* deoptimize) {
 }
 
 void InstructionCodeGeneratorMIPS::VisitDeoptimize(HDeoptimize* deoptimize) {
-  SlowPathCodeMIPS* slow_path = new (GetGraph()->GetArena()) DeoptimizationSlowPathMIPS(deoptimize);
-  codegen_->AddSlowPath(slow_path);
+  SlowPathCodeMIPS* slow_path =
+      deopt_slow_paths_.NewSlowPath<DeoptimizationSlowPathMIPS>(deoptimize);
   GenerateTestAndBranch(deoptimize,
                         /* condition_input_index */ 0,
                         slow_path->GetEntryLabel(),
@@ -3371,6 +3442,10 @@ void LocationsBuilderMIPS::VisitNativeDebugInfo(HNativeDebugInfo* info) {
 }
 
 void InstructionCodeGeneratorMIPS::VisitNativeDebugInfo(HNativeDebugInfo* info) {
+  if (codegen_->HasStackMapAtCurrentPc()) {
+    // Ensure that we do not collide with the stack map of the previous instruction.
+    __ Nop();
+  }
   codegen_->RecordPcInfo(info, info->GetDexPc());
 }
 
@@ -3457,8 +3532,8 @@ void InstructionCodeGeneratorMIPS::HandleFieldGet(HInstruction* instruction,
       // Need to move to FP regs since FP results are returned in core registers.
       __ Mtc1(locations->GetTemp(1).AsRegister<Register>(),
               locations->Out().AsFpuRegister<FRegister>());
-      __ Mthc1(locations->GetTemp(2).AsRegister<Register>(),
-               locations->Out().AsFpuRegister<FRegister>());
+      __ MoveToFpuHigh(locations->GetTemp(2).AsRegister<Register>(),
+                       locations->Out().AsFpuRegister<FRegister>());
     }
   } else {
     if (!Primitive::IsFloatingPointType(type)) {
@@ -3578,8 +3653,8 @@ void InstructionCodeGeneratorMIPS::HandleFieldSet(HInstruction* instruction,
       // Pass FP parameters in core registers.
       __ Mfc1(locations->GetTemp(1).AsRegister<Register>(),
               locations->InAt(1).AsFpuRegister<FRegister>());
-      __ Mfhc1(locations->GetTemp(2).AsRegister<Register>(),
-               locations->InAt(1).AsFpuRegister<FRegister>());
+      __ MoveFromFpuHigh(locations->GetTemp(2).AsRegister<Register>(),
+                         locations->InAt(1).AsFpuRegister<FRegister>());
     }
     codegen_->InvokeRuntime(QUICK_ENTRY_POINT(pA64Store),
                             instruction,
@@ -4536,14 +4611,12 @@ void InstructionCodeGeneratorMIPS::VisitReturnVoid(HReturnVoid* ret ATTRIBUTE_UN
   codegen_->GenerateFrameExit();
 }
 
-void LocationsBuilderMIPS::VisitRor(HRor* ror ATTRIBUTE_UNUSED) {
-  LOG(FATAL) << "Unreachable";
-  UNREACHABLE();
+void LocationsBuilderMIPS::VisitRor(HRor* ror) {
+  HandleShift(ror);
 }
 
-void InstructionCodeGeneratorMIPS::VisitRor(HRor* ror ATTRIBUTE_UNUSED) {
-  LOG(FATAL) << "Unreachable";
-  UNREACHABLE();
+void InstructionCodeGeneratorMIPS::VisitRor(HRor* ror) {
+  HandleShift(ror);
 }
 
 void LocationsBuilderMIPS::VisitShl(HShl* shl) {
@@ -4731,6 +4804,7 @@ void LocationsBuilderMIPS::VisitTypeConversion(HTypeConversion* conversion) {
   Primitive::Type input_type = conversion->GetInputType();
   Primitive::Type result_type = conversion->GetResultType();
   DCHECK_NE(input_type, result_type);
+  bool isR6 = codegen_->GetInstructionSetFeatures().IsR6();
 
   if ((input_type == Primitive::kPrimNot) || (input_type == Primitive::kPrimVoid) ||
       (result_type == Primitive::kPrimNot) || (result_type == Primitive::kPrimVoid)) {
@@ -4738,8 +4812,9 @@ void LocationsBuilderMIPS::VisitTypeConversion(HTypeConversion* conversion) {
   }
 
   LocationSummary::CallKind call_kind = LocationSummary::kNoCall;
-  if ((Primitive::IsFloatingPointType(result_type) && input_type == Primitive::kPrimLong) ||
-      (Primitive::IsIntegralType(result_type) && Primitive::IsFloatingPointType(input_type))) {
+  if (!isR6 &&
+      ((Primitive::IsFloatingPointType(result_type) && input_type == Primitive::kPrimLong) ||
+       (result_type == Primitive::kPrimLong && Primitive::IsFloatingPointType(input_type)))) {
     call_kind = LocationSummary::kCall;
   }
 
@@ -4777,6 +4852,8 @@ void InstructionCodeGeneratorMIPS::VisitTypeConversion(HTypeConversion* conversi
   Primitive::Type result_type = conversion->GetResultType();
   Primitive::Type input_type = conversion->GetInputType();
   bool has_sign_extension = codegen_->GetInstructionSetFeatures().IsMipsIsaRevGreaterThanEqual2();
+  bool isR6 = codegen_->GetInstructionSetFeatures().IsR6();
+  bool fpu_32bit = codegen_->GetInstructionSetFeatures().Is32BitFloatingPoint();
 
   DCHECK_NE(input_type, result_type);
 
@@ -4822,7 +4899,37 @@ void InstructionCodeGeneratorMIPS::VisitTypeConversion(HTypeConversion* conversi
                    << " to " << result_type;
     }
   } else if (Primitive::IsFloatingPointType(result_type) && Primitive::IsIntegralType(input_type)) {
-    if (input_type != Primitive::kPrimLong) {
+    if (input_type == Primitive::kPrimLong) {
+      if (isR6) {
+        // cvt.s.l/cvt.d.l requires MIPSR2+ with FR=1. MIPS32R6 is implemented as a secondary
+        // architecture on top of MIPS64R6, which has FR=1, and therefore can use the instruction.
+        Register src_high = locations->InAt(0).AsRegisterPairHigh<Register>();
+        Register src_low = locations->InAt(0).AsRegisterPairLow<Register>();
+        FRegister dst = locations->Out().AsFpuRegister<FRegister>();
+        __ Mtc1(src_low, FTMP);
+        __ Mthc1(src_high, FTMP);
+        if (result_type == Primitive::kPrimFloat) {
+          __ Cvtsl(dst, FTMP);
+        } else {
+          __ Cvtdl(dst, FTMP);
+        }
+      } else {
+        int32_t entry_offset = (result_type == Primitive::kPrimFloat) ? QUICK_ENTRY_POINT(pL2f)
+                                                                      : QUICK_ENTRY_POINT(pL2d);
+        bool direct = (result_type == Primitive::kPrimFloat) ? IsDirectEntrypoint(kQuickL2f)
+                                                             : IsDirectEntrypoint(kQuickL2d);
+        codegen_->InvokeRuntime(entry_offset,
+                                conversion,
+                                conversion->GetDexPc(),
+                                nullptr,
+                                direct);
+        if (result_type == Primitive::kPrimFloat) {
+          CheckEntrypointTypes<kQuickL2f, float, int64_t>();
+        } else {
+          CheckEntrypointTypes<kQuickL2d, double, int64_t>();
+        }
+      }
+    } else {
       Register src = locations->InAt(0).AsRegister<Register>();
       FRegister dst = locations->Out().AsFpuRegister<FRegister>();
       __ Mtc1(src, FTMP);
@@ -4831,54 +4938,168 @@ void InstructionCodeGeneratorMIPS::VisitTypeConversion(HTypeConversion* conversi
       } else {
         __ Cvtdw(dst, FTMP);
       }
-    } else {
-      int32_t entry_offset = (result_type == Primitive::kPrimFloat) ? QUICK_ENTRY_POINT(pL2f)
-                                                                    : QUICK_ENTRY_POINT(pL2d);
-      bool direct = (result_type == Primitive::kPrimFloat) ? IsDirectEntrypoint(kQuickL2f)
-                                                           : IsDirectEntrypoint(kQuickL2d);
-      codegen_->InvokeRuntime(entry_offset,
-                              conversion,
-                              conversion->GetDexPc(),
-                              nullptr,
-                              direct);
-      if (result_type == Primitive::kPrimFloat) {
-        CheckEntrypointTypes<kQuickL2f, float, int64_t>();
-      } else {
-        CheckEntrypointTypes<kQuickL2d, double, int64_t>();
-      }
     }
   } else if (Primitive::IsIntegralType(result_type) && Primitive::IsFloatingPointType(input_type)) {
     CHECK(result_type == Primitive::kPrimInt || result_type == Primitive::kPrimLong);
-    int32_t entry_offset;
-    bool direct;
-    if (result_type != Primitive::kPrimLong) {
-      entry_offset = (input_type == Primitive::kPrimFloat) ? QUICK_ENTRY_POINT(pF2iz)
-                                                           : QUICK_ENTRY_POINT(pD2iz);
-      direct = (result_type == Primitive::kPrimFloat) ? IsDirectEntrypoint(kQuickF2iz)
-                                                      : IsDirectEntrypoint(kQuickD2iz);
+    if (result_type == Primitive::kPrimLong) {
+      if (isR6) {
+        // trunc.l.s/trunc.l.d requires MIPSR2+ with FR=1. MIPS32R6 is implemented as a secondary
+        // architecture on top of MIPS64R6, which has FR=1, and therefore can use the instruction.
+        FRegister src = locations->InAt(0).AsFpuRegister<FRegister>();
+        Register dst_high = locations->Out().AsRegisterPairHigh<Register>();
+        Register dst_low = locations->Out().AsRegisterPairLow<Register>();
+        MipsLabel truncate;
+        MipsLabel done;
+
+        // When NAN2008=0 (R2 and before), the truncate instruction produces the maximum positive
+        // value when the input is either a NaN or is outside of the range of the output type
+        // after the truncation. IOW, the three special cases (NaN, too small, too big) produce
+        // the same result.
+        //
+        // When NAN2008=1 (R6), the truncate instruction caps the output at the minimum/maximum
+        // value of the output type if the input is outside of the range after the truncation or
+        // produces 0 when the input is a NaN. IOW, the three special cases produce three distinct
+        // results. This matches the desired float/double-to-int/long conversion exactly.
+        //
+        // So, NAN2008 affects handling of negative values and NaNs by the truncate instruction.
+        //
+        // The following code supports both NAN2008=0 and NAN2008=1 behaviors of the truncate
+        // instruction, the reason being that the emulator implements NAN2008=0 on MIPS64R6,
+        // even though it must be NAN2008=1 on R6.
+        //
+        // The code takes care of the different behaviors by first comparing the input to the
+        // minimum output value (-2**-63 for truncating to long, -2**-31 for truncating to int).
+        // If the input is greater than or equal to the minimum, it procedes to the truncate
+        // instruction, which will handle such an input the same way irrespective of NAN2008.
+        // Otherwise the input is compared to itself to determine whether it is a NaN or not
+        // in order to return either zero or the minimum value.
+        //
+        // TODO: simplify this when the emulator correctly implements NAN2008=1 behavior of the
+        // truncate instruction for MIPS64R6.
+        if (input_type == Primitive::kPrimFloat) {
+          uint32_t min_val = bit_cast<uint32_t, float>(std::numeric_limits<int64_t>::min());
+          __ LoadConst32(TMP, min_val);
+          __ Mtc1(TMP, FTMP);
+          __ CmpLeS(FTMP, FTMP, src);
+        } else {
+          uint64_t min_val = bit_cast<uint64_t, double>(std::numeric_limits<int64_t>::min());
+          __ LoadConst32(TMP, High32Bits(min_val));
+          __ Mtc1(ZERO, FTMP);
+          __ Mthc1(TMP, FTMP);
+          __ CmpLeD(FTMP, FTMP, src);
+        }
+
+        __ Bc1nez(FTMP, &truncate);
+
+        if (input_type == Primitive::kPrimFloat) {
+          __ CmpEqS(FTMP, src, src);
+        } else {
+          __ CmpEqD(FTMP, src, src);
+        }
+        __ Move(dst_low, ZERO);
+        __ LoadConst32(dst_high, std::numeric_limits<int32_t>::min());
+        __ Mfc1(TMP, FTMP);
+        __ And(dst_high, dst_high, TMP);
+
+        __ B(&done);
+
+        __ Bind(&truncate);
+
+        if (input_type == Primitive::kPrimFloat) {
+          __ TruncLS(FTMP, src);
+        } else {
+          __ TruncLD(FTMP, src);
+        }
+        __ Mfc1(dst_low, FTMP);
+        __ Mfhc1(dst_high, FTMP);
+
+        __ Bind(&done);
+      } else {
+        int32_t entry_offset = (input_type == Primitive::kPrimFloat) ? QUICK_ENTRY_POINT(pF2l)
+                                                                     : QUICK_ENTRY_POINT(pD2l);
+        bool direct = (result_type == Primitive::kPrimFloat) ? IsDirectEntrypoint(kQuickF2l)
+                                                             : IsDirectEntrypoint(kQuickD2l);
+        codegen_->InvokeRuntime(entry_offset, conversion, conversion->GetDexPc(), nullptr, direct);
+        if (input_type == Primitive::kPrimFloat) {
+          CheckEntrypointTypes<kQuickF2l, int64_t, float>();
+        } else {
+          CheckEntrypointTypes<kQuickD2l, int64_t, double>();
+        }
+      }
     } else {
-      entry_offset = (input_type == Primitive::kPrimFloat) ? QUICK_ENTRY_POINT(pF2l)
-                                                           : QUICK_ENTRY_POINT(pD2l);
-      direct = (result_type == Primitive::kPrimFloat) ? IsDirectEntrypoint(kQuickF2l)
-                                                      : IsDirectEntrypoint(kQuickD2l);
-    }
-    codegen_->InvokeRuntime(entry_offset,
-                            conversion,
-                            conversion->GetDexPc(),
-                            nullptr,
-                            direct);
-    if (result_type != Primitive::kPrimLong) {
+      FRegister src = locations->InAt(0).AsFpuRegister<FRegister>();
+      Register dst = locations->Out().AsRegister<Register>();
+      MipsLabel truncate;
+      MipsLabel done;
+
+      // The following code supports both NAN2008=0 and NAN2008=1 behaviors of the truncate
+      // instruction, the reason being that the emulator implements NAN2008=0 on MIPS64R6,
+      // even though it must be NAN2008=1 on R6.
+      //
+      // For details see the large comment above for the truncation of float/double to long on R6.
+      //
+      // TODO: simplify this when the emulator correctly implements NAN2008=1 behavior of the
+      // truncate instruction for MIPS64R6.
       if (input_type == Primitive::kPrimFloat) {
-        CheckEntrypointTypes<kQuickF2iz, int32_t, float>();
+        uint32_t min_val = bit_cast<uint32_t, float>(std::numeric_limits<int32_t>::min());
+        __ LoadConst32(TMP, min_val);
+        __ Mtc1(TMP, FTMP);
       } else {
-        CheckEntrypointTypes<kQuickD2iz, int32_t, double>();
+        uint64_t min_val = bit_cast<uint64_t, double>(std::numeric_limits<int32_t>::min());
+        __ LoadConst32(TMP, High32Bits(min_val));
+        __ Mtc1(ZERO, FTMP);
+        if (fpu_32bit) {
+          __ Mtc1(TMP, static_cast<FRegister>(FTMP + 1));
+        } else {
+          __ Mthc1(TMP, FTMP);
+        }
       }
-    } else {
+
+      if (isR6) {
+        if (input_type == Primitive::kPrimFloat) {
+          __ CmpLeS(FTMP, FTMP, src);
+        } else {
+          __ CmpLeD(FTMP, FTMP, src);
+        }
+        __ Bc1nez(FTMP, &truncate);
+
+        if (input_type == Primitive::kPrimFloat) {
+          __ CmpEqS(FTMP, src, src);
+        } else {
+          __ CmpEqD(FTMP, src, src);
+        }
+        __ LoadConst32(dst, std::numeric_limits<int32_t>::min());
+        __ Mfc1(TMP, FTMP);
+        __ And(dst, dst, TMP);
+      } else {
+        if (input_type == Primitive::kPrimFloat) {
+          __ ColeS(0, FTMP, src);
+        } else {
+          __ ColeD(0, FTMP, src);
+        }
+        __ Bc1t(0, &truncate);
+
+        if (input_type == Primitive::kPrimFloat) {
+          __ CeqS(0, src, src);
+        } else {
+          __ CeqD(0, src, src);
+        }
+        __ LoadConst32(dst, std::numeric_limits<int32_t>::min());
+        __ Movf(dst, ZERO, 0);
+      }
+
+      __ B(&done);
+
+      __ Bind(&truncate);
+
       if (input_type == Primitive::kPrimFloat) {
-        CheckEntrypointTypes<kQuickF2l, int64_t, float>();
+        __ TruncWS(FTMP, src);
       } else {
-        CheckEntrypointTypes<kQuickD2l, int64_t, double>();
+        __ TruncWD(FTMP, src);
       }
+      __ Mfc1(dst, FTMP);
+
+      __ Bind(&done);
     }
   } else if (Primitive::IsFloatingPointType(result_type) &&
              Primitive::IsFloatingPointType(input_type)) {
diff --git a/compiler/optimizing/code_generator_mips.h b/compiler/optimizing/code_generator_mips.h
index 38302ad315..c3d4851ee9 100644
--- a/compiler/optimizing/code_generator_mips.h
+++ b/compiler/optimizing/code_generator_mips.h
@@ -197,7 +197,7 @@ class LocationsBuilderMIPS : public HGraphVisitor {
   DISALLOW_COPY_AND_ASSIGN(LocationsBuilderMIPS);
 };
 
-class InstructionCodeGeneratorMIPS : public HGraphVisitor {
+class InstructionCodeGeneratorMIPS : public InstructionCodeGenerator {
  public:
   InstructionCodeGeneratorMIPS(HGraph* graph, CodeGeneratorMIPS* codegen);
 
diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc
index 05834ff063..38c32cad06 100644
--- a/compiler/optimizing/code_generator_mips64.cc
+++ b/compiler/optimizing/code_generator_mips64.cc
@@ -391,24 +391,24 @@ class TypeCheckSlowPathMIPS64 : public SlowPathCodeMIPS64 {
 
 class DeoptimizationSlowPathMIPS64 : public SlowPathCodeMIPS64 {
  public:
-  explicit DeoptimizationSlowPathMIPS64(HInstruction* instruction)
+  explicit DeoptimizationSlowPathMIPS64(HDeoptimize* instruction)
     : instruction_(instruction) {}
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    CodeGeneratorMIPS64* mips64_codegen = down_cast<CodeGeneratorMIPS64*>(codegen);
     __ Bind(GetEntryLabel());
     SaveLiveRegisters(codegen, instruction_->GetLocations());
-    DCHECK(instruction_->IsDeoptimize());
-    HDeoptimize* deoptimize = instruction_->AsDeoptimize();
-    uint32_t dex_pc = deoptimize->GetDexPc();
-    CodeGeneratorMIPS64* mips64_codegen = down_cast<CodeGeneratorMIPS64*>(codegen);
-    mips64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pDeoptimize), instruction_, dex_pc, this);
+    mips64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pDeoptimize),
+                                  instruction_,
+                                  instruction_->GetDexPc(),
+                                  this);
     CheckEntrypointTypes<kQuickDeoptimize, void, void>();
   }
 
   const char* GetDescription() const OVERRIDE { return "DeoptimizationSlowPathMIPS64"; }
 
  private:
-  HInstruction* const instruction_;
+  HDeoptimize* const instruction_;
   DISALLOW_COPY_AND_ASSIGN(DeoptimizationSlowPathMIPS64);
 };
 
@@ -1113,7 +1113,7 @@ void InstructionCodeGeneratorMIPS64::GenerateSuspendCheck(HSuspendCheck* instruc
 
 InstructionCodeGeneratorMIPS64::InstructionCodeGeneratorMIPS64(HGraph* graph,
                                                                CodeGeneratorMIPS64* codegen)
-      : HGraphVisitor(graph),
+      : InstructionCodeGenerator(graph, codegen),
         assembler_(codegen->GetAssembler()),
         codegen_(codegen) {}
 
@@ -1247,7 +1247,7 @@ void InstructionCodeGeneratorMIPS64::HandleBinaryOp(HBinaryOperation* instructio
 }
 
 void LocationsBuilderMIPS64::HandleShift(HBinaryOperation* instr) {
-  DCHECK(instr->IsShl() || instr->IsShr() || instr->IsUShr());
+  DCHECK(instr->IsShl() || instr->IsShr() || instr->IsUShr() || instr->IsRor());
 
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instr);
   Primitive::Type type = instr->GetResultType();
@@ -1265,7 +1265,7 @@ void LocationsBuilderMIPS64::HandleShift(HBinaryOperation* instr) {
 }
 
 void InstructionCodeGeneratorMIPS64::HandleShift(HBinaryOperation* instr) {
-  DCHECK(instr->IsShl() || instr->IsShr() || instr->IsUShr());
+  DCHECK(instr->IsShl() || instr->IsShr() || instr->IsUShr() || instr->IsRor());
   LocationSummary* locations = instr->GetLocations();
   Primitive::Type type = instr->GetType();
 
@@ -1290,13 +1290,19 @@ void InstructionCodeGeneratorMIPS64::HandleShift(HBinaryOperation* instr) {
           ? static_cast<uint32_t>(rhs_imm & kMaxIntShiftValue)
           : static_cast<uint32_t>(rhs_imm & kMaxLongShiftValue);
 
-        if (type == Primitive::kPrimInt) {
+        if (shift_value == 0) {
+          if (dst != lhs) {
+            __ Move(dst, lhs);
+          }
+        } else if (type == Primitive::kPrimInt) {
           if (instr->IsShl()) {
             __ Sll(dst, lhs, shift_value);
           } else if (instr->IsShr()) {
             __ Sra(dst, lhs, shift_value);
-          } else {
+          } else if (instr->IsUShr()) {
             __ Srl(dst, lhs, shift_value);
+          } else {
+            __ Rotr(dst, lhs, shift_value);
           }
         } else {
           if (shift_value < 32) {
@@ -1304,8 +1310,10 @@ void InstructionCodeGeneratorMIPS64::HandleShift(HBinaryOperation* instr) {
               __ Dsll(dst, lhs, shift_value);
             } else if (instr->IsShr()) {
               __ Dsra(dst, lhs, shift_value);
-            } else {
+            } else if (instr->IsUShr()) {
               __ Dsrl(dst, lhs, shift_value);
+            } else {
+              __ Drotr(dst, lhs, shift_value);
             }
           } else {
             shift_value -= 32;
@@ -1313,8 +1321,10 @@ void InstructionCodeGeneratorMIPS64::HandleShift(HBinaryOperation* instr) {
               __ Dsll32(dst, lhs, shift_value);
             } else if (instr->IsShr()) {
               __ Dsra32(dst, lhs, shift_value);
-            } else {
+            } else if (instr->IsUShr()) {
               __ Dsrl32(dst, lhs, shift_value);
+            } else {
+              __ Drotr32(dst, lhs, shift_value);
             }
           }
         }
@@ -1324,16 +1334,20 @@ void InstructionCodeGeneratorMIPS64::HandleShift(HBinaryOperation* instr) {
             __ Sllv(dst, lhs, rhs_reg);
           } else if (instr->IsShr()) {
             __ Srav(dst, lhs, rhs_reg);
-          } else {
+          } else if (instr->IsUShr()) {
             __ Srlv(dst, lhs, rhs_reg);
+          } else {
+            __ Rotrv(dst, lhs, rhs_reg);
           }
         } else {
           if (instr->IsShl()) {
             __ Dsllv(dst, lhs, rhs_reg);
           } else if (instr->IsShr()) {
             __ Dsrav(dst, lhs, rhs_reg);
-          } else {
+          } else if (instr->IsUShr()) {
             __ Dsrlv(dst, lhs, rhs_reg);
+          } else {
+            __ Drotrv(dst, lhs, rhs_reg);
           }
         }
       }
@@ -1955,8 +1969,7 @@ void InstructionCodeGeneratorMIPS64::DivRemByPowerOfTwo(HBinaryOperation* instru
   GpuRegister out = locations->Out().AsRegister<GpuRegister>();
   GpuRegister dividend = locations->InAt(0).AsRegister<GpuRegister>();
   int64_t imm = Int64FromConstant(second.GetConstant());
-  uint64_t abs_imm = static_cast<uint64_t>(std::abs(imm));
-  DCHECK(IsPowerOfTwo(abs_imm));
+  uint64_t abs_imm = static_cast<uint64_t>(AbsOrMin(imm));
   int ctz_imm = CTZ(abs_imm);
 
   if (instruction->IsDiv()) {
@@ -2138,7 +2151,7 @@ void InstructionCodeGeneratorMIPS64::GenerateDivRemIntegral(HBinaryOperation* in
       // Do not generate anything. DivZeroCheck would prevent any code to be executed.
     } else if (imm == 1 || imm == -1) {
       DivRemOneOrMinusOne(instruction);
-    } else if (IsPowerOfTwo(std::abs(imm))) {
+    } else if (IsPowerOfTwo(AbsOrMin(imm))) {
       DivRemByPowerOfTwo(instruction);
     } else {
       DCHECK(imm <= -2 || imm >= 2);
@@ -2736,9 +2749,8 @@ void LocationsBuilderMIPS64::VisitDeoptimize(HDeoptimize* deoptimize) {
 }
 
 void InstructionCodeGeneratorMIPS64::VisitDeoptimize(HDeoptimize* deoptimize) {
-  SlowPathCodeMIPS64* slow_path = new (GetGraph()->GetArena())
-      DeoptimizationSlowPathMIPS64(deoptimize);
-  codegen_->AddSlowPath(slow_path);
+  SlowPathCodeMIPS64* slow_path =
+      deopt_slow_paths_.NewSlowPath<DeoptimizationSlowPathMIPS64>(deoptimize);
   GenerateTestAndBranch(deoptimize,
                         /* condition_input_index */ 0,
                         slow_path->GetEntryLabel(),
@@ -2750,6 +2762,10 @@ void LocationsBuilderMIPS64::VisitNativeDebugInfo(HNativeDebugInfo* info) {
 }
 
 void InstructionCodeGeneratorMIPS64::VisitNativeDebugInfo(HNativeDebugInfo* info) {
+  if (codegen_->HasStackMapAtCurrentPc()) {
+    // Ensure that we do not collide with the stack map of the previous instruction.
+    __ Nop();
+  }
   codegen_->RecordPcInfo(info, info->GetDexPc());
 }
 
@@ -3722,14 +3738,12 @@ void InstructionCodeGeneratorMIPS64::VisitReturnVoid(HReturnVoid* ret ATTRIBUTE_
   codegen_->GenerateFrameExit();
 }
 
-void LocationsBuilderMIPS64::VisitRor(HRor* ror ATTRIBUTE_UNUSED) {
-  LOG(FATAL) << "Unreachable";
-  UNREACHABLE();
+void LocationsBuilderMIPS64::VisitRor(HRor* ror) {
+  HandleShift(ror);
 }
 
-void InstructionCodeGeneratorMIPS64::VisitRor(HRor* ror ATTRIBUTE_UNUSED) {
-  LOG(FATAL) << "Unreachable";
-  UNREACHABLE();
+void InstructionCodeGeneratorMIPS64::VisitRor(HRor* ror) {
+  HandleShift(ror);
 }
 
 void LocationsBuilderMIPS64::VisitShl(HShl* shl) {
@@ -3918,36 +3932,18 @@ void LocationsBuilderMIPS64::VisitTypeConversion(HTypeConversion* conversion) {
     LOG(FATAL) << "Unexpected type conversion from " << input_type << " to " << result_type;
   }
 
-  LocationSummary::CallKind call_kind = LocationSummary::kNoCall;
-  if ((Primitive::IsFloatingPointType(result_type) && input_type == Primitive::kPrimLong) ||
-      (Primitive::IsIntegralType(result_type) && Primitive::IsFloatingPointType(input_type))) {
-    call_kind = LocationSummary::kCall;
-  }
-
-  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(conversion, call_kind);
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(conversion);
 
-  if (call_kind == LocationSummary::kNoCall) {
-    if (Primitive::IsFloatingPointType(input_type)) {
-      locations->SetInAt(0, Location::RequiresFpuRegister());
-    } else {
-      locations->SetInAt(0, Location::RequiresRegister());
-    }
-
-    if (Primitive::IsFloatingPointType(result_type)) {
-      locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
-    } else {
-      locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
-    }
+  if (Primitive::IsFloatingPointType(input_type)) {
+    locations->SetInAt(0, Location::RequiresFpuRegister());
   } else {
-    InvokeRuntimeCallingConvention calling_convention;
-
-    if (Primitive::IsFloatingPointType(input_type)) {
-      locations->SetInAt(0, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(0)));
-    } else {
-      locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
-    }
+    locations->SetInAt(0, Location::RequiresRegister());
+  }
 
-    locations->SetOut(calling_convention.GetReturnLocation(result_type));
+  if (Primitive::IsFloatingPointType(result_type)) {
+    locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
+  } else {
+    locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
   }
 }
 
@@ -3992,55 +3988,107 @@ void InstructionCodeGeneratorMIPS64::VisitTypeConversion(HTypeConversion* conver
                    << " to " << result_type;
     }
   } else if (Primitive::IsFloatingPointType(result_type) && Primitive::IsIntegralType(input_type)) {
-    if (input_type != Primitive::kPrimLong) {
-      FpuRegister dst = locations->Out().AsFpuRegister<FpuRegister>();
-      GpuRegister src = locations->InAt(0).AsRegister<GpuRegister>();
-      __ Mtc1(src, FTMP);
+    FpuRegister dst = locations->Out().AsFpuRegister<FpuRegister>();
+    GpuRegister src = locations->InAt(0).AsRegister<GpuRegister>();
+    if (input_type == Primitive::kPrimLong) {
+      __ Dmtc1(src, FTMP);
       if (result_type == Primitive::kPrimFloat) {
-        __ Cvtsw(dst, FTMP);
+        __ Cvtsl(dst, FTMP);
       } else {
-        __ Cvtdw(dst, FTMP);
+        __ Cvtdl(dst, FTMP);
       }
     } else {
-      int32_t entry_offset = (result_type == Primitive::kPrimFloat) ? QUICK_ENTRY_POINT(pL2f)
-                                                                    : QUICK_ENTRY_POINT(pL2d);
-      codegen_->InvokeRuntime(entry_offset,
-                              conversion,
-                              conversion->GetDexPc(),
-                              nullptr);
+      __ Mtc1(src, FTMP);
       if (result_type == Primitive::kPrimFloat) {
-        CheckEntrypointTypes<kQuickL2f, float, int64_t>();
+        __ Cvtsw(dst, FTMP);
       } else {
-        CheckEntrypointTypes<kQuickL2d, double, int64_t>();
+        __ Cvtdw(dst, FTMP);
       }
     }
   } else if (Primitive::IsIntegralType(result_type) && Primitive::IsFloatingPointType(input_type)) {
     CHECK(result_type == Primitive::kPrimInt || result_type == Primitive::kPrimLong);
-    int32_t entry_offset;
-    if (result_type != Primitive::kPrimLong) {
-      entry_offset = (input_type == Primitive::kPrimFloat) ? QUICK_ENTRY_POINT(pF2iz)
-                                                           : QUICK_ENTRY_POINT(pD2iz);
+    GpuRegister dst = locations->Out().AsRegister<GpuRegister>();
+    FpuRegister src = locations->InAt(0).AsFpuRegister<FpuRegister>();
+    Mips64Label truncate;
+    Mips64Label done;
+
+    // When NAN2008=0 (R2 and before), the truncate instruction produces the maximum positive
+    // value when the input is either a NaN or is outside of the range of the output type
+    // after the truncation. IOW, the three special cases (NaN, too small, too big) produce
+    // the same result.
+    //
+    // When NAN2008=1 (R6), the truncate instruction caps the output at the minimum/maximum
+    // value of the output type if the input is outside of the range after the truncation or
+    // produces 0 when the input is a NaN. IOW, the three special cases produce three distinct
+    // results. This matches the desired float/double-to-int/long conversion exactly.
+    //
+    // So, NAN2008 affects handling of negative values and NaNs by the truncate instruction.
+    //
+    // The following code supports both NAN2008=0 and NAN2008=1 behaviors of the truncate
+    // instruction, the reason being that the emulator implements NAN2008=0 on MIPS64R6,
+    // even though it must be NAN2008=1 on R6.
+    //
+    // The code takes care of the different behaviors by first comparing the input to the
+    // minimum output value (-2**-63 for truncating to long, -2**-31 for truncating to int).
+    // If the input is greater than or equal to the minimum, it procedes to the truncate
+    // instruction, which will handle such an input the same way irrespective of NAN2008.
+    // Otherwise the input is compared to itself to determine whether it is a NaN or not
+    // in order to return either zero or the minimum value.
+    //
+    // TODO: simplify this when the emulator correctly implements NAN2008=1 behavior of the
+    // truncate instruction for MIPS64R6.
+    if (input_type == Primitive::kPrimFloat) {
+      uint32_t min_val = (result_type == Primitive::kPrimLong)
+          ? bit_cast<uint32_t, float>(std::numeric_limits<int64_t>::min())
+          : bit_cast<uint32_t, float>(std::numeric_limits<int32_t>::min());
+      __ LoadConst32(TMP, min_val);
+      __ Mtc1(TMP, FTMP);
+      __ CmpLeS(FTMP, FTMP, src);
     } else {
-      entry_offset = (input_type == Primitive::kPrimFloat) ? QUICK_ENTRY_POINT(pF2l)
-                                                           : QUICK_ENTRY_POINT(pD2l);
+      uint64_t min_val = (result_type == Primitive::kPrimLong)
+          ? bit_cast<uint64_t, double>(std::numeric_limits<int64_t>::min())
+          : bit_cast<uint64_t, double>(std::numeric_limits<int32_t>::min());
+      __ LoadConst64(TMP, min_val);
+      __ Dmtc1(TMP, FTMP);
+      __ CmpLeD(FTMP, FTMP, src);
     }
-    codegen_->InvokeRuntime(entry_offset,
-                            conversion,
-                            conversion->GetDexPc(),
-                            nullptr);
-    if (result_type != Primitive::kPrimLong) {
+
+    __ Bc1nez(FTMP, &truncate);
+
+    if (input_type == Primitive::kPrimFloat) {
+      __ CmpEqS(FTMP, src, src);
+    } else {
+      __ CmpEqD(FTMP, src, src);
+    }
+    if (result_type == Primitive::kPrimLong) {
+      __ LoadConst64(dst, std::numeric_limits<int64_t>::min());
+    } else {
+      __ LoadConst32(dst, std::numeric_limits<int32_t>::min());
+    }
+    __ Mfc1(TMP, FTMP);
+    __ And(dst, dst, TMP);
+
+    __ Bc(&done);
+
+    __ Bind(&truncate);
+
+    if (result_type == Primitive::kPrimLong) {
       if (input_type == Primitive::kPrimFloat) {
-        CheckEntrypointTypes<kQuickF2iz, int32_t, float>();
+        __ TruncLS(FTMP, src);
       } else {
-        CheckEntrypointTypes<kQuickD2iz, int32_t, double>();
+        __ TruncLD(FTMP, src);
       }
+      __ Dmfc1(dst, FTMP);
     } else {
       if (input_type == Primitive::kPrimFloat) {
-        CheckEntrypointTypes<kQuickF2l, int64_t, float>();
+        __ TruncWS(FTMP, src);
       } else {
-        CheckEntrypointTypes<kQuickD2l, int64_t, double>();
+        __ TruncWD(FTMP, src);
       }
+      __ Mfc1(dst, FTMP);
     }
+
+    __ Bind(&done);
   } else if (Primitive::IsFloatingPointType(result_type) &&
              Primitive::IsFloatingPointType(input_type)) {
     FpuRegister dst = locations->Out().AsFpuRegister<FpuRegister>();
diff --git a/compiler/optimizing/code_generator_mips64.h b/compiler/optimizing/code_generator_mips64.h
index 60ff96dc43..7182e8e987 100644
--- a/compiler/optimizing/code_generator_mips64.h
+++ b/compiler/optimizing/code_generator_mips64.h
@@ -201,7 +201,7 @@ class LocationsBuilderMIPS64 : public HGraphVisitor {
   DISALLOW_COPY_AND_ASSIGN(LocationsBuilderMIPS64);
 };
 
-class InstructionCodeGeneratorMIPS64 : public HGraphVisitor {
+class InstructionCodeGeneratorMIPS64 : public InstructionCodeGenerator {
  public:
   InstructionCodeGeneratorMIPS64(HGraph* graph, CodeGeneratorMIPS64* codegen);
 
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 86327fb741..6ab3aaff4b 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -365,11 +365,10 @@ class TypeCheckSlowPathX86 : public SlowPathCode {
 
 class DeoptimizationSlowPathX86 : public SlowPathCode {
  public:
-  explicit DeoptimizationSlowPathX86(HInstruction* instruction)
+  explicit DeoptimizationSlowPathX86(HDeoptimize* instruction)
     : instruction_(instruction) {}
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
-    DCHECK(instruction_->IsDeoptimize());
     CodeGeneratorX86* x86_codegen = down_cast<CodeGeneratorX86*>(codegen);
     __ Bind(GetEntryLabel());
     SaveLiveRegisters(codegen, instruction_->GetLocations());
@@ -383,7 +382,7 @@ class DeoptimizationSlowPathX86 : public SlowPathCode {
   const char* GetDescription() const OVERRIDE { return "DeoptimizationSlowPathX86"; }
 
  private:
-  HInstruction* const instruction_;
+  HDeoptimize* const instruction_;
   DISALLOW_COPY_AND_ASSIGN(DeoptimizationSlowPathX86);
 };
 
@@ -892,7 +891,7 @@ void CodeGeneratorX86::UpdateBlockedPairRegisters() const {
 }
 
 InstructionCodeGeneratorX86::InstructionCodeGeneratorX86(HGraph* graph, CodeGeneratorX86* codegen)
-      : HGraphVisitor(graph),
+      : InstructionCodeGenerator(graph, codegen),
         assembler_(codegen->GetAssembler()),
         codegen_(codegen) {}
 
@@ -1611,9 +1610,7 @@ void LocationsBuilderX86::VisitDeoptimize(HDeoptimize* deoptimize) {
 }
 
 void InstructionCodeGeneratorX86::VisitDeoptimize(HDeoptimize* deoptimize) {
-  SlowPathCode* slow_path = new (GetGraph()->GetArena())
-      DeoptimizationSlowPathX86(deoptimize);
-  codegen_->AddSlowPath(slow_path);
+  SlowPathCode* slow_path = deopt_slow_paths_.NewSlowPath<DeoptimizationSlowPathX86>(deoptimize);
   GenerateTestAndBranch(deoptimize,
                         /* condition_input_index */ 0,
                         slow_path->GetEntryLabel(),
@@ -1625,6 +1622,10 @@ void LocationsBuilderX86::VisitNativeDebugInfo(HNativeDebugInfo* info) {
 }
 
 void InstructionCodeGeneratorX86::VisitNativeDebugInfo(HNativeDebugInfo* info) {
+  if (codegen_->HasStackMapAtCurrentPc()) {
+    // Ensure that we do not collide with the stack map of the previous instruction.
+    __ nop();
+  }
   codegen_->RecordPcInfo(info, info->GetDexPc());
 }
 
@@ -3223,11 +3224,12 @@ void InstructionCodeGeneratorX86::DivByPowerOfTwo(HDiv* instruction) {
   Register out_register = locations->Out().AsRegister<Register>();
   Register input_register = locations->InAt(0).AsRegister<Register>();
   int32_t imm = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
+  DCHECK(IsPowerOfTwo(AbsOrMin(imm)));
+  uint32_t abs_imm = static_cast<uint32_t>(AbsOrMin(imm));
 
-  DCHECK(IsPowerOfTwo(std::abs(imm)));
   Register num = locations->GetTemp(0).AsRegister<Register>();
 
-  __ leal(num, Address(input_register, std::abs(imm) - 1));
+  __ leal(num, Address(input_register, abs_imm - 1));
   __ testl(input_register, input_register);
   __ cmovl(kGreaterEqual, num, input_register);
   int shift = CTZ(imm);
@@ -3340,7 +3342,7 @@ void InstructionCodeGeneratorX86::GenerateDivRemIntegral(HBinaryOperation* instr
           // Do not generate anything for 0. DivZeroCheck would forbid any generated code.
         } else if (imm == 1 || imm == -1) {
           DivRemOneOrMinusOne(instruction);
-        } else if (is_div && IsPowerOfTwo(std::abs(imm))) {
+        } else if (is_div && IsPowerOfTwo(AbsOrMin(imm))) {
           DivByPowerOfTwo(instruction->AsDiv());
         } else {
           DCHECK(imm <= -2 || imm >= 2);
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index df7347658b..c65c423eae 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -178,7 +178,7 @@ class LocationsBuilderX86 : public HGraphVisitor {
   DISALLOW_COPY_AND_ASSIGN(LocationsBuilderX86);
 };
 
-class InstructionCodeGeneratorX86 : public HGraphVisitor {
+class InstructionCodeGeneratorX86 : public InstructionCodeGenerator {
  public:
   InstructionCodeGeneratorX86(HGraph* graph, CodeGeneratorX86* codegen);
 
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 76a4ce2e93..294b40e3d4 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -387,18 +387,16 @@ class TypeCheckSlowPathX86_64 : public SlowPathCode {
 
 class DeoptimizationSlowPathX86_64 : public SlowPathCode {
  public:
-  explicit DeoptimizationSlowPathX86_64(HInstruction* instruction)
+  explicit DeoptimizationSlowPathX86_64(HDeoptimize* instruction)
       : instruction_(instruction) {}
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     CodeGeneratorX86_64* x86_64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
     __ Bind(GetEntryLabel());
     SaveLiveRegisters(codegen, instruction_->GetLocations());
-    DCHECK(instruction_->IsDeoptimize());
-    HDeoptimize* deoptimize = instruction_->AsDeoptimize();
     x86_64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pDeoptimize),
-                                  deoptimize,
-                                  deoptimize->GetDexPc(),
+                                  instruction_,
+                                  instruction_->GetDexPc(),
                                   this);
     CheckEntrypointTypes<kQuickDeoptimize, void, void>();
   }
@@ -406,7 +404,7 @@ class DeoptimizationSlowPathX86_64 : public SlowPathCode {
   const char* GetDescription() const OVERRIDE { return "DeoptimizationSlowPathX86_64"; }
 
  private:
-  HInstruction* const instruction_;
+  HDeoptimize* const instruction_;
   DISALLOW_COPY_AND_ASSIGN(DeoptimizationSlowPathX86_64);
 };
 
@@ -1000,7 +998,7 @@ CodeGeneratorX86_64::CodeGeneratorX86_64(HGraph* graph,
 
 InstructionCodeGeneratorX86_64::InstructionCodeGeneratorX86_64(HGraph* graph,
                                                                CodeGeneratorX86_64* codegen)
-      : HGraphVisitor(graph),
+      : InstructionCodeGenerator(graph, codegen),
         assembler_(codegen->GetAssembler()),
         codegen_(codegen) {}
 
@@ -1594,9 +1592,7 @@ void LocationsBuilderX86_64::VisitDeoptimize(HDeoptimize* deoptimize) {
 }
 
 void InstructionCodeGeneratorX86_64::VisitDeoptimize(HDeoptimize* deoptimize) {
-  SlowPathCode* slow_path = new (GetGraph()->GetArena())
-      DeoptimizationSlowPathX86_64(deoptimize);
-  codegen_->AddSlowPath(slow_path);
+  SlowPathCode* slow_path = deopt_slow_paths_.NewSlowPath<DeoptimizationSlowPathX86_64>(deoptimize);
   GenerateTestAndBranch(deoptimize,
                         /* condition_input_index */ 0,
                         slow_path->GetEntryLabel(),
@@ -1608,6 +1604,10 @@ void LocationsBuilderX86_64::VisitNativeDebugInfo(HNativeDebugInfo* info) {
 }
 
 void InstructionCodeGeneratorX86_64::VisitNativeDebugInfo(HNativeDebugInfo* info) {
+  if (codegen_->HasStackMapAtCurrentPc()) {
+    // Ensure that we do not collide with the stack map of the previous instruction.
+    __ nop();
+  }
   codegen_->RecordPcInfo(info, info->GetDexPc());
 }
 
@@ -3350,13 +3350,13 @@ void InstructionCodeGeneratorX86_64::DivByPowerOfTwo(HDiv* instruction) {
   CpuRegister numerator = locations->InAt(0).AsRegister<CpuRegister>();
 
   int64_t imm = Int64FromConstant(second.GetConstant());
-
-  DCHECK(IsPowerOfTwo(std::abs(imm)));
+  DCHECK(IsPowerOfTwo(AbsOrMin(imm)));
+  uint64_t abs_imm = AbsOrMin(imm);
 
   CpuRegister tmp = locations->GetTemp(0).AsRegister<CpuRegister>();
 
   if (instruction->GetResultType() == Primitive::kPrimInt) {
-    __ leal(tmp, Address(numerator, std::abs(imm) - 1));
+    __ leal(tmp, Address(numerator, abs_imm - 1));
     __ testl(numerator, numerator);
     __ cmov(kGreaterEqual, tmp, numerator);
     int shift = CTZ(imm);
@@ -3371,7 +3371,7 @@ void InstructionCodeGeneratorX86_64::DivByPowerOfTwo(HDiv* instruction) {
     DCHECK_EQ(instruction->GetResultType(), Primitive::kPrimLong);
     CpuRegister rdx = locations->GetTemp(0).AsRegister<CpuRegister>();
 
-    codegen_->Load64BitValue(rdx, std::abs(imm) - 1);
+    codegen_->Load64BitValue(rdx, abs_imm - 1);
     __ addq(rdx, numerator);
     __ testq(numerator, numerator);
     __ cmov(kGreaterEqual, rdx, numerator);
@@ -3529,7 +3529,7 @@ void InstructionCodeGeneratorX86_64::GenerateDivRemIntegral(HBinaryOperation* in
       // Do not generate anything. DivZeroCheck would prevent any code to be executed.
     } else if (imm == 1 || imm == -1) {
       DivRemOneOrMinusOne(instruction);
-    } else if (instruction->IsDiv() && IsPowerOfTwo(std::abs(imm))) {
+    } else if (instruction->IsDiv() && IsPowerOfTwo(AbsOrMin(imm))) {
       DivByPowerOfTwo(instruction->AsDiv());
     } else {
       DCHECK(imm <= -2 || imm >= 2);
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index c5e8a04da6..505c9dcdad 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -183,7 +183,7 @@ class LocationsBuilderX86_64 : public HGraphVisitor {
   DISALLOW_COPY_AND_ASSIGN(LocationsBuilderX86_64);
 };
 
-class InstructionCodeGeneratorX86_64 : public HGraphVisitor {
+class InstructionCodeGeneratorX86_64 : public InstructionCodeGenerator {
  public:
   InstructionCodeGeneratorX86_64(HGraph* graph, CodeGeneratorX86_64* codegen);
 
diff --git a/compiler/optimizing/instruction_simplifier.cc b/compiler/optimizing/instruction_simplifier.cc
index c504ded54c..b90afb1d73 100644
--- a/compiler/optimizing/instruction_simplifier.cc
+++ b/compiler/optimizing/instruction_simplifier.cc
@@ -211,19 +211,6 @@ bool InstructionSimplifierVisitor::ReplaceRotateWithRor(HBinaryOperation* op,
 
 // Try to replace a binary operation flanked by one UShr and one Shl with a bitfield rotation.
 bool InstructionSimplifierVisitor::TryReplaceWithRotate(HBinaryOperation* op) {
-  // This simplification is currently supported on x86, x86_64, ARM and ARM64.
-  // TODO: Implement it for MIPS/64.
-  const InstructionSet instruction_set = GetGraph()->GetInstructionSet();
-  switch (instruction_set) {
-    case kArm:
-    case kArm64:
-    case kThumb2:
-    case kX86:
-    case kX86_64:
-      break;
-    default:
-      return false;
-  }
   DCHECK(op->IsAdd() || op->IsXor() || op->IsOr());
   HInstruction* left = op->GetLeft();
   HInstruction* right = op->GetRight();
@@ -1261,19 +1248,6 @@ void InstructionSimplifierVisitor::SimplifyStringEquals(HInvoke* instruction) {
 void InstructionSimplifierVisitor::SimplifyRotate(HInvoke* invoke, bool is_left) {
   DCHECK(invoke->IsInvokeStaticOrDirect());
   DCHECK_EQ(invoke->GetOriginalInvokeType(), InvokeType::kStatic);
-  // This simplification is currently supported on x86, x86_64, ARM and ARM64.
-  // TODO: Implement it for MIPS/64.
-  const InstructionSet instruction_set = GetGraph()->GetInstructionSet();
-  switch (instruction_set) {
-    case kArm:
-    case kArm64:
-    case kThumb2:
-    case kX86:
-    case kX86_64:
-      break;
-    default:
-      return;
-  }
   HInstruction* value = invoke->InputAt(0);
   HInstruction* distance = invoke->InputAt(1);
   // Replace the invoke with an HRor.
diff --git a/compiler/optimizing/intrinsics_arm.cc b/compiler/optimizing/intrinsics_arm.cc
index 4683aee603..b1fbf28204 100644
--- a/compiler/optimizing/intrinsics_arm.cc
+++ b/compiler/optimizing/intrinsics_arm.cc
@@ -502,9 +502,6 @@ static void GenUnsafeGet(HInvoke* invoke,
                          bool is_volatile,
                          CodeGeneratorARM* codegen) {
   LocationSummary* locations = invoke->GetLocations();
-  DCHECK((type == Primitive::kPrimInt) ||
-         (type == Primitive::kPrimLong) ||
-         (type == Primitive::kPrimNot));
   ArmAssembler* assembler = codegen->GetAssembler();
   Location base_loc = locations->InAt(1);
   Register base = base_loc.AsRegister<Register>();             // Object pointer.
@@ -512,30 +509,67 @@ static void GenUnsafeGet(HInvoke* invoke,
   Register offset = offset_loc.AsRegisterPairLow<Register>();  // Long offset, lo part only.
   Location trg_loc = locations->Out();
 
-  if (type == Primitive::kPrimLong) {
-    Register trg_lo = trg_loc.AsRegisterPairLow<Register>();
-    __ add(IP, base, ShifterOperand(offset));
-    if (is_volatile && !codegen->GetInstructionSetFeatures().HasAtomicLdrdAndStrd()) {
-      Register trg_hi = trg_loc.AsRegisterPairHigh<Register>();
-      __ ldrexd(trg_lo, trg_hi, IP);
-    } else {
-      __ ldrd(trg_lo, Address(IP));
+  switch (type) {
+    case Primitive::kPrimInt: {
+      Register trg = trg_loc.AsRegister<Register>();
+      __ ldr(trg, Address(base, offset));
+      if (is_volatile) {
+        __ dmb(ISH);
+      }
+      break;
     }
-  } else {
-    Register trg = trg_loc.AsRegister<Register>();
-    __ ldr(trg, Address(base, offset));
-  }
 
-  if (is_volatile) {
-    __ dmb(ISH);
-  }
+    case Primitive::kPrimNot: {
+      Register trg = trg_loc.AsRegister<Register>();
+      if (kEmitCompilerReadBarrier) {
+        if (kUseBakerReadBarrier) {
+          Location temp = locations->GetTemp(0);
+          codegen->GenerateArrayLoadWithBakerReadBarrier(
+              invoke, trg_loc, base, 0U, offset_loc, temp, /* needs_null_check */ false);
+          if (is_volatile) {
+            __ dmb(ISH);
+          }
+        } else {
+          __ ldr(trg, Address(base, offset));
+          if (is_volatile) {
+            __ dmb(ISH);
+          }
+          codegen->GenerateReadBarrierSlow(invoke, trg_loc, trg_loc, base_loc, 0U, offset_loc);
+        }
+      } else {
+        __ ldr(trg, Address(base, offset));
+        if (is_volatile) {
+          __ dmb(ISH);
+        }
+        __ MaybeUnpoisonHeapReference(trg);
+      }
+      break;
+    }
 
-  if (type == Primitive::kPrimNot) {
-    codegen->MaybeGenerateReadBarrier(invoke, trg_loc, trg_loc, base_loc, 0U, offset_loc);
+    case Primitive::kPrimLong: {
+      Register trg_lo = trg_loc.AsRegisterPairLow<Register>();
+      __ add(IP, base, ShifterOperand(offset));
+      if (is_volatile && !codegen->GetInstructionSetFeatures().HasAtomicLdrdAndStrd()) {
+        Register trg_hi = trg_loc.AsRegisterPairHigh<Register>();
+        __ ldrexd(trg_lo, trg_hi, IP);
+      } else {
+        __ ldrd(trg_lo, Address(IP));
+      }
+      if (is_volatile) {
+        __ dmb(ISH);
+      }
+      break;
+    }
+
+    default:
+      LOG(FATAL) << "Unexpected type " << type;
+      UNREACHABLE();
   }
 }
 
-static void CreateIntIntIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke) {
+static void CreateIntIntIntToIntLocations(ArenaAllocator* arena,
+                                          HInvoke* invoke,
+                                          Primitive::Type type) {
   bool can_call = kEmitCompilerReadBarrier &&
       (invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObject ||
        invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile);
@@ -548,25 +582,30 @@ static void CreateIntIntIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke
   locations->SetInAt(1, Location::RequiresRegister());
   locations->SetInAt(2, Location::RequiresRegister());
   locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+  if (type == Primitive::kPrimNot && kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+    // We need a temporary register for the read barrier marking slow
+    // path in InstructionCodeGeneratorARM::GenerateArrayLoadWithBakerReadBarrier.
+    locations->AddTemp(Location::RequiresRegister());
+  }
 }
 
 void IntrinsicLocationsBuilderARM::VisitUnsafeGet(HInvoke* invoke) {
-  CreateIntIntIntToIntLocations(arena_, invoke);
+  CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimInt);
 }
 void IntrinsicLocationsBuilderARM::VisitUnsafeGetVolatile(HInvoke* invoke) {
-  CreateIntIntIntToIntLocations(arena_, invoke);
+  CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimInt);
 }
 void IntrinsicLocationsBuilderARM::VisitUnsafeGetLong(HInvoke* invoke) {
-  CreateIntIntIntToIntLocations(arena_, invoke);
+  CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimLong);
 }
 void IntrinsicLocationsBuilderARM::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
-  CreateIntIntIntToIntLocations(arena_, invoke);
+  CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimLong);
 }
 void IntrinsicLocationsBuilderARM::VisitUnsafeGetObject(HInvoke* invoke) {
-  CreateIntIntIntToIntLocations(arena_, invoke);
+  CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimNot);
 }
 void IntrinsicLocationsBuilderARM::VisitUnsafeGetObjectVolatile(HInvoke* invoke) {
-  CreateIntIntIntToIntLocations(arena_, invoke);
+  CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimNot);
 }
 
 void IntrinsicCodeGeneratorARM::VisitUnsafeGet(HInvoke* invoke) {
@@ -808,6 +847,9 @@ static void GenCas(LocationSummary* locations, Primitive::Type type, CodeGenerat
   }
 
   // Prevent reordering with prior memory operations.
+  // Emit a DMB ISH instruction instead of an DMB ISHST one, as the
+  // latter allows a preceding load to be delayed past the STXR
+  // instruction below.
   __ dmb(ISH);
 
   __ add(tmp_ptr, base, ShifterOperand(offset));
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index f723940444..81cab86c83 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -1035,7 +1035,11 @@ static void GenCas(LocationSummary* locations, Primitive::Type type, CodeGenerat
     __ Stlxr(tmp_32, value, MemOperand(tmp_ptr));
     __ Cbnz(tmp_32, &loop_head);
   } else {
-    __ Dmb(InnerShareable, BarrierWrites);
+    // Emit a `Dmb(InnerShareable, BarrierAll)` (DMB ISH) instruction
+    // instead of a `Dmb(InnerShareable, BarrierWrites)` (DMB ISHST)
+    // one, as the latter allows a preceding load to be delayed past
+    // the STXR instruction below.
+    __ Dmb(InnerShareable, BarrierAll);
     __ Bind(&loop_head);
     // TODO: When `type == Primitive::kPrimNot`, add a read barrier for
     // the reference stored in the object before attempting the CAS,
diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc
index 06fab616ad..bc126a2716 100644
--- a/compiler/optimizing/intrinsics_mips.cc
+++ b/compiler/optimizing/intrinsics_mips.cc
@@ -43,14 +43,18 @@ ArenaAllocator* IntrinsicCodeGeneratorMIPS::GetAllocator() {
   return codegen_->GetGraph()->GetArena();
 }
 
-inline bool IntrinsicCodeGeneratorMIPS::IsR2OrNewer() {
+inline bool IntrinsicCodeGeneratorMIPS::IsR2OrNewer() const {
   return codegen_->GetInstructionSetFeatures().IsMipsIsaRevGreaterThanEqual2();
 }
 
-inline bool IntrinsicCodeGeneratorMIPS::IsR6() {
+inline bool IntrinsicCodeGeneratorMIPS::IsR6() const {
   return codegen_->GetInstructionSetFeatures().IsR6();
 }
 
+inline bool IntrinsicCodeGeneratorMIPS::Is32BitFPU() const {
+  return codegen_->GetInstructionSetFeatures().Is32BitFloatingPoint();
+}
+
 #define __ codegen->GetAssembler()->
 
 static void MoveFromReturnRegister(Location trg,
@@ -162,7 +166,7 @@ static void MoveFPToInt(LocationSummary* locations, bool is64bit, MipsAssembler*
     Register out_hi = locations->Out().AsRegisterPairHigh<Register>();
 
     __ Mfc1(out_lo, in);
-    __ Mfhc1(out_hi, in);
+    __ MoveFromFpuHigh(out_hi, in);
   } else {
     Register out = locations->Out().AsRegister<Register>();
 
@@ -204,7 +208,7 @@ static void MoveIntToFP(LocationSummary* locations, bool is64bit, MipsAssembler*
     Register in_hi = locations->InAt(0).AsRegisterPairHigh<Register>();
 
     __ Mtc1(in_lo, out);
-    __ Mthc1(in_hi, out);
+    __ MoveToFpuHigh(in_hi, out);
   } else {
     Register in = locations->InAt(0).AsRegister<Register>();
 
diff --git a/compiler/optimizing/intrinsics_mips.h b/compiler/optimizing/intrinsics_mips.h
index f86b0efe4a..575a7d0a23 100644
--- a/compiler/optimizing/intrinsics_mips.h
+++ b/compiler/optimizing/intrinsics_mips.h
@@ -67,8 +67,9 @@ INTRINSICS_LIST(OPTIMIZING_INTRINSICS)
 #undef INTRINSICS_LIST
 #undef OPTIMIZING_INTRINSICS
 
-  bool IsR2OrNewer(void);
-  bool IsR6(void);
+  bool IsR2OrNewer() const;
+  bool IsR6() const;
+  bool Is32BitFPU() const;
 
  private:
   MipsAssembler* GetAssembler();
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index f5a7048b01..b80c6bde82 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -2183,10 +2183,7 @@ void HInvoke::SetIntrinsic(Intrinsics intrinsic,
                            IntrinsicExceptions exceptions) {
   intrinsic_ = intrinsic;
   IntrinsicOptimizations opt(this);
-  if (needs_env_or_cache == kNoEnvironmentOrCache) {
-    opt.SetDoesNotNeedDexCache();
-    opt.SetDoesNotNeedEnvironment();
-  }
+
   // Adjust method's side effects from intrinsic table.
   switch (side_effects) {
     case kNoSideEffects: SetSideEffects(SideEffects::None()); break;
@@ -2194,6 +2191,14 @@ void HInvoke::SetIntrinsic(Intrinsics intrinsic,
     case kWriteSideEffects: SetSideEffects(SideEffects::AllWrites()); break;
     case kAllSideEffects: SetSideEffects(SideEffects::AllExceptGCDependency()); break;
   }
+
+  if (needs_env_or_cache == kNoEnvironmentOrCache) {
+    opt.SetDoesNotNeedDexCache();
+    opt.SetDoesNotNeedEnvironment();
+  } else {
+    // If we need an environment, that means there will be a call, which can trigger GC.
+    SetSideEffects(GetSideEffects().Union(SideEffects::CanTriggerGC()));
+  }
   // Adjust method's exception status from intrinsic table.
   switch (exceptions) {
     case kNoThrow: SetCanThrow(false); break;
@@ -2325,4 +2330,19 @@ HInstruction* HGraph::InsertOppositeCondition(HInstruction* cond, HInstruction*
   }
 }
 
+std::ostream& operator<<(std::ostream& os, const MoveOperands& rhs) {
+  os << "["
+     << " source=" << rhs.GetSource()
+     << " destination=" << rhs.GetDestination()
+     << " type=" << rhs.GetType()
+     << " instruction=";
+  if (rhs.GetInstruction() != nullptr) {
+    os << rhs.GetInstruction()->DebugName() << ' ' << rhs.GetInstruction()->GetId();
+  } else {
+    os << "null";
+  }
+  os << " ]";
+  return os;
+}
+
 }  // namespace art
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 59c07690b1..23132308f0 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -1881,6 +1881,10 @@ class HInstruction : public ArenaObject<kArenaAllocInstruction> {
     return false;
   }
 
+  virtual bool IsActualObject() const {
+    return GetType() == Primitive::kPrimNot;
+  }
+
   void SetReferenceTypeInfo(ReferenceTypeInfo rti);
 
   ReferenceTypeInfo GetReferenceTypeInfo() const {
@@ -2500,8 +2504,10 @@ class HTryBoundary : public HTemplateInstruction<0> {
 // Deoptimize to interpreter, upon checking a condition.
 class HDeoptimize : public HTemplateInstruction<1> {
  public:
+  // We set CanTriggerGC to prevent any intermediate address to be live
+  // at the point of the `HDeoptimize`.
   HDeoptimize(HInstruction* cond, uint32_t dex_pc)
-      : HTemplateInstruction(SideEffects::None(), dex_pc) {
+      : HTemplateInstruction(SideEffects::CanTriggerGC(), dex_pc) {
     SetRawInputAt(0, cond);
   }
 
@@ -4017,8 +4023,10 @@ class HRem : public HBinaryOperation {
 
 class HDivZeroCheck : public HExpression<1> {
  public:
+  // `HDivZeroCheck` can trigger GC, as it may call the `ArithmeticException`
+  // constructor.
   HDivZeroCheck(HInstruction* value, uint32_t dex_pc)
-      : HExpression(value->GetType(), SideEffects::None(), dex_pc) {
+      : HExpression(value->GetType(), SideEffects::CanTriggerGC(), dex_pc) {
     SetRawInputAt(0, value);
   }
 
@@ -4539,8 +4547,10 @@ class HPhi : public HInstruction {
 
 class HNullCheck : public HExpression<1> {
  public:
+  // `HNullCheck` can trigger GC, as it may call the `NullPointerException`
+  // constructor.
   HNullCheck(HInstruction* value, uint32_t dex_pc)
-      : HExpression(value->GetType(), SideEffects::None(), dex_pc) {
+      : HExpression(value->GetType(), SideEffects::CanTriggerGC(), dex_pc) {
     SetRawInputAt(0, value);
   }
 
@@ -4861,8 +4871,10 @@ class HArrayLength : public HExpression<1> {
 
 class HBoundsCheck : public HExpression<2> {
  public:
+  // `HBoundsCheck` can trigger GC, as it may call the `IndexOutOfBoundsException`
+  // constructor.
   HBoundsCheck(HInstruction* index, HInstruction* length, uint32_t dex_pc)
-      : HExpression(index->GetType(), SideEffects::None(), dex_pc) {
+      : HExpression(index->GetType(), SideEffects::CanTriggerGC(), dex_pc) {
     DCHECK(index->GetType() == Primitive::kPrimInt);
     SetRawInputAt(0, index);
     SetRawInputAt(1, length);
@@ -5626,8 +5638,8 @@ class MoveOperands : public ArenaObject<kArenaAllocMoveOperands> {
   }
 
   bool IsPending() const {
-    DCHECK(!source_.IsInvalid() || destination_.IsInvalid());
-    return destination_.IsInvalid() && !source_.IsInvalid();
+    DCHECK(source_.IsValid() || destination_.IsInvalid());
+    return destination_.IsInvalid() && source_.IsValid();
   }
 
   // True if this blocks a move from the given location.
@@ -5671,6 +5683,8 @@ class MoveOperands : public ArenaObject<kArenaAllocMoveOperands> {
   HInstruction* instruction_;
 };
 
+std::ostream& operator<<(std::ostream& os, const MoveOperands& rhs);
+
 static constexpr size_t kDefaultNumberOfMoves = 4;
 
 class HParallelMove : public HTemplateInstruction<0> {
diff --git a/compiler/optimizing/nodes_arm64.h b/compiler/optimizing/nodes_arm64.h
index 18405f2623..445cdab191 100644
--- a/compiler/optimizing/nodes_arm64.h
+++ b/compiler/optimizing/nodes_arm64.h
@@ -107,6 +107,7 @@ class HArm64IntermediateAddress : public HExpression<2> {
 
   bool CanBeMoved() const OVERRIDE { return true; }
   bool InstructionDataEquals(HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE { return true; }
+  bool IsActualObject() const OVERRIDE { return false; }
 
   HInstruction* GetBaseAddress() const { return InputAt(0); }
   HInstruction* GetOffset() const { return InputAt(1); }
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index cafc6c5440..bb840eabdd 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -17,6 +17,7 @@
 #include "optimizing_compiler.h"
 
 #include <fstream>
+#include <memory>
 #include <stdint.h>
 
 #ifdef ART_ENABLE_CODEGEN_arm64
@@ -52,6 +53,8 @@
 #include "driver/compiler_driver-inl.h"
 #include "driver/compiler_options.h"
 #include "driver/dex_compilation_unit.h"
+#include "dwarf/method_debug_info.h"
+#include "elf_writer_debug.h"
 #include "elf_writer_quick.h"
 #include "graph_checker.h"
 #include "graph_visualizer.h"
@@ -60,6 +63,7 @@
 #include "inliner.h"
 #include "instruction_simplifier.h"
 #include "intrinsics.h"
+#include "jit/debugger_interface.h"
 #include "jit/jit_code_cache.h"
 #include "licm.h"
 #include "jni/quick/jni_compiler.h"
@@ -68,6 +72,7 @@
 #include "prepare_for_register_allocation.h"
 #include "reference_type_propagation.h"
 #include "register_allocator.h"
+#include "oat_quick_method_header.h"
 #include "sharpening.h"
 #include "side_effects_analysis.h"
 #include "ssa_builder.h"
@@ -965,6 +970,39 @@ bool OptimizingCompiler::JitCompile(Thread* self,
     return false;
   }
 
+  if (GetCompilerDriver()->GetCompilerOptions().GetGenerateDebugInfo()) {
+    const auto* method_header = reinterpret_cast<const OatQuickMethodHeader*>(code);
+    const uintptr_t code_address = reinterpret_cast<uintptr_t>(method_header->GetCode());
+    CompiledMethod compiled_method(
+        GetCompilerDriver(),
+        codegen->GetInstructionSet(),
+        ArrayRef<const uint8_t>(code_allocator.GetMemory()),
+        codegen->HasEmptyFrame() ? 0 : codegen->GetFrameSize(),
+        codegen->GetCoreSpillMask(),
+        codegen->GetFpuSpillMask(),
+        ArrayRef<const SrcMapElem>(),
+        ArrayRef<const uint8_t>(),  // mapping_table.
+        ArrayRef<const uint8_t>(stack_map_data, stack_map_size),
+        ArrayRef<const uint8_t>(),  // native_gc_map.
+        ArrayRef<const uint8_t>(*codegen->GetAssembler()->cfi().data()),
+        ArrayRef<const LinkerPatch>());
+    dwarf::MethodDebugInfo method_debug_info {
+        dex_file,
+        class_def_idx,
+        method_idx,
+        access_flags,
+        code_item,
+        false,  // deduped.
+        code_address,
+        code_address + code_allocator.GetSize(),
+        &compiled_method
+    };
+    ArrayRef<const uint8_t> elf_file = dwarf::WriteDebugElfFileForMethod(method_debug_info);
+    CreateJITCodeEntryForAddress(code_address,
+                                 std::unique_ptr<const uint8_t[]>(elf_file.data()),
+                                 elf_file.size());
+  }
+
   return true;
 }
 
diff --git a/compiler/optimizing/parallel_move_resolver.cc b/compiler/optimizing/parallel_move_resolver.cc
index 176c50ce21..9d136f3ae6 100644
--- a/compiler/optimizing/parallel_move_resolver.cc
+++ b/compiler/optimizing/parallel_move_resolver.cc
@@ -13,7 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <iostream>
 
 #include "parallel_move_resolver.h"
 
@@ -172,7 +171,7 @@ MoveOperands* ParallelMoveResolverWithSwap::PerformMove(size_t index) {
         i = -1;
       } else if (required_swap != nullptr) {
         // A move is required to swap. We walk back the cycle to find the
-        // move by just returning from this `PerforrmMove`.
+        // move by just returning from this `PerformMove`.
         moves_[index]->ClearPending(destination);
         return required_swap;
       }
@@ -201,7 +200,7 @@ MoveOperands* ParallelMoveResolverWithSwap::PerformMove(size_t index) {
   } else {
     for (MoveOperands* other_move : moves_) {
       if (other_move->Blocks(destination)) {
-        DCHECK(other_move->IsPending());
+        DCHECK(other_move->IsPending()) << "move=" << *move << " other_move=" << *other_move;
         if (!move->Is64BitMove() && other_move->Is64BitMove()) {
           // We swap 64bits moves before swapping 32bits moves. Go back from the
           // cycle by returning the move that must be swapped.
diff --git a/compiler/optimizing/prepare_for_register_allocation.cc b/compiler/optimizing/prepare_for_register_allocation.cc
index d1770b75ab..63ef600756 100644
--- a/compiler/optimizing/prepare_for_register_allocation.cc
+++ b/compiler/optimizing/prepare_for_register_allocation.cc
@@ -96,7 +96,7 @@ void PrepareForRegisterAllocation::VisitClinitCheck(HClinitCheck* check) {
     if (can_merge_with_load_class && !load_class->HasUses()) {
       load_class->GetBlock()->RemoveInstruction(load_class);
     }
-  } else if (can_merge_with_load_class) {
+  } else if (can_merge_with_load_class && !load_class->NeedsAccessCheck()) {
     // Pass the initialization duty to the `HLoadClass` instruction,
     // and remove the instruction from the graph.
     load_class->SetMustGenerateClinitCheck(true);
diff --git a/compiler/optimizing/register_allocator.cc b/compiler/optimizing/register_allocator.cc
index 5ab4547e22..2bae4bc5c8 100644
--- a/compiler/optimizing/register_allocator.cc
+++ b/compiler/optimizing/register_allocator.cc
@@ -1679,6 +1679,9 @@ void RegisterAllocator::ConnectSiblings(LiveInterval* interval) {
 
       LocationSummary* locations = safepoint_position->GetLocations();
       if ((current->GetType() == Primitive::kPrimNot) && current->GetParent()->HasSpillSlot()) {
+        DCHECK(interval->GetDefinedBy()->IsActualObject())
+            << interval->GetDefinedBy()->DebugName()
+            << "@" << safepoint_position->GetInstruction()->DebugName();
         locations->SetStackBit(current->GetParent()->GetSpillSlot() / kVRegSize);
       }
 
@@ -1691,6 +1694,9 @@ void RegisterAllocator::ConnectSiblings(LiveInterval* interval) {
                       maximum_number_of_live_fp_registers_);
           }
           if (current->GetType() == Primitive::kPrimNot) {
+            DCHECK(interval->GetDefinedBy()->IsActualObject())
+                << interval->GetDefinedBy()->DebugName()
+                << "@" << safepoint_position->GetInstruction()->DebugName();
             locations->SetRegisterBit(source.reg());
           }
           break;
diff --git a/compiler/optimizing/stack_map_stream.cc b/compiler/optimizing/stack_map_stream.cc
index c60a4eacaa..4784de1380 100644
--- a/compiler/optimizing/stack_map_stream.cc
+++ b/compiler/optimizing/stack_map_stream.cc
@@ -270,7 +270,7 @@ void StackMapStream::FillIn(MemoryRegion region) {
       stack_map.SetStackMask(stack_map_encoding_, *entry.sp_mask);
     }
 
-    if (entry.num_dex_registers == 0) {
+    if (entry.num_dex_registers == 0 || (entry.live_dex_registers_mask->NumSetBits() == 0)) {
       // No dex map available.
       stack_map.SetDexRegisterMapOffset(stack_map_encoding_, StackMap::kNoDexRegisterMap);
     } else {
diff --git a/compiler/optimizing/stack_map_test.cc b/compiler/optimizing/stack_map_test.cc
index 560502fde6..604787fd92 100644
--- a/compiler/optimizing/stack_map_test.cc
+++ b/compiler/optimizing/stack_map_test.cc
@@ -614,6 +614,10 @@ TEST(StackMapTest, TestNoDexRegisterMap) {
   stream.BeginStackMapEntry(0, 64, 0x3, &sp_mask, number_of_dex_registers, 0);
   stream.EndStackMapEntry();
 
+  number_of_dex_registers = 1;
+  stream.BeginStackMapEntry(1, 67, 0x4, &sp_mask, number_of_dex_registers, 0);
+  stream.EndStackMapEntry();
+
   size_t size = stream.PrepareForFillIn();
   void* memory = arena.Alloc(size, kArenaAllocMisc);
   MemoryRegion region(memory, size);
@@ -622,7 +626,7 @@ TEST(StackMapTest, TestNoDexRegisterMap) {
   CodeInfo code_info(region);
   StackMapEncoding encoding = code_info.ExtractEncoding();
   ASSERT_EQ(0u, encoding.NumberOfBytesForStackMask());
-  ASSERT_EQ(1u, code_info.GetNumberOfStackMaps());
+  ASSERT_EQ(2u, code_info.GetNumberOfStackMaps());
 
   uint32_t number_of_location_catalog_entries = code_info.GetNumberOfLocationCatalogEntries();
   ASSERT_EQ(0u, number_of_location_catalog_entries);
@@ -638,6 +642,16 @@ TEST(StackMapTest, TestNoDexRegisterMap) {
 
   ASSERT_FALSE(stack_map.HasDexRegisterMap(encoding));
   ASSERT_FALSE(stack_map.HasInlineInfo(encoding));
+
+  stack_map = code_info.GetStackMapAt(1, encoding);
+  ASSERT_TRUE(stack_map.Equals(code_info.GetStackMapForDexPc(1, encoding)));
+  ASSERT_TRUE(stack_map.Equals(code_info.GetStackMapForNativePcOffset(67, encoding)));
+  ASSERT_EQ(1u, stack_map.GetDexPc(encoding));
+  ASSERT_EQ(67u, stack_map.GetNativePcOffset(encoding));
+  ASSERT_EQ(0x4u, stack_map.GetRegisterMask(encoding));
+
+  ASSERT_FALSE(stack_map.HasDexRegisterMap(encoding));
+  ASSERT_FALSE(stack_map.HasInlineInfo(encoding));
 }
 
 TEST(StackMapTest, InlineTest) {