22 files changed, 915 insertions, 741 deletions
diff --git a/compiler/optimizing/bounds_check_elimination.h b/compiler/optimizing/bounds_check_elimination.h
index 6dc53207ea..b9df686ffd 100644
--- a/compiler/optimizing/bounds_check_elimination.h
+++ b/compiler/optimizing/bounds_check_elimination.h
@@ -29,13 +29,13 @@ class BoundsCheckElimination : public HOptimization {
   BoundsCheckElimination(HGraph* graph,
                          const SideEffectsAnalysis& side_effects,
                          HInductionVarAnalysis* induction_analysis)
-      : HOptimization(graph, kBoundsCheckEliminationPassName),
+      : HOptimization(graph, kBoundsCheckEliminiationPassName),
         side_effects_(side_effects),
         induction_analysis_(induction_analysis) {}
 
   void Run() OVERRIDE;
 
-  static constexpr const char* kBoundsCheckEliminationPassName = "BCE";
+  static constexpr const char* kBoundsCheckEliminiationPassName = "BCE";
 
  private:
   const SideEffectsAnalysis& side_effects_;
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index f265a0c7d3..272579219f 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -6449,33 +6449,6 @@ Literal* CodeGeneratorARM::DeduplicateMethodCodeLiteral(MethodReference target_m
   return DeduplicateMethodLiteral(target_method, &call_patches_);
 }
 
-void LocationsBuilderARM::VisitMultiplyAccumulate(HMultiplyAccumulate* instr) {
-  LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instr, LocationSummary::kNoCall);
-  locations->SetInAt(HMultiplyAccumulate::kInputAccumulatorIndex,
-                     Location::RequiresRegister());
-  locations->SetInAt(HMultiplyAccumulate::kInputMulLeftIndex, Location::RequiresRegister());
-  locations->SetInAt(HMultiplyAccumulate::kInputMulRightIndex, Location::RequiresRegister());
-  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
-}
-
-void InstructionCodeGeneratorARM::VisitMultiplyAccumulate(HMultiplyAccumulate* instr) {
-  LocationSummary* locations = instr->GetLocations();
-  Register res = locations->Out().AsRegister<Register>();
-  Register accumulator = locations->InAt(HMultiplyAccumulate::kInputAccumulatorIndex)
-                                      .AsRegister<Register>();
-  Register mul_left = locations->InAt(HMultiplyAccumulate::kInputMulLeftIndex)
-                                  .AsRegister<Register>();
-  Register mul_right = locations->InAt(HMultiplyAccumulate::kInputMulRightIndex)
-                                    .AsRegister<Register>();
-
-  if (instr->GetOpKind() == HInstruction::kAdd) {
-    __ mla(res, mul_left, mul_right, accumulator);
-  } else {
-    __ mls(res, mul_left, mul_right, accumulator);
-  }
-}
-
 void LocationsBuilderARM::VisitBoundType(HBoundType* instruction ATTRIBUTE_UNUSED) {
   // Nothing to do, this should be removed during prepare for register allocator.
   LOG(FATAL) << "Unreachable";
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index df2126c653..d45ea973f9 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -159,7 +159,6 @@ class LocationsBuilderARM : public HGraphVisitor {
 
   FOR_EACH_CONCRETE_INSTRUCTION_COMMON(DECLARE_VISIT_INSTRUCTION)
   FOR_EACH_CONCRETE_INSTRUCTION_ARM(DECLARE_VISIT_INSTRUCTION)
-  FOR_EACH_CONCRETE_INSTRUCTION_SHARED(DECLARE_VISIT_INSTRUCTION)
 
 #undef DECLARE_VISIT_INSTRUCTION
 
@@ -198,7 +197,6 @@ class InstructionCodeGeneratorARM : public InstructionCodeGenerator {
 
   FOR_EACH_CONCRETE_INSTRUCTION_COMMON(DECLARE_VISIT_INSTRUCTION)
   FOR_EACH_CONCRETE_INSTRUCTION_ARM(DECLARE_VISIT_INSTRUCTION)
-  FOR_EACH_CONCRETE_INSTRUCTION_SHARED(DECLARE_VISIT_INSTRUCTION)
 
 #undef DECLARE_VISIT_INSTRUCTION
 
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 3fdd7186d1..c0e3959933 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -584,6 +584,56 @@ void JumpTableARM64::EmitTable(CodeGeneratorARM64* codegen) {
   }
 }
 
+// Slow path marking an object during a read barrier.
+class ReadBarrierMarkSlowPathARM64 : public SlowPathCodeARM64 {
+ public:
+  ReadBarrierMarkSlowPathARM64(HInstruction* instruction, Location out, Location obj)
+      : instruction_(instruction), out_(out), obj_(obj) {
+    DCHECK(kEmitCompilerReadBarrier);
+  }
+
+  const char* GetDescription() const OVERRIDE { return "ReadBarrierMarkSlowPathARM64"; }
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    LocationSummary* locations = instruction_->GetLocations();
+    Primitive::Type type = Primitive::kPrimNot;
+    DCHECK(locations->CanCall());
+    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(out_.reg()));
+    DCHECK(instruction_->IsInstanceFieldGet() ||
+           instruction_->IsStaticFieldGet() ||
+           instruction_->IsArrayGet() ||
+           instruction_->IsLoadClass() ||
+           instruction_->IsLoadString() ||
+           instruction_->IsInstanceOf() ||
+           instruction_->IsCheckCast())
+        << "Unexpected instruction in read barrier marking slow path: "
+        << instruction_->DebugName();
+
+    __ Bind(GetEntryLabel());
+    SaveLiveRegisters(codegen, locations);
+
+    InvokeRuntimeCallingConvention calling_convention;
+    CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen);
+    arm64_codegen->MoveLocation(LocationFrom(calling_convention.GetRegisterAt(0)), obj_, type);
+    arm64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pReadBarrierMark),
+                                 instruction_,
+                                 instruction_->GetDexPc(),
+                                 this);
+    CheckEntrypointTypes<kQuickReadBarrierMark, mirror::Object*, mirror::Object*>();
+    arm64_codegen->MoveLocation(out_, calling_convention.GetReturnLocation(type), type);
+
+    RestoreLiveRegisters(codegen, locations);
+    __ B(GetExitLabel());
+  }
+
+ private:
+  HInstruction* const instruction_;
+  const Location out_;
+  const Location obj_;
+
+  DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathARM64);
+};
+
 // Slow path generating a read barrier for a heap reference.
 class ReadBarrierForHeapReferenceSlowPathARM64 : public SlowPathCodeARM64 {
  public:
@@ -605,7 +655,7 @@ class ReadBarrierForHeapReferenceSlowPathARM64 : public SlowPathCodeARM64 {
     // to be instrumented, e.g.:
     //
     //   __ Ldr(out, HeapOperand(out, class_offset);
-    //   codegen_->GenerateReadBarrier(instruction, out_loc, out_loc, out_loc, offset);
+    //   codegen_->GenerateReadBarrierSlow(instruction, out_loc, out_loc, out_loc, offset);
     //
     // In that case, we have lost the information about the original
     // object, and the emitted read barrier cannot work properly.
@@ -621,7 +671,9 @@ class ReadBarrierForHeapReferenceSlowPathARM64 : public SlowPathCodeARM64 {
     DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(out_.reg()));
     DCHECK(!instruction_->IsInvoke() ||
            (instruction_->IsInvokeStaticOrDirect() &&
-            instruction_->GetLocations()->Intrinsified()));
+            instruction_->GetLocations()->Intrinsified()))
+        << "Unexpected instruction in read barrier for heap reference slow path: "
+        << instruction_->DebugName();
     // The read barrier instrumentation does not support the
     // HArm64IntermediateAddress instruction yet.
     DCHECK(!(instruction_->IsArrayGet() &&
@@ -769,14 +821,18 @@ class ReadBarrierForHeapReferenceSlowPathARM64 : public SlowPathCodeARM64 {
 class ReadBarrierForRootSlowPathARM64 : public SlowPathCodeARM64 {
  public:
   ReadBarrierForRootSlowPathARM64(HInstruction* instruction, Location out, Location root)
-      : instruction_(instruction), out_(out), root_(root) {}
+      : instruction_(instruction), out_(out), root_(root) {
+    DCHECK(kEmitCompilerReadBarrier);
+  }
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     LocationSummary* locations = instruction_->GetLocations();
     Primitive::Type type = Primitive::kPrimNot;
     DCHECK(locations->CanCall());
     DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(out_.reg()));
-    DCHECK(instruction_->IsLoadClass() || instruction_->IsLoadString());
+    DCHECK(instruction_->IsLoadClass() || instruction_->IsLoadString())
+        << "Unexpected instruction in read barrier for GC root slow path: "
+        << instruction_->DebugName();
 
     __ Bind(GetEntryLabel());
     SaveLiveRegisters(codegen, locations);
@@ -1338,7 +1394,8 @@ void CodeGeneratorARM64::Load(Primitive::Type type,
 
 void CodeGeneratorARM64::LoadAcquire(HInstruction* instruction,
                                      CPURegister dst,
-                                     const MemOperand& src) {
+                                     const MemOperand& src,
+                                     bool needs_null_check) {
   MacroAssembler* masm = GetVIXLAssembler();
   BlockPoolsScope block_pools(masm);
   UseScratchRegisterScope temps(masm);
@@ -1354,20 +1411,28 @@ void CodeGeneratorARM64::LoadAcquire(HInstruction* instruction,
   switch (type) {
     case Primitive::kPrimBoolean:
       __ Ldarb(Register(dst), base);
-      MaybeRecordImplicitNullCheck(instruction);
+      if (needs_null_check) {
+        MaybeRecordImplicitNullCheck(instruction);
+      }
       break;
     case Primitive::kPrimByte:
       __ Ldarb(Register(dst), base);
-      MaybeRecordImplicitNullCheck(instruction);
+      if (needs_null_check) {
+        MaybeRecordImplicitNullCheck(instruction);
+      }
       __ Sbfx(Register(dst), Register(dst), 0, Primitive::ComponentSize(type) * kBitsPerByte);
       break;
     case Primitive::kPrimChar:
       __ Ldarh(Register(dst), base);
-      MaybeRecordImplicitNullCheck(instruction);
+      if (needs_null_check) {
+        MaybeRecordImplicitNullCheck(instruction);
+      }
       break;
     case Primitive::kPrimShort:
       __ Ldarh(Register(dst), base);
-      MaybeRecordImplicitNullCheck(instruction);
+      if (needs_null_check) {
+        MaybeRecordImplicitNullCheck(instruction);
+      }
       __ Sbfx(Register(dst), Register(dst), 0, Primitive::ComponentSize(type) * kBitsPerByte);
       break;
     case Primitive::kPrimInt:
@@ -1375,7 +1440,9 @@ void CodeGeneratorARM64::LoadAcquire(HInstruction* instruction,
     case Primitive::kPrimLong:
       DCHECK_EQ(dst.Is64Bits(), Primitive::Is64BitType(type));
       __ Ldar(Register(dst), base);
-      MaybeRecordImplicitNullCheck(instruction);
+      if (needs_null_check) {
+        MaybeRecordImplicitNullCheck(instruction);
+      }
       break;
     case Primitive::kPrimFloat:
     case Primitive::kPrimDouble: {
@@ -1384,7 +1451,9 @@ void CodeGeneratorARM64::LoadAcquire(HInstruction* instruction,
 
       Register temp = dst.Is64Bits() ? temps.AcquireX() : temps.AcquireW();
       __ Ldar(temp, base);
-      MaybeRecordImplicitNullCheck(instruction);
+      if (needs_null_check) {
+        MaybeRecordImplicitNullCheck(instruction);
+      }
       __ Fmov(FPRegister(dst), temp);
       break;
     }
@@ -1505,7 +1574,7 @@ void InstructionCodeGeneratorARM64::GenerateClassInitializationCheck(SlowPathCod
   __ Bind(slow_path->GetExitLabel());
 }
 
-void InstructionCodeGeneratorARM64::GenerateMemoryBarrier(MemBarrierKind kind) {
+void CodeGeneratorARM64::GenerateMemoryBarrier(MemBarrierKind kind) {
   BarrierType type = BarrierAll;
 
   switch (kind) {
@@ -1641,33 +1710,62 @@ void LocationsBuilderARM64::HandleFieldGet(HInstruction* instruction) {
 void InstructionCodeGeneratorARM64::HandleFieldGet(HInstruction* instruction,
                                                    const FieldInfo& field_info) {
   DCHECK(instruction->IsInstanceFieldGet() || instruction->IsStaticFieldGet());
+  LocationSummary* locations = instruction->GetLocations();
+  Location base_loc = locations->InAt(0);
+  Location out = locations->Out();
+  uint32_t offset = field_info.GetFieldOffset().Uint32Value();
   Primitive::Type field_type = field_info.GetFieldType();
   BlockPoolsScope block_pools(GetVIXLAssembler());
 
   MemOperand field = HeapOperand(InputRegisterAt(instruction, 0), field_info.GetFieldOffset());
   bool use_acquire_release = codegen_->GetInstructionSetFeatures().PreferAcquireRelease();
 
-  if (field_info.IsVolatile()) {
-    if (use_acquire_release) {
-      // NB: LoadAcquire will record the pc info if needed.
-      codegen_->LoadAcquire(instruction, OutputCPURegister(instruction), field);
+  if (field_type == Primitive::kPrimNot && kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+    // Object FieldGet with Baker's read barrier case.
+    MacroAssembler* masm = GetVIXLAssembler();
+    UseScratchRegisterScope temps(masm);
+    // /* HeapReference<Object> */ out = *(base + offset)
+    Register base = RegisterFrom(base_loc, Primitive::kPrimNot);
+    Register temp = temps.AcquireW();
+    // Note that potential implicit null checks are handled in this
+    // CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier call.
+    codegen_->GenerateFieldLoadWithBakerReadBarrier(
+        instruction,
+        out,
+        base,
+        offset,
+        temp,
+        /* needs_null_check */ true,
+        field_info.IsVolatile() && use_acquire_release);
+    if (field_info.IsVolatile() && !use_acquire_release) {
+      // For IRIW sequential consistency kLoadAny is not sufficient.
+      codegen_->GenerateMemoryBarrier(MemBarrierKind::kAnyAny);
+    }
+  } else {
+    // General case.
+    if (field_info.IsVolatile()) {
+      if (use_acquire_release) {
+        // Note that a potential implicit null check is handled in this
+        // CodeGeneratorARM64::LoadAcquire call.
+        // NB: LoadAcquire will record the pc info if needed.
+        codegen_->LoadAcquire(
+            instruction, OutputCPURegister(instruction), field, /* needs_null_check */ true);
+      } else {
+        codegen_->Load(field_type, OutputCPURegister(instruction), field);
+        codegen_->MaybeRecordImplicitNullCheck(instruction);
+        // For IRIW sequential consistency kLoadAny is not sufficient.
+        codegen_->GenerateMemoryBarrier(MemBarrierKind::kAnyAny);
+      }
     } else {
       codegen_->Load(field_type, OutputCPURegister(instruction), field);
       codegen_->MaybeRecordImplicitNullCheck(instruction);
-      // For IRIW sequential consistency kLoadAny is not sufficient.
-      GenerateMemoryBarrier(MemBarrierKind::kAnyAny);
     }
-  } else {
-    codegen_->Load(field_type, OutputCPURegister(instruction), field);
-    codegen_->MaybeRecordImplicitNullCheck(instruction);
-  }
-
-  if (field_type == Primitive::kPrimNot) {
-    LocationSummary* locations = instruction->GetLocations();
-    Location base = locations->InAt(0);
-    Location out = locations->Out();
-    uint32_t offset = field_info.GetFieldOffset().Uint32Value();
-    codegen_->MaybeGenerateReadBarrier(instruction, out, out, base, offset);
+    if (field_type == Primitive::kPrimNot) {
+      // If read barriers are enabled, emit read barriers other than
+      // Baker's using a slow path (and also unpoison the loaded
+      // reference, if heap poisoning is enabled).
+      codegen_->MaybeGenerateReadBarrierSlow(instruction, out, out, base_loc, offset);
+    }
   }
 }
 
@@ -1713,10 +1811,10 @@ void InstructionCodeGeneratorARM64::HandleFieldSet(HInstruction* instruction,
         codegen_->StoreRelease(field_type, source, HeapOperand(obj, offset));
         codegen_->MaybeRecordImplicitNullCheck(instruction);
       } else {
-        GenerateMemoryBarrier(MemBarrierKind::kAnyStore);
+        codegen_->GenerateMemoryBarrier(MemBarrierKind::kAnyStore);
         codegen_->Store(field_type, source, HeapOperand(obj, offset));
         codegen_->MaybeRecordImplicitNullCheck(instruction);
-        GenerateMemoryBarrier(MemBarrierKind::kAnyAny);
+        codegen_->GenerateMemoryBarrier(MemBarrierKind::kAnyAny);
       }
     } else {
       codegen_->Store(field_type, source, HeapOperand(obj, offset));
@@ -1952,27 +2050,21 @@ void InstructionCodeGeneratorARM64::VisitArm64IntermediateAddress(
          Operand(InputOperandAt(instruction, 1)));
 }
 
-void LocationsBuilderARM64::VisitMultiplyAccumulate(HMultiplyAccumulate* instr) {
+void LocationsBuilderARM64::VisitArm64MultiplyAccumulate(HArm64MultiplyAccumulate* instr) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instr, LocationSummary::kNoCall);
-  HInstruction* accumulator = instr->InputAt(HMultiplyAccumulate::kInputAccumulatorIndex);
-  if (instr->GetOpKind() == HInstruction::kSub &&
-      accumulator->IsConstant() &&
-      accumulator->AsConstant()->IsZero()) {
-    // Don't allocate register for Mneg instruction.
-  } else {
-    locations->SetInAt(HMultiplyAccumulate::kInputAccumulatorIndex,
-                       Location::RequiresRegister());
-  }
-  locations->SetInAt(HMultiplyAccumulate::kInputMulLeftIndex, Location::RequiresRegister());
-  locations->SetInAt(HMultiplyAccumulate::kInputMulRightIndex, Location::RequiresRegister());
+  locations->SetInAt(HArm64MultiplyAccumulate::kInputAccumulatorIndex,
+                     Location::RequiresRegister());
+  locations->SetInAt(HArm64MultiplyAccumulate::kInputMulLeftIndex, Location::RequiresRegister());
+  locations->SetInAt(HArm64MultiplyAccumulate::kInputMulRightIndex, Location::RequiresRegister());
   locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
 }
 
-void InstructionCodeGeneratorARM64::VisitMultiplyAccumulate(HMultiplyAccumulate* instr) {
+void InstructionCodeGeneratorARM64::VisitArm64MultiplyAccumulate(HArm64MultiplyAccumulate* instr) {
   Register res = OutputRegister(instr);
-  Register mul_left = InputRegisterAt(instr, HMultiplyAccumulate::kInputMulLeftIndex);
-  Register mul_right = InputRegisterAt(instr, HMultiplyAccumulate::kInputMulRightIndex);
+  Register accumulator = InputRegisterAt(instr, HArm64MultiplyAccumulate::kInputAccumulatorIndex);
+  Register mul_left = InputRegisterAt(instr, HArm64MultiplyAccumulate::kInputMulLeftIndex);
+  Register mul_right = InputRegisterAt(instr, HArm64MultiplyAccumulate::kInputMulRightIndex);
 
   // Avoid emitting code that could trigger Cortex A53's erratum 835769.
   // This fixup should be carried out for all multiply-accumulate instructions:
@@ -1992,18 +2084,10 @@ void InstructionCodeGeneratorARM64::VisitMultiplyAccumulate(HMultiplyAccumulate*
   }
 
   if (instr->GetOpKind() == HInstruction::kAdd) {
-    Register accumulator = InputRegisterAt(instr, HMultiplyAccumulate::kInputAccumulatorIndex);
     __ Madd(res, mul_left, mul_right, accumulator);
   } else {
     DCHECK(instr->GetOpKind() == HInstruction::kSub);
-    HInstruction* accum_instr = instr->InputAt(HMultiplyAccumulate::kInputAccumulatorIndex);
-    if (accum_instr->IsConstant() && accum_instr->AsConstant()->IsZero()) {
-      __ Mneg(res, mul_left, mul_right);
-    } else {
-      Register accumulator = InputRegisterAt(instr,
-                                             HMultiplyAccumulate::kInputAccumulatorIndex);
-      __ Msub(res, mul_left, mul_right, accumulator);
-    }
+    __ Msub(res, mul_left, mul_right, accumulator);
   }
 }
 
@@ -2035,50 +2119,62 @@ void InstructionCodeGeneratorARM64::VisitArrayGet(HArrayGet* instruction) {
   LocationSummary* locations = instruction->GetLocations();
   Location index = locations->InAt(1);
   uint32_t offset = mirror::Array::DataOffset(Primitive::ComponentSize(type)).Uint32Value();
-  MemOperand source = HeapOperand(obj);
-  CPURegister dest = OutputCPURegister(instruction);
+  Location out = locations->Out();
 
   MacroAssembler* masm = GetVIXLAssembler();
   UseScratchRegisterScope temps(masm);
   // Block pools between `Load` and `MaybeRecordImplicitNullCheck`.
   BlockPoolsScope block_pools(masm);
 
-  if (index.IsConstant()) {
-    offset += Int64ConstantFrom(index) << Primitive::ComponentSizeShift(type);
-    source = HeapOperand(obj, offset);
+  if (type == Primitive::kPrimNot && kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+    // Object ArrayGet with Baker's read barrier case.
+    Register temp = temps.AcquireW();
+    // The read barrier instrumentation does not support the
+    // HArm64IntermediateAddress instruction yet.
+    DCHECK(!instruction->GetArray()->IsArm64IntermediateAddress());
+    // Note that a potential implicit null check is handled in the
+    // CodeGeneratorARM64::GenerateArrayLoadWithBakerReadBarrier call.
+    codegen_->GenerateArrayLoadWithBakerReadBarrier(
+        instruction, out, obj.W(), offset, index, temp, /* needs_null_check */ true);
   } else {
-    Register temp = temps.AcquireSameSizeAs(obj);
-    if (instruction->GetArray()->IsArm64IntermediateAddress()) {
-      // The read barrier instrumentation does not support the
-      // HArm64IntermediateAddress instruction yet.
-      DCHECK(!kEmitCompilerReadBarrier);
-      // We do not need to compute the intermediate address from the array: the
-      // input instruction has done it already. See the comment in
-      // `InstructionSimplifierArm64::TryExtractArrayAccessAddress()`.
-      if (kIsDebugBuild) {
-        HArm64IntermediateAddress* tmp = instruction->GetArray()->AsArm64IntermediateAddress();
-        DCHECK(tmp->GetOffset()->AsIntConstant()->GetValueAsUint64() == offset);
-      }
-      temp = obj;
+    // General case.
+    MemOperand source = HeapOperand(obj);
+    if (index.IsConstant()) {
+      offset += Int64ConstantFrom(index) << Primitive::ComponentSizeShift(type);
+      source = HeapOperand(obj, offset);
     } else {
-      __ Add(temp, obj, offset);
+      Register temp = temps.AcquireSameSizeAs(obj);
+      if (instruction->GetArray()->IsArm64IntermediateAddress()) {
+        // The read barrier instrumentation does not support the
+        // HArm64IntermediateAddress instruction yet.
+        DCHECK(!kEmitCompilerReadBarrier);
+        // We do not need to compute the intermediate address from the array: the
+        // input instruction has done it already. See the comment in
+        // `InstructionSimplifierArm64::TryExtractArrayAccessAddress()`.
+        if (kIsDebugBuild) {
+          HArm64IntermediateAddress* tmp = instruction->GetArray()->AsArm64IntermediateAddress();
+          DCHECK_EQ(tmp->GetOffset()->AsIntConstant()->GetValueAsUint64(), offset);
+        }
+        temp = obj;
+      } else {
+        __ Add(temp, obj, offset);
+      }
+      source = HeapOperand(temp, XRegisterFrom(index), LSL, Primitive::ComponentSizeShift(type));
     }
-    source = HeapOperand(temp, XRegisterFrom(index), LSL, Primitive::ComponentSizeShift(type));
-  }
 
-  codegen_->Load(type, dest, source);
-  codegen_->MaybeRecordImplicitNullCheck(instruction);
+    codegen_->Load(type, OutputCPURegister(instruction), source);
+    codegen_->MaybeRecordImplicitNullCheck(instruction);
 
-  if (type == Primitive::kPrimNot) {
-    static_assert(
-        sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
-        "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
-    Location obj_loc = locations->InAt(0);
-    Location out = locations->Out();
-    if (index.IsConstant()) {
-      codegen_->MaybeGenerateReadBarrier(instruction, out, out, obj_loc, offset);
-    } else {
-      codegen_->MaybeGenerateReadBarrier(instruction, out, out, obj_loc, offset, index);
+    if (type == Primitive::kPrimNot) {
+      static_assert(
+          sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
+          "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
+      Location obj_loc = locations->InAt(0);
+      if (index.IsConstant()) {
+        codegen_->MaybeGenerateReadBarrierSlow(instruction, out, out, obj_loc, offset);
+      } else {
+        codegen_->MaybeGenerateReadBarrierSlow(instruction, out, out, obj_loc, offset, index);
+      }
     }
   }
 }
@@ -2208,12 +2304,12 @@ void InstructionCodeGeneratorARM64::VisitArraySet(HArraySet* instruction) {
           //   __ Mov(temp2, temp);
           //   // /* HeapReference<Class> */ temp = temp->component_type_
           //   __ Ldr(temp, HeapOperand(temp, component_offset));
-          //   codegen_->GenerateReadBarrier(
+          //   codegen_->GenerateReadBarrierSlow(
           //       instruction, temp_loc, temp_loc, temp2_loc, component_offset);
           //
           //   // /* HeapReference<Class> */ temp2 = value->klass_
           //   __ Ldr(temp2, HeapOperand(Register(value), class_offset));
-          //   codegen_->GenerateReadBarrier(
+          //   codegen_->GenerateReadBarrierSlow(
           //       instruction, temp2_loc, temp2_loc, value_loc, class_offset, temp_loc);
           //
           //   __ Cmp(temp, temp2);
@@ -2936,6 +3032,14 @@ void InstructionCodeGeneratorARM64::VisitInstanceFieldSet(HInstanceFieldSet* ins
   HandleFieldSet(instruction, instruction->GetFieldInfo(), instruction->GetValueCanBeNull());
 }
 
+static bool TypeCheckNeedsATemporary(TypeCheckKind type_check_kind) {
+  return kEmitCompilerReadBarrier &&
+      (kUseBakerReadBarrier ||
+       type_check_kind == TypeCheckKind::kAbstractClassCheck ||
+       type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
+       type_check_kind == TypeCheckKind::kArrayObjectCheck);
+}
+
 void LocationsBuilderARM64::VisitInstanceOf(HInstanceOf* instruction) {
   LocationSummary::CallKind call_kind = LocationSummary::kNoCall;
   TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
@@ -2962,21 +3066,22 @@ void LocationsBuilderARM64::VisitInstanceOf(HInstanceOf* instruction) {
   locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
   // When read barriers are enabled, we need a temporary register for
   // some cases.
-  if (kEmitCompilerReadBarrier &&
-      (type_check_kind == TypeCheckKind::kAbstractClassCheck ||
-       type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
-       type_check_kind == TypeCheckKind::kArrayObjectCheck)) {
+  if (TypeCheckNeedsATemporary(type_check_kind)) {
     locations->AddTemp(Location::RequiresRegister());
   }
 }
 
 void InstructionCodeGeneratorARM64::VisitInstanceOf(HInstanceOf* instruction) {
+  TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
   LocationSummary* locations = instruction->GetLocations();
   Location obj_loc = locations->InAt(0);
   Register obj = InputRegisterAt(instruction, 0);
   Register cls = InputRegisterAt(instruction, 1);
   Location out_loc = locations->Out();
   Register out = OutputRegister(instruction);
+  Location maybe_temp_loc = TypeCheckNeedsATemporary(type_check_kind) ?
+      locations->GetTemp(0) :
+      Location::NoLocation();
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
   uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
   uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
@@ -2992,10 +3097,9 @@ void InstructionCodeGeneratorARM64::VisitInstanceOf(HInstanceOf* instruction) {
   }
 
   // /* HeapReference<Class> */ out = obj->klass_
-  __ Ldr(out, HeapOperand(obj.W(), class_offset));
-  codegen_->MaybeGenerateReadBarrier(instruction, out_loc, out_loc, obj_loc, class_offset);
+  GenerateReferenceLoadTwoRegisters(instruction, out_loc, obj_loc, class_offset, maybe_temp_loc);
 
-  switch (instruction->GetTypeCheckKind()) {
+  switch (type_check_kind) {
     case TypeCheckKind::kExactCheck: {
       __ Cmp(out, cls);
       __ Cset(out, eq);
@@ -3010,17 +3114,8 @@ void InstructionCodeGeneratorARM64::VisitInstanceOf(HInstanceOf* instruction) {
       // object to avoid doing a comparison we know will fail.
       vixl::Label loop, success;
       __ Bind(&loop);
-      Location temp_loc = kEmitCompilerReadBarrier ? locations->GetTemp(0) : Location::NoLocation();
-      if (kEmitCompilerReadBarrier) {
-        // Save the value of `out` into `temp` before overwriting it
-        // in the following move operation, as we will need it for the
-        // read barrier below.
-        Register temp = WRegisterFrom(temp_loc);
-        __ Mov(temp, out);
-      }
       // /* HeapReference<Class> */ out = out->super_class_
-      __ Ldr(out, HeapOperand(out, super_offset));
-      codegen_->MaybeGenerateReadBarrier(instruction, out_loc, out_loc, temp_loc, super_offset);
+      GenerateReferenceLoadOneRegister(instruction, out_loc, super_offset, maybe_temp_loc);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ Cbz(out, &done);
       __ Cmp(out, cls);
@@ -3038,17 +3133,8 @@ void InstructionCodeGeneratorARM64::VisitInstanceOf(HInstanceOf* instruction) {
       __ Bind(&loop);
       __ Cmp(out, cls);
       __ B(eq, &success);
-      Location temp_loc = kEmitCompilerReadBarrier ? locations->GetTemp(0) : Location::NoLocation();
-      if (kEmitCompilerReadBarrier) {
-        // Save the value of `out` into `temp` before overwriting it
-        // in the following move operation, as we will need it for the
-        // read barrier below.
-        Register temp = WRegisterFrom(temp_loc);
-        __ Mov(temp, out);
-      }
       // /* HeapReference<Class> */ out = out->super_class_
-      __ Ldr(out, HeapOperand(out, super_offset));
-      codegen_->MaybeGenerateReadBarrier(instruction, out_loc, out_loc, temp_loc, super_offset);
+      GenerateReferenceLoadOneRegister(instruction, out_loc, super_offset, maybe_temp_loc);
       __ Cbnz(out, &loop);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ B(&done);
@@ -3066,17 +3152,8 @@ void InstructionCodeGeneratorARM64::VisitInstanceOf(HInstanceOf* instruction) {
       __ Cmp(out, cls);
       __ B(eq, &exact_check);
       // Otherwise, we need to check that the object's class is a non-primitive array.
-      Location temp_loc = kEmitCompilerReadBarrier ? locations->GetTemp(0) : Location::NoLocation();
-      if (kEmitCompilerReadBarrier) {
-        // Save the value of `out` into `temp` before overwriting it
-        // in the following move operation, as we will need it for the
-        // read barrier below.
-        Register temp = WRegisterFrom(temp_loc);
-        __ Mov(temp, out);
-      }
       // /* HeapReference<Class> */ out = out->component_type_
-      __ Ldr(out, HeapOperand(out, component_offset));
-      codegen_->MaybeGenerateReadBarrier(instruction, out_loc, out_loc, temp_loc, component_offset);
+      GenerateReferenceLoadOneRegister(instruction, out_loc, component_offset, maybe_temp_loc);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ Cbz(out, &done);
       __ Ldrh(out, HeapOperand(out, primitive_offset));
@@ -3115,6 +3192,13 @@ void InstructionCodeGeneratorARM64::VisitInstanceOf(HInstanceOf* instruction) {
       // HInstanceOf instruction (following the runtime calling
       // convention), which might be cluttered by the potential first
       // read barrier emission at the beginning of this method.
+      //
+      // TODO: Introduce a new runtime entry point taking the object
+      // to test (instead of its class) as argument, and let it deal
+      // with the read barrier issues. This will let us refactor this
+      // case of the `switch` code as it was previously (with a direct
+      // call to the runtime not using a type checking slow path).
+      // This should also be beneficial for the other cases above.
       DCHECK(locations->OnlyCallsOnSlowPath());
       slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathARM64(instruction,
                                                                       /* is_fatal */ false);
@@ -3167,30 +3251,29 @@ void LocationsBuilderARM64::VisitCheckCast(HCheckCast* instruction) {
   locations->SetInAt(1, Location::RequiresRegister());
   // Note that TypeCheckSlowPathARM64 uses this "temp" register too.
   locations->AddTemp(Location::RequiresRegister());
-  locations->AddTemp(Location::RequiresRegister());
   // When read barriers are enabled, we need an additional temporary
   // register for some cases.
-  if (kEmitCompilerReadBarrier &&
-      (type_check_kind == TypeCheckKind::kAbstractClassCheck ||
-       type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
-       type_check_kind == TypeCheckKind::kArrayObjectCheck)) {
-     locations->AddTemp(Location::RequiresRegister());
+  if (TypeCheckNeedsATemporary(type_check_kind)) {
+    locations->AddTemp(Location::RequiresRegister());
   }
 }
 
 void InstructionCodeGeneratorARM64::VisitCheckCast(HCheckCast* instruction) {
+  TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
   LocationSummary* locations = instruction->GetLocations();
   Location obj_loc = locations->InAt(0);
   Register obj = InputRegisterAt(instruction, 0);
   Register cls = InputRegisterAt(instruction, 1);
   Location temp_loc = locations->GetTemp(0);
+  Location maybe_temp2_loc = TypeCheckNeedsATemporary(type_check_kind) ?
+      locations->GetTemp(1) :
+      Location::NoLocation();
   Register temp = WRegisterFrom(temp_loc);
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
   uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
   uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
   uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
 
-  TypeCheckKind type_check_kind = instruction->GetTypeCheckKind();
   bool is_type_check_slow_path_fatal =
       (type_check_kind == TypeCheckKind::kExactCheck ||
        type_check_kind == TypeCheckKind::kAbstractClassCheck ||
@@ -3209,8 +3292,7 @@ void InstructionCodeGeneratorARM64::VisitCheckCast(HCheckCast* instruction) {
   }
 
   // /* HeapReference<Class> */ temp = obj->klass_
-  __ Ldr(temp, HeapOperand(obj, class_offset));
-  codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, obj_loc, class_offset);
+  GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset, maybe_temp2_loc);
 
   switch (type_check_kind) {
     case TypeCheckKind::kExactCheck:
@@ -3227,18 +3309,8 @@ void InstructionCodeGeneratorARM64::VisitCheckCast(HCheckCast* instruction) {
       // object to avoid doing a comparison we know will fail.
       vixl::Label loop, compare_classes;
       __ Bind(&loop);
-      Location temp2_loc =
-          kEmitCompilerReadBarrier ? locations->GetTemp(1) : Location::NoLocation();
-      if (kEmitCompilerReadBarrier) {
-        // Save the value of `temp` into `temp2` before overwriting it
-        // in the following move operation, as we will need it for the
-        // read barrier below.
-        Register temp2 = WRegisterFrom(temp2_loc);
-        __ Mov(temp2, temp);
-      }
       // /* HeapReference<Class> */ temp = temp->super_class_
-      __ Ldr(temp, HeapOperand(temp, super_offset));
-      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, temp2_loc, super_offset);
+      GenerateReferenceLoadOneRegister(instruction, temp_loc, super_offset, maybe_temp2_loc);
 
       // If the class reference currently in `temp` is not null, jump
       // to the `compare_classes` label to compare it with the checked
@@ -3250,8 +3322,8 @@ void InstructionCodeGeneratorARM64::VisitCheckCast(HCheckCast* instruction) {
       // going into the slow path, as it has been overwritten in the
       // meantime.
       // /* HeapReference<Class> */ temp = obj->klass_
-      __ Ldr(temp, HeapOperand(obj, class_offset));
-      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, obj_loc, class_offset);
+      GenerateReferenceLoadTwoRegisters(
+          instruction, temp_loc, obj_loc, class_offset, maybe_temp2_loc);
       __ B(type_check_slow_path->GetEntryLabel());
 
       __ Bind(&compare_classes);
@@ -3267,18 +3339,8 @@ void InstructionCodeGeneratorARM64::VisitCheckCast(HCheckCast* instruction) {
       __ Cmp(temp, cls);
       __ B(eq, &done);
 
-      Location temp2_loc =
-          kEmitCompilerReadBarrier ? locations->GetTemp(1) : Location::NoLocation();
-      if (kEmitCompilerReadBarrier) {
-        // Save the value of `temp` into `temp2` before overwriting it
-        // in the following move operation, as we will need it for the
-        // read barrier below.
-        Register temp2 = WRegisterFrom(temp2_loc);
-        __ Mov(temp2, temp);
-      }
       // /* HeapReference<Class> */ temp = temp->super_class_
-      __ Ldr(temp, HeapOperand(temp, super_offset));
-      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, temp2_loc, super_offset);
+      GenerateReferenceLoadOneRegister(instruction, temp_loc, super_offset, maybe_temp2_loc);
 
       // If the class reference currently in `temp` is not null, jump
       // back at the beginning of the loop.
@@ -3289,8 +3351,8 @@ void InstructionCodeGeneratorARM64::VisitCheckCast(HCheckCast* instruction) {
       // going into the slow path, as it has been overwritten in the
       // meantime.
       // /* HeapReference<Class> */ temp = obj->klass_
-      __ Ldr(temp, HeapOperand(obj, class_offset));
-      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, obj_loc, class_offset);
+      GenerateReferenceLoadTwoRegisters(
+          instruction, temp_loc, obj_loc, class_offset, maybe_temp2_loc);
       __ B(type_check_slow_path->GetEntryLabel());
       break;
     }
@@ -3302,19 +3364,8 @@ void InstructionCodeGeneratorARM64::VisitCheckCast(HCheckCast* instruction) {
       __ B(eq, &done);
 
       // Otherwise, we need to check that the object's class is a non-primitive array.
-      Location temp2_loc =
-          kEmitCompilerReadBarrier ? locations->GetTemp(1) : Location::NoLocation();
-      if (kEmitCompilerReadBarrier) {
-        // Save the value of `temp` into `temp2` before overwriting it
-        // in the following move operation, as we will need it for the
-        // read barrier below.
-        Register temp2 = WRegisterFrom(temp2_loc);
-        __ Mov(temp2, temp);
-      }
       // /* HeapReference<Class> */ temp = temp->component_type_
-      __ Ldr(temp, HeapOperand(temp, component_offset));
-      codegen_->MaybeGenerateReadBarrier(
-          instruction, temp_loc, temp_loc, temp2_loc, component_offset);
+      GenerateReferenceLoadOneRegister(instruction, temp_loc, component_offset, maybe_temp2_loc);
 
       // If the component type is not null (i.e. the object is indeed
       // an array), jump to label `check_non_primitive_component_type`
@@ -3327,8 +3378,8 @@ void InstructionCodeGeneratorARM64::VisitCheckCast(HCheckCast* instruction) {
       // going into the slow path, as it has been overwritten in the
       // meantime.
       // /* HeapReference<Class> */ temp = obj->klass_
-      __ Ldr(temp, HeapOperand(obj, class_offset));
-      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, obj_loc, class_offset);
+      GenerateReferenceLoadTwoRegisters(
+          instruction, temp_loc, obj_loc, class_offset, maybe_temp2_loc);
       __ B(type_check_slow_path->GetEntryLabel());
 
       __ Bind(&check_non_primitive_component_type);
@@ -3337,8 +3388,8 @@ void InstructionCodeGeneratorARM64::VisitCheckCast(HCheckCast* instruction) {
       __ Cbz(temp, &done);
       // Same comment as above regarding `temp` and the slow path.
       // /* HeapReference<Class> */ temp = obj->klass_
-      __ Ldr(temp, HeapOperand(obj, class_offset));
-      codegen_->MaybeGenerateReadBarrier(instruction, temp_loc, temp_loc, obj_loc, class_offset);
+      GenerateReferenceLoadTwoRegisters(
+          instruction, temp_loc, obj_loc, class_offset, maybe_temp2_loc);
       __ B(type_check_slow_path->GetEntryLabel());
       break;
     }
@@ -3355,6 +3406,13 @@ void InstructionCodeGeneratorARM64::VisitCheckCast(HCheckCast* instruction) {
       // instruction (following the runtime calling convention), which
       // might be cluttered by the potential first read barrier
       // emission at the beginning of this method.
+      //
+      // TODO: Introduce a new runtime entry point taking the object
+      // to test (instead of its class) as argument, and let it deal
+      // with the read barrier issues. This will let us refactor this
+      // case of the `switch` code as it was previously (with a direct
+      // call to the runtime not using a type checking slow path).
+      // This should also be beneficial for the other cases above.
       __ B(type_check_slow_path->GetEntryLabel());
       break;
   }
@@ -3480,7 +3538,7 @@ static bool TryGenerateIntrinsicCode(HInvoke* invoke, CodeGeneratorARM64* codege
 HInvokeStaticOrDirect::DispatchInfo CodeGeneratorARM64::GetSupportedInvokeStaticOrDirectDispatch(
       const HInvokeStaticOrDirect::DispatchInfo& desired_dispatch_info,
       MethodReference target_method ATTRIBUTE_UNUSED) {
-  // On arm64 we support all dispatch types.
+  // On ARM64 we support all dispatch types.
   return desired_dispatch_info;
 }
 
@@ -3757,32 +3815,17 @@ void InstructionCodeGeneratorARM64::VisitLoadClass(HLoadClass* cls) {
   if (cls->IsReferrersClass()) {
     DCHECK(!cls->CanCallRuntime());
     DCHECK(!cls->MustGenerateClinitCheck());
-    uint32_t declaring_class_offset = ArtMethod::DeclaringClassOffset().Int32Value();
-    if (kEmitCompilerReadBarrier) {
-      // /* GcRoot<mirror::Class>* */ out = &(current_method->declaring_class_)
-      __ Add(out.X(), current_method.X(), declaring_class_offset);
-      // /* mirror::Class* */ out = out->Read()
-      codegen_->GenerateReadBarrierForRoot(cls, out_loc, out_loc);
-    } else {
-      // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
-      __ Ldr(out, MemOperand(current_method, declaring_class_offset));
-    }
+    // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
+    GenerateGcRootFieldLoad(
+        cls, out_loc, current_method, ArtMethod::DeclaringClassOffset().Int32Value());
   } else {
     MemberOffset resolved_types_offset = ArtMethod::DexCacheResolvedTypesOffset(kArm64PointerSize);
     // /* GcRoot<mirror::Class>[] */ out =
     //        current_method.ptr_sized_fields_->dex_cache_resolved_types_
     __ Ldr(out.X(), MemOperand(current_method, resolved_types_offset.Int32Value()));
-
-    size_t cache_offset = CodeGenerator::GetCacheOffset(cls->GetTypeIndex());
-    if (kEmitCompilerReadBarrier) {
-      // /* GcRoot<mirror::Class>* */ out = &out[type_index]
-      __ Add(out.X(), out.X(), cache_offset);
-      // /* mirror::Class* */ out = out->Read()
-      codegen_->GenerateReadBarrierForRoot(cls, out_loc, out_loc);
-    } else {
-      // /* GcRoot<mirror::Class> */ out = out[type_index]
-      __ Ldr(out, MemOperand(out.X(), cache_offset));
-    }
+    // /* GcRoot<mirror::Class> */ out = out[type_index]
+    GenerateGcRootFieldLoad(
+        cls, out_loc, out.X(), CodeGenerator::GetCacheOffset(cls->GetTypeIndex()));
 
     if (!cls->IsInDexCache() || cls->MustGenerateClinitCheck()) {
       DCHECK(cls->CanCallRuntime());
@@ -3845,30 +3888,14 @@ void InstructionCodeGeneratorARM64::VisitLoadString(HLoadString* load) {
   Register out = OutputRegister(load);
   Register current_method = InputRegisterAt(load, 0);
 
-  uint32_t declaring_class_offset = ArtMethod::DeclaringClassOffset().Int32Value();
-  if (kEmitCompilerReadBarrier) {
-    // /* GcRoot<mirror::Class>* */ out = &(current_method->declaring_class_)
-    __ Add(out.X(), current_method.X(), declaring_class_offset);
-    // /* mirror::Class* */ out = out->Read()
-    codegen_->GenerateReadBarrierForRoot(load, out_loc, out_loc);
-  } else {
-    // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
-    __ Ldr(out, MemOperand(current_method, declaring_class_offset));
-  }
-
+  // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
+  GenerateGcRootFieldLoad(
+      load, out_loc, current_method, ArtMethod::DeclaringClassOffset().Int32Value());
   // /* GcRoot<mirror::String>[] */ out = out->dex_cache_strings_
   __ Ldr(out.X(), HeapOperand(out, mirror::Class::DexCacheStringsOffset().Uint32Value()));
-
-  size_t cache_offset = CodeGenerator::GetCacheOffset(load->GetStringIndex());
-  if (kEmitCompilerReadBarrier) {
-    // /* GcRoot<mirror::String>* */ out = &out[string_index]
-    __ Add(out.X(), out.X(), cache_offset);
-    // /* mirror::String* */ out = out->Read()
-    codegen_->GenerateReadBarrierForRoot(load, out_loc, out_loc);
-  } else {
-    // /* GcRoot<mirror::String> */ out = out[string_index]
-    __ Ldr(out, MemOperand(out.X(), cache_offset));
-  }
+  // /* GcRoot<mirror::String> */ out = out[string_index]
+  GenerateGcRootFieldLoad(
+      load, out_loc, out.X(), CodeGenerator::GetCacheOffset(load->GetStringIndex()));
 
   if (!load->IsInDexCache()) {
     SlowPathCodeARM64* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathARM64(load);
@@ -4237,7 +4264,7 @@ void LocationsBuilderARM64::VisitMemoryBarrier(HMemoryBarrier* memory_barrier) {
 }
 
 void InstructionCodeGeneratorARM64::VisitMemoryBarrier(HMemoryBarrier* memory_barrier) {
-  GenerateMemoryBarrier(memory_barrier->GetBarrierKind());
+  codegen_->GenerateMemoryBarrier(memory_barrier->GetBarrierKind());
 }
 
 void LocationsBuilderARM64::VisitReturn(HReturn* instruction) {
@@ -4622,14 +4649,288 @@ void InstructionCodeGeneratorARM64::VisitPackedSwitch(HPackedSwitch* switch_inst
   }
 }
 
-void CodeGeneratorARM64::GenerateReadBarrier(HInstruction* instruction,
-                                             Location out,
-                                             Location ref,
-                                             Location obj,
-                                             uint32_t offset,
-                                             Location index) {
+void InstructionCodeGeneratorARM64::GenerateReferenceLoadOneRegister(HInstruction* instruction,
+                                                                     Location out,
+                                                                     uint32_t offset,
+                                                                     Location maybe_temp) {
+  Primitive::Type type = Primitive::kPrimNot;
+  Register out_reg = RegisterFrom(out, type);
+  if (kEmitCompilerReadBarrier) {
+    Register temp_reg = RegisterFrom(maybe_temp, type);
+    if (kUseBakerReadBarrier) {
+      // Load with fast path based Baker's read barrier.
+      // /* HeapReference<Object> */ out = *(out + offset)
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(instruction,
+                                                      out,
+                                                      out_reg,
+                                                      offset,
+                                                      temp_reg,
+                                                      /* needs_null_check */ false,
+                                                      /* use_load_acquire */ false);
+    } else {
+      // Load with slow path based read barrier.
+      // Save the value of `out` into `maybe_temp` before overwriting it
+      // in the following move operation, as we will need it for the
+      // read barrier below.
+      __ Mov(temp_reg, out_reg);
+      // /* HeapReference<Object> */ out = *(out + offset)
+      __ Ldr(out_reg, HeapOperand(out_reg, offset));
+      codegen_->GenerateReadBarrierSlow(instruction, out, out, maybe_temp, offset);
+    }
+  } else {
+    // Plain load with no read barrier.
+    // /* HeapReference<Object> */ out = *(out + offset)
+    __ Ldr(out_reg, HeapOperand(out_reg, offset));
+    GetAssembler()->MaybeUnpoisonHeapReference(out_reg);
+  }
+}
+
+void InstructionCodeGeneratorARM64::GenerateReferenceLoadTwoRegisters(HInstruction* instruction,
+                                                                      Location out,
+                                                                      Location obj,
+                                                                      uint32_t offset,
+                                                                      Location maybe_temp) {
+  Primitive::Type type = Primitive::kPrimNot;
+  Register out_reg = RegisterFrom(out, type);
+  Register obj_reg = RegisterFrom(obj, type);
+  if (kEmitCompilerReadBarrier) {
+    if (kUseBakerReadBarrier) {
+      // Load with fast path based Baker's read barrier.
+      Register temp_reg = RegisterFrom(maybe_temp, type);
+      // /* HeapReference<Object> */ out = *(obj + offset)
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(instruction,
+                                                      out,
+                                                      obj_reg,
+                                                      offset,
+                                                      temp_reg,
+                                                      /* needs_null_check */ false,
+                                                      /* use_load_acquire */ false);
+    } else {
+      // Load with slow path based read barrier.
+      // /* HeapReference<Object> */ out = *(obj + offset)
+      __ Ldr(out_reg, HeapOperand(obj_reg, offset));
+      codegen_->GenerateReadBarrierSlow(instruction, out, out, obj, offset);
+    }
+  } else {
+    // Plain load with no read barrier.
+    // /* HeapReference<Object> */ out = *(obj + offset)
+    __ Ldr(out_reg, HeapOperand(obj_reg, offset));
+    GetAssembler()->MaybeUnpoisonHeapReference(out_reg);
+  }
+}
+
+void InstructionCodeGeneratorARM64::GenerateGcRootFieldLoad(HInstruction* instruction,
+                                                            Location root,
+                                                            vixl::Register obj,
+                                                            uint32_t offset) {
+  Register root_reg = RegisterFrom(root, Primitive::kPrimNot);
+  if (kEmitCompilerReadBarrier) {
+    if (kUseBakerReadBarrier) {
+      // Fast path implementation of art::ReadBarrier::BarrierForRoot when
+      // Baker's read barrier are used:
+      //
+      //   root = obj.field;
+      //   if (Thread::Current()->GetIsGcMarking()) {
+      //     root = ReadBarrier::Mark(root)
+      //   }
+
+      // /* GcRoot<mirror::Object> */ root = *(obj + offset)
+      __ Ldr(root_reg, MemOperand(obj, offset));
+      static_assert(
+          sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(GcRoot<mirror::Object>),
+          "art::mirror::CompressedReference<mirror::Object> and art::GcRoot<mirror::Object> "
+          "have different sizes.");
+      static_assert(sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(int32_t),
+                    "art::mirror::CompressedReference<mirror::Object> and int32_t "
+                    "have different sizes.");
+
+      // Slow path used to mark the GC root `root`.
+      SlowPathCodeARM64* slow_path =
+          new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64(instruction, root, root);
+      codegen_->AddSlowPath(slow_path);
+
+      MacroAssembler* masm = GetVIXLAssembler();
+      UseScratchRegisterScope temps(masm);
+      Register temp = temps.AcquireW();
+      // temp = Thread::Current()->GetIsGcMarking()
+      __ Ldr(temp, MemOperand(tr, Thread::IsGcMarkingOffset<kArm64WordSize>().Int32Value()));
+      __ Cbnz(temp, slow_path->GetEntryLabel());
+      __ Bind(slow_path->GetExitLabel());
+    } else {
+      // GC root loaded through a slow path for read barriers other
+      // than Baker's.
+      // /* GcRoot<mirror::Object>* */ root = obj + offset
+      __ Add(root_reg.X(), obj.X(), offset);
+      // /* mirror::Object* */ root = root->Read()
+      codegen_->GenerateReadBarrierForRootSlow(instruction, root, root);
+    }
+  } else {
+    // Plain GC root load with no read barrier.
+    // /* GcRoot<mirror::Object> */ root = *(obj + offset)
+    __ Ldr(root_reg, MemOperand(obj, offset));
+    // Note that GC roots are not affected by heap poisoning, thus we
+    // do not have to unpoison `root_reg` here.
+  }
+}
+
+void CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction,
+                                                               Location ref,
+                                                               vixl::Register obj,
+                                                               uint32_t offset,
+                                                               Register temp,
+                                                               bool needs_null_check,
+                                                               bool use_load_acquire) {
+  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(kUseBakerReadBarrier);
+
+  // /* HeapReference<Object> */ ref = *(obj + offset)
+  Location no_index = Location::NoLocation();
+  GenerateReferenceLoadWithBakerReadBarrier(
+      instruction, ref, obj, offset, no_index, temp, needs_null_check, use_load_acquire);
+}
+
+void CodeGeneratorARM64::GenerateArrayLoadWithBakerReadBarrier(HInstruction* instruction,
+                                                               Location ref,
+                                                               vixl::Register obj,
+                                                               uint32_t data_offset,
+                                                               Location index,
+                                                               Register temp,
+                                                               bool needs_null_check) {
+  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(kUseBakerReadBarrier);
+
+  // Array cells are never volatile variables, therefore array loads
+  // never use Load-Acquire instructions on ARM64.
+  const bool use_load_acquire = false;
+
+  // /* HeapReference<Object> */ ref =
+  //     *(obj + data_offset + index * sizeof(HeapReference<Object>))
+  GenerateReferenceLoadWithBakerReadBarrier(
+      instruction, ref, obj, data_offset, index, temp, needs_null_check, use_load_acquire);
+}
+
+void CodeGeneratorARM64::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* instruction,
+                                                                   Location ref,
+                                                                   vixl::Register obj,
+                                                                   uint32_t offset,
+                                                                   Location index,
+                                                                   Register temp,
+                                                                   bool needs_null_check,
+                                                                   bool use_load_acquire) {
   DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(kUseBakerReadBarrier);
+  // If `index` is a valid location, then we are emitting an array
+  // load, so we shouldn't be using a Load Acquire instruction.
+  // In other words: `index.IsValid()` => `!use_load_acquire`.
+  DCHECK(!index.IsValid() || !use_load_acquire);
+
+  MacroAssembler* masm = GetVIXLAssembler();
+  UseScratchRegisterScope temps(masm);
 
+  // In slow path based read barriers, the read barrier call is
+  // inserted after the original load. However, in fast path based
+  // Baker's read barriers, we need to perform the load of
+  // mirror::Object::monitor_ *before* the original reference load.
+  // This load-load ordering is required by the read barrier.
+  // The fast path/slow path (for Baker's algorithm) should look like:
+  //
+  //   uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
+  //   lfence;  // Load fence or artificial data dependency to prevent load-load reordering
+  //   HeapReference<Object> ref = *src;  // Original reference load.
+  //   bool is_gray = (rb_state == ReadBarrier::gray_ptr_);
+  //   if (is_gray) {
+  //     ref = ReadBarrier::Mark(ref);  // Performed by runtime entrypoint slow path.
+  //   }
+  //
+  // Note: the original implementation in ReadBarrier::Barrier is
+  // slightly more complex as it performs additional checks that we do
+  // not do here for performance reasons.
+
+  Primitive::Type type = Primitive::kPrimNot;
+  Register ref_reg = RegisterFrom(ref, type);
+  DCHECK(obj.IsW());
+  uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
+
+  // /* int32_t */ monitor = obj->monitor_
+  __ Ldr(temp, HeapOperand(obj, monitor_offset));
+  if (needs_null_check) {
+    MaybeRecordImplicitNullCheck(instruction);
+  }
+  // /* LockWord */ lock_word = LockWord(monitor)
+  static_assert(sizeof(LockWord) == sizeof(int32_t),
+                "art::LockWord and int32_t have different sizes.");
+  // /* uint32_t */ rb_state = lock_word.ReadBarrierState()
+  __ Lsr(temp, temp, LockWord::kReadBarrierStateShift);
+  __ And(temp, temp, Operand(LockWord::kReadBarrierStateMask));
+  static_assert(
+      LockWord::kReadBarrierStateMask == ReadBarrier::rb_ptr_mask_,
+      "art::LockWord::kReadBarrierStateMask is not equal to art::ReadBarrier::rb_ptr_mask_.");
+
+  // Introduce a dependency on the high bits of rb_state, which shall
+  // be all zeroes, to prevent load-load reordering, and without using
+  // a memory barrier (which would be more expensive).
+  // temp2 = rb_state & ~LockWord::kReadBarrierStateMask = 0
+  Register temp2 = temps.AcquireW();
+  __ Bic(temp2, temp, Operand(LockWord::kReadBarrierStateMask));
+  // obj is unchanged by this operation, but its value now depends on
+  // temp2, which depends on temp.
+  __ Add(obj, obj, Operand(temp2));
+  temps.Release(temp2);
+
+  // The actual reference load.
+  if (index.IsValid()) {
+    static_assert(
+        sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
+        "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
+    temp2 = temps.AcquireW();
+    // /* HeapReference<Object> */ ref =
+    //     *(obj + offset + index * sizeof(HeapReference<Object>))
+    MemOperand source = HeapOperand(obj);
+    if (index.IsConstant()) {
+      uint32_t computed_offset =
+          offset + (Int64ConstantFrom(index) << Primitive::ComponentSizeShift(type));
+      source = HeapOperand(obj, computed_offset);
+    } else {
+      __ Add(temp2, obj, offset);
+      source = HeapOperand(temp2, XRegisterFrom(index), LSL, Primitive::ComponentSizeShift(type));
+    }
+    Load(type, ref_reg, source);
+    temps.Release(temp2);
+  } else {
+    // /* HeapReference<Object> */ ref = *(obj + offset)
+    MemOperand field = HeapOperand(obj, offset);
+    if (use_load_acquire) {
+      LoadAcquire(instruction, ref_reg, field, /* needs_null_check */ false);
+    } else {
+      Load(type, ref_reg, field);
+    }
+  }
+
+  // Object* ref = ref_addr->AsMirrorPtr()
+  GetAssembler()->MaybeUnpoisonHeapReference(ref_reg);
+
+  // Slow path used to mark the object `ref` when it is gray.
+  SlowPathCodeARM64* slow_path =
+      new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64(instruction, ref, ref);
+  AddSlowPath(slow_path);
+
+  // if (rb_state == ReadBarrier::gray_ptr_)
+  //   ref = ReadBarrier::Mark(ref);
+  __ Cmp(temp, ReadBarrier::gray_ptr_);
+  __ B(eq, slow_path->GetEntryLabel());
+  __ Bind(slow_path->GetExitLabel());
+}
+
+void CodeGeneratorARM64::GenerateReadBarrierSlow(HInstruction* instruction,
+                                                 Location out,
+                                                 Location ref,
+                                                 Location obj,
+                                                 uint32_t offset,
+                                                 Location index) {
+  DCHECK(kEmitCompilerReadBarrier);
+
+  // Insert a slow path based read barrier *after* the reference load.
+  //
   // If heap poisoning is enabled, the unpoisoning of the loaded
   // reference will be carried out by the runtime within the slow
   // path.
@@ -4643,57 +4944,41 @@ void CodeGeneratorARM64::GenerateReadBarrier(HInstruction* instruction,
       ReadBarrierForHeapReferenceSlowPathARM64(instruction, out, ref, obj, offset, index);
   AddSlowPath(slow_path);
 
-  // TODO: When read barrier has a fast path, add it here.
-  /* Currently the read barrier call is inserted after the original load.
-   * However, if we have a fast path, we need to perform the load of obj.LockWord *before* the
-   * original load. This load-load ordering is required by the read barrier.
-   * The fast path/slow path (for Baker's algorithm) should look like:
-   *
-   * bool isGray = obj.LockWord & kReadBarrierMask;
-   * lfence;  // load fence or artificial data dependence to prevent load-load reordering
-   * ref = obj.field;    // this is the original load
-   * if (isGray) {
-   *   ref = Mark(ref);  // ideally the slow path just does Mark(ref)
-   * }
-   */
-
   __ B(slow_path->GetEntryLabel());
   __ Bind(slow_path->GetExitLabel());
 }
 
-void CodeGeneratorARM64::MaybeGenerateReadBarrier(HInstruction* instruction,
-                                                  Location out,
-                                                  Location ref,
-                                                  Location obj,
-                                                  uint32_t offset,
-                                                  Location index) {
+void CodeGeneratorARM64::MaybeGenerateReadBarrierSlow(HInstruction* instruction,
+                                                      Location out,
+                                                      Location ref,
+                                                      Location obj,
+                                                      uint32_t offset,
+                                                      Location index) {
   if (kEmitCompilerReadBarrier) {
+    // Baker's read barriers shall be handled by the fast path
+    // (CodeGeneratorARM64::GenerateReferenceLoadWithBakerReadBarrier).
+    DCHECK(!kUseBakerReadBarrier);
     // If heap poisoning is enabled, unpoisoning will be taken care of
     // by the runtime within the slow path.
-    GenerateReadBarrier(instruction, out, ref, obj, offset, index);
+    GenerateReadBarrierSlow(instruction, out, ref, obj, offset, index);
   } else if (kPoisonHeapReferences) {
     GetAssembler()->UnpoisonHeapReference(WRegisterFrom(out));
   }
 }
 
-void CodeGeneratorARM64::GenerateReadBarrierForRoot(HInstruction* instruction,
-                                                    Location out,
-                                                    Location root) {
+void CodeGeneratorARM64::GenerateReadBarrierForRootSlow(HInstruction* instruction,
+                                                        Location out,
+                                                        Location root) {
   DCHECK(kEmitCompilerReadBarrier);
 
+  // Insert a slow path based read barrier *after* the GC root load.
+  //
   // Note that GC roots are not affected by heap poisoning, so we do
   // not need to do anything special for this here.
   SlowPathCodeARM64* slow_path =
       new (GetGraph()->GetArena()) ReadBarrierForRootSlowPathARM64(instruction, out, root);
   AddSlowPath(slow_path);
 
-  // TODO: Implement a fast path for ReadBarrierForRoot, performing
-  // the following operation (for Baker's algorithm):
-  //
-  //   if (thread.tls32_.is_gc_marking) {
-  //     root = Mark(root);
-  //   }
-
   __ B(slow_path->GetEntryLabel());
   __ Bind(slow_path->GetExitLabel());
 }
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 98303f67ad..a9d1bbde98 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -195,7 +195,6 @@ class InstructionCodeGeneratorARM64 : public InstructionCodeGenerator {
 
   FOR_EACH_CONCRETE_INSTRUCTION_COMMON(DECLARE_VISIT_INSTRUCTION)
   FOR_EACH_CONCRETE_INSTRUCTION_ARM64(DECLARE_VISIT_INSTRUCTION)
-  FOR_EACH_CONCRETE_INSTRUCTION_SHARED(DECLARE_VISIT_INSTRUCTION)
 
 #undef DECLARE_VISIT_INSTRUCTION
 
@@ -209,14 +208,53 @@ class InstructionCodeGeneratorARM64 : public InstructionCodeGenerator {
 
  private:
   void GenerateClassInitializationCheck(SlowPathCodeARM64* slow_path, vixl::Register class_reg);
-  void GenerateMemoryBarrier(MemBarrierKind kind);
   void GenerateSuspendCheck(HSuspendCheck* instruction, HBasicBlock* successor);
   void HandleBinaryOp(HBinaryOperation* instr);
+
   void HandleFieldSet(HInstruction* instruction,
                       const FieldInfo& field_info,
                       bool value_can_be_null);
   void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info);
   void HandleCondition(HCondition* instruction);
+
+  // Generate a heap reference load using one register `out`:
+  //
+  //   out <- *(out + offset)
+  //
+  // while honoring heap poisoning and/or read barriers (if any).
+  //
+  // Location `maybe_temp` is used when generating a read barrier and
+  // shall be a register in that case; it may be an invalid location
+  // otherwise.
+  void GenerateReferenceLoadOneRegister(HInstruction* instruction,
+                                        Location out,
+                                        uint32_t offset,
+                                        Location maybe_temp);
+  // Generate a heap reference load using two different registers
+  // `out` and `obj`:
+  //
+  //   out <- *(obj + offset)
+  //
+  // while honoring heap poisoning and/or read barriers (if any).
+  //
+  // Location `maybe_temp` is used when generating a Baker's (fast
+  // path) read barrier and shall be a register in that case; it may
+  // be an invalid location otherwise.
+  void GenerateReferenceLoadTwoRegisters(HInstruction* instruction,
+                                         Location out,
+                                         Location obj,
+                                         uint32_t offset,
+                                         Location maybe_temp);
+  // Generate a GC root reference load:
+  //
+  //   root <- *(obj + offset)
+  //
+  // while honoring read barriers (if any).
+  void GenerateGcRootFieldLoad(HInstruction* instruction,
+                               Location root,
+                               vixl::Register obj,
+                               uint32_t offset);
+
   void HandleShift(HBinaryOperation* instr);
   void GenerateImplicitNullCheck(HNullCheck* instruction);
   void GenerateExplicitNullCheck(HNullCheck* instruction);
@@ -246,7 +284,6 @@ class LocationsBuilderARM64 : public HGraphVisitor {
 
   FOR_EACH_CONCRETE_INSTRUCTION_COMMON(DECLARE_VISIT_INSTRUCTION)
   FOR_EACH_CONCRETE_INSTRUCTION_ARM64(DECLARE_VISIT_INSTRUCTION)
-  FOR_EACH_CONCRETE_INSTRUCTION_SHARED(DECLARE_VISIT_INSTRUCTION)
 
 #undef DECLARE_VISIT_INSTRUCTION
 
@@ -339,6 +376,8 @@ class CodeGeneratorARM64 : public CodeGenerator {
   // Emit a write barrier.
   void MarkGCCard(vixl::Register object, vixl::Register value, bool value_can_be_null);
 
+  void GenerateMemoryBarrier(MemBarrierKind kind);
+
   // Register allocation.
 
   void SetupBlockedRegisters() const OVERRIDE;
@@ -388,9 +427,12 @@ class CodeGeneratorARM64 : public CodeGenerator {
   void AddLocationAsTemp(Location location, LocationSummary* locations) OVERRIDE;
 
   void Load(Primitive::Type type, vixl::CPURegister dst, const vixl::MemOperand& src);
-  void Store(Primitive::Type type, vixl::CPURegister rt, const vixl::MemOperand& dst);
-  void LoadAcquire(HInstruction* instruction, vixl::CPURegister dst, const vixl::MemOperand& src);
-  void StoreRelease(Primitive::Type type, vixl::CPURegister rt, const vixl::MemOperand& dst);
+  void Store(Primitive::Type type, vixl::CPURegister src, const vixl::MemOperand& dst);
+  void LoadAcquire(HInstruction* instruction,
+                   vixl::CPURegister dst,
+                   const vixl::MemOperand& src,
+                   bool needs_null_check);
+  void StoreRelease(Primitive::Type type, vixl::CPURegister src, const vixl::MemOperand& dst);
 
   // Generate code to invoke a runtime entry point.
   void InvokeRuntime(QuickEntrypointEnum entrypoint,
@@ -425,7 +467,27 @@ class CodeGeneratorARM64 : public CodeGenerator {
 
   void EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches) OVERRIDE;
 
-  // Generate a read barrier for a heap reference within `instruction`.
+  // Fast path implementation of ReadBarrier::Barrier for a heap
+  // reference field load when Baker's read barriers are used.
+  void GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction,
+                                             Location ref,
+                                             vixl::Register obj,
+                                             uint32_t offset,
+                                             vixl::Register temp,
+                                             bool needs_null_check,
+                                             bool use_load_acquire);
+  // Fast path implementation of ReadBarrier::Barrier for a heap
+  // reference array load when Baker's read barriers are used.
+  void GenerateArrayLoadWithBakerReadBarrier(HInstruction* instruction,
+                                             Location ref,
+                                             vixl::Register obj,
+                                             uint32_t data_offset,
+                                             Location index,
+                                             vixl::Register temp,
+                                             bool needs_null_check);
+
+  // Generate a read barrier for a heap reference within `instruction`
+  // using a slow path.
   //
   // A read barrier for an object reference read from the heap is
   // implemented as a call to the artReadBarrierSlow runtime entry
@@ -442,23 +504,25 @@ class CodeGeneratorARM64 : public CodeGenerator {
   // When `index` is provided (i.e. for array accesses), the offset
   // value passed to artReadBarrierSlow is adjusted to take `index`
   // into account.
-  void GenerateReadBarrier(HInstruction* instruction,
-                           Location out,
-                           Location ref,
-                           Location obj,
-                           uint32_t offset,
-                           Location index = Location::NoLocation());
-
-  // If read barriers are enabled, generate a read barrier for a heap reference.
-  // If heap poisoning is enabled, also unpoison the reference in `out`.
-  void MaybeGenerateReadBarrier(HInstruction* instruction,
-                                Location out,
-                                Location ref,
-                                Location obj,
-                                uint32_t offset,
-                                Location index = Location::NoLocation());
-
-  // Generate a read barrier for a GC root within `instruction`.
+  void GenerateReadBarrierSlow(HInstruction* instruction,
+                               Location out,
+                               Location ref,
+                               Location obj,
+                               uint32_t offset,
+                               Location index = Location::NoLocation());
+
+  // If read barriers are enabled, generate a read barrier for a heap
+  // reference using a slow path. If heap poisoning is enabled, also
+  // unpoison the reference in `out`.
+  void MaybeGenerateReadBarrierSlow(HInstruction* instruction,
+                                    Location out,
+                                    Location ref,
+                                    Location obj,
+                                    uint32_t offset,
+                                    Location index = Location::NoLocation());
+
+  // Generate a read barrier for a GC root within `instruction` using
+  // a slow path.
   //
   // A read barrier for an object reference GC root is implemented as
   // a call to the artReadBarrierForRootSlow runtime entry point,
@@ -468,9 +532,20 @@ class CodeGeneratorARM64 : public CodeGenerator {
   //
   // The `out` location contains the value returned by
   // artReadBarrierForRootSlow.
-  void GenerateReadBarrierForRoot(HInstruction* instruction, Location out, Location root);
+  void GenerateReadBarrierForRootSlow(HInstruction* instruction, Location out, Location root);
 
  private:
+  // Factored implementation of GenerateFieldLoadWithBakerReadBarrier
+  // and GenerateArrayLoadWithBakerReadBarrier.
+  void GenerateReferenceLoadWithBakerReadBarrier(HInstruction* instruction,
+                                                 Location ref,
+                                                 vixl::Register obj,
+                                                 uint32_t offset,
+                                                 Location index,
+                                                 vixl::Register temp,
+                                                 bool needs_null_check,
+                                                 bool use_load_acquire);
+
   using Uint64ToLiteralMap = ArenaSafeMap<uint64_t, vixl::Literal<uint64_t>*>;
   using MethodToLiteralMap = ArenaSafeMap<MethodReference,
                                           vixl::Literal<uint64_t>*,
diff --git a/compiler/optimizing/dominator_test.cc b/compiler/optimizing/dominator_test.cc
index 91e4a997fd..feb8b2092a 100644
--- a/compiler/optimizing/dominator_test.cc
+++ b/compiler/optimizing/dominator_test.cc
@@ -133,8 +133,9 @@ TEST(OptimizerTest, CFG4) {
 
   const uint32_t dominators[] = {
       kInvalidBlockId,
-      0,
-      kInvalidBlockId
+      3,
+      kInvalidBlockId,
+      0
   };
 
   TestCode(data1, dominators, sizeof(dominators) / sizeof(int));
diff --git a/compiler/optimizing/graph_test.cc b/compiler/optimizing/graph_test.cc
index d4b9b71952..d5305646a8 100644
--- a/compiler/optimizing/graph_test.cc
+++ b/compiler/optimizing/graph_test.cc
@@ -164,7 +164,7 @@ TEST(GraphTest, IfSuccessorMultipleBackEdges1) {
 
   // Ensure there is only one back edge.
   ASSERT_EQ(if_block->GetPredecessors().size(), 2u);
-  ASSERT_EQ(if_block->GetPredecessors()[0], entry_block);
+  ASSERT_EQ(if_block->GetPredecessors()[0], entry_block->GetSingleSuccessor());
   ASSERT_NE(if_block->GetPredecessors()[1], if_block);
 
   // Ensure the new block is the back edge.
@@ -199,7 +199,7 @@ TEST(GraphTest, IfSuccessorMultipleBackEdges2) {
 
   // Ensure there is only one back edge.
   ASSERT_EQ(if_block->GetPredecessors().size(), 2u);
-  ASSERT_EQ(if_block->GetPredecessors()[0], entry_block);
+  ASSERT_EQ(if_block->GetPredecessors()[0], entry_block->GetSingleSuccessor());
   ASSERT_NE(if_block->GetPredecessors()[1], if_block);
 
   // Ensure the new block is the back edge.
diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index 6b8f61a5b4..32c3a925e0 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc
@@ -21,7 +21,6 @@
 #include <cctype>
 #include <sstream>
 
-#include "bounds_check_elimination.h"
 #include "code_generator.h"
 #include "dead_code_elimination.h"
 #include "disassembler.h"
@@ -427,12 +426,6 @@ class HGraphVisualizerPrinter : public HGraphDelegateVisitor {
     StartAttributeStream("kind") << (try_boundary->IsEntry() ? "entry" : "exit");
   }
 
-#if defined(ART_ENABLE_CODEGEN_arm) || defined(ART_ENABLE_CODEGEN_arm64)
-  void VisitMultiplyAccumulate(HMultiplyAccumulate* instruction) OVERRIDE {
-    StartAttributeStream("kind") << instruction->GetOpKind();
-  }
-#endif
-
 #ifdef ART_ENABLE_CODEGEN_arm64
   void VisitArm64DataProcWithShifterOp(HArm64DataProcWithShifterOp* instruction) OVERRIDE {
     StartAttributeStream("kind") << instruction->GetInstrKind() << "+" << instruction->GetOpKind();
@@ -440,6 +433,10 @@ class HGraphVisualizerPrinter : public HGraphDelegateVisitor {
       StartAttributeStream("shift") << instruction->GetShiftAmount();
     }
   }
+
+  void VisitArm64MultiplyAccumulate(HArm64MultiplyAccumulate* instruction) OVERRIDE {
+    StartAttributeStream("kind") << instruction->GetOpKind();
+  }
 #endif
 
   bool IsPass(const char* name) {
@@ -508,7 +505,6 @@ class HGraphVisualizerPrinter : public HGraphDelegateVisitor {
     if (IsPass(LICM::kLoopInvariantCodeMotionPassName)
         || IsPass(HDeadCodeElimination::kFinalDeadCodeEliminationPassName)
         || IsPass(HDeadCodeElimination::kInitialDeadCodeEliminationPassName)
-        || IsPass(BoundsCheckElimination::kBoundsCheckEliminationPassName)
         || IsPass(SsaBuilder::kSsaBuilderPassName)) {
       HLoopInformation* info = instruction->GetBlock()->GetLoopInformation();
       if (info == nullptr) {
diff --git a/compiler/optimizing/inliner.cc b/compiler/optimizing/inliner.cc
index 20c4f1f698..2e79df1b84 100644
--- a/compiler/optimizing/inliner.cc
+++ b/compiler/optimizing/inliner.cc
@@ -419,7 +419,10 @@ bool HInliner::TryInline(HInvoke* invoke_instruction, ArtMethod* method, bool do
   size_t inline_max_code_units = compiler_driver_->GetCompilerOptions().GetInlineMaxCodeUnits();
   if (code_item->insns_size_in_code_units_ > inline_max_code_units) {
     VLOG(compiler) << "Method " << PrettyMethod(method)
-                   << " is too big to inline";
+                   << " is too big to inline: "
+                   << code_item->insns_size_in_code_units_
+                   << " > "
+                   << inline_max_code_units;
     return false;
   }
 
@@ -639,9 +642,12 @@ bool HInliner::TryBuildAndInline(ArtMethod* resolved_method,
 
   for (; !it.Done(); it.Advance()) {
     HBasicBlock* block = it.Current();
-    if (block->IsLoopHeader()) {
+
+    if (block->IsLoopHeader() && block->GetLoopInformation()->IsIrreducible()) {
+      // Don't inline methods with irreducible loops, they could prevent some
+      // optimizations to run.
       VLOG(compiler) << "Method " << PrettyMethod(method_index, callee_dex_file)
-                     << " could not be inlined because it contains a loop";
+                     << " could not be inlined because it contains an irreducible loop";
       return false;
     }
 
diff --git a/compiler/optimizing/instruction_simplifier_arm.cc b/compiler/optimizing/instruction_simplifier_arm.cc
deleted file mode 100644
index db1f9a79aa..0000000000
--- a/compiler/optimizing/instruction_simplifier_arm.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright (C) 2015 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "instruction_simplifier_arm.h"
-#include "instruction_simplifier_shared.h"
-
-namespace art {
-namespace arm {
-
-void InstructionSimplifierArmVisitor::VisitMul(HMul* instruction) {
-  if (TryCombineMultiplyAccumulate(instruction, kArm)) {
-    RecordSimplification();
-  }
-}
-
-}  // namespace arm
-}  // namespace art
diff --git a/compiler/optimizing/instruction_simplifier_arm.h b/compiler/optimizing/instruction_simplifier_arm.h
deleted file mode 100644
index 379b95d6ae..0000000000
--- a/compiler/optimizing/instruction_simplifier_arm.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (C) 2015 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_ARM_H_
-#define ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_ARM_H_
-
-#include "nodes.h"
-#include "optimization.h"
-
-namespace art {
-namespace arm {
-
-class InstructionSimplifierArmVisitor : public HGraphVisitor {
- public:
-  InstructionSimplifierArmVisitor(HGraph* graph, OptimizingCompilerStats* stats)
-      : HGraphVisitor(graph), stats_(stats) {}
-
- private:
-  void RecordSimplification() {
-    if (stats_ != nullptr) {
-      stats_->RecordStat(kInstructionSimplificationsArch);
-    }
-  }
-
-  void VisitMul(HMul* instruction) OVERRIDE;
-
-  OptimizingCompilerStats* stats_;
-};
-
-
-class InstructionSimplifierArm : public HOptimization {
- public:
-  InstructionSimplifierArm(HGraph* graph, OptimizingCompilerStats* stats)
-    : HOptimization(graph, "instruction_simplifier_arm", stats) {}
-
-  void Run() OVERRIDE {
-    InstructionSimplifierArmVisitor visitor(graph_, stats_);
-    visitor.VisitReversePostOrder();
-  }
-};
-
-}  // namespace arm
-}  // namespace art
-
-#endif  // ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_ARM_H_
diff --git a/compiler/optimizing/instruction_simplifier_arm64.cc b/compiler/optimizing/instruction_simplifier_arm64.cc
index 83126a5c4d..4bcfc54791 100644
--- a/compiler/optimizing/instruction_simplifier_arm64.cc
+++ b/compiler/optimizing/instruction_simplifier_arm64.cc
@@ -17,7 +17,6 @@
 #include "instruction_simplifier_arm64.h"
 
 #include "common_arm64.h"
-#include "instruction_simplifier_shared.h"
 #include "mirror/array-inl.h"
 
 namespace art {
@@ -180,6 +179,67 @@ bool InstructionSimplifierArm64Visitor::TryMergeIntoUsersShifterOperand(HInstruc
   return true;
 }
 
+bool InstructionSimplifierArm64Visitor::TrySimpleMultiplyAccumulatePatterns(
+    HMul* mul, HBinaryOperation* input_binop, HInstruction* input_other) {
+  DCHECK(Primitive::IsIntOrLongType(mul->GetType()));
+  DCHECK(input_binop->IsAdd() || input_binop->IsSub());
+  DCHECK_NE(input_binop, input_other);
+  if (!input_binop->HasOnlyOneNonEnvironmentUse()) {
+    return false;
+  }
+
+  // Try to interpret patterns like
+  //    a * (b <+/-> 1)
+  // as
+  //    (a * b) <+/-> a
+  HInstruction* input_a = input_other;
+  HInstruction* input_b = nullptr;  // Set to a non-null value if we found a pattern to optimize.
+  HInstruction::InstructionKind op_kind;
+
+  if (input_binop->IsAdd()) {
+    if ((input_binop->GetConstantRight() != nullptr) && input_binop->GetConstantRight()->IsOne()) {
+      // Interpret
+      //    a * (b + 1)
+      // as
+      //    (a * b) + a
+      input_b = input_binop->GetLeastConstantLeft();
+      op_kind = HInstruction::kAdd;
+    }
+  } else {
+    DCHECK(input_binop->IsSub());
+    if (input_binop->GetRight()->IsConstant() &&
+        input_binop->GetRight()->AsConstant()->IsMinusOne()) {
+      // Interpret
+      //    a * (b - (-1))
+      // as
+      //    a + (a * b)
+      input_b = input_binop->GetLeft();
+      op_kind = HInstruction::kAdd;
+    } else if (input_binop->GetLeft()->IsConstant() &&
+               input_binop->GetLeft()->AsConstant()->IsOne()) {
+      // Interpret
+      //    a * (1 - b)
+      // as
+      //    a - (a * b)
+      input_b = input_binop->GetRight();
+      op_kind = HInstruction::kSub;
+    }
+  }
+
+  if (input_b == nullptr) {
+    // We did not find a pattern we can optimize.
+    return false;
+  }
+
+  HArm64MultiplyAccumulate* mulacc = new(GetGraph()->GetArena()) HArm64MultiplyAccumulate(
+      mul->GetType(), op_kind, input_a, input_a, input_b, mul->GetDexPc());
+
+  mul->GetBlock()->ReplaceAndRemoveInstructionWith(mul, mulacc);
+  input_binop->GetBlock()->RemoveInstruction(input_binop);
+
+  return false;
+}
+
 void InstructionSimplifierArm64Visitor::VisitArrayGet(HArrayGet* instruction) {
   TryExtractArrayAccessAddress(instruction,
                                instruction->GetArray(),
@@ -195,8 +255,75 @@ void InstructionSimplifierArm64Visitor::VisitArraySet(HArraySet* instruction) {
 }
 
 void InstructionSimplifierArm64Visitor::VisitMul(HMul* instruction) {
-  if (TryCombineMultiplyAccumulate(instruction, kArm64)) {
-    RecordSimplification();
+  Primitive::Type type = instruction->GetType();
+  if (!Primitive::IsIntOrLongType(type)) {
+    return;
+  }
+
+  HInstruction* use = instruction->HasNonEnvironmentUses()
+      ? instruction->GetUses().GetFirst()->GetUser()
+      : nullptr;
+
+  if (instruction->HasOnlyOneNonEnvironmentUse() && (use->IsAdd() || use->IsSub())) {
+    // Replace code looking like
+    //    MUL tmp, x, y
+    //    SUB dst, acc, tmp
+    // with
+    //    MULSUB dst, acc, x, y
+    // Note that we do not want to (unconditionally) perform the merge when the
+    // multiplication has multiple uses and it can be merged in all of them.
+    // Multiple uses could happen on the same control-flow path, and we would
+    // then increase the amount of work. In the future we could try to evaluate
+    // whether all uses are on different control-flow paths (using dominance and
+    // reverse-dominance information) and only perform the merge when they are.
+    HInstruction* accumulator = nullptr;
+    HBinaryOperation* binop = use->AsBinaryOperation();
+    HInstruction* binop_left = binop->GetLeft();
+    HInstruction* binop_right = binop->GetRight();
+    // Be careful after GVN. This should not happen since the `HMul` has only
+    // one use.
+    DCHECK_NE(binop_left, binop_right);
+    if (binop_right == instruction) {
+      accumulator = binop_left;
+    } else if (use->IsAdd()) {
+      DCHECK_EQ(binop_left, instruction);
+      accumulator = binop_right;
+    }
+
+    if (accumulator != nullptr) {
+      HArm64MultiplyAccumulate* mulacc =
+          new (GetGraph()->GetArena()) HArm64MultiplyAccumulate(type,
+                                                                binop->GetKind(),
+                                                                accumulator,
+                                                                instruction->GetLeft(),
+                                                                instruction->GetRight());
+
+      binop->GetBlock()->ReplaceAndRemoveInstructionWith(binop, mulacc);
+      DCHECK(!instruction->HasUses());
+      instruction->GetBlock()->RemoveInstruction(instruction);
+      RecordSimplification();
+      return;
+    }
+  }
+
+  // Use multiply accumulate instruction for a few simple patterns.
+  // We prefer not applying the following transformations if the left and
+  // right inputs perform the same operation.
+  // We rely on GVN having squashed the inputs if appropriate. However the
+  // results are still correct even if that did not happen.
+  if (instruction->GetLeft() == instruction->GetRight()) {
+    return;
+  }
+
+  HInstruction* left = instruction->GetLeft();
+  HInstruction* right = instruction->GetRight();
+  if ((right->IsAdd() || right->IsSub()) &&
+      TrySimpleMultiplyAccumulatePatterns(instruction, right->AsBinaryOperation(), left)) {
+    return;
+  }
+  if ((left->IsAdd() || left->IsSub()) &&
+      TrySimpleMultiplyAccumulatePatterns(instruction, left->AsBinaryOperation(), right)) {
+    return;
   }
 }
 
diff --git a/compiler/optimizing/instruction_simplifier_arm64.h b/compiler/optimizing/instruction_simplifier_arm64.h
index 37a34c0373..b7f490bb8c 100644
--- a/compiler/optimizing/instruction_simplifier_arm64.h
+++ b/compiler/optimizing/instruction_simplifier_arm64.h
@@ -51,6 +51,10 @@ class InstructionSimplifierArm64Visitor : public HGraphVisitor {
     return TryMergeIntoShifterOperand(use, bitfield_op, true);
   }
 
+  bool TrySimpleMultiplyAccumulatePatterns(HMul* mul,
+                                           HBinaryOperation* input_binop,
+                                           HInstruction* input_other);
+
   // HInstruction visitors, sorted alphabetically.
   void VisitArrayGet(HArrayGet* instruction) OVERRIDE;
   void VisitArraySet(HArraySet* instruction) OVERRIDE;
diff --git a/compiler/optimizing/instruction_simplifier_shared.cc b/compiler/optimizing/instruction_simplifier_shared.cc
deleted file mode 100644
index 45d196fa6d..0000000000
--- a/compiler/optimizing/instruction_simplifier_shared.cc
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * Copyright (C) 2015 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "instruction_simplifier_shared.h"
-
-namespace art {
-
-namespace {
-
-bool TrySimpleMultiplyAccumulatePatterns(HMul* mul,
-                                         HBinaryOperation* input_binop,
-                                         HInstruction* input_other) {
-  DCHECK(Primitive::IsIntOrLongType(mul->GetType()));
-  DCHECK(input_binop->IsAdd() || input_binop->IsSub());
-  DCHECK_NE(input_binop, input_other);
-  if (!input_binop->HasOnlyOneNonEnvironmentUse()) {
-    return false;
-  }
-
-  // Try to interpret patterns like
-  //    a * (b <+/-> 1)
-  // as
-  //    (a * b) <+/-> a
-  HInstruction* input_a = input_other;
-  HInstruction* input_b = nullptr;  // Set to a non-null value if we found a pattern to optimize.
-  HInstruction::InstructionKind op_kind;
-
-  if (input_binop->IsAdd()) {
-    if ((input_binop->GetConstantRight() != nullptr) && input_binop->GetConstantRight()->IsOne()) {
-      // Interpret
-      //    a * (b + 1)
-      // as
-      //    (a * b) + a
-      input_b = input_binop->GetLeastConstantLeft();
-      op_kind = HInstruction::kAdd;
-    }
-  } else {
-    DCHECK(input_binop->IsSub());
-    if (input_binop->GetRight()->IsConstant() &&
-        input_binop->GetRight()->AsConstant()->IsMinusOne()) {
-      // Interpret
-      //    a * (b - (-1))
-      // as
-      //    a + (a * b)
-      input_b = input_binop->GetLeft();
-      op_kind = HInstruction::kAdd;
-    } else if (input_binop->GetLeft()->IsConstant() &&
-               input_binop->GetLeft()->AsConstant()->IsOne()) {
-      // Interpret
-      //    a * (1 - b)
-      // as
-      //    a - (a * b)
-      input_b = input_binop->GetRight();
-      op_kind = HInstruction::kSub;
-    }
-  }
-
-  if (input_b == nullptr) {
-    // We did not find a pattern we can optimize.
-    return false;
-  }
-
-  ArenaAllocator* arena = mul->GetBlock()->GetGraph()->GetArena();
-  HMultiplyAccumulate* mulacc = new(arena) HMultiplyAccumulate(
-      mul->GetType(), op_kind, input_a, input_a, input_b, mul->GetDexPc());
-
-  mul->GetBlock()->ReplaceAndRemoveInstructionWith(mul, mulacc);
-  input_binop->GetBlock()->RemoveInstruction(input_binop);
-
-  return true;
-}
-
-}  // namespace
-
-bool TryCombineMultiplyAccumulate(HMul* mul, InstructionSet isa) {
-  Primitive::Type type = mul->GetType();
-  switch (isa) {
-    case kArm:
-    case kThumb2:
-      if (type != Primitive::kPrimInt) {
-        return false;
-      }
-      break;
-    case kArm64:
-      if (!Primitive::IsIntOrLongType(type)) {
-        return false;
-      }
-      break;
-    default:
-      return false;
-  }
-
-  HInstruction* use = mul->HasNonEnvironmentUses()
-      ? mul->GetUses().GetFirst()->GetUser()
-      : nullptr;
-
-  ArenaAllocator* arena = mul->GetBlock()->GetGraph()->GetArena();
-
-  if (mul->HasOnlyOneNonEnvironmentUse()) {
-    if (use->IsAdd() || use->IsSub()) {
-      // Replace code looking like
-      //    MUL tmp, x, y
-      //    SUB dst, acc, tmp
-      // with
-      //    MULSUB dst, acc, x, y
-      // Note that we do not want to (unconditionally) perform the merge when the
-      // multiplication has multiple uses and it can be merged in all of them.
-      // Multiple uses could happen on the same control-flow path, and we would
-      // then increase the amount of work. In the future we could try to evaluate
-      // whether all uses are on different control-flow paths (using dominance and
-      // reverse-dominance information) and only perform the merge when they are.
-      HInstruction* accumulator = nullptr;
-      HBinaryOperation* binop = use->AsBinaryOperation();
-      HInstruction* binop_left = binop->GetLeft();
-      HInstruction* binop_right = binop->GetRight();
-      // Be careful after GVN. This should not happen since the `HMul` has only
-      // one use.
-      DCHECK_NE(binop_left, binop_right);
-      if (binop_right == mul) {
-        accumulator = binop_left;
-      } else if (use->IsAdd()) {
-        DCHECK_EQ(binop_left, mul);
-        accumulator = binop_right;
-      }
-
-      if (accumulator != nullptr) {
-        HMultiplyAccumulate* mulacc =
-            new (arena) HMultiplyAccumulate(type,
-                                            binop->GetKind(),
-                                            accumulator,
-                                            mul->GetLeft(),
-                                            mul->GetRight());
-
-        binop->GetBlock()->ReplaceAndRemoveInstructionWith(binop, mulacc);
-        DCHECK(!mul->HasUses());
-        mul->GetBlock()->RemoveInstruction(mul);
-        return true;
-      }
-    } else if (use->IsNeg() && isa != kArm) {
-      HMultiplyAccumulate* mulacc =
-          new (arena) HMultiplyAccumulate(type,
-                                          HInstruction::kSub,
-                                          mul->GetBlock()->GetGraph()->GetConstant(type, 0),
-                                          mul->GetLeft(),
-                                          mul->GetRight());
-
-      use->GetBlock()->ReplaceAndRemoveInstructionWith(use, mulacc);
-      DCHECK(!mul->HasUses());
-      mul->GetBlock()->RemoveInstruction(mul);
-      return true;
-    }
-  }
-
-  // Use multiply accumulate instruction for a few simple patterns.
-  // We prefer not applying the following transformations if the left and
-  // right inputs perform the same operation.
-  // We rely on GVN having squashed the inputs if appropriate. However the
-  // results are still correct even if that did not happen.
-  if (mul->GetLeft() == mul->GetRight()) {
-    return false;
-  }
-
-  HInstruction* left = mul->GetLeft();
-  HInstruction* right = mul->GetRight();
-  if ((right->IsAdd() || right->IsSub()) &&
-      TrySimpleMultiplyAccumulatePatterns(mul, right->AsBinaryOperation(), left)) {
-    return true;
-  }
-  if ((left->IsAdd() || left->IsSub()) &&
-      TrySimpleMultiplyAccumulatePatterns(mul, left->AsBinaryOperation(), right)) {
-    return true;
-  }
-  return false;
-}
-
-}  // namespace art
diff --git a/compiler/optimizing/instruction_simplifier_shared.h b/compiler/optimizing/instruction_simplifier_shared.h
deleted file mode 100644
index 9832ecc058..0000000000
--- a/compiler/optimizing/instruction_simplifier_shared.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (C) 2015 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_SHARED_H_
-#define ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_SHARED_H_
-
-#include "nodes.h"
-
-namespace art {
-
-bool TryCombineMultiplyAccumulate(HMul* mul, InstructionSet isa);
-
-}  // namespace art
-
-#endif  // ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_SHARED_H_
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index c5688a3ea2..8cf2d4f393 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -752,21 +752,33 @@ static void GenUnsafeGet(HInvoke* invoke,
   Register trg = RegisterFrom(trg_loc, type);
   bool use_acquire_release = codegen->GetInstructionSetFeatures().PreferAcquireRelease();
 
-  MemOperand mem_op(base.X(), offset);
-  if (is_volatile) {
-    if (use_acquire_release) {
-      codegen->LoadAcquire(invoke, trg, mem_op);
-    } else {
-      codegen->Load(type, trg, mem_op);
+  if (type == Primitive::kPrimNot && kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+    // UnsafeGetObject/UnsafeGetObjectVolatile with Baker's read barrier case.
+    UseScratchRegisterScope temps(masm);
+    Register temp = temps.AcquireW();
+    codegen->GenerateArrayLoadWithBakerReadBarrier(
+        invoke, trg_loc, base, 0U, offset_loc, temp, /* needs_null_check */ false);
+    if (is_volatile && !use_acquire_release) {
       __ Dmb(InnerShareable, BarrierReads);
     }
   } else {
-    codegen->Load(type, trg, mem_op);
-  }
+    // Other cases.
+    MemOperand mem_op(base.X(), offset);
+    if (is_volatile) {
+      if (use_acquire_release) {
+        codegen->LoadAcquire(invoke, trg, mem_op, /* needs_null_check */ true);
+      } else {
+        codegen->Load(type, trg, mem_op);
+        __ Dmb(InnerShareable, BarrierReads);
+      }
+    } else {
+      codegen->Load(type, trg, mem_op);
+    }
 
-  if (type == Primitive::kPrimNot) {
-    DCHECK(trg.IsW());
-    codegen->MaybeGenerateReadBarrier(invoke, trg_loc, trg_loc, base_loc, 0U, offset_loc);
+    if (type == Primitive::kPrimNot) {
+      DCHECK(trg.IsW());
+      codegen->MaybeGenerateReadBarrierSlow(invoke, trg_loc, trg_loc, base_loc, 0U, offset_loc);
+    }
   }
 }
 
@@ -1026,10 +1038,15 @@ static void GenCas(LocationSummary* locations, Primitive::Type type, CodeGenerat
   vixl::Label loop_head, exit_loop;
   if (use_acquire_release) {
     __ Bind(&loop_head);
-    __ Ldaxr(tmp_value, MemOperand(tmp_ptr));
-    // TODO: Do we need a read barrier here when `type == Primitive::kPrimNot`?
+    // TODO: When `type == Primitive::kPrimNot`, add a read barrier for
+    // the reference stored in the object before attempting the CAS,
+    // similar to the one in the art::Unsafe_compareAndSwapObject JNI
+    // implementation.
+    //
     // Note that this code is not (yet) used when read barriers are
     // enabled (see IntrinsicLocationsBuilderARM64::VisitUnsafeCASObject).
+    DCHECK(!(type == Primitive::kPrimNot && kEmitCompilerReadBarrier));
+    __ Ldaxr(tmp_value, MemOperand(tmp_ptr));
     __ Cmp(tmp_value, expected);
     __ B(&exit_loop, ne);
     __ Stlxr(tmp_32, value, MemOperand(tmp_ptr));
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index cb7bc58b0c..adf8734214 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -288,9 +288,10 @@ void HGraph::SimplifyLoop(HBasicBlock* header) {
 
   // Make sure the loop has only one pre header. This simplifies SSA building by having
   // to just look at the pre header to know which locals are initialized at entry of the
-  // loop.
+  // loop. Also, don't allow the entry block to be a pre header: this simplifies inlining
+  // this graph.
   size_t number_of_incomings = header->GetPredecessors().size() - info->NumberOfBackEdges();
-  if (number_of_incomings != 1) {
+  if (number_of_incomings != 1 || (GetEntryBlock()->GetSingleSuccessor() == header)) {
     HBasicBlock* pre_header = new (arena_) HBasicBlock(this, header->GetDexPc());
     AddBlock(pre_header);
     pre_header->AddInstruction(new (arena_) HGoto(header->GetDexPc()));
@@ -1837,6 +1838,7 @@ HInstruction* HGraph::InlineInto(HGraph* outer_graph, HInvoke* invoke) {
     DCHECK(GetBlocks()[0]->IsEntryBlock());
     DCHECK(GetBlocks()[2]->IsExitBlock());
     DCHECK(!body->IsExitBlock());
+    DCHECK(!body->IsInLoop());
     HInstruction* last = body->GetLastInstruction();
 
     invoke->GetBlock()->instructions_.AddAfter(invoke, body->GetInstructions());
@@ -1895,7 +1897,7 @@ HInstruction* HGraph::InlineInto(HGraph* outer_graph, HInvoke* invoke) {
     // Update the meta information surrounding blocks:
     // (1) the graph they are now in,
     // (2) the reverse post order of that graph,
-    // (3) the potential loop information they are now in,
+    // (3) their potential loop information, inner and outer,
     // (4) try block membership.
     // Note that we do not need to update catch phi inputs because they
     // correspond to the register file of the outer method which the inlinee
@@ -1924,15 +1926,24 @@ HInstruction* HGraph::InlineInto(HGraph* outer_graph, HInvoke* invoke) {
     for (HReversePostOrderIterator it(*this); !it.Done(); it.Advance()) {
       HBasicBlock* current = it.Current();
       if (current != exit_block_ && current != entry_block_ && current != first) {
-        DCHECK(!current->IsInLoop());
         DCHECK(current->GetTryCatchInformation() == nullptr);
         DCHECK(current->GetGraph() == this);
         current->SetGraph(outer_graph);
         outer_graph->AddBlock(current);
         outer_graph->reverse_post_order_[++index_of_at] = current;
-        if (loop_info != nullptr) {
+        if (!current->IsInLoop()) {
           current->SetLoopInformation(loop_info);
-          for (HLoopInformationOutwardIterator loop_it(*at); !loop_it.Done(); loop_it.Advance()) {
+        } else if (current->IsLoopHeader()) {
+          // Clear the information of which blocks are contained in that loop. Since the
+          // information is stored as a bit vector based on block ids, we have to update
+          // it, as those block ids were specific to the callee graph and we are now adding
+          // these blocks to the caller graph.
+          current->GetLoopInformation()->ClearAllBlocks();
+        }
+        if (current->IsInLoop()) {
+          for (HLoopInformationOutwardIterator loop_it(*current);
+               !loop_it.Done();
+               loop_it.Advance()) {
             loop_it.Current()->Add(current);
           }
         }
@@ -1945,7 +1956,9 @@ HInstruction* HGraph::InlineInto(HGraph* outer_graph, HInvoke* invoke) {
     outer_graph->AddBlock(to);
     outer_graph->reverse_post_order_[++index_of_at] = to;
     if (loop_info != nullptr) {
-      to->SetLoopInformation(loop_info);
+      if (!to->IsInLoop()) {
+        to->SetLoopInformation(loop_info);
+      }
       for (HLoopInformationOutwardIterator loop_it(*at); !loop_it.Done(); loop_it.Advance()) {
         loop_it.Current()->Add(to);
       }
@@ -2208,7 +2221,10 @@ void HInvoke::SetIntrinsic(Intrinsics intrinsic,
     SetSideEffects(GetSideEffects().Union(SideEffects::CanTriggerGC()));
   }
   // Adjust method's exception status from intrinsic table.
-  SetCanThrow(exceptions == kCanThrow);
+  switch (exceptions) {
+    case kNoThrow: SetCanThrow(false); break;
+    case kCanThrow: SetCanThrow(true); break;
+  }
 }
 
 bool HNewInstance::IsStringAlloc() const {
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 57fa558129..5246fd1f05 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -689,6 +689,10 @@ class HLoopInformation : public ArenaObject<kArenaAllocLoopInfo> {
   void Add(HBasicBlock* block);
   void Remove(HBasicBlock* block);
 
+  void ClearAllBlocks() {
+    blocks_.ClearAllBits();
+  }
+
  private:
   // Internal recursive implementation of `Populate`.
   void PopulateRecursive(HBasicBlock* block);
@@ -1226,16 +1230,6 @@ class HLoopInformationOutwardIterator : public ValueObject {
   M(UShr, BinaryOperation)                                              \
   M(Xor, BinaryOperation)                                               \
 
-/*
- * Instructions, shared across several (not all) architectures.
- */
-#if !defined(ART_ENABLE_CODEGEN_arm) && !defined(ART_ENABLE_CODEGEN_arm64)
-#define FOR_EACH_CONCRETE_INSTRUCTION_SHARED(M)
-#else
-#define FOR_EACH_CONCRETE_INSTRUCTION_SHARED(M)                         \
-  M(MultiplyAccumulate, Instruction)
-#endif
-
 #ifndef ART_ENABLE_CODEGEN_arm
 #define FOR_EACH_CONCRETE_INSTRUCTION_ARM(M)
 #else
@@ -1248,7 +1242,8 @@ class HLoopInformationOutwardIterator : public ValueObject {
 #else
 #define FOR_EACH_CONCRETE_INSTRUCTION_ARM64(M)                          \
   M(Arm64DataProcWithShifterOp, Instruction)                            \
-  M(Arm64IntermediateAddress, Instruction)
+  M(Arm64IntermediateAddress, Instruction)                              \
+  M(Arm64MultiplyAccumulate, Instruction)
 #endif
 
 #define FOR_EACH_CONCRETE_INSTRUCTION_MIPS(M)
@@ -1268,7 +1263,6 @@ class HLoopInformationOutwardIterator : public ValueObject {
 
 #define FOR_EACH_CONCRETE_INSTRUCTION(M)                                \
   FOR_EACH_CONCRETE_INSTRUCTION_COMMON(M)                               \
-  FOR_EACH_CONCRETE_INSTRUCTION_SHARED(M)                               \
   FOR_EACH_CONCRETE_INSTRUCTION_ARM(M)                                  \
   FOR_EACH_CONCRETE_INSTRUCTION_ARM64(M)                                \
   FOR_EACH_CONCRETE_INSTRUCTION_MIPS(M)                                 \
@@ -5728,9 +5722,6 @@ class HParallelMove : public HTemplateInstruction<0> {
 
 }  // namespace art
 
-#if defined(ART_ENABLE_CODEGEN_arm) || defined(ART_ENABLE_CODEGEN_arm64)
-#include "nodes_shared.h"
-#endif
 #ifdef ART_ENABLE_CODEGEN_arm
 #include "nodes_arm.h"
 #endif
diff --git a/compiler/optimizing/nodes_arm64.h b/compiler/optimizing/nodes_arm64.h
index 173852a55d..445cdab191 100644
--- a/compiler/optimizing/nodes_arm64.h
+++ b/compiler/optimizing/nodes_arm64.h
@@ -118,6 +118,40 @@ class HArm64IntermediateAddress : public HExpression<2> {
   DISALLOW_COPY_AND_ASSIGN(HArm64IntermediateAddress);
 };
 
+class HArm64MultiplyAccumulate : public HExpression<3> {
+ public:
+  HArm64MultiplyAccumulate(Primitive::Type type,
+                           InstructionKind op,
+                           HInstruction* accumulator,
+                           HInstruction* mul_left,
+                           HInstruction* mul_right,
+                           uint32_t dex_pc = kNoDexPc)
+      : HExpression(type, SideEffects::None(), dex_pc), op_kind_(op) {
+    SetRawInputAt(kInputAccumulatorIndex, accumulator);
+    SetRawInputAt(kInputMulLeftIndex, mul_left);
+    SetRawInputAt(kInputMulRightIndex, mul_right);
+  }
+
+  static constexpr int kInputAccumulatorIndex = 0;
+  static constexpr int kInputMulLeftIndex = 1;
+  static constexpr int kInputMulRightIndex = 2;
+
+  bool CanBeMoved() const OVERRIDE { return true; }
+  bool InstructionDataEquals(HInstruction* other) const OVERRIDE {
+    return op_kind_ == other->AsArm64MultiplyAccumulate()->op_kind_;
+  }
+
+  InstructionKind GetOpKind() const { return op_kind_; }
+
+  DECLARE_INSTRUCTION(Arm64MultiplyAccumulate);
+
+ private:
+  // Indicates if this is a MADD or MSUB.
+  InstructionKind op_kind_;
+
+  DISALLOW_COPY_AND_ASSIGN(HArm64MultiplyAccumulate);
+};
+
 }  // namespace art
 
 #endif  // ART_COMPILER_OPTIMIZING_NODES_ARM64_H_
diff --git a/compiler/optimizing/nodes_shared.h b/compiler/optimizing/nodes_shared.h
deleted file mode 100644
index b04b622838..0000000000
--- a/compiler/optimizing/nodes_shared.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (C) 2015 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ART_COMPILER_OPTIMIZING_NODES_SHARED_H_
-#define ART_COMPILER_OPTIMIZING_NODES_SHARED_H_
-
-namespace art {
-
-class HMultiplyAccumulate : public HExpression<3> {
- public:
-  HMultiplyAccumulate(Primitive::Type type,
-                      InstructionKind op,
-                      HInstruction* accumulator,
-                      HInstruction* mul_left,
-                      HInstruction* mul_right,
-                      uint32_t dex_pc = kNoDexPc)
-      : HExpression(type, SideEffects::None(), dex_pc), op_kind_(op) {
-    SetRawInputAt(kInputAccumulatorIndex, accumulator);
-    SetRawInputAt(kInputMulLeftIndex, mul_left);
-    SetRawInputAt(kInputMulRightIndex, mul_right);
-  }
-
-  static constexpr int kInputAccumulatorIndex = 0;
-  static constexpr int kInputMulLeftIndex = 1;
-  static constexpr int kInputMulRightIndex = 2;
-
-  bool CanBeMoved() const OVERRIDE { return true; }
-  bool InstructionDataEquals(HInstruction* other) const OVERRIDE {
-    return op_kind_ == other->AsMultiplyAccumulate()->op_kind_;
-  }
-
-  InstructionKind GetOpKind() const { return op_kind_; }
-
-  DECLARE_INSTRUCTION(MultiplyAccumulate);
-
- private:
-  // Indicates if this is a MADD or MSUB.
-  const InstructionKind op_kind_;
-
-  DISALLOW_COPY_AND_ASSIGN(HMultiplyAccumulate);
-};
-
-}  // namespace art
-
-#endif  // ART_COMPILER_OPTIMIZING_NODES_SHARED_H_
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index 4da48bdfc3..fffd00535c 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -62,7 +62,6 @@
 #include "induction_var_analysis.h"
 #include "inliner.h"
 #include "instruction_simplifier.h"
-#include "instruction_simplifier_arm.h"
 #include "intrinsics.h"
 #include "jit/debugger_interface.h"
 #include "jit/jit_code_cache.h"
@@ -446,11 +445,8 @@ static void RunArchOptimizations(InstructionSet instruction_set,
     case kThumb2:
     case kArm: {
       arm::DexCacheArrayFixups* fixups = new (arena) arm::DexCacheArrayFixups(graph, stats);
-      arm::InstructionSimplifierArm* simplifier =
-          new (arena) arm::InstructionSimplifierArm(graph, stats);
       HOptimization* arm_optimizations[] = {
-        fixups,
-        simplifier
+        fixups
       };
       RunOptimizations(arm_optimizations, arraysize(arm_optimizations), pass_observer);
       break;
diff --git a/compiler/optimizing/register_allocator.cc b/compiler/optimizing/register_allocator.cc
index a966b62b4f..d77639d608 100644
--- a/compiler/optimizing/register_allocator.cc
+++ b/compiler/optimizing/register_allocator.cc
@@ -1734,6 +1734,12 @@ void RegisterAllocator::ConnectSiblings(LiveInterval* interval) {
   }
 }
 
+static bool IsMaterializableEntryBlockInstructionOfGraphWithIrreducibleLoop(
+    HInstruction* instruction) {
+  return instruction->GetBlock()->GetGraph()->HasIrreducibleLoops() &&
+         (instruction->IsConstant() || instruction->IsCurrentMethod());
+}
+
 void RegisterAllocator::ConnectSplitSiblings(LiveInterval* interval,
                                              HBasicBlock* from,
                                              HBasicBlock* to) const {
@@ -1750,7 +1756,19 @@ void RegisterAllocator::ConnectSplitSiblings(LiveInterval* interval,
     // Interval was not split.
     return;
   }
-  DCHECK(destination != nullptr && source != nullptr);
+
+  LiveInterval* parent = interval->GetParent();
+  HInstruction* defined_by = parent->GetDefinedBy();
+  if (destination == nullptr) {
+    // Our live_in fixed point calculation has found that the instruction is live
+    // in the `to` block because it will eventually enter an irreducible loop. Our
+    // live interval computation however does not compute a fixed point, and
+    // therefore will not have a location for that instruction for `to`.
+    // Because the instruction is a constant or the ArtMethod, we don't need to
+    // do anything: it will be materialized in the irreducible loop.
+    DCHECK(IsMaterializableEntryBlockInstructionOfGraphWithIrreducibleLoop(defined_by));
+    return;
+  }
 
   if (!destination->HasRegister()) {
     // Values are eagerly spilled. Spill slot already contains appropriate value.
@@ -1761,13 +1779,13 @@ void RegisterAllocator::ConnectSplitSiblings(LiveInterval* interval,
   // we need to put the moves at the entry of `to`.
   if (from->GetNormalSuccessors().size() == 1) {
     InsertParallelMoveAtExitOf(from,
-                               interval->GetParent()->GetDefinedBy(),
+                               defined_by,
                                source->ToLocation(),
                                destination->ToLocation());
   } else {
     DCHECK_EQ(to->GetPredecessors().size(), 1u);
     InsertParallelMoveAtEntryOf(to,
-                                interval->GetParent()->GetDefinedBy(),
+                                defined_by,
                                 source->ToLocation(),
                                 destination->ToLocation());
   }