Add support for Baker read barriers in SystemArrayCopy intrinsics.

Benchmarks (ARM64) score variations on Nexus 5X with CPU
cores clamped at 960000 Hz (aosp_bullhead-userdebug build):
- Ritzperf - average (lower is better):       -3.03% (slightly better)
- CaffeineMark - average (higher is better):  +1.26% (slightly better)
- DeltaBlue (lower is better):               -10.50% (better)
- Richards - average (lower is better):       -3.36% (slightly better)
- SciMark2 - average (higher is better):      +0.26% (virtually unchanged)

Details about Ritzperf benchmarks with meaningful variations
(lower is better):
- FormulaEvaluationActions.EvaluateAndApplyChanges: -13.26% (better)
- FormulaEvaluationActions.EvaluateCascadingSums:   -10.94% (better)
- FormulaEvaluationActions.EvaluateComplexFormulas: -15.50% (better)
- FormulaEvaluationActions.EvaluateFibonacci:       -10.41% (better)
- FormulaEvaluationActions.EvaluateLargeSums:        +6.02% (worse)

Boot image code size variation on Nexus 5X
(aosp_bullhead-userdebug build):
- total ARM64 framework Oat files size change:
  107047632 bytes -> 107154128 bytes (+0.10%)
- total ARM framework Oat files size change:
  90932028 bytes -> 91009852 bytes (+0.09%)

Test: ART host and target (ARM, ARM64) tests + Nexus 5X boot.
Bug: 29516905
Bug: 29506760
Bug: 12687968
Change-Id: I85431368d09965687a0301ae2eb3c991f276ce5d
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index 22f4181..49d6c19 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -70,6 +70,105 @@
 
 using IntrinsicSlowPathX86 = IntrinsicSlowPath<InvokeDexCallingConventionVisitorX86>;
 
+// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
+#define __ down_cast<X86Assembler*>(codegen->GetAssembler())->  // NOLINT
+
+// Slow path implementing the SystemArrayCopy intrinsic copy loop with read barriers.
+class ReadBarrierSystemArrayCopySlowPathX86 : public SlowPathCode {
+ public:
+  explicit ReadBarrierSystemArrayCopySlowPathX86(HInstruction* instruction)
+      : SlowPathCode(instruction) {
+    DCHECK(kEmitCompilerReadBarrier);
+    DCHECK(kUseBakerReadBarrier);
+  }
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    CodeGeneratorX86* x86_codegen = down_cast<CodeGeneratorX86*>(codegen);
+    LocationSummary* locations = instruction_->GetLocations();
+    DCHECK(locations->CanCall());
+    DCHECK(instruction_->IsInvokeStaticOrDirect())
+        << "Unexpected instruction in read barrier arraycopy slow path: "
+        << instruction_->DebugName();
+    DCHECK(instruction_->GetLocations()->Intrinsified());
+    DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kSystemArrayCopy);
+
+    int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot);
+    uint32_t offset = mirror::Array::DataOffset(element_size).Uint32Value();
+
+    Register src = locations->InAt(0).AsRegister<Register>();
+    Location src_pos = locations->InAt(1);
+    Register dest = locations->InAt(2).AsRegister<Register>();
+    Location dest_pos = locations->InAt(3);
+    Location length = locations->InAt(4);
+    Location temp1_loc = locations->GetTemp(0);
+    Register temp1 = temp1_loc.AsRegister<Register>();
+    Register temp2 = locations->GetTemp(1).AsRegister<Register>();
+    Register temp3 = locations->GetTemp(2).AsRegister<Register>();
+
+    __ Bind(GetEntryLabel());
+    // In this code path, registers `temp1`, `temp2`, and `temp3`
+    // (resp.) are not used for the base source address, the base
+    // destination address, and the end source address (resp.), as in
+    // other SystemArrayCopy intrinsic code paths.  Instead they are
+    // (resp.) used for:
+    // - the loop index (`i`);
+    // - the source index (`src_index`) and the loaded (source)
+    //   reference (`value`); and
+    // - the destination index (`dest_index`).
+
+    // i = 0
+    __ xorl(temp1, temp1);
+    NearLabel loop;
+    __ Bind(&loop);
+    // value = src_array[i + src_pos]
+    if (src_pos.IsConstant()) {
+      int32_t constant = src_pos.GetConstant()->AsIntConstant()->GetValue();
+      int32_t adjusted_offset = offset + constant * element_size;
+      __ movl(temp2, Address(src, temp1, ScaleFactor::TIMES_4, adjusted_offset));
+    } else {
+      __ leal(temp2, Address(src_pos.AsRegister<Register>(), temp1, ScaleFactor::TIMES_1, 0));
+      __ movl(temp2, Address(src, temp2, ScaleFactor::TIMES_4, offset));
+    }
+    __ MaybeUnpoisonHeapReference(temp2);
+    // TODO: Inline the mark bit check before calling the runtime?
+    // value = ReadBarrier::Mark(value)
+    // No need to save live registers; it's taken care of by the
+    // entrypoint. Also, there is no need to update the stack mask,
+    // as this runtime call will not trigger a garbage collection.
+    // (See ReadBarrierMarkSlowPathX86::EmitNativeCode for more
+    // explanations.)
+    DCHECK_NE(temp2, ESP);
+    DCHECK(0 <= temp2 && temp2 < kNumberOfCpuRegisters) << temp2;
+    int32_t entry_point_offset =
+        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kX86PointerSize>(temp2);
+    // This runtime call does not require a stack map.
+    x86_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
+    __ MaybePoisonHeapReference(temp2);
+    // dest_array[i + dest_pos] = value
+    if (dest_pos.IsConstant()) {
+      int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
+      int32_t adjusted_offset = offset + constant * element_size;
+      __ movl(Address(dest, temp1, ScaleFactor::TIMES_4, adjusted_offset), temp2);
+    } else {
+      __ leal(temp3, Address(dest_pos.AsRegister<Register>(), temp1, ScaleFactor::TIMES_1, 0));
+      __ movl(Address(dest, temp3, ScaleFactor::TIMES_4, offset), temp2);
+    }
+    // ++i
+    __ addl(temp1, Immediate(1));
+    // if (i != length) goto loop
+    x86_codegen->GenerateIntCompare(temp1_loc, length);
+    __ j(kNotEqual, &loop);
+    __ jmp(GetExitLabel());
+  }
+
+  const char* GetDescription() const OVERRIDE { return "ReadBarrierSystemArrayCopySlowPathX86"; }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(ReadBarrierSystemArrayCopySlowPathX86);
+};
+
+#undef __
+
 #define __ assembler->
 
 static void CreateFPToIntLocations(ArenaAllocator* arena, HInvoke* invoke, bool is64bit) {
@@ -2678,9 +2777,9 @@
 }
 
 void IntrinsicLocationsBuilderX86::VisitSystemArrayCopy(HInvoke* invoke) {
-  // TODO(rpl): Implement read barriers in the SystemArrayCopy
-  // intrinsic and re-enable it (b/29516905).
-  if (kEmitCompilerReadBarrier) {
+  // The only read barrier implementation supporting the
+  // SystemArrayCopy intrinsic is the Baker-style read barriers.
+  if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
     return;
   }
 
@@ -2710,9 +2809,9 @@
 }
 
 void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) {
-  // TODO(rpl): Implement read barriers in the SystemArrayCopy
-  // intrinsic and re-enable it (b/29516905).
-  DCHECK(!kEmitCompilerReadBarrier);
+  // The only read barrier implementation supporting the
+  // SystemArrayCopy intrinsic is the Baker-style read barriers.
+  DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier);
 
   X86Assembler* assembler = GetAssembler();
   LocationSummary* locations = invoke->GetLocations();
@@ -2721,17 +2820,21 @@
   uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
   uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
   uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
+  uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
 
   Register src = locations->InAt(0).AsRegister<Register>();
   Location src_pos = locations->InAt(1);
   Register dest = locations->InAt(2).AsRegister<Register>();
   Location dest_pos = locations->InAt(3);
-  Location length = locations->InAt(4);
-  Register temp1 = locations->GetTemp(0).AsRegister<Register>();
-  Register temp2 = locations->GetTemp(1).AsRegister<Register>();
+  Location length_arg = locations->InAt(4);
+  Location length = length_arg;
+  Location temp1_loc = locations->GetTemp(0);
+  Register temp1 = temp1_loc.AsRegister<Register>();
+  Location temp2_loc = locations->GetTemp(1);
+  Register temp2 = temp2_loc.AsRegister<Register>();
 
-  SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathX86(invoke);
-  codegen_->AddSlowPath(slow_path);
+  SlowPathCode* intrinsic_slow_path = new (GetAllocator()) IntrinsicSlowPathX86(invoke);
+  codegen_->AddSlowPath(intrinsic_slow_path);
 
   NearLabel conditions_on_positions_validated;
   SystemArrayCopyOptimizations optimizations(invoke);
@@ -2747,7 +2850,7 @@
         DCHECK_GE(src_pos_constant, dest_pos_constant);
       } else if (src_pos_constant < dest_pos_constant) {
         __ cmpl(src, dest);
-        __ j(kEqual, slow_path->GetEntryLabel());
+        __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
       }
     } else {
       if (!optimizations.GetDestinationIsSource()) {
@@ -2755,7 +2858,7 @@
         __ j(kNotEqual, &conditions_on_positions_validated);
       }
       __ cmpl(dest_pos.AsRegister<Register>(), Immediate(src_pos_constant));
-      __ j(kGreater, slow_path->GetEntryLabel());
+      __ j(kGreater, intrinsic_slow_path->GetEntryLabel());
     }
   } else {
     if (!optimizations.GetDestinationIsSource()) {
@@ -2765,10 +2868,10 @@
     if (dest_pos.IsConstant()) {
       int32_t dest_pos_constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
       __ cmpl(src_pos.AsRegister<Register>(), Immediate(dest_pos_constant));
-      __ j(kLess, slow_path->GetEntryLabel());
+      __ j(kLess, intrinsic_slow_path->GetEntryLabel());
     } else {
       __ cmpl(src_pos.AsRegister<Register>(), dest_pos.AsRegister<Register>());
-      __ j(kLess, slow_path->GetEntryLabel());
+      __ j(kLess, intrinsic_slow_path->GetEntryLabel());
     }
   }
 
@@ -2777,16 +2880,17 @@
   if (!optimizations.GetSourceIsNotNull()) {
     // Bail out if the source is null.
     __ testl(src, src);
-    __ j(kEqual, slow_path->GetEntryLabel());
+    __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
   }
 
   if (!optimizations.GetDestinationIsNotNull() && !optimizations.GetDestinationIsSource()) {
     // Bail out if the destination is null.
     __ testl(dest, dest);
-    __ j(kEqual, slow_path->GetEntryLabel());
+    __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
   }
 
-  Register temp3 = locations->GetTemp(2).AsRegister<Register>();
+  Location temp3_loc = locations->GetTemp(2);
+  Register temp3 = temp3_loc.AsRegister<Register>();
   if (length.IsStackSlot()) {
     __ movl(temp3, Address(ESP, length.GetStackIndex()));
     length = Location::RegisterLocation(temp3);
@@ -2798,7 +2902,7 @@
       !optimizations.GetCountIsSourceLength() &&
       !optimizations.GetCountIsDestinationLength()) {
     __ testl(length.AsRegister<Register>(), length.AsRegister<Register>());
-    __ j(kLess, slow_path->GetEntryLabel());
+    __ j(kLess, intrinsic_slow_path->GetEntryLabel());
   }
 
   // Validity checks: source.
@@ -2806,7 +2910,7 @@
                 src_pos,
                 src,
                 length,
-                slow_path,
+                intrinsic_slow_path,
                 temp1,
                 optimizations.GetCountIsSourceLength());
 
@@ -2815,7 +2919,7 @@
                 dest_pos,
                 dest,
                 length,
-                slow_path,
+                intrinsic_slow_path,
                 temp1,
                 optimizations.GetCountIsDestinationLength());
 
@@ -2824,72 +2928,159 @@
     // type of the destination array. We do two checks: the classes are the same,
     // or the destination is Object[]. If none of these checks succeed, we go to the
     // slow path.
+
     if (!optimizations.GetSourceIsNonPrimitiveArray()) {
-      // /* HeapReference<Class> */ temp1 = temp1->klass_
-      __ movl(temp1, Address(src, class_offset));
-      __ MaybeUnpoisonHeapReference(temp1);
-      // Bail out if the source is not a non primitive array.
-      // /* HeapReference<Class> */ temp1 = temp1->component_type_
-      __ movl(temp1, Address(temp1, component_offset));
-      __ testl(temp1, temp1);
-      __ j(kEqual, slow_path->GetEntryLabel());
-      __ MaybeUnpoisonHeapReference(temp1);
+      if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+        // /* HeapReference<Class> */ temp1 = src->klass_
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(
+            invoke, temp1_loc, src, class_offset, temp2_loc, /* needs_null_check */ false);
+        // Bail out if the source is not a non primitive array.
+        // /* HeapReference<Class> */ temp1 = temp1->component_type_
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(
+            invoke, temp1_loc, temp1, component_offset, temp2_loc, /* needs_null_check */ false);
+        __ testl(temp1, temp1);
+        __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
+        // If heap poisoning is enabled, `temp1` has been unpoisoned
+        // by the the previous call to GenerateFieldLoadWithBakerReadBarrier.
+      } else {
+        // /* HeapReference<Class> */ temp1 = src->klass_
+        __ movl(temp1, Address(src, class_offset));
+        __ MaybeUnpoisonHeapReference(temp1);
+        // Bail out if the source is not a non primitive array.
+        // /* HeapReference<Class> */ temp1 = temp1->component_type_
+        __ movl(temp1, Address(temp1, component_offset));
+        __ testl(temp1, temp1);
+        __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
+        __ MaybeUnpoisonHeapReference(temp1);
+      }
       __ cmpw(Address(temp1, primitive_offset), Immediate(Primitive::kPrimNot));
-      __ j(kNotEqual, slow_path->GetEntryLabel());
+      __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
     }
 
-    if (!optimizations.GetDestinationIsNonPrimitiveArray()) {
-      // /* HeapReference<Class> */ temp1 = temp1->klass_
-      __ movl(temp1, Address(dest, class_offset));
-      __ MaybeUnpoisonHeapReference(temp1);
-      // Bail out if the destination is not a non primitive array.
-      // /* HeapReference<Class> */ temp2 = temp1->component_type_
-      __ movl(temp2, Address(temp1, component_offset));
-      __ testl(temp2, temp2);
-      __ j(kEqual, slow_path->GetEntryLabel());
-      __ MaybeUnpoisonHeapReference(temp2);
-      __ cmpw(Address(temp2, primitive_offset), Immediate(Primitive::kPrimNot));
-      __ j(kNotEqual, slow_path->GetEntryLabel());
-      // Re-poison the heap reference to make the compare instruction below
-      // compare two poisoned references.
-      __ PoisonHeapReference(temp1);
-    } else {
-      // /* HeapReference<Class> */ temp1 = temp1->klass_
-      __ movl(temp1, Address(dest, class_offset));
-    }
+    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+      if (length.Equals(Location::RegisterLocation(temp3))) {
+        // When Baker read barriers are enabled, register `temp3`,
+        // which in the present case contains the `length` parameter,
+        // will be overwritten below.  Make the `length` location
+        // reference the original stack location; it will be moved
+        // back to `temp3` later if necessary.
+        DCHECK(length_arg.IsStackSlot());
+        length = length_arg;
+      }
 
-    // Note: if poisoning is on, we are here comparing two poisoned references.
-    __ cmpl(temp1, Address(src, class_offset));
+      // /* HeapReference<Class> */ temp1 = dest->klass_
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(
+          invoke, temp1_loc, dest, class_offset, temp2_loc, /* needs_null_check */ false);
 
-    if (optimizations.GetDestinationIsTypedObjectArray()) {
-      NearLabel do_copy;
-      __ j(kEqual, &do_copy);
-      __ MaybeUnpoisonHeapReference(temp1);
-      // /* HeapReference<Class> */ temp1 = temp1->component_type_
-      __ movl(temp1, Address(temp1, component_offset));
-      __ MaybeUnpoisonHeapReference(temp1);
-      __ cmpl(Address(temp1, super_offset), Immediate(0));
-      __ j(kNotEqual, slow_path->GetEntryLabel());
-      __ Bind(&do_copy);
+      if (!optimizations.GetDestinationIsNonPrimitiveArray()) {
+        // Bail out if the destination is not a non primitive array.
+        //
+        // Register `temp1` is not trashed by the read barrier emitted
+        // by GenerateFieldLoadWithBakerReadBarrier below, as that
+        // method produces a call to a ReadBarrierMarkRegX entry point,
+        // which saves all potentially live registers, including
+        // temporaries such a `temp1`.
+        // /* HeapReference<Class> */ temp2 = temp1->component_type_
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(
+            invoke, temp2_loc, temp1, component_offset, temp3_loc, /* needs_null_check */ false);
+        __ testl(temp2, temp2);
+        __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
+        // If heap poisoning is enabled, `temp2` has been unpoisoned
+        // by the the previous call to GenerateFieldLoadWithBakerReadBarrier.
+        __ cmpw(Address(temp2, primitive_offset), Immediate(Primitive::kPrimNot));
+        __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
+      }
+
+      // For the same reason given earlier, `temp1` is not trashed by the
+      // read barrier emitted by GenerateFieldLoadWithBakerReadBarrier below.
+      // /* HeapReference<Class> */ temp2 = src->klass_
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(
+          invoke, temp2_loc, src, class_offset, temp3_loc, /* needs_null_check */ false);
+      // Note: if heap poisoning is on, we are comparing two unpoisoned references here.
+      __ cmpl(temp1, temp2);
+
+      if (optimizations.GetDestinationIsTypedObjectArray()) {
+        NearLabel do_copy;
+        __ j(kEqual, &do_copy);
+        // /* HeapReference<Class> */ temp1 = temp1->component_type_
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(
+            invoke, temp1_loc, temp1, component_offset, temp2_loc, /* needs_null_check */ false);
+        // We do not need to emit a read barrier for the following
+        // heap reference load, as `temp1` is only used in a
+        // comparison with null below, and this reference is not
+        // kept afterwards.
+        __ cmpl(Address(temp1, super_offset), Immediate(0));
+        __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
+        __ Bind(&do_copy);
+      } else {
+        __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
+      }
     } else {
-      __ j(kNotEqual, slow_path->GetEntryLabel());
+      // Non read barrier code.
+
+      // /* HeapReference<Class> */ temp1 = dest->klass_
+      __ movl(temp1, Address(dest, class_offset));
+      if (!optimizations.GetDestinationIsNonPrimitiveArray()) {
+        __ MaybeUnpoisonHeapReference(temp1);
+        // Bail out if the destination is not a non primitive array.
+        // /* HeapReference<Class> */ temp2 = temp1->component_type_
+        __ movl(temp2, Address(temp1, component_offset));
+        __ testl(temp2, temp2);
+        __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
+        __ MaybeUnpoisonHeapReference(temp2);
+        __ cmpw(Address(temp2, primitive_offset), Immediate(Primitive::kPrimNot));
+        __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
+        // Re-poison the heap reference to make the compare instruction below
+        // compare two poisoned references.
+        __ PoisonHeapReference(temp1);
+      }
+
+      // Note: if heap poisoning is on, we are comparing two poisoned references here.
+      __ cmpl(temp1, Address(src, class_offset));
+
+      if (optimizations.GetDestinationIsTypedObjectArray()) {
+        NearLabel do_copy;
+        __ j(kEqual, &do_copy);
+        __ MaybeUnpoisonHeapReference(temp1);
+        // /* HeapReference<Class> */ temp1 = temp1->component_type_
+        __ movl(temp1, Address(temp1, component_offset));
+        __ MaybeUnpoisonHeapReference(temp1);
+        __ cmpl(Address(temp1, super_offset), Immediate(0));
+        __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
+        __ Bind(&do_copy);
+      } else {
+        __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
+      }
     }
   } else if (!optimizations.GetSourceIsNonPrimitiveArray()) {
     DCHECK(optimizations.GetDestinationIsNonPrimitiveArray());
     // Bail out if the source is not a non primitive array.
-    // /* HeapReference<Class> */ temp1 = src->klass_
-    __ movl(temp1, Address(src, class_offset));
-    __ MaybeUnpoisonHeapReference(temp1);
-    // /* HeapReference<Class> */ temp1 = temp1->component_type_
-    __ movl(temp1, Address(temp1, component_offset));
-    __ testl(temp1, temp1);
-    __ j(kEqual, slow_path->GetEntryLabel());
-    __ MaybeUnpoisonHeapReference(temp1);
+    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+      // /* HeapReference<Class> */ temp1 = src->klass_
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(
+          invoke, temp1_loc, src, class_offset, temp2_loc, /* needs_null_check */ false);
+      // /* HeapReference<Class> */ temp1 = temp1->component_type_
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(
+          invoke, temp1_loc, temp1, component_offset, temp2_loc, /* needs_null_check */ false);
+      __ testl(temp1, temp1);
+      __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
+      // If heap poisoning is enabled, `temp1` has been unpoisoned
+      // by the the previous call to GenerateFieldLoadWithBakerReadBarrier.
+    } else {
+      // /* HeapReference<Class> */ temp1 = src->klass_
+      __ movl(temp1, Address(src, class_offset));
+      __ MaybeUnpoisonHeapReference(temp1);
+      // /* HeapReference<Class> */ temp1 = temp1->component_type_
+      __ movl(temp1, Address(temp1, component_offset));
+      __ testl(temp1, temp1);
+      __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
+      __ MaybeUnpoisonHeapReference(temp1);
+    }
     __ cmpw(Address(temp1, primitive_offset), Immediate(Primitive::kPrimNot));
-    __ j(kNotEqual, slow_path->GetEntryLabel());
+    __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
   }
 
-  // Compute base source address, base destination address, and end source address.
+  // Compute the base source address in `temp1`.
   int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot);
   DCHECK_EQ(element_size, 4);
   uint32_t offset = mirror::Array::DataOffset(element_size).Uint32Value();
@@ -2900,35 +3091,136 @@
     __ leal(temp1, Address(src, src_pos.AsRegister<Register>(), ScaleFactor::TIMES_4, offset));
   }
 
-  if (dest_pos.IsConstant()) {
-    int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
-    __ leal(temp2, Address(dest, element_size * constant + offset));
-  } else {
-    __ leal(temp2, Address(dest, dest_pos.AsRegister<Register>(), ScaleFactor::TIMES_4, offset));
-  }
+  if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+    // If it is needed (in the case of the fast-path loop), the base
+    // destination address is computed later, as `temp2` is used for
+    // intermediate computations.
 
-  if (length.IsConstant()) {
-    int32_t constant = length.GetConstant()->AsIntConstant()->GetValue();
-    __ leal(temp3, Address(temp1, element_size * constant));
-  } else {
-    __ leal(temp3, Address(temp1, length.AsRegister<Register>(), ScaleFactor::TIMES_4, 0));
-  }
+    // Compute the end source address in `temp3`.
+    if (length.IsConstant()) {
+      int32_t constant = length.GetConstant()->AsIntConstant()->GetValue();
+      __ leal(temp3, Address(temp1, element_size * constant));
+    } else {
+      if (length.IsStackSlot()) {
+        // Location `length` is again pointing at a stack slot, as
+        // register `temp3` (which was containing the length parameter
+        // earlier) has been overwritten; restore it now
+        DCHECK(length.Equals(length_arg));
+        __ movl(temp3, Address(ESP, length.GetStackIndex()));
+        length = Location::RegisterLocation(temp3);
+      }
+      __ leal(temp3, Address(temp1, length.AsRegister<Register>(), ScaleFactor::TIMES_4, 0));
+    }
 
-  // Iterate over the arrays and do a raw copy of the objects. We don't need to
-  // poison/unpoison.
-  NearLabel loop, done;
-  __ cmpl(temp1, temp3);
-  __ j(kEqual, &done);
-  __ Bind(&loop);
-  __ pushl(Address(temp1, 0));
-  __ cfi().AdjustCFAOffset(4);
-  __ popl(Address(temp2, 0));
-  __ cfi().AdjustCFAOffset(-4);
-  __ addl(temp1, Immediate(element_size));
-  __ addl(temp2, Immediate(element_size));
-  __ cmpl(temp1, temp3);
-  __ j(kNotEqual, &loop);
-  __ Bind(&done);
+    // SystemArrayCopy implementation for Baker read barriers (see
+    // also CodeGeneratorX86::GenerateReferenceLoadWithBakerReadBarrier):
+    //
+    //   if (src_ptr != end_ptr) {
+    //     uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState();
+    //     lfence;  // Load fence or artificial data dependency to prevent load-load reordering
+    //     bool is_gray = (rb_state == ReadBarrier::gray_ptr_);
+    //     if (is_gray) {
+    //       // Slow-path copy.
+    //       for (size_t i = 0; i != length; ++i) {
+    //         dest_array[dest_pos + i] =
+    //             MaybePoison(ReadBarrier::Mark(MaybeUnpoison(src_array[src_pos + i])));
+    //       }
+    //     } else {
+    //       // Fast-path copy.
+    //       do {
+    //         *dest_ptr++ = *src_ptr++;
+    //       } while (src_ptr != end_ptr)
+    //     }
+    //   }
+
+    NearLabel loop, done;
+
+    // Don't enter copy loop if `length == 0`.
+    __ cmpl(temp1, temp3);
+    __ j(kEqual, &done);
+
+    // /* int32_t */ monitor = src->monitor_
+    __ movl(temp2, Address(src, monitor_offset));
+    // /* LockWord */ lock_word = LockWord(monitor)
+    static_assert(sizeof(LockWord) == sizeof(int32_t),
+                  "art::LockWord and int32_t have different sizes.");
+
+    // Load fence to prevent load-load reordering.
+    // Note that this is a no-op, thanks to the x86 memory model.
+    codegen_->GenerateMemoryBarrier(MemBarrierKind::kLoadAny);
+
+    // Slow path used to copy array when `src` is gray.
+    SlowPathCode* read_barrier_slow_path =
+        new (GetAllocator()) ReadBarrierSystemArrayCopySlowPathX86(invoke);
+    codegen_->AddSlowPath(read_barrier_slow_path);
+
+    // Given the numeric representation, it's enough to check the low bit of the
+    // rb_state. We do that by shifting the bit out of the lock word with SHR.
+    static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0");
+    static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1");
+    static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2");
+    __ shrl(temp2, Immediate(LockWord::kReadBarrierStateShift + 1));
+    __ j(kCarrySet, read_barrier_slow_path->GetEntryLabel());
+
+    // Fast-path copy.
+
+    // Set the base destination address in `temp2`.
+    if (dest_pos.IsConstant()) {
+      int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
+      __ leal(temp2, Address(dest, element_size * constant + offset));
+    } else {
+      __ leal(temp2, Address(dest, dest_pos.AsRegister<Register>(), ScaleFactor::TIMES_4, offset));
+    }
+
+    // Iterate over the arrays and do a raw copy of the objects. We don't need to
+    // poison/unpoison.
+    __ Bind(&loop);
+    __ pushl(Address(temp1, 0));
+    __ cfi().AdjustCFAOffset(4);
+    __ popl(Address(temp2, 0));
+    __ cfi().AdjustCFAOffset(-4);
+    __ addl(temp1, Immediate(element_size));
+    __ addl(temp2, Immediate(element_size));
+    __ cmpl(temp1, temp3);
+    __ j(kNotEqual, &loop);
+
+    __ Bind(read_barrier_slow_path->GetExitLabel());
+    __ Bind(&done);
+  } else {
+    // Non read barrier code.
+
+    // Compute the base destination address in `temp2`.
+    if (dest_pos.IsConstant()) {
+      int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
+      __ leal(temp2, Address(dest, element_size * constant + offset));
+    } else {
+      __ leal(temp2, Address(dest, dest_pos.AsRegister<Register>(), ScaleFactor::TIMES_4, offset));
+    }
+
+    // Compute the end source address in `temp3`.
+    if (length.IsConstant()) {
+      int32_t constant = length.GetConstant()->AsIntConstant()->GetValue();
+      __ leal(temp3, Address(temp1, element_size * constant));
+    } else {
+      __ leal(temp3, Address(temp1, length.AsRegister<Register>(), ScaleFactor::TIMES_4, 0));
+    }
+
+    // Iterate over the arrays and do a raw copy of the objects. We don't need to
+    // poison/unpoison.
+    NearLabel loop, done;
+    __ cmpl(temp1, temp3);
+    __ j(kEqual, &done);
+    __ Bind(&loop);
+    __ pushl(Address(temp1, 0));
+    __ cfi().AdjustCFAOffset(4);
+    __ popl(Address(temp2, 0));
+    __ cfi().AdjustCFAOffset(-4);
+    __ addl(temp1, Immediate(element_size));
+    __ addl(temp2, Immediate(element_size));
+    __ cmpl(temp1, temp3);
+    __ j(kNotEqual, &loop);
+    __ Bind(&done);
+  }
 
   // We only need one card marking on the destination array.
   codegen_->MarkGCCard(temp1,
@@ -2937,7 +3229,7 @@
                        Register(kNoRegister),
                        /* value_can_be_null */ false);
 
-  __ Bind(slow_path->GetExitLabel());
+  __ Bind(intrinsic_slow_path->GetExitLabel());
 }
 
 UNIMPLEMENTED_INTRINSIC(X86, MathRoundDouble)