Fix the artificial dependency in ARM/ARM64 SystemArrayCopy intrinsics.

Ensure that the base source address register (`src_curr_addr`) is
assigned from the `src` register after `src` is made dependent on the
lock word of `src`.

Before this CL, the artificial (or "fake") dependency of `src` on
`tmp` was present, but because `src_curr_addr` was computed from `src`
*before* that artificial dependency, it meant that it was basically
useless. This could explain some stale reference bugs.

After this CL, `src_curr_addr` is computed from `src` *after* the
artificial dependency.

Test: m test-art-target on Baker read barrier configuration.
Bug: 12687968
Change-Id: If30a2e35ca04b0b6d054e2a6c6b1e9c6879cf4a9
diff --git a/compiler/optimizing/intrinsics_arm.cc b/compiler/optimizing/intrinsics_arm.cc
index 987a2af..98b80f5 100644
--- a/compiler/optimizing/intrinsics_arm.cc
+++ b/compiler/optimizing/intrinsics_arm.cc
@@ -1967,101 +1967,108 @@
     __ CompareAndBranchIfNonZero(temp3, intrinsic_slow_path->GetEntryLabel());
   }
 
-  const Primitive::Type type = Primitive::kPrimNot;
-  const int32_t element_size = Primitive::ComponentSize(type);
-
-  // Compute the base source address in `temp1`.
-  GenSystemArrayCopyBaseAddress(GetAssembler(), type, src, src_pos, temp1);
-  // Compute the end source address in `temp3`.
-  GenSystemArrayCopyEndAddress(GetAssembler(), type, length, temp1, temp3);
-  // The base destination address is computed later, as `temp2` is
-  // used for intermediate computations.
-
-  if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
-    // TODO: Also convert this intrinsic to the IsGcMarking strategy?
-
-    // SystemArrayCopy implementation for Baker read barriers (see
-    // also CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier):
-    //
-    //   if (src_ptr != end_ptr) {
-    //     uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState();
-    //     lfence;  // Load fence or artificial data dependency to prevent load-load reordering
-    //     bool is_gray = (rb_state == ReadBarrier::GrayState());
-    //     if (is_gray) {
-    //       // Slow-path copy.
-    //       do {
-    //         *dest_ptr++ = MaybePoison(ReadBarrier::Mark(MaybeUnpoison(*src_ptr++)));
-    //       } while (src_ptr != end_ptr)
-    //     } else {
-    //       // Fast-path copy.
-    //       do {
-    //         *dest_ptr++ = *src_ptr++;
-    //       } while (src_ptr != end_ptr)
-    //     }
-    //   }
-
-    Label loop, done;
-
-    // Don't enter copy loop if `length == 0`.
-    __ cmp(temp1, ShifterOperand(temp3));
-    __ b(&done, EQ);
-
-    // /* int32_t */ monitor = src->monitor_
-    __ LoadFromOffset(kLoadWord, temp2, src, monitor_offset);
-    // /* LockWord */ lock_word = LockWord(monitor)
-    static_assert(sizeof(LockWord) == sizeof(int32_t),
-                  "art::LockWord and int32_t have different sizes.");
-
-    // Introduce a dependency on the lock_word including the rb_state,
-    // which shall prevent load-load reordering without using
-    // a memory barrier (which would be more expensive).
-    // `src` is unchanged by this operation, but its value now depends
-    // on `temp2`.
-    __ add(src, src, ShifterOperand(temp2, LSR, 32));
-
-    // Slow path used to copy array when `src` is gray.
-    // Note that the base destination address is computed in `temp2`
-    // by the slow path code.
-    SlowPathCode* read_barrier_slow_path =
-        new (GetAllocator()) ReadBarrierSystemArrayCopySlowPathARM(invoke);
-    codegen_->AddSlowPath(read_barrier_slow_path);
-
-    // Given the numeric representation, it's enough to check the low bit of the
-    // rb_state. We do that by shifting the bit out of the lock word with LSRS
-    // which can be a 16-bit instruction unlike the TST immediate.
-    static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0");
-    static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1");
-    __ Lsrs(temp2, temp2, LockWord::kReadBarrierStateShift + 1);
-    // Carry flag is the last bit shifted out by LSRS.
-    __ b(read_barrier_slow_path->GetEntryLabel(), CS);
-
-    // Fast-path copy.
-    // Compute the base destination address in `temp2`.
-    GenSystemArrayCopyBaseAddress(GetAssembler(), type, dest, dest_pos, temp2);
-    // Iterate over the arrays and do a raw copy of the objects. We don't need to
-    // poison/unpoison.
-    __ Bind(&loop);
-    __ ldr(IP, Address(temp1, element_size, Address::PostIndex));
-    __ str(IP, Address(temp2, element_size, Address::PostIndex));
-    __ cmp(temp1, ShifterOperand(temp3));
-    __ b(&loop, NE);
-
-    __ Bind(read_barrier_slow_path->GetExitLabel());
-    __ Bind(&done);
+  if (length.IsConstant() && length.GetConstant()->AsIntConstant()->GetValue() == 0) {
+    // Null constant length: not need to emit the loop code at all.
   } else {
-    // Non read barrier code.
-    // Compute the base destination address in `temp2`.
-    GenSystemArrayCopyBaseAddress(GetAssembler(), type, dest, dest_pos, temp2);
-    // Iterate over the arrays and do a raw copy of the objects. We don't need to
-    // poison/unpoison.
-    Label loop, done;
-    __ cmp(temp1, ShifterOperand(temp3));
-    __ b(&done, EQ);
-    __ Bind(&loop);
-    __ ldr(IP, Address(temp1, element_size, Address::PostIndex));
-    __ str(IP, Address(temp2, element_size, Address::PostIndex));
-    __ cmp(temp1, ShifterOperand(temp3));
-    __ b(&loop, NE);
+    Label done;
+    const Primitive::Type type = Primitive::kPrimNot;
+    const int32_t element_size = Primitive::ComponentSize(type);
+
+    if (length.IsRegister()) {
+      // Don't enter the copy loop if the length is null.
+      __ CompareAndBranchIfZero(length.AsRegister<Register>(), &done);
+    }
+
+    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+      // TODO: Also convert this intrinsic to the IsGcMarking strategy?
+
+      // SystemArrayCopy implementation for Baker read barriers (see
+      // also CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier):
+      //
+      //   uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState();
+      //   lfence;  // Load fence or artificial data dependency to prevent load-load reordering
+      //   bool is_gray = (rb_state == ReadBarrier::GrayState());
+      //   if (is_gray) {
+      //     // Slow-path copy.
+      //     do {
+      //       *dest_ptr++ = MaybePoison(ReadBarrier::Mark(MaybeUnpoison(*src_ptr++)));
+      //     } while (src_ptr != end_ptr)
+      //   } else {
+      //     // Fast-path copy.
+      //     do {
+      //       *dest_ptr++ = *src_ptr++;
+      //     } while (src_ptr != end_ptr)
+      //   }
+
+      // /* int32_t */ monitor = src->monitor_
+      __ LoadFromOffset(kLoadWord, temp2, src, monitor_offset);
+      // /* LockWord */ lock_word = LockWord(monitor)
+      static_assert(sizeof(LockWord) == sizeof(int32_t),
+                    "art::LockWord and int32_t have different sizes.");
+
+      // Introduce a dependency on the lock_word including the rb_state,
+      // which shall prevent load-load reordering without using
+      // a memory barrier (which would be more expensive).
+      // `src` is unchanged by this operation, but its value now depends
+      // on `temp2`.
+      __ add(src, src, ShifterOperand(temp2, LSR, 32));
+
+      // Compute the base source address in `temp1`.
+      // Note that `temp1` (the base source address) is computed from
+      // `src` (and `src_pos`) here, and thus honors the artificial
+      // dependency of `src` on `temp2`.
+      GenSystemArrayCopyBaseAddress(GetAssembler(), type, src, src_pos, temp1);
+      // Compute the end source address in `temp3`.
+      GenSystemArrayCopyEndAddress(GetAssembler(), type, length, temp1, temp3);
+      // The base destination address is computed later, as `temp2` is
+      // used for intermediate computations.
+
+      // Slow path used to copy array when `src` is gray.
+      // Note that the base destination address is computed in `temp2`
+      // by the slow path code.
+      SlowPathCode* read_barrier_slow_path =
+          new (GetAllocator()) ReadBarrierSystemArrayCopySlowPathARM(invoke);
+      codegen_->AddSlowPath(read_barrier_slow_path);
+
+      // Given the numeric representation, it's enough to check the low bit of the
+      // rb_state. We do that by shifting the bit out of the lock word with LSRS
+      // which can be a 16-bit instruction unlike the TST immediate.
+      static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0");
+      static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1");
+      __ Lsrs(temp2, temp2, LockWord::kReadBarrierStateShift + 1);
+      // Carry flag is the last bit shifted out by LSRS.
+      __ b(read_barrier_slow_path->GetEntryLabel(), CS);
+
+      // Fast-path copy.
+      // Compute the base destination address in `temp2`.
+      GenSystemArrayCopyBaseAddress(GetAssembler(), type, dest, dest_pos, temp2);
+      // Iterate over the arrays and do a raw copy of the objects. We don't need to
+      // poison/unpoison.
+      Label loop;
+      __ Bind(&loop);
+      __ ldr(IP, Address(temp1, element_size, Address::PostIndex));
+      __ str(IP, Address(temp2, element_size, Address::PostIndex));
+      __ cmp(temp1, ShifterOperand(temp3));
+      __ b(&loop, NE);
+
+      __ Bind(read_barrier_slow_path->GetExitLabel());
+    } else {
+      // Non read barrier code.
+      // Compute the base source address in `temp1`.
+      GenSystemArrayCopyBaseAddress(GetAssembler(), type, src, src_pos, temp1);
+      // Compute the base destination address in `temp2`.
+      GenSystemArrayCopyBaseAddress(GetAssembler(), type, dest, dest_pos, temp2);
+      // Compute the end source address in `temp3`.
+      GenSystemArrayCopyEndAddress(GetAssembler(), type, length, temp1, temp3);
+      // Iterate over the arrays and do a raw copy of the objects. We don't need to
+      // poison/unpoison.
+      Label loop;
+      __ Bind(&loop);
+      __ ldr(IP, Address(temp1, element_size, Address::PostIndex));
+      __ str(IP, Address(temp2, element_size, Address::PostIndex));
+      __ cmp(temp1, ShifterOperand(temp3));
+      __ b(&loop, NE);
+    }
     __ Bind(&done);
   }
 
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index b012608..423fd3c 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -2716,111 +2716,127 @@
       __ Cbnz(temp2, intrinsic_slow_path->GetEntryLabel());
     }
 
-    const Primitive::Type type = Primitive::kPrimNot;
-    const int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot);
-
-    Register src_curr_addr = temp1.X();
-    Register dst_curr_addr = temp2.X();
-    Register src_stop_addr = temp3.X();
-
-    // Compute base source address, base destination address, and end
-    // source address in `src_curr_addr`, `dst_curr_addr` and
-    // `src_stop_addr` respectively.
-    GenSystemArrayCopyAddresses(masm,
-                                type,
-                                src,
-                                src_pos,
-                                dest,
-                                dest_pos,
-                                length,
-                                src_curr_addr,
-                                dst_curr_addr,
-                                src_stop_addr);
-
-    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
-      // TODO: Also convert this intrinsic to the IsGcMarking strategy?
-
-      // SystemArrayCopy implementation for Baker read barriers (see
-      // also CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier):
-      //
-      //   if (src_ptr != end_ptr) {
-      //     uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState();
-      //     lfence;  // Load fence or artificial data dependency to prevent load-load reordering
-      //     bool is_gray = (rb_state == ReadBarrier::GrayState());
-      //     if (is_gray) {
-      //       // Slow-path copy.
-      //       do {
-      //         *dest_ptr++ = MaybePoison(ReadBarrier::Mark(MaybeUnpoison(*src_ptr++)));
-      //       } while (src_ptr != end_ptr)
-      //     } else {
-      //       // Fast-path copy.
-      //       do {
-      //         *dest_ptr++ = *src_ptr++;
-      //       } while (src_ptr != end_ptr)
-      //     }
-      //   }
-
-      vixl::aarch64::Label loop, done;
-
-      // Don't enter copy loop if `length == 0`.
-      __ Cmp(src_curr_addr, src_stop_addr);
-      __ B(&done, eq);
-
-      // Make sure `tmp` is not IP0, as it is clobbered by
-      // ReadBarrierMarkRegX entry points in
-      // ReadBarrierSystemArrayCopySlowPathARM64.
-      temps.Exclude(ip0);
-      Register tmp = temps.AcquireW();
-      DCHECK_NE(LocationFrom(tmp).reg(), IP0);
-
-      // /* int32_t */ monitor = src->monitor_
-      __ Ldr(tmp, HeapOperand(src.W(), monitor_offset));
-      // /* LockWord */ lock_word = LockWord(monitor)
-      static_assert(sizeof(LockWord) == sizeof(int32_t),
-                    "art::LockWord and int32_t have different sizes.");
-
-      // Introduce a dependency on the lock_word including rb_state,
-      // to prevent load-load reordering, and without using
-      // a memory barrier (which would be more expensive).
-      // `src` is unchanged by this operation, but its value now depends
-      // on `tmp`.
-      __ Add(src.X(), src.X(), Operand(tmp.X(), LSR, 32));
-
-      // Slow path used to copy array when `src` is gray.
-      SlowPathCodeARM64* read_barrier_slow_path =
-          new (GetAllocator()) ReadBarrierSystemArrayCopySlowPathARM64(invoke, LocationFrom(tmp));
-      codegen_->AddSlowPath(read_barrier_slow_path);
-
-      // Given the numeric representation, it's enough to check the low bit of the rb_state.
-      static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0");
-      static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1");
-      __ Tbnz(tmp, LockWord::kReadBarrierStateShift, read_barrier_slow_path->GetEntryLabel());
-
-      // Fast-path copy.
-      // Iterate over the arrays and do a raw copy of the objects. We don't need to
-      // poison/unpoison.
-      __ Bind(&loop);
-      __ Ldr(tmp, MemOperand(src_curr_addr, element_size, PostIndex));
-      __ Str(tmp, MemOperand(dst_curr_addr, element_size, PostIndex));
-      __ Cmp(src_curr_addr, src_stop_addr);
-      __ B(&loop, ne);
-
-      __ Bind(read_barrier_slow_path->GetExitLabel());
-      __ Bind(&done);
+    if (length.IsConstant() && length.GetConstant()->AsIntConstant()->GetValue() == 0) {
+      // Null constant length: not need to emit the loop code at all.
     } else {
-      // Non read barrier code.
-      // Iterate over the arrays and do a raw copy of the objects. We don't need to
-      // poison/unpoison.
-      vixl::aarch64::Label loop, done;
-      __ Bind(&loop);
-      __ Cmp(src_curr_addr, src_stop_addr);
-      __ B(&done, eq);
-      {
+      Register src_curr_addr = temp1.X();
+      Register dst_curr_addr = temp2.X();
+      Register src_stop_addr = temp3.X();
+      vixl::aarch64::Label done;
+      const Primitive::Type type = Primitive::kPrimNot;
+      const int32_t element_size = Primitive::ComponentSize(type);
+
+      if (length.IsRegister()) {
+        // Don't enter the copy loop if the length is null.
+        __ Cbz(WRegisterFrom(length), &done);
+      }
+
+      if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+        // TODO: Also convert this intrinsic to the IsGcMarking strategy?
+
+        // SystemArrayCopy implementation for Baker read barriers (see
+        // also CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier):
+        //
+        //   uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState();
+        //   lfence;  // Load fence or artificial data dependency to prevent load-load reordering
+        //   bool is_gray = (rb_state == ReadBarrier::GrayState());
+        //   if (is_gray) {
+        //     // Slow-path copy.
+        //     do {
+        //       *dest_ptr++ = MaybePoison(ReadBarrier::Mark(MaybeUnpoison(*src_ptr++)));
+        //     } while (src_ptr != end_ptr)
+        //   } else {
+        //     // Fast-path copy.
+        //     do {
+        //       *dest_ptr++ = *src_ptr++;
+        //     } while (src_ptr != end_ptr)
+        //   }
+
+        // Make sure `tmp` is not IP0, as it is clobbered by
+        // ReadBarrierMarkRegX entry points in
+        // ReadBarrierSystemArrayCopySlowPathARM64.
+        temps.Exclude(ip0);
         Register tmp = temps.AcquireW();
+        DCHECK_NE(LocationFrom(tmp).reg(), IP0);
+
+        // /* int32_t */ monitor = src->monitor_
+        __ Ldr(tmp, HeapOperand(src.W(), monitor_offset));
+        // /* LockWord */ lock_word = LockWord(monitor)
+        static_assert(sizeof(LockWord) == sizeof(int32_t),
+                      "art::LockWord and int32_t have different sizes.");
+
+        // Introduce a dependency on the lock_word including rb_state,
+        // to prevent load-load reordering, and without using
+        // a memory barrier (which would be more expensive).
+        // `src` is unchanged by this operation, but its value now depends
+        // on `tmp`.
+        __ Add(src.X(), src.X(), Operand(tmp.X(), LSR, 32));
+
+        // Compute base source address, base destination address, and end
+        // source address for System.arraycopy* intrinsics in `src_base`,
+        // `dst_base` and `src_end` respectively.
+        // Note that `src_curr_addr` is computed from from `src` (and
+        // `src_pos`) here, and thus honors the artificial dependency
+        // of `src` on `tmp`.
+        GenSystemArrayCopyAddresses(masm,
+                                    type,
+                                    src,
+                                    src_pos,
+                                    dest,
+                                    dest_pos,
+                                    length,
+                                    src_curr_addr,
+                                    dst_curr_addr,
+                                    src_stop_addr);
+
+        // Slow path used to copy array when `src` is gray.
+        SlowPathCodeARM64* read_barrier_slow_path =
+            new (GetAllocator()) ReadBarrierSystemArrayCopySlowPathARM64(invoke, LocationFrom(tmp));
+        codegen_->AddSlowPath(read_barrier_slow_path);
+
+        // Given the numeric representation, it's enough to check the low bit of the rb_state.
+        static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0");
+        static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1");
+        __ Tbnz(tmp, LockWord::kReadBarrierStateShift, read_barrier_slow_path->GetEntryLabel());
+
+        // Fast-path copy.
+        // Iterate over the arrays and do a raw copy of the objects. We don't need to
+        // poison/unpoison.
+        vixl::aarch64::Label loop;
+        __ Bind(&loop);
         __ Ldr(tmp, MemOperand(src_curr_addr, element_size, PostIndex));
         __ Str(tmp, MemOperand(dst_curr_addr, element_size, PostIndex));
+        __ Cmp(src_curr_addr, src_stop_addr);
+        __ B(&loop, ne);
+
+        __ Bind(read_barrier_slow_path->GetExitLabel());
+      } else {
+        // Non read barrier code.
+        // Compute base source address, base destination address, and end
+        // source address for System.arraycopy* intrinsics in `src_base`,
+        // `dst_base` and `src_end` respectively.
+        GenSystemArrayCopyAddresses(masm,
+                                    type,
+                                    src,
+                                    src_pos,
+                                    dest,
+                                    dest_pos,
+                                    length,
+                                    src_curr_addr,
+                                    dst_curr_addr,
+                                    src_stop_addr);
+        // Iterate over the arrays and do a raw copy of the objects. We don't need to
+        // poison/unpoison.
+        vixl::aarch64::Label loop;
+        __ Bind(&loop);
+        {
+          Register tmp = temps.AcquireW();
+          __ Ldr(tmp, MemOperand(src_curr_addr, element_size, PostIndex));
+          __ Str(tmp, MemOperand(dst_curr_addr, element_size, PostIndex));
+        }
+        __ Cmp(src_curr_addr, src_stop_addr);
+        __ B(&loop, ne);
       }
-      __ B(&loop);
       __ Bind(&done);
     }
   }
diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc
index 0167891..19ff49c 100644
--- a/compiler/optimizing/intrinsics_arm_vixl.cc
+++ b/compiler/optimizing/intrinsics_arm_vixl.cc
@@ -2280,109 +2280,116 @@
     __ CompareAndBranchIfNonZero(temp3, intrinsic_slow_path->GetEntryLabel());
   }
 
-  const Primitive::Type type = Primitive::kPrimNot;
-  const int32_t element_size = Primitive::ComponentSize(type);
-
-  // Compute the base source address in `temp1`.
-  GenSystemArrayCopyBaseAddress(GetAssembler(), type, src, src_pos, temp1);
-  // Compute the end source address in `temp3`.
-  GenSystemArrayCopyEndAddress(GetAssembler(), type, length, temp1, temp3);
-  // The base destination address is computed later, as `temp2` is
-  // used for intermediate computations.
-
-  if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
-    // TODO: Also convert this intrinsic to the IsGcMarking strategy?
-
-    // SystemArrayCopy implementation for Baker read barriers (see
-    // also CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier):
-    //
-    //   if (src_ptr != end_ptr) {
-    //     uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState();
-    //     lfence;  // Load fence or artificial data dependency to prevent load-load reordering
-    //     bool is_gray = (rb_state == ReadBarrier::GrayState());
-    //     if (is_gray) {
-    //       // Slow-path copy.
-    //       do {
-    //         *dest_ptr++ = MaybePoison(ReadBarrier::Mark(MaybeUnpoison(*src_ptr++)));
-    //       } while (src_ptr != end_ptr)
-    //     } else {
-    //       // Fast-path copy.
-    //       do {
-    //         *dest_ptr++ = *src_ptr++;
-    //       } while (src_ptr != end_ptr)
-    //     }
-    //   }
-
-    vixl32::Label loop, done;
-
-    // Don't enter copy loop if `length == 0`.
-    __ Cmp(temp1, temp3);
-    __ B(eq, &done, /* far_target */ false);
-
-    // /* int32_t */ monitor = src->monitor_
-    __ Ldr(temp2, MemOperand(src, monitor_offset));
-    // /* LockWord */ lock_word = LockWord(monitor)
-    static_assert(sizeof(LockWord) == sizeof(int32_t),
-                  "art::LockWord and int32_t have different sizes.");
-
-    // Introduce a dependency on the lock_word including the rb_state,
-    // which shall prevent load-load reordering without using
-    // a memory barrier (which would be more expensive).
-    // `src` is unchanged by this operation, but its value now depends
-    // on `temp2`.
-    __ Add(src, src, Operand(temp2, vixl32::LSR, 32));
-
-    // Slow path used to copy array when `src` is gray.
-    // Note that the base destination address is computed in `temp2`
-    // by the slow path code.
-    SlowPathCodeARMVIXL* read_barrier_slow_path =
-        new (GetAllocator()) ReadBarrierSystemArrayCopySlowPathARMVIXL(invoke);
-    codegen_->AddSlowPath(read_barrier_slow_path);
-
-    // Given the numeric representation, it's enough to check the low bit of the
-    // rb_state. We do that by shifting the bit out of the lock word with LSRS
-    // which can be a 16-bit instruction unlike the TST immediate.
-    static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0");
-    static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1");
-    __ Lsrs(temp2, temp2, LockWord::kReadBarrierStateShift + 1);
-    // Carry flag is the last bit shifted out by LSRS.
-    __ B(cs, read_barrier_slow_path->GetEntryLabel());
-
-    // Fast-path copy.
-    // Compute the base destination address in `temp2`.
-    GenSystemArrayCopyBaseAddress(GetAssembler(), type, dest, dest_pos, temp2);
-    // Iterate over the arrays and do a raw copy of the objects. We don't need to
-    // poison/unpoison.
-    __ Bind(&loop);
-    {
-      UseScratchRegisterScope temps(assembler->GetVIXLAssembler());
-      const vixl32::Register temp_reg = temps.Acquire();
-      __ Ldr(temp_reg, MemOperand(temp1, element_size, PostIndex));
-      __ Str(temp_reg, MemOperand(temp2, element_size, PostIndex));
-    }
-    __ Cmp(temp1, temp3);
-    __ B(ne, &loop, /* far_target */ false);
-
-    __ Bind(read_barrier_slow_path->GetExitLabel());
-    __ Bind(&done);
+  if (length.IsConstant() && Int32ConstantFrom(length) == 0) {
+    // Null constant length: not need to emit the loop code at all.
   } else {
-    // Non read barrier code.
-    // Compute the base destination address in `temp2`.
-    GenSystemArrayCopyBaseAddress(GetAssembler(), type, dest, dest_pos, temp2);
-    // Iterate over the arrays and do a raw copy of the objects. We don't need to
-    // poison/unpoison.
-    vixl32::Label loop, done;
-    __ Cmp(temp1, temp3);
-    __ B(eq, &done, /* far_target */ false);
-    __ Bind(&loop);
-    {
-      UseScratchRegisterScope temps(assembler->GetVIXLAssembler());
-      const vixl32::Register temp_reg = temps.Acquire();
-      __ Ldr(temp_reg, MemOperand(temp1, element_size, PostIndex));
-      __ Str(temp_reg, MemOperand(temp2, element_size, PostIndex));
+    vixl32::Label done;
+    const Primitive::Type type = Primitive::kPrimNot;
+    const int32_t element_size = Primitive::ComponentSize(type);
+
+    if (length.IsRegister()) {
+      // Don't enter the copy loop if the length is null.
+      __ CompareAndBranchIfZero(RegisterFrom(length), &done, /* is_far_target */ false);
     }
-    __ Cmp(temp1, temp3);
-    __ B(ne, &loop, /* far_target */ false);
+
+    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+      // TODO: Also convert this intrinsic to the IsGcMarking strategy?
+
+      // SystemArrayCopy implementation for Baker read barriers (see
+      // also CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier):
+      //
+      //   uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState();
+      //   lfence;  // Load fence or artificial data dependency to prevent load-load reordering
+      //   bool is_gray = (rb_state == ReadBarrier::GrayState());
+      //   if (is_gray) {
+      //     // Slow-path copy.
+      //     do {
+      //       *dest_ptr++ = MaybePoison(ReadBarrier::Mark(MaybeUnpoison(*src_ptr++)));
+      //     } while (src_ptr != end_ptr)
+      //   } else {
+      //     // Fast-path copy.
+      //     do {
+      //       *dest_ptr++ = *src_ptr++;
+      //     } while (src_ptr != end_ptr)
+      //   }
+
+      // /* int32_t */ monitor = src->monitor_
+      __ Ldr(temp2, MemOperand(src, monitor_offset));
+      // /* LockWord */ lock_word = LockWord(monitor)
+      static_assert(sizeof(LockWord) == sizeof(int32_t),
+                    "art::LockWord and int32_t have different sizes.");
+
+      // Introduce a dependency on the lock_word including the rb_state,
+      // which shall prevent load-load reordering without using
+      // a memory barrier (which would be more expensive).
+      // `src` is unchanged by this operation, but its value now depends
+      // on `temp2`.
+      __ Add(src, src, Operand(temp2, vixl32::LSR, 32));
+
+      // Compute the base source address in `temp1`.
+      // Note that `temp1` (the base source address) is computed from
+      // `src` (and `src_pos`) here, and thus honors the artificial
+      // dependency of `src` on `temp2`.
+      GenSystemArrayCopyBaseAddress(GetAssembler(), type, src, src_pos, temp1);
+      // Compute the end source address in `temp3`.
+      GenSystemArrayCopyEndAddress(GetAssembler(), type, length, temp1, temp3);
+      // The base destination address is computed later, as `temp2` is
+      // used for intermediate computations.
+
+      // Slow path used to copy array when `src` is gray.
+      // Note that the base destination address is computed in `temp2`
+      // by the slow path code.
+      SlowPathCodeARMVIXL* read_barrier_slow_path =
+          new (GetAllocator()) ReadBarrierSystemArrayCopySlowPathARMVIXL(invoke);
+      codegen_->AddSlowPath(read_barrier_slow_path);
+
+      // Given the numeric representation, it's enough to check the low bit of the
+      // rb_state. We do that by shifting the bit out of the lock word with LSRS
+      // which can be a 16-bit instruction unlike the TST immediate.
+      static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0");
+      static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1");
+      __ Lsrs(temp2, temp2, LockWord::kReadBarrierStateShift + 1);
+      // Carry flag is the last bit shifted out by LSRS.
+      __ B(cs, read_barrier_slow_path->GetEntryLabel());
+
+      // Fast-path copy.
+      // Compute the base destination address in `temp2`.
+      GenSystemArrayCopyBaseAddress(GetAssembler(), type, dest, dest_pos, temp2);
+      // Iterate over the arrays and do a raw copy of the objects. We don't need to
+      // poison/unpoison.
+      vixl32::Label loop;
+      __ Bind(&loop);
+      {
+        UseScratchRegisterScope temps(assembler->GetVIXLAssembler());
+        const vixl32::Register temp_reg = temps.Acquire();
+        __ Ldr(temp_reg, MemOperand(temp1, element_size, PostIndex));
+        __ Str(temp_reg, MemOperand(temp2, element_size, PostIndex));
+      }
+      __ Cmp(temp1, temp3);
+      __ B(ne, &loop, /* far_target */ false);
+
+      __ Bind(read_barrier_slow_path->GetExitLabel());
+    } else {
+      // Non read barrier code.
+      // Compute the base source address in `temp1`.
+      GenSystemArrayCopyBaseAddress(GetAssembler(), type, src, src_pos, temp1);
+      // Compute the base destination address in `temp2`.
+      GenSystemArrayCopyBaseAddress(GetAssembler(), type, dest, dest_pos, temp2);
+      // Compute the end source address in `temp3`.
+      GenSystemArrayCopyEndAddress(GetAssembler(), type, length, temp1, temp3);
+      // Iterate over the arrays and do a raw copy of the objects. We don't need to
+      // poison/unpoison.
+      vixl32::Label loop;
+      __ Bind(&loop);
+      {
+        UseScratchRegisterScope temps(assembler->GetVIXLAssembler());
+        const vixl32::Register temp_reg = temps.Acquire();
+        __ Ldr(temp_reg, MemOperand(temp1, element_size, PostIndex));
+        __ Str(temp_reg, MemOperand(temp2, element_size, PostIndex));
+      }
+      __ Cmp(temp1, temp3);
+      __ B(ne, &loop, /* far_target */ false);
+    }
     __ Bind(&done);
   }