Avoid read barriers for inlined check cast

Avoiding read barriers improves speed and reduces code size.

Doing this can never result in false positives, only false negatives.
These false negatives are handled correcly by rechecking in the
entrypoint.

Ritzperf code size for CC:
arm32: 13439400->13300136 (-1.04%)
arm64: 16405120->16253568 (-0.92%)

Perf: TODO

Bug: 29516974
Bug: 12687968

Test: test-art-host, run ritzperf both with CC

Change-Id: Ie024e0b1e8ee415781fb73e8029e87e8a5318f86
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 08227fc..8d0f203 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -6033,7 +6033,7 @@
   return 0;
 }
 
-// InteraceCheck has 3 temps, one for holding the number of interfaces, one for the current
+// Interface case has 3 temps, one for holding the number of interfaces, one for the current
 // interface pointer, one for loading the current interface.
 // The other checks have one temp for loading the object's class.
 static size_t NumberOfCheckCastTemps(TypeCheckKind type_check_kind) {
@@ -6123,7 +6123,11 @@
       Label loop;
       __ Bind(&loop);
       // /* HeapReference<Class> */ out = out->super_class_
-      GenerateReferenceLoadOneRegister(instruction, out_loc, super_offset, maybe_temp_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       out_loc,
+                                       super_offset,
+                                       maybe_temp_loc,
+                                       kEmitCompilerReadBarrier);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ CompareAndBranchIfZero(out, &done);
       __ cmp(out, ShifterOperand(cls));
@@ -6142,7 +6146,11 @@
       __ cmp(out, ShifterOperand(cls));
       __ b(&success, EQ);
       // /* HeapReference<Class> */ out = out->super_class_
-      GenerateReferenceLoadOneRegister(instruction, out_loc, super_offset, maybe_temp_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       out_loc,
+                                       super_offset,
+                                       maybe_temp_loc,
+                                       kEmitCompilerReadBarrier);
       __ CompareAndBranchIfNonZero(out, &loop);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ b(&done);
@@ -6161,7 +6169,11 @@
       __ b(&exact_check, EQ);
       // Otherwise, we need to check that the object's class is a non-primitive array.
       // /* HeapReference<Class> */ out = out->component_type_
-      GenerateReferenceLoadOneRegister(instruction, out_loc, component_offset, maybe_temp_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       out_loc,
+                                       component_offset,
+                                       maybe_temp_loc,
+                                       kEmitCompilerReadBarrier);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ CompareAndBranchIfZero(out, &done);
       __ LoadFromOffset(kLoadUnsignedHalfword, out, out, primitive_offset);
@@ -6281,12 +6293,15 @@
   const uint32_t object_array_data_offset =
       mirror::Array::DataOffset(kHeapReferenceSize).Uint32Value();
 
-  bool is_type_check_slow_path_fatal =
-      (type_check_kind == TypeCheckKind::kExactCheck ||
-       type_check_kind == TypeCheckKind::kAbstractClassCheck ||
-       type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
-       type_check_kind == TypeCheckKind::kArrayObjectCheck) &&
-      !instruction->CanThrowIntoCatchBlock();
+  bool is_type_check_slow_path_fatal = false;
+  if (!kEmitCompilerReadBarrier) {
+    is_type_check_slow_path_fatal =
+        (type_check_kind == TypeCheckKind::kExactCheck ||
+         type_check_kind == TypeCheckKind::kAbstractClassCheck ||
+         type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
+         type_check_kind == TypeCheckKind::kArrayObjectCheck) &&
+        !instruction->CanThrowIntoCatchBlock();
+  }
   SlowPathCodeARM* type_check_slow_path =
       new (GetGraph()->GetArena()) TypeCheckSlowPathARM(instruction,
                                                         is_type_check_slow_path_fatal);
@@ -6307,7 +6322,7 @@
                                         obj_loc,
                                         class_offset,
                                         maybe_temp2_loc,
-                                        kEmitCompilerReadBarrier);
+                                        /*emit_read_barrier*/ false);
 
       __ cmp(temp, ShifterOperand(cls));
       // Jump to slow path for throwing the exception or doing a
@@ -6323,14 +6338,18 @@
                                         obj_loc,
                                         class_offset,
                                         maybe_temp2_loc,
-                                        kEmitCompilerReadBarrier);
+                                        /*emit_read_barrier*/ false);
 
       // If the class is abstract, we eagerly fetch the super class of the
       // object to avoid doing a comparison we know will fail.
       Label loop;
       __ Bind(&loop);
       // /* HeapReference<Class> */ temp = temp->super_class_
-      GenerateReferenceLoadOneRegister(instruction, temp_loc, super_offset, maybe_temp2_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       temp_loc,
+                                       super_offset,
+                                       maybe_temp2_loc,
+                                       /*emit_read_barrier*/ false);
 
       // If the class reference currently in `temp` is null, jump to the slow path to throw the
       // exception.
@@ -6349,7 +6368,7 @@
                                         obj_loc,
                                         class_offset,
                                         maybe_temp2_loc,
-                                        kEmitCompilerReadBarrier);
+                                        /*emit_read_barrier*/ false);
 
       // Walk over the class hierarchy to find a match.
       Label loop;
@@ -6358,7 +6377,11 @@
       __ b(&done, EQ);
 
       // /* HeapReference<Class> */ temp = temp->super_class_
-      GenerateReferenceLoadOneRegister(instruction, temp_loc, super_offset, maybe_temp2_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       temp_loc,
+                                       super_offset,
+                                       maybe_temp2_loc,
+                                       /*emit_read_barrier*/ false);
 
       // If the class reference currently in `temp` is null, jump to the slow path to throw the
       // exception.
@@ -6375,7 +6398,7 @@
                                         obj_loc,
                                         class_offset,
                                         maybe_temp2_loc,
-                                        kEmitCompilerReadBarrier);
+                                        /*emit_read_barrier*/ false);
 
       // Do an exact check.
       __ cmp(temp, ShifterOperand(cls));
@@ -6383,7 +6406,11 @@
 
       // Otherwise, we need to check that the object's class is a non-primitive array.
       // /* HeapReference<Class> */ temp = temp->component_type_
-      GenerateReferenceLoadOneRegister(instruction, temp_loc, component_offset, maybe_temp2_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       temp_loc,
+                                       component_offset,
+                                       maybe_temp2_loc,
+                                       /*emit_read_barrier*/ false);
       // If the component type is null, jump to the slow path to throw the exception.
       __ CompareAndBranchIfZero(temp, type_check_slow_path->GetEntryLabel());
       // Otherwise,the object is indeed an array, jump to label `check_non_primitive_component_type`
@@ -6727,9 +6754,11 @@
 void InstructionCodeGeneratorARM::GenerateReferenceLoadOneRegister(HInstruction* instruction,
                                                                    Location out,
                                                                    uint32_t offset,
-                                                                   Location maybe_temp) {
+                                                                   Location maybe_temp,
+                                                                   bool emit_read_barrier) {
   Register out_reg = out.AsRegister<Register>();
-  if (kEmitCompilerReadBarrier) {
+  if (emit_read_barrier) {
+    CHECK(kEmitCompilerReadBarrier);
     DCHECK(maybe_temp.IsRegister()) << maybe_temp;
     if (kUseBakerReadBarrier) {
       // Load with fast path based Baker's read barrier.
@@ -6763,7 +6792,7 @@
   Register out_reg = out.AsRegister<Register>();
   Register obj_reg = obj.AsRegister<Register>();
   if (emit_read_barrier) {
-    DCHECK(kEmitCompilerReadBarrier);
+    CHECK(kEmitCompilerReadBarrier);
     if (kUseBakerReadBarrier) {
       DCHECK(maybe_temp.IsRegister()) << maybe_temp;
       // Load with fast path based Baker's read barrier.
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index 6561984..e953df8 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -263,7 +263,8 @@
   void GenerateReferenceLoadOneRegister(HInstruction* instruction,
                                         Location out,
                                         uint32_t offset,
-                                        Location maybe_temp);
+                                        Location maybe_temp,
+                                        bool emit_read_barrier);
   // Generate a heap reference load using two different registers
   // `out` and `obj`:
   //
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index aef46c8..3d81635 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -3333,7 +3333,7 @@
   return 0;
 }
 
-// InteraceCheck has 3 temps, one for holding the number of interfaces, one for the current
+// Interface case has 3 temps, one for holding the number of interfaces, one for the current
 // interface pointer, one for loading the current interface.
 // The other checks have one temp for loading the object's class.
 static size_t NumberOfCheckCastTemps(TypeCheckKind type_check_kind) {
@@ -3425,7 +3425,11 @@
       vixl::aarch64::Label loop, success;
       __ Bind(&loop);
       // /* HeapReference<Class> */ out = out->super_class_
-      GenerateReferenceLoadOneRegister(instruction, out_loc, super_offset, maybe_temp_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       out_loc,
+                                       super_offset,
+                                       maybe_temp_loc,
+                                       kEmitCompilerReadBarrier);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ Cbz(out, &done);
       __ Cmp(out, cls);
@@ -3444,7 +3448,11 @@
       __ Cmp(out, cls);
       __ B(eq, &success);
       // /* HeapReference<Class> */ out = out->super_class_
-      GenerateReferenceLoadOneRegister(instruction, out_loc, super_offset, maybe_temp_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       out_loc,
+                                       super_offset,
+                                       maybe_temp_loc,
+                                       kEmitCompilerReadBarrier);
       __ Cbnz(out, &loop);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ B(&done);
@@ -3463,7 +3471,11 @@
       __ B(eq, &exact_check);
       // Otherwise, we need to check that the object's class is a non-primitive array.
       // /* HeapReference<Class> */ out = out->component_type_
-      GenerateReferenceLoadOneRegister(instruction, out_loc, component_offset, maybe_temp_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       out_loc,
+                                       component_offset,
+                                       maybe_temp_loc,
+                                       kEmitCompilerReadBarrier);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ Cbz(out, &done);
       __ Ldrh(out, HeapOperand(out, primitive_offset));
@@ -3585,12 +3597,15 @@
   const uint32_t object_array_data_offset =
       mirror::Array::DataOffset(kHeapReferenceSize).Uint32Value();
 
-  bool is_type_check_slow_path_fatal =
-      (type_check_kind == TypeCheckKind::kExactCheck ||
-       type_check_kind == TypeCheckKind::kAbstractClassCheck ||
-       type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
-       type_check_kind == TypeCheckKind::kArrayObjectCheck) &&
-      !instruction->CanThrowIntoCatchBlock();
+  bool is_type_check_slow_path_fatal = false;
+  if (!kEmitCompilerReadBarrier) {
+    is_type_check_slow_path_fatal =
+        (type_check_kind == TypeCheckKind::kExactCheck ||
+         type_check_kind == TypeCheckKind::kAbstractClassCheck ||
+         type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
+         type_check_kind == TypeCheckKind::kArrayObjectCheck) &&
+        !instruction->CanThrowIntoCatchBlock();
+  }
   SlowPathCodeARM64* type_check_slow_path =
       new (GetGraph()->GetArena()) TypeCheckSlowPathARM64(instruction,
                                                           is_type_check_slow_path_fatal);
@@ -3611,7 +3626,7 @@
                                         obj_loc,
                                         class_offset,
                                         maybe_temp2_loc,
-                                        kEmitCompilerReadBarrier);
+                                        /*emit_read_barrier*/ false);
 
       __ Cmp(temp, cls);
       // Jump to slow path for throwing the exception or doing a
@@ -3627,14 +3642,18 @@
                                         obj_loc,
                                         class_offset,
                                         maybe_temp2_loc,
-                                        kEmitCompilerReadBarrier);
+                                        /*emit_read_barrier*/ false);
 
       // If the class is abstract, we eagerly fetch the super class of the
       // object to avoid doing a comparison we know will fail.
       vixl::aarch64::Label loop;
       __ Bind(&loop);
       // /* HeapReference<Class> */ temp = temp->super_class_
-      GenerateReferenceLoadOneRegister(instruction, temp_loc, super_offset, maybe_temp2_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       temp_loc,
+                                       super_offset,
+                                       maybe_temp2_loc,
+                                       /*emit_read_barrier*/ false);
 
       // If the class reference currently in `temp` is null, jump to the slow path to throw the
       // exception.
@@ -3652,7 +3671,7 @@
                                         obj_loc,
                                         class_offset,
                                         maybe_temp2_loc,
-                                        kEmitCompilerReadBarrier);
+                                        /*emit_read_barrier*/ false);
 
       // Walk over the class hierarchy to find a match.
       vixl::aarch64::Label loop;
@@ -3661,7 +3680,11 @@
       __ B(eq, &done);
 
       // /* HeapReference<Class> */ temp = temp->super_class_
-      GenerateReferenceLoadOneRegister(instruction, temp_loc, super_offset, maybe_temp2_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       temp_loc,
+                                       super_offset,
+                                       maybe_temp2_loc,
+                                       /*emit_read_barrier*/ false);
 
       // If the class reference currently in `temp` is not null, jump
       // back at the beginning of the loop.
@@ -3678,7 +3701,7 @@
                                         obj_loc,
                                         class_offset,
                                         maybe_temp2_loc,
-                                        kEmitCompilerReadBarrier);
+                                        /*emit_read_barrier*/ false);
 
       // Do an exact check.
       __ Cmp(temp, cls);
@@ -3686,7 +3709,11 @@
 
       // Otherwise, we need to check that the object's class is a non-primitive array.
       // /* HeapReference<Class> */ temp = temp->component_type_
-      GenerateReferenceLoadOneRegister(instruction, temp_loc, component_offset, maybe_temp2_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       temp_loc,
+                                       component_offset,
+                                       maybe_temp2_loc,
+                                       /*emit_read_barrier*/ false);
 
       // If the component type is null, jump to the slow path to throw the exception.
       __ Cbz(temp, type_check_slow_path->GetEntryLabel());
@@ -5247,10 +5274,12 @@
 void InstructionCodeGeneratorARM64::GenerateReferenceLoadOneRegister(HInstruction* instruction,
                                                                      Location out,
                                                                      uint32_t offset,
-                                                                     Location maybe_temp) {
+                                                                     Location maybe_temp,
+                                                                     bool emit_read_barrier) {
   Primitive::Type type = Primitive::kPrimNot;
   Register out_reg = RegisterFrom(out, type);
-  if (kEmitCompilerReadBarrier) {
+  if (emit_read_barrier) {
+    CHECK(kEmitCompilerReadBarrier);
     Register temp_reg = RegisterFrom(maybe_temp, type);
     if (kUseBakerReadBarrier) {
       // Load with fast path based Baker's read barrier.
@@ -5290,7 +5319,7 @@
   Register out_reg = RegisterFrom(out, type);
   Register obj_reg = RegisterFrom(obj, type);
   if (emit_read_barrier) {
-    DCHECK(kEmitCompilerReadBarrier);
+    CHECK(kEmitCompilerReadBarrier);
     if (kUseBakerReadBarrier) {
       // Load with fast path based Baker's read barrier.
       Register temp_reg = RegisterFrom(maybe_temp, type);
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index e8518f6..e9a47b6 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -269,7 +269,8 @@
   void GenerateReferenceLoadOneRegister(HInstruction* instruction,
                                         Location out,
                                         uint32_t offset,
-                                        Location maybe_temp);
+                                        Location maybe_temp,
+                                        bool emit_read_barrier);
   // Generate a heap reference load using two different registers
   // `out` and `obj`:
   //
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index e556044..7280e87 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -6349,7 +6349,7 @@
   return 0;
 }
 
-// InteraceCheck has 3 temps, one for holding the number of interfaces, one for the current
+// Interface case has 3 temps, one for holding the number of interfaces, one for the current
 // interface pointer, one for loading the current interface.
 // The other checks have one temp for loading the object's class.
 static size_t NumberOfCheckCastTemps(TypeCheckKind type_check_kind) {
@@ -6445,7 +6445,11 @@
       NearLabel loop;
       __ Bind(&loop);
       // /* HeapReference<Class> */ out = out->super_class_
-      GenerateReferenceLoadOneRegister(instruction, out_loc, super_offset, maybe_temp_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       out_loc,
+                                       super_offset,
+                                       maybe_temp_loc,
+                                       kEmitCompilerReadBarrier);
       __ testl(out, out);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ j(kEqual, &done);
@@ -6475,7 +6479,11 @@
       }
       __ j(kEqual, &success);
       // /* HeapReference<Class> */ out = out->super_class_
-      GenerateReferenceLoadOneRegister(instruction, out_loc, super_offset, maybe_temp_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       out_loc,
+                                       super_offset,
+                                       maybe_temp_loc,
+                                       kEmitCompilerReadBarrier);
       __ testl(out, out);
       __ j(kNotEqual, &loop);
       // If `out` is null, we use it for the result, and jump to `done`.
@@ -6500,7 +6508,11 @@
       __ j(kEqual, &exact_check);
       // Otherwise, we need to check that the object's class is a non-primitive array.
       // /* HeapReference<Class> */ out = out->component_type_
-      GenerateReferenceLoadOneRegister(instruction, out_loc, component_offset, maybe_temp_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       out_loc,
+                                       component_offset,
+                                       maybe_temp_loc,
+                                       kEmitCompilerReadBarrier);
       __ testl(out, out);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ j(kEqual, &done);
@@ -6672,7 +6684,7 @@
                                         temp_loc,
                                         obj_loc,
                                         class_offset,
-                                        kEmitCompilerReadBarrier);
+                                        /*emit_read_barrier*/ false);
 
       if (cls.IsRegister()) {
         __ cmpl(temp, cls.AsRegister<Register>());
@@ -6692,14 +6704,18 @@
                                         temp_loc,
                                         obj_loc,
                                         class_offset,
-                                        kEmitCompilerReadBarrier);
+                                        /*emit_read_barrier*/ false);
 
       // If the class is abstract, we eagerly fetch the super class of the
       // object to avoid doing a comparison we know will fail.
       NearLabel loop;
       __ Bind(&loop);
       // /* HeapReference<Class> */ temp = temp->super_class_
-      GenerateReferenceLoadOneRegister(instruction, temp_loc, super_offset, maybe_temp2_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       temp_loc,
+                                       super_offset,
+                                       maybe_temp2_loc,
+                                       /*emit_read_barrier*/ false);
 
       // If the class reference currently in `temp` is null, jump to the slow path to throw the
       // exception.
@@ -6723,7 +6739,7 @@
                                         temp_loc,
                                         obj_loc,
                                         class_offset,
-                                        kEmitCompilerReadBarrier);
+                                        /*emit_read_barrier*/ false);
 
       // Walk over the class hierarchy to find a match.
       NearLabel loop;
@@ -6737,7 +6753,11 @@
       __ j(kEqual, &done);
 
       // /* HeapReference<Class> */ temp = temp->super_class_
-      GenerateReferenceLoadOneRegister(instruction, temp_loc, super_offset, maybe_temp2_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       temp_loc,
+                                       super_offset,
+                                       maybe_temp2_loc,
+                                       /*emit_read_barrier*/ false);
 
       // If the class reference currently in `temp` is not null, jump
       // back at the beginning of the loop.
@@ -6754,7 +6774,7 @@
                                         temp_loc,
                                         obj_loc,
                                         class_offset,
-                                        kEmitCompilerReadBarrier);
+                                        /*emit_read_barrier*/ false);
 
       // Do an exact check.
       if (cls.IsRegister()) {
@@ -6767,7 +6787,11 @@
 
       // Otherwise, we need to check that the object's class is a non-primitive array.
       // /* HeapReference<Class> */ temp = temp->component_type_
-      GenerateReferenceLoadOneRegister(instruction, temp_loc, component_offset, maybe_temp2_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       temp_loc,
+                                       component_offset,
+                                       maybe_temp2_loc,
+                                       /*emit_read_barrier*/ false);
 
       // If the component type is null (i.e. the object not an array),  jump to the slow path to
       // throw the exception. Otherwise proceed with the check.
@@ -6795,7 +6819,7 @@
       // Fast path for the interface check. Since we compare with a memory location in the inner
       // loop we would need to have cls poisoned. However unpoisoning cls would reset the
       // conditional flags and cause the conditional jump to be incorrect. Therefore we just jump
-      // to the slow path if we are running under poisoning
+      // to the slow path if we are running under poisoning.
       if (!kPoisonHeapReferences) {
         // /* HeapReference<Class> */ temp = obj->klass_
         GenerateReferenceLoadTwoRegisters(instruction,
@@ -7002,9 +7026,10 @@
 void InstructionCodeGeneratorX86::GenerateReferenceLoadOneRegister(HInstruction* instruction,
                                                                    Location out,
                                                                    uint32_t offset,
-                                                                   Location maybe_temp) {
+                                                                   Location maybe_temp,
+                                                                   bool emit_read_barrier) {
   Register out_reg = out.AsRegister<Register>();
-  if (kEmitCompilerReadBarrier) {
+  if (emit_read_barrier) {
     if (kUseBakerReadBarrier) {
       // Load with fast path based Baker's read barrier.
       // /* HeapReference<Object> */ out = *(out + offset)
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index d224902..87295a4 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -240,7 +240,8 @@
   void GenerateReferenceLoadOneRegister(HInstruction* instruction,
                                         Location out,
                                         uint32_t offset,
-                                        Location maybe_temp);
+                                        Location maybe_temp,
+                                        bool emit_read_barrier);
   // Generate a heap reference load using two different registers
   // `out` and `obj`:
   //
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 3612c75..1abda7d 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -5854,7 +5854,11 @@
       NearLabel loop, success;
       __ Bind(&loop);
       // /* HeapReference<Class> */ out = out->super_class_
-      GenerateReferenceLoadOneRegister(instruction, out_loc, super_offset, maybe_temp_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       out_loc,
+                                       super_offset,
+                                       maybe_temp_loc,
+                                       kEmitCompilerReadBarrier);
       __ testl(out, out);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ j(kEqual, &done);
@@ -5884,7 +5888,11 @@
       }
       __ j(kEqual, &success);
       // /* HeapReference<Class> */ out = out->super_class_
-      GenerateReferenceLoadOneRegister(instruction, out_loc, super_offset, maybe_temp_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       out_loc,
+                                       super_offset,
+                                       maybe_temp_loc,
+                                       kEmitCompilerReadBarrier);
       __ testl(out, out);
       __ j(kNotEqual, &loop);
       // If `out` is null, we use it for the result, and jump to `done`.
@@ -5909,7 +5917,11 @@
       __ j(kEqual, &exact_check);
       // Otherwise, we need to check that the object's class is a non-primitive array.
       // /* HeapReference<Class> */ out = out->component_type_
-      GenerateReferenceLoadOneRegister(instruction, out_loc, component_offset, maybe_temp_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       out_loc,
+                                       component_offset,
+                                       maybe_temp_loc,
+                                       kEmitCompilerReadBarrier);
       __ testl(out, out);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ j(kEqual, &done);
@@ -6056,22 +6068,23 @@
                                                            is_type_check_slow_path_fatal);
   codegen_->AddSlowPath(type_check_slow_path);
 
+
+  NearLabel done;
+  // Avoid null check if we know obj is not null.
+  if (instruction->MustDoNullCheck()) {
+    __ testl(obj, obj);
+    __ j(kEqual, &done);
+  }
+
   switch (type_check_kind) {
     case TypeCheckKind::kExactCheck:
     case TypeCheckKind::kArrayCheck: {
-      NearLabel done;
-      // Avoid null check if we know obj is not null.
-      if (instruction->MustDoNullCheck()) {
-        __ testl(obj, obj);
-        __ j(kEqual, &done);
-      }
-
       // /* HeapReference<Class> */ temp = obj->klass_
       GenerateReferenceLoadTwoRegisters(instruction,
                                         temp_loc,
                                         obj_loc,
                                         class_offset,
-                                        kEmitCompilerReadBarrier);
+                                        /*emit_read_barrier*/ false);
       if (cls.IsRegister()) {
         __ cmpl(temp, cls.AsRegister<CpuRegister>());
       } else {
@@ -6081,30 +6094,26 @@
       // Jump to slow path for throwing the exception or doing a
       // more involved array check.
       __ j(kNotEqual, type_check_slow_path->GetEntryLabel());
-      __ Bind(&done);
       break;
     }
 
     case TypeCheckKind::kAbstractClassCheck: {
-      NearLabel done;
-      // Avoid null check if we know obj is not null.
-      if (instruction->MustDoNullCheck()) {
-        __ testl(obj, obj);
-        __ j(kEqual, &done);
-      }
-
       // /* HeapReference<Class> */ temp = obj->klass_
       GenerateReferenceLoadTwoRegisters(instruction,
                                         temp_loc,
                                         obj_loc,
                                         class_offset,
-                                        kEmitCompilerReadBarrier);
+                                        /*emit_read_barrier*/ false);
       // If the class is abstract, we eagerly fetch the super class of the
       // object to avoid doing a comparison we know will fail.
       NearLabel loop;
       __ Bind(&loop);
       // /* HeapReference<Class> */ temp = temp->super_class_
-      GenerateReferenceLoadOneRegister(instruction, temp_loc, super_offset, maybe_temp2_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       temp_loc,
+                                       super_offset,
+                                       maybe_temp2_loc,
+                                       /*emit_read_barrier*/ false);
 
       // If the class reference currently in `temp` is null, jump to the slow path to throw the
       // exception.
@@ -6118,24 +6127,16 @@
         __ cmpl(temp, Address(CpuRegister(RSP), cls.GetStackIndex()));
       }
       __ j(kNotEqual, &loop);
-      __ Bind(&done);
       break;
     }
 
     case TypeCheckKind::kClassHierarchyCheck: {
-      NearLabel done;
-      // Avoid null check if we know obj is not null.
-      if (instruction->MustDoNullCheck()) {
-        __ testl(obj, obj);
-        __ j(kEqual, &done);
-      }
-
       // /* HeapReference<Class> */ temp = obj->klass_
       GenerateReferenceLoadTwoRegisters(instruction,
                                         temp_loc,
                                         obj_loc,
                                         class_offset,
-                                        kEmitCompilerReadBarrier);
+                                        /*emit_read_barrier*/ false);
       // Walk over the class hierarchy to find a match.
       NearLabel loop;
       __ Bind(&loop);
@@ -6148,7 +6149,11 @@
       __ j(kEqual, &done);
 
       // /* HeapReference<Class> */ temp = temp->super_class_
-      GenerateReferenceLoadOneRegister(instruction, temp_loc, super_offset, maybe_temp2_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       temp_loc,
+                                       super_offset,
+                                       maybe_temp2_loc,
+                                       /*emit_read_barrier*/ false);
 
       // If the class reference currently in `temp` is not null, jump
       // back at the beginning of the loop.
@@ -6156,28 +6161,16 @@
       __ j(kNotZero, &loop);
       // Otherwise, jump to the slow path to throw the exception.
       __ jmp(type_check_slow_path->GetEntryLabel());
-      __ Bind(&done);
       break;
     }
 
     case TypeCheckKind::kArrayObjectCheck: {
-      // We cannot use a NearLabel here, as its range might be too
-      // short in some cases when read barriers are enabled.  This has
-      // been observed for instance when the code emitted for this
-      // case uses high x86-64 registers (R8-R15).
-      Label done;
-      // Avoid null check if we know obj is not null.
-      if (instruction->MustDoNullCheck()) {
-        __ testl(obj, obj);
-        __ j(kEqual, &done);
-      }
-
       // /* HeapReference<Class> */ temp = obj->klass_
       GenerateReferenceLoadTwoRegisters(instruction,
                                         temp_loc,
                                         obj_loc,
                                         class_offset,
-                                        kEmitCompilerReadBarrier);
+                                        /*emit_read_barrier*/ false);
       // Do an exact check.
       NearLabel check_non_primitive_component_type;
       if (cls.IsRegister()) {
@@ -6190,7 +6183,11 @@
 
       // Otherwise, we need to check that the object's class is a non-primitive array.
       // /* HeapReference<Class> */ temp = temp->component_type_
-      GenerateReferenceLoadOneRegister(instruction, temp_loc, component_offset, maybe_temp2_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       temp_loc,
+                                       component_offset,
+                                       maybe_temp2_loc,
+                                       /*emit_read_barrier*/ false);
 
       // If the component type is not null (i.e. the object is indeed
       // an array), jump to label `check_non_primitive_component_type`
@@ -6201,7 +6198,6 @@
       __ j(kZero, type_check_slow_path->GetEntryLabel());
       __ cmpw(Address(temp, primitive_offset), Immediate(Primitive::kPrimNot));
       __ j(kNotEqual, type_check_slow_path->GetEntryLabel());
-      __ Bind(&done);
       break;
     }
 
@@ -6215,27 +6211,11 @@
       // instruction (following the runtime calling convention), which
       // might be cluttered by the potential first read barrier
       // emission at the beginning of this method.
-
-      NearLabel done;
-      // Avoid null check if we know obj is not null.
-      if (instruction->MustDoNullCheck()) {
-        __ testl(obj, obj);
-        __ j(kEqual, &done);
-      }
       __ jmp(type_check_slow_path->GetEntryLabel());
-      __ Bind(&done);
       break;
     }
 
     case TypeCheckKind::kInterfaceCheck:
-      NearLabel done;
-
-      // Avoid null check if we know obj is not null.
-      if (instruction->MustDoNullCheck()) {
-        __ testl(obj, obj);
-        __ j(kEqual, &done);
-      }
-
       // Fast path for the interface check. We always go slow path for heap poisoning since
       // unpoisoning cls would require an extra temp.
       if (!kPoisonHeapReferences) {
@@ -6273,10 +6253,13 @@
         __ Bind(&is_null);
       }
       __ jmp(type_check_slow_path->GetEntryLabel());
-      __ Bind(&done);
       break;
   }
 
+  if (done.IsLinked()) {
+    __ Bind(&done);
+  }
+
   __ Bind(type_check_slow_path->GetExitLabel());
 }
 
@@ -6416,9 +6399,11 @@
 void InstructionCodeGeneratorX86_64::GenerateReferenceLoadOneRegister(HInstruction* instruction,
                                                                       Location out,
                                                                       uint32_t offset,
-                                                                      Location maybe_temp) {
+                                                                      Location maybe_temp,
+                                                                      bool emit_read_barrier) {
   CpuRegister out_reg = out.AsRegister<CpuRegister>();
-  if (kEmitCompilerReadBarrier) {
+  if (emit_read_barrier) {
+    CHECK(kEmitCompilerReadBarrier);
     if (kUseBakerReadBarrier) {
       // Load with fast path based Baker's read barrier.
       // /* HeapReference<Object> */ out = *(out + offset)
@@ -6451,6 +6436,7 @@
   CpuRegister out_reg = out.AsRegister<CpuRegister>();
   CpuRegister obj_reg = obj.AsRegister<CpuRegister>();
   if (emit_read_barrier) {
+    CHECK(kEmitCompilerReadBarrier);
     if (kUseBakerReadBarrier) {
       // Load with fast path based Baker's read barrier.
       // /* HeapReference<Object> */ out = *(obj + offset)
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index 5a6dc54..807a9f1 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -234,7 +234,8 @@
   void GenerateReferenceLoadOneRegister(HInstruction* instruction,
                                         Location out,
                                         uint32_t offset,
-                                        Location maybe_temp);
+                                        Location maybe_temp,
+                                        bool emit_read_barrier);
   // Generate a heap reference load using two different registers
   // `out` and `obj`:
   //