Avoid read barriers for inlined check cast

Avoiding read barriers improves speed and reduces code size.

Doing this can never result in false positives, only false negatives.
These false negatives are handled correcly by rechecking in the
entrypoint.

Ritzperf code size for CC:
arm32: 13439400->13300136 (-1.04%)
arm64: 16405120->16253568 (-0.92%)

Perf: TODO

Bug: 29516974
Bug: 12687968

Test: test-art-host, run ritzperf both with CC

Change-Id: Ie024e0b1e8ee415781fb73e8029e87e8a5318f86
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index e556044..7280e87 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -6349,7 +6349,7 @@
   return 0;
 }
 
-// InteraceCheck has 3 temps, one for holding the number of interfaces, one for the current
+// Interface case has 3 temps, one for holding the number of interfaces, one for the current
 // interface pointer, one for loading the current interface.
 // The other checks have one temp for loading the object's class.
 static size_t NumberOfCheckCastTemps(TypeCheckKind type_check_kind) {
@@ -6445,7 +6445,11 @@
       NearLabel loop;
       __ Bind(&loop);
       // /* HeapReference<Class> */ out = out->super_class_
-      GenerateReferenceLoadOneRegister(instruction, out_loc, super_offset, maybe_temp_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       out_loc,
+                                       super_offset,
+                                       maybe_temp_loc,
+                                       kEmitCompilerReadBarrier);
       __ testl(out, out);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ j(kEqual, &done);
@@ -6475,7 +6479,11 @@
       }
       __ j(kEqual, &success);
       // /* HeapReference<Class> */ out = out->super_class_
-      GenerateReferenceLoadOneRegister(instruction, out_loc, super_offset, maybe_temp_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       out_loc,
+                                       super_offset,
+                                       maybe_temp_loc,
+                                       kEmitCompilerReadBarrier);
       __ testl(out, out);
       __ j(kNotEqual, &loop);
       // If `out` is null, we use it for the result, and jump to `done`.
@@ -6500,7 +6508,11 @@
       __ j(kEqual, &exact_check);
       // Otherwise, we need to check that the object's class is a non-primitive array.
       // /* HeapReference<Class> */ out = out->component_type_
-      GenerateReferenceLoadOneRegister(instruction, out_loc, component_offset, maybe_temp_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       out_loc,
+                                       component_offset,
+                                       maybe_temp_loc,
+                                       kEmitCompilerReadBarrier);
       __ testl(out, out);
       // If `out` is null, we use it for the result, and jump to `done`.
       __ j(kEqual, &done);
@@ -6672,7 +6684,7 @@
                                         temp_loc,
                                         obj_loc,
                                         class_offset,
-                                        kEmitCompilerReadBarrier);
+                                        /*emit_read_barrier*/ false);
 
       if (cls.IsRegister()) {
         __ cmpl(temp, cls.AsRegister<Register>());
@@ -6692,14 +6704,18 @@
                                         temp_loc,
                                         obj_loc,
                                         class_offset,
-                                        kEmitCompilerReadBarrier);
+                                        /*emit_read_barrier*/ false);
 
       // If the class is abstract, we eagerly fetch the super class of the
       // object to avoid doing a comparison we know will fail.
       NearLabel loop;
       __ Bind(&loop);
       // /* HeapReference<Class> */ temp = temp->super_class_
-      GenerateReferenceLoadOneRegister(instruction, temp_loc, super_offset, maybe_temp2_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       temp_loc,
+                                       super_offset,
+                                       maybe_temp2_loc,
+                                       /*emit_read_barrier*/ false);
 
       // If the class reference currently in `temp` is null, jump to the slow path to throw the
       // exception.
@@ -6723,7 +6739,7 @@
                                         temp_loc,
                                         obj_loc,
                                         class_offset,
-                                        kEmitCompilerReadBarrier);
+                                        /*emit_read_barrier*/ false);
 
       // Walk over the class hierarchy to find a match.
       NearLabel loop;
@@ -6737,7 +6753,11 @@
       __ j(kEqual, &done);
 
       // /* HeapReference<Class> */ temp = temp->super_class_
-      GenerateReferenceLoadOneRegister(instruction, temp_loc, super_offset, maybe_temp2_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       temp_loc,
+                                       super_offset,
+                                       maybe_temp2_loc,
+                                       /*emit_read_barrier*/ false);
 
       // If the class reference currently in `temp` is not null, jump
       // back at the beginning of the loop.
@@ -6754,7 +6774,7 @@
                                         temp_loc,
                                         obj_loc,
                                         class_offset,
-                                        kEmitCompilerReadBarrier);
+                                        /*emit_read_barrier*/ false);
 
       // Do an exact check.
       if (cls.IsRegister()) {
@@ -6767,7 +6787,11 @@
 
       // Otherwise, we need to check that the object's class is a non-primitive array.
       // /* HeapReference<Class> */ temp = temp->component_type_
-      GenerateReferenceLoadOneRegister(instruction, temp_loc, component_offset, maybe_temp2_loc);
+      GenerateReferenceLoadOneRegister(instruction,
+                                       temp_loc,
+                                       component_offset,
+                                       maybe_temp2_loc,
+                                       /*emit_read_barrier*/ false);
 
       // If the component type is null (i.e. the object not an array),  jump to the slow path to
       // throw the exception. Otherwise proceed with the check.
@@ -6795,7 +6819,7 @@
       // Fast path for the interface check. Since we compare with a memory location in the inner
       // loop we would need to have cls poisoned. However unpoisoning cls would reset the
       // conditional flags and cause the conditional jump to be incorrect. Therefore we just jump
-      // to the slow path if we are running under poisoning
+      // to the slow path if we are running under poisoning.
       if (!kPoisonHeapReferences) {
         // /* HeapReference<Class> */ temp = obj->klass_
         GenerateReferenceLoadTwoRegisters(instruction,
@@ -7002,9 +7026,10 @@
 void InstructionCodeGeneratorX86::GenerateReferenceLoadOneRegister(HInstruction* instruction,
                                                                    Location out,
                                                                    uint32_t offset,
-                                                                   Location maybe_temp) {
+                                                                   Location maybe_temp,
+                                                                   bool emit_read_barrier) {
   Register out_reg = out.AsRegister<Register>();
-  if (kEmitCompilerReadBarrier) {
+  if (emit_read_barrier) {
     if (kUseBakerReadBarrier) {
       // Load with fast path based Baker's read barrier.
       // /* HeapReference<Object> */ out = *(out + offset)