optimizing: Build HConstructorFence for HNewArray/HNewInstance nodes

Also fixes:
* LSE, code_sinking to keep optimizing new-instance if it did so before
* Various tests to expect constructor fences after new-instance

Sidenote: new-instance String does not get a ConstructorFence; the
special StringFactory calls are assumed to be self-fencing.

Metric changes on go/lem:
* CodeSize -0.262% in ART-Compile (ARMv8)
* RunTime -0.747% for all (linux-armv8)

(No changes expected to x86, constructor fences are no-op).

The RunTime regression is temporary until art_quick_alloc_* entrypoints have their
DMBs removed in a follow up CL.

Test: art/test.py
Bug: 36656456
Change-Id: I6a936a6e51c623e1c6b5b22eee5c3c72bebbed35
diff --git a/compiler/optimizing/instruction_builder.cc b/compiler/optimizing/instruction_builder.cc
index 8b79da8..40fafb0 100644
--- a/compiler/optimizing/instruction_builder.cc
+++ b/compiler/optimizing/instruction_builder.cc
@@ -962,7 +962,7 @@
                       false /* is_unresolved */);
 }
 
-bool HInstructionBuilder::BuildNewInstance(dex::TypeIndex type_index, uint32_t dex_pc) {
+HNewInstance* HInstructionBuilder::BuildNewInstance(dex::TypeIndex type_index, uint32_t dex_pc) {
   ScopedObjectAccess soa(Thread::Current());
 
   HLoadClass* load_class = BuildLoadClass(type_index, dex_pc);
@@ -986,14 +986,65 @@
   // Consider classes we haven't resolved as potentially finalizable.
   bool finalizable = (klass == nullptr) || klass->IsFinalizable();
 
-  AppendInstruction(new (arena_) HNewInstance(
+  HNewInstance* new_instance = new (arena_) HNewInstance(
       cls,
       dex_pc,
       type_index,
       *dex_compilation_unit_->GetDexFile(),
       finalizable,
-      entrypoint));
-  return true;
+      entrypoint);
+  AppendInstruction(new_instance);
+
+  return new_instance;
+}
+
+void HInstructionBuilder::BuildConstructorFenceForAllocation(HInstruction* allocation) {
+  DCHECK(allocation != nullptr &&
+             allocation->IsNewInstance() ||
+             allocation->IsNewArray());  // corresponding to "new" keyword in JLS.
+
+  if (allocation->IsNewInstance()) {
+    // STRING SPECIAL HANDLING:
+    // -------------------------------
+    // Strings have a real HNewInstance node but they end up always having 0 uses.
+    // All uses of a String HNewInstance are always transformed to replace their input
+    // of the HNewInstance with an input of the invoke to StringFactory.
+    //
+    // Do not emit an HConstructorFence here since it can inhibit some String new-instance
+    // optimizations (to pass checker tests that rely on those optimizations).
+    HNewInstance* new_inst = allocation->AsNewInstance();
+    HLoadClass* load_class = new_inst->GetLoadClass();
+
+    Thread* self = Thread::Current();
+    ScopedObjectAccess soa(self);
+    StackHandleScope<1> hs(self);
+    Handle<mirror::Class> klass = load_class->GetClass();
+    if (klass != nullptr && klass->IsStringClass()) {
+      return;
+      // Note: Do not use allocation->IsStringAlloc which requires
+      // a valid ReferenceTypeInfo, but that doesn't get made until after reference type
+      // propagation (and instruction builder is too early).
+    }
+    // (In terms of correctness, the StringFactory needs to provide its own
+    // default initialization barrier, see below.)
+  }
+
+  // JLS 17.4.5 "Happens-before Order" describes:
+  //
+  //   The default initialization of any object happens-before any other actions (other than
+  //   default-writes) of a program.
+  //
+  // In our implementation the default initialization of an object to type T means
+  // setting all of its initial data (object[0..size)) to 0, and setting the
+  // object's class header (i.e. object.getClass() == T.class).
+  //
+  // In practice this fence ensures that the writes to the object header
+  // are visible to other threads if this object escapes the current thread.
+  // (and in theory the 0-initializing, but that happens automatically
+  // when new memory pages are mapped in by the OS).
+  HConstructorFence* ctor_fence =
+      new (arena_) HConstructorFence(allocation, allocation->GetDexPc(), arena_);
+  AppendInstruction(ctor_fence);
 }
 
 static bool IsSubClass(mirror::Class* to_test, mirror::Class* super_class)
@@ -1522,15 +1573,15 @@
   graph_->SetHasBoundsChecks(true);
 }
 
-void HInstructionBuilder::BuildFilledNewArray(uint32_t dex_pc,
-                                              dex::TypeIndex type_index,
-                                              uint32_t number_of_vreg_arguments,
-                                              bool is_range,
-                                              uint32_t* args,
-                                              uint32_t register_index) {
+HNewArray* HInstructionBuilder::BuildFilledNewArray(uint32_t dex_pc,
+                                                    dex::TypeIndex type_index,
+                                                    uint32_t number_of_vreg_arguments,
+                                                    bool is_range,
+                                                    uint32_t* args,
+                                                    uint32_t register_index) {
   HInstruction* length = graph_->GetIntConstant(number_of_vreg_arguments, dex_pc);
   HLoadClass* cls = BuildLoadClass(type_index, dex_pc);
-  HInstruction* object = new (arena_) HNewArray(cls, length, dex_pc);
+  HNewArray* const object = new (arena_) HNewArray(cls, length, dex_pc);
   AppendInstruction(object);
 
   const char* descriptor = dex_file_->StringByTypeIdx(type_index);
@@ -1550,6 +1601,8 @@
     AppendInstruction(aset);
   }
   latest_result_ = object;
+
+  return object;
 }
 
 template <typename T>
@@ -2534,10 +2587,12 @@
     }
 
     case Instruction::NEW_INSTANCE: {
-      if (!BuildNewInstance(dex::TypeIndex(instruction.VRegB_21c()), dex_pc)) {
-        return false;
-      }
+      HNewInstance* new_instance =
+          BuildNewInstance(dex::TypeIndex(instruction.VRegB_21c()), dex_pc);
+      DCHECK(new_instance != nullptr);
+
       UpdateLocal(instruction.VRegA(), current_block_->GetLastInstruction());
+      BuildConstructorFenceForAllocation(new_instance);
       break;
     }
 
@@ -2545,8 +2600,11 @@
       dex::TypeIndex type_index(instruction.VRegC_22c());
       HInstruction* length = LoadLocal(instruction.VRegB_22c(), Primitive::kPrimInt);
       HLoadClass* cls = BuildLoadClass(type_index, dex_pc);
-      AppendInstruction(new (arena_) HNewArray(cls, length, dex_pc));
+
+      HNewArray* new_array = new (arena_) HNewArray(cls, length, dex_pc);
+      AppendInstruction(new_array);
       UpdateLocal(instruction.VRegA_22c(), current_block_->GetLastInstruction());
+      BuildConstructorFenceForAllocation(new_array);
       break;
     }
 
@@ -2555,7 +2613,13 @@
       dex::TypeIndex type_index(instruction.VRegB_35c());
       uint32_t args[5];
       instruction.GetVarArgs(args);
-      BuildFilledNewArray(dex_pc, type_index, number_of_vreg_arguments, false, args, 0);
+      HNewArray* new_array = BuildFilledNewArray(dex_pc,
+                                                 type_index,
+                                                 number_of_vreg_arguments,
+                                                 /* is_range */ false,
+                                                 args,
+                                                 /* register_index */ 0);
+      BuildConstructorFenceForAllocation(new_array);
       break;
     }
 
@@ -2563,8 +2627,13 @@
       uint32_t number_of_vreg_arguments = instruction.VRegA_3rc();
       dex::TypeIndex type_index(instruction.VRegB_3rc());
       uint32_t register_index = instruction.VRegC_3rc();
-      BuildFilledNewArray(
-          dex_pc, type_index, number_of_vreg_arguments, true, nullptr, register_index);
+      HNewArray* new_array = BuildFilledNewArray(dex_pc,
+                                                 type_index,
+                                                 number_of_vreg_arguments,
+                                                 /* is_range */ true,
+                                                 /* args*/ nullptr,
+                                                 register_index);
+      BuildConstructorFenceForAllocation(new_array);
       break;
     }