optimizing: Build HConstructorFence for HNewArray/HNewInstance nodes

Also fixes:
* LSE, code_sinking to keep optimizing new-instance if it did so before
* Various tests to expect constructor fences after new-instance

Sidenote: new-instance String does not get a ConstructorFence; the
special StringFactory calls are assumed to be self-fencing.

Metric changes on go/lem:
* CodeSize -0.262% in ART-Compile (ARMv8)
* RunTime -0.747% for all (linux-armv8)

(No changes expected to x86, constructor fences are no-op).

The RunTime regression is temporary until art_quick_alloc_* entrypoints have their
DMBs removed in a follow up CL.

Test: art/test.py
Bug: 36656456
Change-Id: I6a936a6e51c623e1c6b5b22eee5c3c72bebbed35
diff --git a/compiler/optimizing/code_sinking.cc b/compiler/optimizing/code_sinking.cc
index 0b4dcd3..e598e19 100644
--- a/compiler/optimizing/code_sinking.cc
+++ b/compiler/optimizing/code_sinking.cc
@@ -56,6 +56,17 @@
     return true;
   }
 
+  // Check it is safe to move ConstructorFence.
+  // (Safe to move ConstructorFence for only protecting the new-instance but not for finals.)
+  if (instruction->IsConstructorFence()) {
+    HConstructorFence* ctor_fence = instruction->AsConstructorFence();
+
+    // A fence with "0" inputs is dead and should've been removed in a prior pass.
+    DCHECK_NE(0u, ctor_fence->InputCount());
+
+    return ctor_fence->GetAssociatedAllocation() != nullptr;
+  }
+
   // All other instructions that can throw cannot be moved.
   if (instruction->CanThrow()) {
     return false;
@@ -134,11 +145,11 @@
                             HInstruction* user,
                             const ArenaBitVector& post_dominated) {
   if (instruction->IsNewInstance()) {
-    return user->IsInstanceFieldSet() &&
+    return (user->IsInstanceFieldSet() || user->IsConstructorFence()) &&
         (user->InputAt(0) == instruction) &&
         !post_dominated.IsBitSet(user->GetBlock()->GetBlockId());
   } else if (instruction->IsNewArray()) {
-    return user->IsArraySet() &&
+    return (user->IsArraySet() || user->IsConstructorFence()) &&
         (user->InputAt(0) == instruction) &&
         !post_dominated.IsBitSet(user->GetBlock()->GetBlockId());
   }
@@ -372,7 +383,9 @@
   // Step (3): Try to move sinking candidates.
   for (HInstruction* instruction : move_in_order) {
     HInstruction* position = nullptr;
-    if (instruction->IsArraySet() || instruction->IsInstanceFieldSet()) {
+    if (instruction->IsArraySet()
+            || instruction->IsInstanceFieldSet()
+            || instruction->IsConstructorFence()) {
       if (!instructions_that_can_move.IsBitSet(instruction->InputAt(0)->GetId())) {
         // A store can trivially move, but it can safely do so only if the heap
         // location it stores to can also move.
diff --git a/compiler/optimizing/instruction_builder.cc b/compiler/optimizing/instruction_builder.cc
index 8b79da8..40fafb0 100644
--- a/compiler/optimizing/instruction_builder.cc
+++ b/compiler/optimizing/instruction_builder.cc
@@ -962,7 +962,7 @@
                       false /* is_unresolved */);
 }
 
-bool HInstructionBuilder::BuildNewInstance(dex::TypeIndex type_index, uint32_t dex_pc) {
+HNewInstance* HInstructionBuilder::BuildNewInstance(dex::TypeIndex type_index, uint32_t dex_pc) {
   ScopedObjectAccess soa(Thread::Current());
 
   HLoadClass* load_class = BuildLoadClass(type_index, dex_pc);
@@ -986,14 +986,65 @@
   // Consider classes we haven't resolved as potentially finalizable.
   bool finalizable = (klass == nullptr) || klass->IsFinalizable();
 
-  AppendInstruction(new (arena_) HNewInstance(
+  HNewInstance* new_instance = new (arena_) HNewInstance(
       cls,
       dex_pc,
       type_index,
       *dex_compilation_unit_->GetDexFile(),
       finalizable,
-      entrypoint));
-  return true;
+      entrypoint);
+  AppendInstruction(new_instance);
+
+  return new_instance;
+}
+
+void HInstructionBuilder::BuildConstructorFenceForAllocation(HInstruction* allocation) {
+  DCHECK(allocation != nullptr &&
+             allocation->IsNewInstance() ||
+             allocation->IsNewArray());  // corresponding to "new" keyword in JLS.
+
+  if (allocation->IsNewInstance()) {
+    // STRING SPECIAL HANDLING:
+    // -------------------------------
+    // Strings have a real HNewInstance node but they end up always having 0 uses.
+    // All uses of a String HNewInstance are always transformed to replace their input
+    // of the HNewInstance with an input of the invoke to StringFactory.
+    //
+    // Do not emit an HConstructorFence here since it can inhibit some String new-instance
+    // optimizations (to pass checker tests that rely on those optimizations).
+    HNewInstance* new_inst = allocation->AsNewInstance();
+    HLoadClass* load_class = new_inst->GetLoadClass();
+
+    Thread* self = Thread::Current();
+    ScopedObjectAccess soa(self);
+    StackHandleScope<1> hs(self);
+    Handle<mirror::Class> klass = load_class->GetClass();
+    if (klass != nullptr && klass->IsStringClass()) {
+      return;
+      // Note: Do not use allocation->IsStringAlloc which requires
+      // a valid ReferenceTypeInfo, but that doesn't get made until after reference type
+      // propagation (and instruction builder is too early).
+    }
+    // (In terms of correctness, the StringFactory needs to provide its own
+    // default initialization barrier, see below.)
+  }
+
+  // JLS 17.4.5 "Happens-before Order" describes:
+  //
+  //   The default initialization of any object happens-before any other actions (other than
+  //   default-writes) of a program.
+  //
+  // In our implementation the default initialization of an object to type T means
+  // setting all of its initial data (object[0..size)) to 0, and setting the
+  // object's class header (i.e. object.getClass() == T.class).
+  //
+  // In practice this fence ensures that the writes to the object header
+  // are visible to other threads if this object escapes the current thread.
+  // (and in theory the 0-initializing, but that happens automatically
+  // when new memory pages are mapped in by the OS).
+  HConstructorFence* ctor_fence =
+      new (arena_) HConstructorFence(allocation, allocation->GetDexPc(), arena_);
+  AppendInstruction(ctor_fence);
 }
 
 static bool IsSubClass(mirror::Class* to_test, mirror::Class* super_class)
@@ -1522,15 +1573,15 @@
   graph_->SetHasBoundsChecks(true);
 }
 
-void HInstructionBuilder::BuildFilledNewArray(uint32_t dex_pc,
-                                              dex::TypeIndex type_index,
-                                              uint32_t number_of_vreg_arguments,
-                                              bool is_range,
-                                              uint32_t* args,
-                                              uint32_t register_index) {
+HNewArray* HInstructionBuilder::BuildFilledNewArray(uint32_t dex_pc,
+                                                    dex::TypeIndex type_index,
+                                                    uint32_t number_of_vreg_arguments,
+                                                    bool is_range,
+                                                    uint32_t* args,
+                                                    uint32_t register_index) {
   HInstruction* length = graph_->GetIntConstant(number_of_vreg_arguments, dex_pc);
   HLoadClass* cls = BuildLoadClass(type_index, dex_pc);
-  HInstruction* object = new (arena_) HNewArray(cls, length, dex_pc);
+  HNewArray* const object = new (arena_) HNewArray(cls, length, dex_pc);
   AppendInstruction(object);
 
   const char* descriptor = dex_file_->StringByTypeIdx(type_index);
@@ -1550,6 +1601,8 @@
     AppendInstruction(aset);
   }
   latest_result_ = object;
+
+  return object;
 }
 
 template <typename T>
@@ -2534,10 +2587,12 @@
     }
 
     case Instruction::NEW_INSTANCE: {
-      if (!BuildNewInstance(dex::TypeIndex(instruction.VRegB_21c()), dex_pc)) {
-        return false;
-      }
+      HNewInstance* new_instance =
+          BuildNewInstance(dex::TypeIndex(instruction.VRegB_21c()), dex_pc);
+      DCHECK(new_instance != nullptr);
+
       UpdateLocal(instruction.VRegA(), current_block_->GetLastInstruction());
+      BuildConstructorFenceForAllocation(new_instance);
       break;
     }
 
@@ -2545,8 +2600,11 @@
       dex::TypeIndex type_index(instruction.VRegC_22c());
       HInstruction* length = LoadLocal(instruction.VRegB_22c(), Primitive::kPrimInt);
       HLoadClass* cls = BuildLoadClass(type_index, dex_pc);
-      AppendInstruction(new (arena_) HNewArray(cls, length, dex_pc));
+
+      HNewArray* new_array = new (arena_) HNewArray(cls, length, dex_pc);
+      AppendInstruction(new_array);
       UpdateLocal(instruction.VRegA_22c(), current_block_->GetLastInstruction());
+      BuildConstructorFenceForAllocation(new_array);
       break;
     }
 
@@ -2555,7 +2613,13 @@
       dex::TypeIndex type_index(instruction.VRegB_35c());
       uint32_t args[5];
       instruction.GetVarArgs(args);
-      BuildFilledNewArray(dex_pc, type_index, number_of_vreg_arguments, false, args, 0);
+      HNewArray* new_array = BuildFilledNewArray(dex_pc,
+                                                 type_index,
+                                                 number_of_vreg_arguments,
+                                                 /* is_range */ false,
+                                                 args,
+                                                 /* register_index */ 0);
+      BuildConstructorFenceForAllocation(new_array);
       break;
     }
 
@@ -2563,8 +2627,13 @@
       uint32_t number_of_vreg_arguments = instruction.VRegA_3rc();
       dex::TypeIndex type_index(instruction.VRegB_3rc());
       uint32_t register_index = instruction.VRegC_3rc();
-      BuildFilledNewArray(
-          dex_pc, type_index, number_of_vreg_arguments, true, nullptr, register_index);
+      HNewArray* new_array = BuildFilledNewArray(dex_pc,
+                                                 type_index,
+                                                 number_of_vreg_arguments,
+                                                 /* is_range */ true,
+                                                 /* args*/ nullptr,
+                                                 register_index);
+      BuildConstructorFenceForAllocation(new_array);
       break;
     }
 
diff --git a/compiler/optimizing/instruction_builder.h b/compiler/optimizing/instruction_builder.h
index 2fb5c7b..e968760 100644
--- a/compiler/optimizing/instruction_builder.h
+++ b/compiler/optimizing/instruction_builder.h
@@ -194,12 +194,12 @@
                               uint32_t register_index);
 
   // Builds a new array node and the instructions that fill it.
-  void BuildFilledNewArray(uint32_t dex_pc,
-                           dex::TypeIndex type_index,
-                           uint32_t number_of_vreg_arguments,
-                           bool is_range,
-                           uint32_t* args,
-                           uint32_t register_index);
+  HNewArray* BuildFilledNewArray(uint32_t dex_pc,
+                                 dex::TypeIndex type_index,
+                                 uint32_t number_of_vreg_arguments,
+                                 bool is_range,
+                                 uint32_t* args,
+                                 uint32_t register_index);
 
   void BuildFillArrayData(const Instruction& instruction, uint32_t dex_pc);
 
@@ -288,7 +288,11 @@
       REQUIRES_SHARED(Locks::mutator_lock_);
 
   // Build a HNewInstance instruction.
-  bool BuildNewInstance(dex::TypeIndex type_index, uint32_t dex_pc);
+  HNewInstance* BuildNewInstance(dex::TypeIndex type_index, uint32_t dex_pc);
+
+  // Build a HConstructorFence for HNewInstance and HNewArray instructions. This ensures the
+  // happens-before ordering for default-initialization of the object referred to by new_instance.
+  void BuildConstructorFenceForAllocation(HInstruction* allocation);
 
   // Return whether the compiler can assume `cls` is initialized.
   bool IsInitialized(Handle<mirror::Class> cls) const
diff --git a/compiler/optimizing/load_store_elimination.cc b/compiler/optimizing/load_store_elimination.cc
index 8d8cc93..76c9d23 100644
--- a/compiler/optimizing/load_store_elimination.cc
+++ b/compiler/optimizing/load_store_elimination.cc
@@ -578,9 +578,7 @@
       }
     }
     for (HInstruction* new_array : singleton_new_arrays_) {
-      // TODO: Delete constructor fences for new-array
-      // In the future HNewArray instructions will have HConstructorFence's for them.
-      // HConstructorFence::RemoveConstructorFences(new_array);
+      HConstructorFence::RemoveConstructorFences(new_array);
 
       if (!new_array->HasNonEnvironmentUses()) {
         new_array->RemoveEnvironmentUsers();
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index f250c1a..1460b26 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -1234,6 +1234,20 @@
   }
 }
 
+HInstruction* HConstructorFence::GetAssociatedAllocation() {
+  HInstruction* new_instance_inst = GetPrevious();
+  // Check if the immediately preceding instruction is a new-instance/new-array.
+  // Otherwise this fence is for protecting final fields.
+  if (new_instance_inst != nullptr &&
+      (new_instance_inst->IsNewInstance() || new_instance_inst->IsNewArray())) {
+    // TODO: Need to update this code to handle multiple inputs.
+    DCHECK_EQ(InputCount(), 1u);
+    return new_instance_inst;
+  } else {
+    return nullptr;
+  }
+}
+
 #define DEFINE_ACCEPT(name, super)                                             \
 void H##name::Accept(HGraphVisitor* visitor) {                                 \
   visitor->Visit##name(this);                                                  \
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index e40361e..72521fd 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -6650,6 +6650,14 @@
   // still considered live.
   static void RemoveConstructorFences(HInstruction* instruction);
 
+  // Check if this constructor fence is protecting
+  // an HNewInstance or HNewArray that is also the immediate
+  // predecessor of `this`.
+  //
+  // Returns the associated HNewArray or HNewInstance,
+  // or null otherwise.
+  HInstruction* GetAssociatedAllocation();
+
   DECLARE_INSTRUCTION(ConstructorFence);
 
  private:
diff --git a/compiler/optimizing/prepare_for_register_allocation.cc b/compiler/optimizing/prepare_for_register_allocation.cc
index c3c141b..aa42fd6 100644
--- a/compiler/optimizing/prepare_for_register_allocation.cc
+++ b/compiler/optimizing/prepare_for_register_allocation.cc
@@ -168,7 +168,39 @@
 }
 
 void PrepareForRegisterAllocation::VisitConstructorFence(HConstructorFence* constructor_fence) {
-  // Delete all the inputs to the constructor fence;
+  // Trivially remove redundant HConstructorFence when it immediately follows an HNewInstance
+  // to an uninitialized class. In this special case, the art_quick_alloc_object_resolved
+  // will already have the 'dmb' which is strictly stronger than an HConstructorFence.
+  //
+  // The instruction builder always emits "x = HNewInstance; HConstructorFence(x)" so this
+  // is effectively pattern-matching that particular case and undoing the redundancy the builder
+  // had introduced.
+  //
+  // TODO: Move this to a separate pass.
+  HInstruction* allocation_inst = constructor_fence->GetAssociatedAllocation();
+  if (allocation_inst != nullptr && allocation_inst->IsNewInstance()) {
+    HNewInstance* new_inst = allocation_inst->AsNewInstance();
+    // This relies on the entrypoint already being set to the more optimized version;
+    // as that happens in this pass, this redundancy removal also cannot happen any earlier.
+    if (new_inst != nullptr && new_inst->GetEntrypoint() == kQuickAllocObjectResolved) {
+      // If this was done in an earlier pass, we would want to match that `previous` was an input
+      // to the `constructor_fence`. However, since this pass removes the inputs to the fence,
+      // we can ignore the inputs and just remove the instruction from its block.
+      DCHECK_EQ(1u, constructor_fence->InputCount());
+      // TODO: GetAssociatedAllocation should not care about multiple inputs
+      // if we are in prepare_for_register_allocation pass only.
+      constructor_fence->GetBlock()->RemoveInstruction(constructor_fence);
+      return;
+      // TODO: actually remove the dmb from the .S entrypoints (initialized variants only).
+    }
+
+    // HNewArray does not need this check because the art_quick_alloc_array does not itself
+    // have a dmb in any normal situation (i.e. the array class is never exactly in the
+    // "resolved" state). If the array class is not yet loaded, it will always go from
+    // Unloaded->Initialized state.
+  }
+
+  // Remove all the inputs to the constructor fence;
   // they aren't used by the InstructionCodeGenerator and this lets us avoid creating a
   // LocationSummary in the LocationsBuilder.
   constructor_fence->RemoveAllInputs();