ART: Optimize StringBuilder append pattern.

Recognize appending with StringBuilder and replace the
entire expression with a runtime call that perfoms the
append in a more efficient manner.

For now, require the entire pattern to be in a single block
and be very strict about the StringBuilder environment uses.
Also, do not accept StringBuilder/char[]/Object/float/double
arguments as they throw non-OOME exceptions and/or require a
call from the entrypoint back to a helper function in Java;
these shall be implemented later.

Boot image size for aosp_taimen-userdebug:
 - before:
   arm/boot*.oat: 19653872
   arm64/boot*.oat: 23292784
   oat/arm64/services.odex: 22408664
 - after:
   arm/boot*.oat: 19432184 (-216KiB)
   arm64/boot*.oat: 22992488 (-293KiB)
   oat/arm64/services.odex: 22376776 (-31KiB)
Note that const-string in compiled boot image methods cannot
throw, but for apps it can and therefore its environment can
prevent the optimization for apps. We could implement either
a simple carve-out for const-string or generic environment
pruning to allow this pattern to be applied more often.

Results for the new StringBuilderAppendBenchmark on taimen:
  timeAppendLongStrings: ~700ns -> ~200ns
  timeAppendStringAndInt: ~220ns -> ~140ns
  timeAppendStrings: ~200ns -> 130ns

Bug: 19575890
Test: 697-checker-string-append
Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing
Test: aosp_taimen-userdebug boots.
Test: run-gtests.sh
Test: testrunner.py --target --optimizing
Test: vogar --benchmark art/benchmark/stringbuilder-append/src/StringBuilderAppendBenchmark.java
Change-Id: I51789bf299f5219f68ada4c077b6a1d3fe083964
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index 2bbb570..3b5699b 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -64,6 +64,7 @@
 #include "ssa_liveness_analysis.h"
 #include "stack_map.h"
 #include "stack_map_stream.h"
+#include "string_builder_append.h"
 #include "thread-current-inl.h"
 #include "utils/assembler.h"
 
@@ -599,6 +600,57 @@
   InvokeRuntime(entrypoint, invoke, invoke->GetDexPc(), nullptr);
 }
 
+void CodeGenerator::CreateStringBuilderAppendLocations(HStringBuilderAppend* instruction,
+                                                       Location out) {
+  ArenaAllocator* allocator = GetGraph()->GetAllocator();
+  LocationSummary* locations =
+      new (allocator) LocationSummary(instruction, LocationSummary::kCallOnMainOnly);
+  locations->SetOut(out);
+  instruction->GetLocations()->SetInAt(instruction->FormatIndex(),
+                                       Location::ConstantLocation(instruction->GetFormat()));
+
+  uint32_t format = static_cast<uint32_t>(instruction->GetFormat()->GetValue());
+  uint32_t f = format;
+  PointerSize pointer_size = InstructionSetPointerSize(GetInstructionSet());
+  size_t stack_offset = static_cast<size_t>(pointer_size);  // Start after the ArtMethod*.
+  for (size_t i = 0, num_args = instruction->GetNumberOfArguments(); i != num_args; ++i) {
+    StringBuilderAppend::Argument arg_type =
+        static_cast<StringBuilderAppend::Argument>(f & StringBuilderAppend::kArgMask);
+    switch (arg_type) {
+      case StringBuilderAppend::Argument::kStringBuilder:
+      case StringBuilderAppend::Argument::kString:
+      case StringBuilderAppend::Argument::kCharArray:
+        static_assert(sizeof(StackReference<mirror::Object>) == sizeof(uint32_t), "Size check.");
+        FALLTHROUGH_INTENDED;
+      case StringBuilderAppend::Argument::kBoolean:
+      case StringBuilderAppend::Argument::kChar:
+      case StringBuilderAppend::Argument::kInt:
+      case StringBuilderAppend::Argument::kFloat:
+        locations->SetInAt(i, Location::StackSlot(stack_offset));
+        break;
+      case StringBuilderAppend::Argument::kLong:
+      case StringBuilderAppend::Argument::kDouble:
+        stack_offset = RoundUp(stack_offset, sizeof(uint64_t));
+        locations->SetInAt(i, Location::DoubleStackSlot(stack_offset));
+        // Skip the low word, let the common code skip the high word.
+        stack_offset += sizeof(uint32_t);
+        break;
+      default:
+        LOG(FATAL) << "Unexpected arg format: 0x" << std::hex
+            << (f & StringBuilderAppend::kArgMask) << " full format: 0x" << format;
+        UNREACHABLE();
+    }
+    f >>= StringBuilderAppend::kBitsPerArg;
+    stack_offset += sizeof(uint32_t);
+  }
+  DCHECK_EQ(f, 0u);
+
+  size_t param_size = stack_offset - static_cast<size_t>(pointer_size);
+  DCHECK_ALIGNED(param_size, kVRegSize);
+  size_t num_vregs = param_size / kVRegSize;
+  graph_->UpdateMaximumNumberOfOutVRegs(num_vregs);
+}
+
 void CodeGenerator::CreateUnresolvedFieldLocationSummary(
     HInstruction* field_access,
     DataType::Type field_type,
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index f70ecb6..357c4bb 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -546,6 +546,8 @@
 
   void GenerateInvokeCustomCall(HInvokeCustom* invoke);
 
+  void CreateStringBuilderAppendLocations(HStringBuilderAppend* instruction, Location out);
+
   void CreateUnresolvedFieldLocationSummary(
       HInstruction* field_access,
       DataType::Type field_type,
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 3086882..28a7583 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -5411,6 +5411,15 @@
   HandleFieldSet(instruction, instruction->GetFieldInfo(), instruction->GetValueCanBeNull());
 }
 
+void LocationsBuilderARM64::VisitStringBuilderAppend(HStringBuilderAppend* instruction) {
+  codegen_->CreateStringBuilderAppendLocations(instruction, LocationFrom(x0));
+}
+
+void InstructionCodeGeneratorARM64::VisitStringBuilderAppend(HStringBuilderAppend* instruction) {
+  __ Mov(w0, instruction->GetFormat()->GetValue());
+  codegen_->InvokeRuntime(kQuickStringBuilderAppend, instruction, instruction->GetDexPc());
+}
+
 void LocationsBuilderARM64::VisitUnresolvedInstanceFieldGet(
     HUnresolvedInstanceFieldGet* instruction) {
   FieldAccessCallingConventionARM64 calling_convention;
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index 6469c69..7adfe5d 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -5722,6 +5722,15 @@
   HandleFieldSet(instruction, instruction->GetFieldInfo(), instruction->GetValueCanBeNull());
 }
 
+void LocationsBuilderARMVIXL::VisitStringBuilderAppend(HStringBuilderAppend* instruction) {
+  codegen_->CreateStringBuilderAppendLocations(instruction, LocationFrom(r0));
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitStringBuilderAppend(HStringBuilderAppend* instruction) {
+  __ Mov(r0, instruction->GetFormat()->GetValue());
+  codegen_->InvokeRuntime(kQuickStringBuilderAppend, instruction, instruction->GetDexPc());
+}
+
 void LocationsBuilderARMVIXL::VisitUnresolvedInstanceFieldGet(
     HUnresolvedInstanceFieldGet* instruction) {
   FieldAccessCallingConventionARMVIXL calling_convention;
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index 72334af..b3bce78 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -9529,6 +9529,15 @@
                  instruction->GetValueCanBeNull());
 }
 
+void LocationsBuilderMIPS::VisitStringBuilderAppend(HStringBuilderAppend* instruction) {
+  codegen_->CreateStringBuilderAppendLocations(instruction, Location::RegisterLocation(V0));
+}
+
+void InstructionCodeGeneratorMIPS::VisitStringBuilderAppend(HStringBuilderAppend* instruction) {
+  __ LoadConst32(A0, instruction->GetFormat()->GetValue());
+  codegen_->InvokeRuntime(kQuickStringBuilderAppend, instruction, instruction->GetDexPc());
+}
+
 void LocationsBuilderMIPS::VisitUnresolvedInstanceFieldGet(
     HUnresolvedInstanceFieldGet* instruction) {
   FieldAccessCallingConventionMIPS calling_convention;
diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc
index 0d3cb3b..0a3a3fa 100644
--- a/compiler/optimizing/code_generator_mips64.cc
+++ b/compiler/optimizing/code_generator_mips64.cc
@@ -7148,6 +7148,15 @@
   HandleFieldSet(instruction, instruction->GetFieldInfo(), instruction->GetValueCanBeNull());
 }
 
+void LocationsBuilderMIPS64::VisitStringBuilderAppend(HStringBuilderAppend* instruction) {
+  codegen_->CreateStringBuilderAppendLocations(instruction, Location::RegisterLocation(V0));
+}
+
+void InstructionCodeGeneratorMIPS64::VisitStringBuilderAppend(HStringBuilderAppend* instruction) {
+  __ LoadConst32(A0, instruction->GetFormat()->GetValue());
+  codegen_->InvokeRuntime(kQuickStringBuilderAppend, instruction, instruction->GetDexPc());
+}
+
 void LocationsBuilderMIPS64::VisitUnresolvedInstanceFieldGet(
     HUnresolvedInstanceFieldGet* instruction) {
   FieldAccessCallingConventionMIPS64 calling_convention;
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index ca1723b..7a73569 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -5510,6 +5510,15 @@
   HandleFieldGet(instruction, instruction->GetFieldInfo());
 }
 
+void LocationsBuilderX86::VisitStringBuilderAppend(HStringBuilderAppend* instruction) {
+  codegen_->CreateStringBuilderAppendLocations(instruction, Location::RegisterLocation(EAX));
+}
+
+void InstructionCodeGeneratorX86::VisitStringBuilderAppend(HStringBuilderAppend* instruction) {
+  __ movl(EAX, Immediate(instruction->GetFormat()->GetValue()));
+  codegen_->InvokeRuntime(kQuickStringBuilderAppend, instruction, instruction->GetDexPc());
+}
+
 void LocationsBuilderX86::VisitUnresolvedInstanceFieldGet(
     HUnresolvedInstanceFieldGet* instruction) {
   FieldAccessCallingConventionX86 calling_convention;
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 7c293b8..ee2ef77 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -4882,6 +4882,15 @@
   HandleFieldSet(instruction, instruction->GetFieldInfo(), instruction->GetValueCanBeNull());
 }
 
+void LocationsBuilderX86_64::VisitStringBuilderAppend(HStringBuilderAppend* instruction) {
+  codegen_->CreateStringBuilderAppendLocations(instruction, Location::RegisterLocation(RAX));
+}
+
+void InstructionCodeGeneratorX86_64::VisitStringBuilderAppend(HStringBuilderAppend* instruction) {
+  __ movl(CpuRegister(RDI), Immediate(instruction->GetFormat()->GetValue()));
+  codegen_->InvokeRuntime(kQuickStringBuilderAppend, instruction, instruction->GetDexPc());
+}
+
 void LocationsBuilderX86_64::VisitUnresolvedInstanceFieldGet(
     HUnresolvedInstanceFieldGet* instruction) {
   FieldAccessCallingConventionX86_64 calling_convention;
diff --git a/compiler/optimizing/instruction_simplifier.cc b/compiler/optimizing/instruction_simplifier.cc
index e829576..9cc5cad 100644
--- a/compiler/optimizing/instruction_simplifier.cc
+++ b/compiler/optimizing/instruction_simplifier.cc
@@ -25,6 +25,7 @@
 #include "mirror/class-inl.h"
 #include "scoped_thread_state_change-inl.h"
 #include "sharpening.h"
+#include "string_builder_append.h"
 
 namespace art {
 
@@ -2479,6 +2480,186 @@
   return false;
 }
 
+static bool TryReplaceStringBuilderAppend(HInvoke* invoke) {
+  DCHECK_EQ(invoke->GetIntrinsic(), Intrinsics::kStringBuilderToString);
+  if (invoke->CanThrowIntoCatchBlock()) {
+    return false;
+  }
+
+  HBasicBlock* block = invoke->GetBlock();
+  HInstruction* sb = invoke->InputAt(0);
+
+  // We support only a new StringBuilder, otherwise we cannot ensure that
+  // the StringBuilder data does not need to be populated for other users.
+  if (!sb->IsNewInstance()) {
+    return false;
+  }
+
+  // For now, we support only single-block recognition.
+  // (Ternary operators feeding the append could be implemented.)
+  for (const HUseListNode<HInstruction*>& use : sb->GetUses()) {
+    if (use.GetUser()->GetBlock() != block) {
+      return false;
+    }
+  }
+
+  // Collect args and check for unexpected uses.
+  // We expect one call to a constructor with no arguments, one constructor fence (unless
+  // eliminated), some number of append calls and one call to StringBuilder.toString().
+  bool seen_constructor = false;
+  bool seen_constructor_fence = false;
+  bool seen_to_string = false;
+  uint32_t format = 0u;
+  uint32_t num_args = 0u;
+  HInstruction* args[StringBuilderAppend::kMaxArgs];  // Added in reverse order.
+  for (const HUseListNode<HInstruction*>& use : sb->GetUses()) {
+    // The append pattern uses the StringBuilder only as the first argument.
+    if (use.GetIndex() != 0u) {
+      return false;
+    }
+    // We see the uses in reverse order because they are inserted at the front
+    // of the singly-linked list, so the StringBuilder.append() must come first.
+    HInstruction* user = use.GetUser();
+    if (!seen_to_string) {
+      if (user->IsInvokeVirtual() &&
+          user->AsInvokeVirtual()->GetIntrinsic() == Intrinsics::kStringBuilderToString) {
+        seen_to_string = true;
+        continue;
+      } else {
+        return false;
+      }
+    }
+    // Then we should see the arguments.
+    if (user->IsInvokeVirtual()) {
+      HInvokeVirtual* as_invoke_virtual = user->AsInvokeVirtual();
+      DCHECK(!seen_constructor);
+      DCHECK(!seen_constructor_fence);
+      StringBuilderAppend::Argument arg;
+      switch (as_invoke_virtual->GetIntrinsic()) {
+        case Intrinsics::kStringBuilderAppendObject:
+          // TODO: Unimplemented, needs to call String.valueOf().
+          return false;
+        case Intrinsics::kStringBuilderAppendString:
+          arg = StringBuilderAppend::Argument::kString;
+          break;
+        case Intrinsics::kStringBuilderAppendCharArray:
+          // TODO: Unimplemented, StringBuilder.append(char[]) can throw NPE and we would
+          // not have the correct stack trace for it.
+          return false;
+        case Intrinsics::kStringBuilderAppendBoolean:
+          arg = StringBuilderAppend::Argument::kBoolean;
+          break;
+        case Intrinsics::kStringBuilderAppendChar:
+          arg = StringBuilderAppend::Argument::kChar;
+          break;
+        case Intrinsics::kStringBuilderAppendInt:
+          arg = StringBuilderAppend::Argument::kInt;
+          break;
+        case Intrinsics::kStringBuilderAppendLong:
+          arg = StringBuilderAppend::Argument::kLong;
+          break;
+        case Intrinsics::kStringBuilderAppendCharSequence: {
+          ReferenceTypeInfo rti = user->AsInvokeVirtual()->InputAt(1)->GetReferenceTypeInfo();
+          if (!rti.IsValid()) {
+            return false;
+          }
+          ScopedObjectAccess soa(Thread::Current());
+          Handle<mirror::Class> input_type = rti.GetTypeHandle();
+          DCHECK(input_type != nullptr);
+          if (input_type.Get() == GetClassRoot<mirror::String>()) {
+            arg = StringBuilderAppend::Argument::kString;
+          } else {
+            // TODO: Check and implement for StringBuilder. We could find the StringBuilder's
+            // internal char[] inconsistent with the length, or the string compression
+            // of the result could be compromised with a concurrent modification, and
+            // we would need to throw appropriate exceptions.
+            return false;
+          }
+          break;
+        }
+        case Intrinsics::kStringBuilderAppendFloat:
+        case Intrinsics::kStringBuilderAppendDouble:
+          // TODO: Unimplemented, needs to call FloatingDecimal.getBinaryToASCIIConverter().
+          return false;
+        default: {
+          return false;
+        }
+      }
+      // Uses of the append return value should have been replaced with the first input.
+      DCHECK(!as_invoke_virtual->HasUses());
+      DCHECK(!as_invoke_virtual->HasEnvironmentUses());
+      if (num_args == StringBuilderAppend::kMaxArgs) {
+        return false;
+      }
+      format = (format << StringBuilderAppend::kBitsPerArg) | static_cast<uint32_t>(arg);
+      args[num_args] = as_invoke_virtual->InputAt(1u);
+      ++num_args;
+    } else if (user->IsInvokeStaticOrDirect() &&
+               user->AsInvokeStaticOrDirect()->GetResolvedMethod() != nullptr &&
+               user->AsInvokeStaticOrDirect()->GetResolvedMethod()->IsConstructor() &&
+               user->AsInvokeStaticOrDirect()->GetNumberOfArguments() == 1u) {
+      // After arguments, we should see the constructor.
+      // We accept only the constructor with no extra arguments.
+      DCHECK(!seen_constructor);
+      DCHECK(!seen_constructor_fence);
+      seen_constructor = true;
+    } else if (user->IsConstructorFence()) {
+      // The last use we see is the constructor fence.
+      DCHECK(seen_constructor);
+      DCHECK(!seen_constructor_fence);
+      seen_constructor_fence = true;
+    } else {
+      return false;
+    }
+  }
+
+  if (num_args == 0u) {
+    return false;
+  }
+
+  // Check environment uses.
+  for (const HUseListNode<HEnvironment*>& use : sb->GetEnvUses()) {
+    HInstruction* holder = use.GetUser()->GetHolder();
+    if (holder->GetBlock() != block) {
+      return false;
+    }
+    // Accept only calls on the StringBuilder (which shall all be removed).
+    // TODO: Carve-out for const-string? Or rely on environment pruning (to be implemented)?
+    if (holder->InputCount() == 0 || holder->InputAt(0) != sb) {
+      return false;
+    }
+  }
+
+  // Create replacement instruction.
+  HIntConstant* fmt = block->GetGraph()->GetIntConstant(static_cast<int32_t>(format));
+  ArenaAllocator* allocator = block->GetGraph()->GetAllocator();
+  HStringBuilderAppend* append =
+      new (allocator) HStringBuilderAppend(fmt, num_args, allocator, invoke->GetDexPc());
+  append->SetReferenceTypeInfo(invoke->GetReferenceTypeInfo());
+  for (size_t i = 0; i != num_args; ++i) {
+    append->SetArgumentAt(i, args[num_args - 1u - i]);
+  }
+  block->InsertInstructionBefore(append, invoke);
+  invoke->ReplaceWith(append);
+  // Copy environment, except for the StringBuilder uses.
+  for (size_t i = 0, size = invoke->GetEnvironment()->Size(); i != size; ++i) {
+    if (invoke->GetEnvironment()->GetInstructionAt(i) == sb) {
+      invoke->GetEnvironment()->RemoveAsUserOfInput(i);
+      invoke->GetEnvironment()->SetRawEnvAt(i, nullptr);
+    }
+  }
+  append->CopyEnvironmentFrom(invoke->GetEnvironment());
+  // Remove the old instruction.
+  block->RemoveInstruction(invoke);
+  // Remove the StringBuilder's uses and StringBuilder.
+  while (sb->HasNonEnvironmentUses()) {
+    block->RemoveInstruction(sb->GetUses().front().GetUser());
+  }
+  DCHECK(!sb->HasEnvironmentUses());
+  block->RemoveInstruction(sb);
+  return true;
+}
+
 // Certain allocation intrinsics are not removed by dead code elimination
 // because of potentially throwing an OOM exception or other side effects.
 // This method removes such intrinsics when special circumstances allow.
@@ -2493,6 +2674,9 @@
       invoke->GetBlock()->RemoveInstruction(invoke);
       RecordSimplification();
     }
+  } else if (invoke->GetIntrinsic() == Intrinsics::kStringBuilderToString &&
+             TryReplaceStringBuilderAppend(invoke)) {
+    RecordSimplification();
   }
 }
 
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 8a8b371..cb53ae3 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -1438,6 +1438,7 @@
   M(Shr, BinaryOperation)                                               \
   M(StaticFieldGet, Instruction)                                        \
   M(StaticFieldSet, Instruction)                                        \
+  M(StringBuilderAppend, Instruction)                                   \
   M(UnresolvedInstanceFieldGet, Instruction)                            \
   M(UnresolvedInstanceFieldSet, Instruction)                            \
   M(UnresolvedStaticFieldGet, Instruction)                              \
@@ -6889,6 +6890,55 @@
   const FieldInfo field_info_;
 };
 
+class HStringBuilderAppend final : public HVariableInputSizeInstruction {
+ public:
+  HStringBuilderAppend(HIntConstant* format,
+                       uint32_t number_of_arguments,
+                       ArenaAllocator* allocator,
+                       uint32_t dex_pc)
+      : HVariableInputSizeInstruction(
+            kStringBuilderAppend,
+            DataType::Type::kReference,
+            // The runtime call may read memory from inputs. It never writes outside
+            // of the newly allocated result object (or newly allocated helper objects).
+            SideEffects::AllReads().Union(SideEffects::CanTriggerGC()),
+            dex_pc,
+            allocator,
+            number_of_arguments + /* format */ 1u,
+            kArenaAllocInvokeInputs) {
+    DCHECK_GE(number_of_arguments, 1u);  // There must be something to append.
+    SetRawInputAt(FormatIndex(), format);
+  }
+
+  void SetArgumentAt(size_t index, HInstruction* argument) {
+    DCHECK_LE(index, GetNumberOfArguments());
+    SetRawInputAt(index, argument);
+  }
+
+  // Return the number of arguments, excluding the format.
+  size_t GetNumberOfArguments() const {
+    DCHECK_GE(InputCount(), 1u);
+    return InputCount() - 1u;
+  }
+
+  size_t FormatIndex() const {
+    return GetNumberOfArguments();
+  }
+
+  HIntConstant* GetFormat() {
+    return InputAt(FormatIndex())->AsIntConstant();
+  }
+
+  bool NeedsEnvironment() const override { return true; }
+
+  bool CanThrow() const override { return true; }
+
+  DECLARE_INSTRUCTION(StringBuilderAppend);
+
+ protected:
+  DEFAULT_COPY_CONSTRUCTOR(StringBuilderAppend);
+};
+
 class HUnresolvedInstanceFieldGet final : public HExpression<1> {
  public:
   HUnresolvedInstanceFieldGet(HInstruction* obj,