StringBuilder append pattern for float/double.

Results for added benchmarks on blueline-userdebug with cpu frequencies fxed at 1420800 (cpus 0-3; little) and 1459200 (cpus 4-7; big): 32-bit little (--variant=X32 --invoke-with 'taskset 0f') timeAppendStringAndDouble: ~1260ns -> ~970ns timeAppendStringAndFloat: ~1250ns -> ~940ns timeAppendStringAndHugeDouble: ~4700ns -> ~4690ns (noise) timeAppendStringAndHugeFloat: ~3400ns -> ~3300ns (noise) timeAppendStringDoubleStringAndFloat: ~1980ns -> ~1550ns 64-bit little (--variant=X64 --invoke-with 'taskset 0f') timeAppendStringAndDouble: ~1260ns -> ~970ns timeAppendStringAndFloat: ~1260ns -> ~940ns timeAppendStringAndHugeDouble: ~4700ns -> ~4800ns (noise) timeAppendStringAndHugeFloat: ~3300ns -> ~3400ns (noise) timeAppendStringDoubleStringAndFloat: ~1970ns -> ~1550ns 32-bit big (--variant=X32 --invoke-with 'taskset f0') timeAppendStringAndDouble: ~580ns -> ~450ns timeAppendStringAndFloat: ~590ns -> ~430ns timeAppendStringAndHugeDouble: ~2500ns -> ~2100ns (noise) timeAppendStringAndHugeFloat: ~1500ns -> ~1300ns (noise) timeAppendStringDoubleStringAndFloat: ~880ns -> ~730ns 64-bit big (--variant=X64 --invoke-with 'taskset f0') timeAppendStringAndDouble: ~590ns -> ~450ns timeAppendStringAndFloat: ~590ns -> ~430ns timeAppendStringAndHugeDouble: ~2300ns -> ~2300ns (noise) timeAppendStringAndHugeFloat: ~1500ns -> ~1300ns (noise) timeAppendStringDoubleStringAndFloat: ~870ns -> ~730ns The `timeAppendStringAnd{Double,Float)` benchmarks show very nice improvements, roughly 25% on both little and big cores. The `timeAppendStringDoubleStringAndFloat` also shows decent improvements, over 20% on little and over 15% on big cores. (These benchmarks test the best-case scenario for "before" as the StringBuilder's internal buffer is not reallocated.) The `testAppendStringAndHuge{Double,Float}` results are too noisy to draw any conclusions (especially on little cores but there is still too much noise on big cores as well). There are also small regressions for existing benchmarks `timeAppend{LongStrings,StringAndInt,Strings}` but these non-FP regressions may be mitigated after updating the ThinLTO profile. There is also an opportunity to optimize the calls back to managed code for known shorty (in this change we use "LD" and "LF") by using a dedicated stub instead of going through the generic invoke stub. Boot image size changes are insignificant (few matches). Test: Added tests to 697-checker-string-append Test: m test-art-host-gtest Test: testrunner.py --host --optimizing Test: testrunner.py --target --optimizing Bug: 19575890 Change-Id: I9cf38c2d615a0a2b14255d18588a694d8870aae5
author: Vladimir Marko <vmarko@google.com> 2019-05-21 10:00:15 +0100
committer: Vladimír Marko <vmarko@google.com> 2023-01-03 12:31:57 +0000
commit: 41de45060710d64b671a0fa001ec187df221359d (patch)
tree: acdd772258d4eb671f8aa00876988b3f59cc4d1f /runtime/string_builder_append.cc
parent: 890b19bd625be5d0e4a876e3eb11b8b893fb0c13 (diff)
1 files changed, 158 insertions, 4 deletions
diff --git a/runtime/string_builder_append.cc b/runtime/string_builder_append.cc
index 85b70eb12b..ef6969d4d0 100644
--- a/runtime/string_builder_append.cc
+++ b/runtime/string_builder_append.cc
@@ -20,9 +20,11 @@
 #include "base/logging.h"
 #include "common_throws.h"
 #include "gc/heap.h"
+#include "mirror/array-inl.h"
 #include "mirror/string-alloc-inl.h"
 #include "obj_ptr-inl.h"
 #include "runtime.h"
+#include "well_known_classes.h"
 
 namespace art {
 
@@ -60,6 +62,11 @@ class StringBuilderAppend::Builder {
     return new_string->GetLength() - (data - new_string->GetValue());
   }
 
+  template <typename CharType>
+  CharType* AppendFpArg(ObjPtr<mirror::String> new_string,
+                        CharType* data,
+                        size_t fp_arg_index) const REQUIRES_SHARED(Locks::mutator_lock_);
+
   template <typename CharType, size_t size>
   static CharType* AppendLiteral(ObjPtr<mirror::String> new_string,
                                  CharType* data,
@@ -75,6 +82,8 @@ class StringBuilderAppend::Builder {
                                CharType* data,
                                int64_t value) REQUIRES_SHARED(Locks::mutator_lock_);
 
+  int32_t ConvertFpArgs() REQUIRES_SHARED(Locks::mutator_lock_);
+
   template <typename CharType>
   void StoreData(ObjPtr<mirror::String> new_string, CharType* data) const
       REQUIRES_SHARED(Locks::mutator_lock_);
@@ -93,6 +102,12 @@ class StringBuilderAppend::Builder {
   // References are moved to the handle scope during CalculateLengthWithFlag().
   StackHandleScope<kMaxArgs> hs_;
 
+  // We convert float/double values using jdk.internal.math.FloatingDecimal which uses
+  // a thread-local converter under the hood. As we may have more than one
+  // float/double argument, we need to copy the data out of the converter.
+  uint8_t converted_fp_args_[kMaxArgs][26];  // 26 is the maximum number of characters.
+  int32_t converted_fp_arg_lengths_[kMaxArgs];
+
   // The length and flag to store when the AppendBuilder is used as a pre-fence visitor.
   int32_t length_with_flag_ = 0u;
 };
@@ -142,6 +157,18 @@ inline size_t StringBuilderAppend::Builder::Uint64Length(uint64_t value)  {
   return log10_value_estimate + adjustment;
 }
 
+template <typename CharType>
+inline CharType* StringBuilderAppend::Builder::AppendFpArg(ObjPtr<mirror::String> new_string,
+                                                           CharType* data,
+                                                           size_t fp_arg_index) const {
+  DCHECK_LE(fp_arg_index, std::size(converted_fp_args_));
+  const uint8_t* src = converted_fp_args_[fp_arg_index];
+  size_t length = converted_fp_arg_lengths_[fp_arg_index];
+  DCHECK_LE(length, std::size(converted_fp_args_[0]));
+  DCHECK_LE(length, RemainingSpace(new_string, data));
+  return std::copy_n(src, length, data);
+}
+
 template <typename CharType, size_t size>
 inline CharType* StringBuilderAppend::Builder::AppendLiteral(ObjPtr<mirror::String> new_string,
                                                              CharType* data,
@@ -204,10 +231,111 @@ inline CharType* StringBuilderAppend::Builder::AppendInt64(ObjPtr<mirror::String
   return data + length;
 }
 
+int32_t StringBuilderAppend::Builder::ConvertFpArgs() {
+  int32_t fp_args_length = 0u;
+  const uint32_t* current_arg = args_;
+  size_t fp_arg_index = 0u;
+  for (uint32_t f = format_; f != 0u; f >>= kBitsPerArg) {
+    DCHECK_LE(f & kArgMask, static_cast<uint32_t>(Argument::kLast));
+    bool fp_arg = false;
+    ObjPtr<mirror::Object> converter;
+    switch (static_cast<Argument>(f & kArgMask)) {
+      case Argument::kString:
+      case Argument::kBoolean:
+      case Argument::kChar:
+      case Argument::kInt:
+        break;
+      case Argument::kLong: {
+        current_arg = AlignUp(current_arg, sizeof(int64_t));
+        ++current_arg;  // Skip the low word, let the common code skip the high word.
+        break;
+      }
+      case Argument::kFloat: {
+        fp_arg = true;
+        float arg = bit_cast<float>(*current_arg);
+        converter = WellKnownClasses::jdk_internal_math_FloatingDecimal_getBinaryToASCIIConverter_F
+            ->InvokeStatic<'L', 'F'>(hs_.Self(), arg);
+        break;
+      }
+      case Argument::kDouble: {
+        fp_arg = true;
+        current_arg = AlignUp(current_arg, sizeof(int64_t));
+        double arg = bit_cast<double>(
+            static_cast<uint64_t>(current_arg[0]) + (static_cast<uint64_t>(current_arg[1]) << 32));
+        converter = WellKnownClasses::jdk_internal_math_FloatingDecimal_getBinaryToASCIIConverter_D
+            ->InvokeStatic<'L', 'D'>(hs_.Self(), arg);
+        ++current_arg;  // Skip the low word, let the common code skip the high word.
+        break;
+      }
+      case Argument::kStringBuilder:
+      case Argument::kCharArray:
+      case Argument::kObject:
+        LOG(FATAL) << "Unimplemented arg format: 0x" << std::hex
+            << (f & kArgMask) << " full format: 0x" << std::hex << format_;
+        UNREACHABLE();
+      default:
+        LOG(FATAL) << "Unexpected arg format: 0x" << std::hex
+            << (f & kArgMask) << " full format: 0x" << std::hex << format_;
+        UNREACHABLE();
+    }
+    if (fp_arg) {
+      // If we see an exception (presumably OOME or SOE), keep it as is, even
+      // though it may be confusing to see the stack trace for FP argument
+      // conversion continue at the StringBuilder.toString() invoke location.
+      DCHECK_EQ(converter == nullptr, hs_.Self()->IsExceptionPending());
+      if (UNLIKELY(converter == nullptr)) {
+        return -1;
+      }
+      ArtField* btab_buffer_field =
+          WellKnownClasses::jdk_internal_math_FloatingDecimal_BinaryToASCIIBuffer_buffer;
+      int32_t length;
+      if (converter->GetClass() == btab_buffer_field->GetDeclaringClass()) {
+        // Call `converter.getChars(converter.buffer)`.
+        StackHandleScope<1u> hs2(hs_.Self());
+        Handle<mirror::CharArray> buffer =
+            hs2.NewHandle(btab_buffer_field->GetObj<mirror::CharArray>(converter));
+        DCHECK(buffer != nullptr);
+        length = WellKnownClasses::jdk_internal_math_FloatingDecimal_BinaryToASCIIBuffer_getChars
+            ->InvokeInstance<'I', 'L'>(hs_.Self(), converter, buffer.Get());
+        if (UNLIKELY(hs_.Self()->IsExceptionPending())) {
+          return -1;
+        }
+        // The converted string is now at the front of the buffer.
+        DCHECK_GT(length, 0);
+        DCHECK_LE(length, buffer->GetLength());
+        DCHECK_LE(static_cast<size_t>(length), std::size(converted_fp_args_[0]));
+        DCHECK(mirror::String::AllASCII(buffer->GetData(), length));
+        std::copy_n(buffer->GetData(), length, converted_fp_args_[fp_arg_index]);
+      } else {
+        ArtField* ebtab_image_field = WellKnownClasses::
+            jdk_internal_math_FloatingDecimal_ExceptionalBinaryToASCIIBuffer_image;
+        DCHECK(converter->GetClass() == ebtab_image_field->GetDeclaringClass());
+        ObjPtr<mirror::String> converted = ebtab_image_field->GetObj<mirror::String>(converter);
+        DCHECK(converted != nullptr);
+        length = converted->GetLength();
+        if (mirror::kUseStringCompression) {
+          DCHECK(converted->IsCompressed());
+          memcpy(converted_fp_args_[fp_arg_index], converted->GetValueCompressed(), length);
+        } else {
+          DCHECK(mirror::String::AllASCII(converted->GetValue(), length));
+          std::copy_n(converted->GetValue(), length, converted_fp_args_[fp_arg_index]);
+        }
+      }
+      converted_fp_arg_lengths_[fp_arg_index] = length;
+      fp_args_length += length;
+      ++fp_arg_index;
+    }
+    ++current_arg;
+    DCHECK_LE(fp_arg_index, kMaxArgs);
+  }
+  return fp_args_length;
+}
+
 inline int32_t StringBuilderAppend::Builder::CalculateLengthWithFlag() {
   static_assert(static_cast<size_t>(Argument::kEnd) == 0u, "kEnd must be 0.");
   bool compressible = mirror::kUseStringCompression;
   uint64_t length = 0u;
+  bool has_fp_args = false;
   const uint32_t* current_arg = args_;
   for (uint32_t f = format_; f != 0u; f >>= kBitsPerArg) {
     DCHECK_LE(f & kArgMask, static_cast<uint32_t>(Argument::kLast));
@@ -243,12 +371,19 @@ inline int32_t StringBuilderAppend::Builder::CalculateLengthWithFlag() {
         ++current_arg;  // Skip the low word, let the common code skip the high word.
         break;
       }
+      case Argument::kDouble:
+        current_arg = AlignUp(current_arg, sizeof(int64_t));
+        ++current_arg;  // Skip the low word, let the common code skip the high word.
+        FALLTHROUGH_INTENDED;
+      case Argument::kFloat:
+        // Conversion shall be performed in a separate pass because it calls back to
+        // managed code and we need to convert reference arguments to `Handle<>`s first.
+        has_fp_args = true;
+        break;
 
       case Argument::kStringBuilder:
       case Argument::kCharArray:
       case Argument::kObject:
-      case Argument::kFloat:
-      case Argument::kDouble:
         LOG(FATAL) << "Unimplemented arg format: 0x" << std::hex
             << (f & kArgMask) << " full format: 0x" << std::hex << format_;
         UNREACHABLE();
@@ -261,6 +396,16 @@ inline int32_t StringBuilderAppend::Builder::CalculateLengthWithFlag() {
     DCHECK_LE(hs_.NumberOfReferences(), kMaxArgs);
   }
 
+  if (UNLIKELY(has_fp_args)) {
+    // Call Java helpers to convert FP args.
+    int32_t fp_args_length = ConvertFpArgs();
+    if (fp_args_length == -1) {
+      return -1;
+    }
+    DCHECK_GT(fp_args_length, 0);
+    length += fp_args_length;
+  }
+
   if (length > std::numeric_limits<int32_t>::max()) {
     // We cannot allocate memory for the entire result.
     hs_.Self()->ThrowNewException("Ljava/lang/OutOfMemoryError;",
@@ -276,6 +421,7 @@ template <typename CharType>
 inline void StringBuilderAppend::Builder::StoreData(ObjPtr<mirror::String> new_string,
                                                     CharType* data) const {
   size_t handle_index = 0u;
+  size_t fp_arg_index = 0u;
   const uint32_t* current_arg = args_;
   for (uint32_t f = format_; f != 0u; f >>= kBitsPerArg) {
     DCHECK_LE(f & kArgMask, static_cast<uint32_t>(Argument::kLast));
@@ -315,11 +461,18 @@ inline void StringBuilderAppend::Builder::StoreData(ObjPtr<mirror::String> new_s
         ++current_arg;  // Skip the low word, let the common code skip the high word.
         break;
       }
+      case Argument::kDouble:
+        current_arg = AlignUp(current_arg, sizeof(int64_t));
+        ++current_arg;  // Skip the low word, let the common code skip the high word.
+        FALLTHROUGH_INTENDED;
+      case Argument::kFloat: {
+        data = AppendFpArg(new_string, data, fp_arg_index);
+        ++fp_arg_index;
+        break;
+      }
 
       case Argument::kStringBuilder:
       case Argument::kCharArray:
-      case Argument::kFloat:
-      case Argument::kDouble:
         LOG(FATAL) << "Unimplemented arg format: 0x" << std::hex
             << (f & kArgMask) << " full format: 0x" << std::hex << format_;
         UNREACHABLE();
@@ -330,6 +483,7 @@ inline void StringBuilderAppend::Builder::StoreData(ObjPtr<mirror::String> new_s
     }
     ++current_arg;
     DCHECK_LE(handle_index, hs_.NumberOfReferences());
+    DCHECK_LE(fp_arg_index, std::size(converted_fp_args_));
   }
   DCHECK_EQ(RemainingSpace(new_string, data), 0u) << std::hex << format_;
 }
author	Vladimir Marko <vmarko@google.com>	2019-05-21 10:00:15 +0100
committer	Vladimír Marko <vmarko@google.com>	2023-01-03 12:31:57 +0000
commit	41de45060710d64b671a0fa001ec187df221359d (patch)
tree	acdd772258d4eb671f8aa00876988b3f59cc4d1f /runtime/string_builder_append.cc
parent	890b19bd625be5d0e4a876e3eb11b8b893fb0c13 (diff)