Avoid Modified-UTF8 processing for ASCII strings.
Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing
Bug: 181943478
Change-Id: I9926a0d3e0160aa56ba7a02922388bb3007aaccb
diff --git a/dex2oat/linker/image_writer.cc b/dex2oat/linker/image_writer.cc
index 0a6566c..331b8b7 100644
--- a/dex2oat/linker/image_writer.cc
+++ b/dex2oat/linker/image_writer.cc
@@ -2019,9 +2019,8 @@
uint32_t utf16_length;
const char* utf8_data = dex_file->StringDataAndUtf16LengthByIdx(dex::StringIndex(i),
&utf16_length);
- int32_t hash = ComputeUtf16HashFromModifiedUtf8(utf8_data, utf16_length);
- InternTable::Utf8String utf8_string(utf16_length, utf8_data, hash);
- auto intern_it = intern_set.find(utf8_string);
+ int32_t hash = InternTable::Utf8String::Hash(utf16_length, utf8_data);
+ auto intern_it = intern_set.find(InternTable::Utf8String(utf16_length, utf8_data, hash));
if (intern_it != intern_set.end()) {
mirror::String* string = intern_it->Read<kWithoutReadBarrier>();
DCHECK(string != nullptr);
diff --git a/libdexfile/dex/utf.h b/libdexfile/dex/utf.h
index 6949319..35cbf78 100644
--- a/libdexfile/dex/utf.h
+++ b/libdexfile/dex/utf.h
@@ -17,13 +17,14 @@
#ifndef ART_LIBDEXFILE_DEX_UTF_H_
#define ART_LIBDEXFILE_DEX_UTF_H_
-#include "base/macros.h"
-
#include <stddef.h>
#include <stdint.h>
#include <string>
#include <string_view>
+#include <type_traits>
+
+#include "base/macros.h"
/*
* All UTF-8 in art is actually modified UTF-8. Mostly, this distinction
@@ -97,9 +98,13 @@
*/
template<typename MemoryType>
int32_t ComputeUtf16Hash(const MemoryType* chars, size_t char_count) {
+ static_assert(std::is_same_v<MemoryType, char> ||
+ std::is_same_v<MemoryType, uint8_t> ||
+ std::is_same_v<MemoryType, uint16_t>);
+ using UnsignedMemoryType = std::make_unsigned_t<MemoryType>;
uint32_t hash = 0;
while (char_count--) {
- hash = hash * 31 + *chars++;
+ hash = hash * 31 + static_cast<UnsignedMemoryType>(*chars++);
}
return static_cast<int32_t>(hash);
}
diff --git a/runtime/intern_table-inl.h b/runtime/intern_table-inl.h
index 44bdb1f..a1319f1 100644
--- a/runtime/intern_table-inl.h
+++ b/runtime/intern_table-inl.h
@@ -28,6 +28,17 @@
namespace art {
+inline int32_t InternTable::Utf8String::Hash(uint32_t utf16_length, const char* utf8_data) {
+ DCHECK_EQ(utf16_length, CountModifiedUtf8Chars(utf8_data));
+ if (LIKELY(utf8_data[utf16_length] == 0)) {
+ int32_t hash = ComputeUtf16Hash(utf8_data, utf16_length);
+ DCHECK_EQ(hash, ComputeUtf16HashFromModifiedUtf8(utf8_data, utf16_length));
+ return hash;
+ } else {
+ return ComputeUtf16HashFromModifiedUtf8(utf8_data, utf16_length);
+ }
+}
+
inline std::size_t InternTable::StringHash::operator()(const GcRoot<mirror::String>& root) const {
if (kIsDebugBuild) {
Locks::mutator_lock_->AssertSharedHeld(Thread::Current());
@@ -55,19 +66,16 @@
if (a_length != b.GetUtf16Length()) {
return false;
}
+ DCHECK_GE(strlen(b.GetUtf8Data()), a_length);
if (a_string->IsCompressed()) {
- size_t b_byte_count = strlen(b.GetUtf8Data());
- size_t b_utf8_length = CountModifiedUtf8Chars(b.GetUtf8Data(), b_byte_count);
- // Modified UTF-8 single byte character range is 0x01 .. 0x7f
+ // Modified UTF-8 single byte character range is 0x01 .. 0x7f.
// The string compression occurs on regular ASCII with same exact range,
- // not on extended ASCII which up to 0xff
- const bool is_b_regular_ascii = (b_byte_count == b_utf8_length);
- if (is_b_regular_ascii) {
- return memcmp(b.GetUtf8Data(),
- a_string->GetValueCompressed(), a_length * sizeof(uint8_t)) == 0;
- } else {
- return false;
- }
+ // not on extended ASCII which is up to 0xff.
+ return b.GetUtf8Data()[a_length] == 0 &&
+ memcmp(b.GetUtf8Data(), a_string->GetValueCompressed(), a_length * sizeof(uint8_t)) == 0;
+ } else if (mirror::kUseStringCompression && b.GetUtf8Data()[a_length] == 0) {
+ // ASCII string `b` cannot equal non-ASCII `a_string`.
+ return false;
} else {
const uint16_t* a_value = a_string->GetValue();
return CompareModifiedUtf8ToUtf16AsCodePointValues(b.GetUtf8Data(), a_value, a_length) == 0;
diff --git a/runtime/intern_table.cc b/runtime/intern_table.cc
index 4da5453..c36bc8a 100644
--- a/runtime/intern_table.cc
+++ b/runtime/intern_table.cc
@@ -103,12 +103,9 @@
ObjPtr<mirror::String> InternTable::LookupStrong(Thread* self,
uint32_t utf16_length,
const char* utf8_data) {
- DCHECK_EQ(utf16_length, CountModifiedUtf8Chars(utf8_data));
- Utf8String string(utf16_length,
- utf8_data,
- ComputeUtf16HashFromModifiedUtf8(utf8_data, utf16_length));
+ int32_t hash = Utf8String::Hash(utf16_length, utf8_data);
MutexLock mu(self, *Locks::intern_table_lock_);
- return strong_interns_.Find(string);
+ return strong_interns_.Find(Utf8String(utf16_length, utf8_data, hash));
}
ObjPtr<mirror::String> InternTable::LookupWeakLocked(ObjPtr<mirror::String> s) {
@@ -254,16 +251,34 @@
return is_strong ? InsertStrong(s) : InsertWeak(s);
}
-ObjPtr<mirror::String> InternTable::InternStrong(int32_t utf16_length, const char* utf8_data) {
+ObjPtr<mirror::String> InternTable::InternStrong(uint32_t utf16_length, const char* utf8_data) {
DCHECK(utf8_data != nullptr);
+ int32_t hash = Utf8String::Hash(utf16_length, utf8_data);
Thread* self = Thread::Current();
- // Try to avoid allocation.
- ObjPtr<mirror::String> s = LookupStrong(self, utf16_length, utf8_data);
+ ObjPtr<mirror::String> s;
+ {
+ // Try to avoid allocation. If we need to allocate, release the mutex before the allocation.
+ MutexLock mu(self, *Locks::intern_table_lock_);
+ s = strong_interns_.Find(Utf8String(utf16_length, utf8_data, hash));
+ }
if (s != nullptr) {
return s;
}
- return InternStrong(mirror::String::AllocFromModifiedUtf8(
- self, utf16_length, utf8_data));
+ bool is_ascii = (utf8_data[utf16_length] == 0);
+ int32_t utf8_length = utf16_length + (LIKELY(is_ascii) ? 0 : strlen(utf8_data + utf16_length));
+ DCHECK_EQ(static_cast<size_t>(utf8_length), strlen(utf8_data));
+ s = mirror::String::AllocFromModifiedUtf8(self, utf16_length, utf8_data, utf8_length);
+ if (UNLIKELY(s == nullptr)) {
+ self->AssertPendingOOMException();
+ return nullptr;
+ }
+ if (kIsDebugBuild) {
+ int32_t string_hash = s->GetHashCode(); // Implicitly sets the hash code.
+ CHECK_EQ(hash, string_hash);
+ } else {
+ s->SetHashCode(hash);
+ }
+ return InternStrong(s);
}
ObjPtr<mirror::String> InternTable::InternStrong(const char* utf8_data) {
diff --git a/runtime/intern_table.h b/runtime/intern_table.h
index c5fe797..ba039cc 100644
--- a/runtime/intern_table.h
+++ b/runtime/intern_table.h
@@ -66,6 +66,8 @@
uint32_t GetUtf16Length() const { return utf16_length_; }
const char* GetUtf8Data() const { return utf8_data_; }
+ static int32_t Hash(uint32_t utf16_length, const char* utf8_data);
+
private:
int32_t hash_;
uint32_t utf16_length_;
@@ -112,7 +114,7 @@
InternTable();
// Interns a potentially new string in the 'strong' table. May cause thread suspension.
- ObjPtr<mirror::String> InternStrong(int32_t utf16_length, const char* utf8_data)
+ ObjPtr<mirror::String> InternStrong(uint32_t utf16_length, const char* utf8_data)
REQUIRES_SHARED(Locks::mutator_lock_) REQUIRES(!Roles::uninterruptible_);
// Only used by image writer. Special version that may not cause thread suspension since the GC
diff --git a/runtime/mirror/string.h b/runtime/mirror/string.h
index 6cb560e..66e0151 100644
--- a/runtime/mirror/string.h
+++ b/runtime/mirror/string.h
@@ -29,6 +29,7 @@
} // namespace gc
template<class T> class Handle;
+class InternTable;
template<class MirrorType> class ObjPtr;
class StringBuilderAppend;
struct StringOffsets;
@@ -277,6 +278,7 @@
uint8_t value_compressed_[0];
};
+ friend class art::InternTable; // Let `InternTable` call `SetHashCode()`.
friend class art::StringBuilderAppend;
friend struct art::StringOffsets; // for verifying offset information