From 92f5984d2c2cd73b6b9f68c02c147877d1e2fc46 Mon Sep 17 00:00:00 2001 From: Kenny Root Date: Fri, 4 Dec 2009 09:38:48 -0800 Subject: Optional use of UTF-8 strings in resource bundles Allows the use of UTF-8 for packing resources instead of the default of UTF-16 for Java. When strings are extracted from the ResStringPool, they are converted to UTF-16 and the result is cached for subsequent calls. When using aapt to package, add in the "-8" switch to pack the resources using UTF-8. This will result in the value, key, and type strings as well as the compiled XML string values taking significantly less space in the final application package in most scenarios. Change-Id: I129483f8b3d3b1c5869dced05cb525e494a6c83a --- libs/utils/String8.cpp | 67 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 2 deletions(-) (limited to 'libs/utils/String8.cpp') diff --git a/libs/utils/String8.cpp b/libs/utils/String8.cpp index e908ec1e87..3a34838711 100644 --- a/libs/utils/String8.cpp +++ b/libs/utils/String8.cpp @@ -208,10 +208,23 @@ static char* allocFromUTF16OrUTF32(const T* in, L len) return getEmptyString(); } -// Note: not dealing with expanding surrogate pairs. static char* allocFromUTF16(const char16_t* in, size_t len) { - return allocFromUTF16OrUTF32(in, len); + if (len == 0) return getEmptyString(); + + const size_t bytes = utf8_length_from_utf16(in, len); + + SharedBuffer* buf = SharedBuffer::alloc(bytes+1); + LOG_ASSERT(buf, "Unable to allocate shared buffer"); + if (buf) { + char* str = (char*)buf->data(); + + utf16_to_utf8(in, len, str, bytes+1); + + return str; + } + + return getEmptyString(); } static char* allocFromUTF32(const char32_t* in, size_t len) @@ -762,6 +775,26 @@ size_t utf8_length_from_utf32(const char32_t *src, size_t src_len) return ret; } +size_t utf8_length_from_utf16(const char16_t *src, size_t src_len) +{ + if (src == NULL || src_len == 0) { + return 0; + } + size_t ret = 0; + const char16_t* const end = src + src_len; + while (src < end) { + if ((*src & 0xFC00) == 0xD800 && (src + 1) < end + && (*++src & 0xFC00) == 0xDC00) { + // surrogate pairs are always 4 bytes. + ret += 4; + src++; + } else { + ret += android::utf32_to_utf8_bytes((char32_t) *src++); + } + } + return ret; +} + static int32_t utf32_at_internal(const char* cur, size_t *num_read) { const char first_char = *cur; @@ -848,3 +881,33 @@ size_t utf32_to_utf8(const char32_t* src, size_t src_len, } return cur - dst; } + +size_t utf16_to_utf8(const char16_t* src, size_t src_len, + char* dst, size_t dst_len) +{ + if (src == NULL || src_len == 0 || dst == NULL || dst_len == 0) { + return 0; + } + const char16_t* cur_utf16 = src; + const char16_t* const end_utf16 = src + src_len; + char *cur = dst; + const char* const end = dst + dst_len; + while (cur_utf16 < end_utf16 && cur < end) { + char32_t utf32; + // surrogate pairs + if ((*cur_utf16 & 0xFC00) == 0xD800 && (cur_utf16 + 1) < end_utf16) { + utf32 = (*cur_utf16++ - 0xD800) << 10; + utf32 |= *cur_utf16++ - 0xDC00; + utf32 += 0x10000; + } else { + utf32 = (char32_t) *cur_utf16++; + } + size_t len = android::utf32_to_utf8_bytes(utf32); + android::utf32_to_utf8((uint8_t*)cur, utf32, len); + cur += len; + } + if (cur < end) { + *cur = '\0'; + } + return cur - dst; +} -- cgit v1.2.3-59-g8ed1b