diff options
Diffstat (limited to 'tools')
| -rwxr-xr-x | tools/localedata/extract_icu_data.py | 142 | 
1 files changed, 112 insertions, 30 deletions
| diff --git a/tools/localedata/extract_icu_data.py b/tools/localedata/extract_icu_data.py index 8f67fa87adb5..ec531275af1c 100755 --- a/tools/localedata/extract_icu_data.py +++ b/tools/localedata/extract_icu_data.py @@ -121,7 +121,7 @@ def pack_to_uint32(locale):  def dump_script_codes(all_scripts):      """Dump the SCRIPT_CODES table.""" -    print('const char SCRIPT_CODES[][4] = {') +    print('constexpr const char SCRIPT_CODES[][4] = {')      for index, script in enumerate(all_scripts):          print("    /* %-2d */ {'%c', '%c', '%c', '%c'}," % (              index, script[0], script[1], script[2], script[3])) @@ -132,15 +132,33 @@ def dump_script_codes(all_scripts):  def dump_script_data(likely_script_dict, all_scripts):      """Dump the script data."""      print() -    print('const std::unordered_map<uint32_t, uint8_t> LIKELY_SCRIPTS({') +    print('const char* lookupLikelyScript(uint32_t packed_lang_region) {') +    print('    switch(packed_lang_region) {') + +    # partition the mapping by the script code +    parts = {}      for locale in sorted(likely_script_dict.keys()):          script = likely_script_dict[locale] -        print('    {0x%08Xu, %2du}, // %s -> %s' % ( -            pack_to_uint32(locale), -            all_scripts.index(script), -            locale.replace('_', '-'), -            script)) -    print('});') +        if script in parts: +            l = parts[script] +        else: +            l = [] +            parts[script] = l +        l.append(locale) + +    for script in sorted(parts.keys()): +        locales = parts[script] +        for locale in locales: +            print('        case 0x%08Xu: // %s -> %s' % ( +                pack_to_uint32(locale), +                locale.replace('_', '-'), +                script)) +        print('            return SCRIPT_CODES[%2du];' % +              all_scripts.index(script)) +    print('        default:') +    print('            return nullptr;') +    print('     }') +    print('}')  def pack_to_uint64(locale): @@ -152,16 +170,32 @@ def pack_to_uint64(locale):              (ord(script[2]) << 8) |              ord(script[3])) +def pack_script_to_uint32(script): +    """Pack a 4-letter script code into a 32-bit unsigned integer.""" +    return ((ord(script[0]) << 24) | +            (ord(script[1]) << 16) | +            (ord(script[2]) << 8) | +            ord(script[3])) +  def dump_representative_locales(representative_locales):      """Dump the set of representative locales."""      print() -    print('std::unordered_set<uint64_t> REPRESENTATIVE_LOCALES({') +    print('bool isLocaleRepresentative(uint32_t language_and_region, const char* script) {') +    print('    const uint64_t packed_locale =') +    print('            ((static_cast<uint64_t>(language_and_region)) << 32u) |') +    print('            (static_cast<uint64_t>(packScript(script)));') +    print('    switch(packed_locale) {')      for locale in sorted(representative_locales): -        print('    0x%08XLLU, // %s' % ( +        print('        case 0x%08XLLU: // %s' % (              pack_to_uint64(locale),              locale)) -    print('});') + +    print('            return true;') +    print('        default:') +    print('            return false;') +    print('    }') +    print('}')  def read_and_dump_likely_data(cldr_source_dir): @@ -182,7 +216,7 @@ def read_and_dump_likely_data(cldr_source_dir):  def escape_script_variable_name(script):      """Escape characters, e.g. '~', in a C++ variable name""" -    return script.replace("~", "_") +    return script.replace("~", "0")  def read_parent_data(icu_data_dir):      """Read locale parent data from ICU data files.""" @@ -225,29 +259,52 @@ def dump_parent_data(script_organized_dict):      """Dump information for parents of locales."""      sorted_scripts = sorted(script_organized_dict.keys())      print() +      for script in sorted_scripts:          parent_dict = script_organized_dict[script] -        print ('const std::unordered_map<uint32_t, uint32_t> %s_PARENTS({' -            % escape_script_variable_name(script.upper())) + +        # partition the mapping by the parent's value +        parts = {}          for locale in sorted(parent_dict.keys()):              parent = parent_dict[locale] -            print('    {0x%08Xu, 0x%08Xu}, // %s -> %s' % ( -                pack_to_uint32(locale), -                pack_to_uint32(parent), -                locale.replace('_', '-'), -                parent.replace('_', '-'))) -        print('});') +            if parent in parts: +                l = parts[parent] +            else: +                l = [] +                parts[parent] = l +            l.append(locale) + +        print('static uint32_t find%sParent(uint32_t packed_lang_region) {' % escape_script_variable_name(script)) +        print('    switch(packed_lang_region) {') +        for parent in sorted(parts.keys()): +            locales = parts[parent] +            for locale in locales: +                print('        case 0x%08Xu: // %s -> %s' % ( +                    pack_to_uint32(locale), +                    locale.replace('_', '-'), +                    parent.replace('_', '-'))) + +            print('            return 0x%08Xu;' % pack_to_uint32(parent)) + +        print('        default:') +        print('            return 0;') +        print('    }') +        print('}')          print() -    print('const struct {') -    print('    const char script[4];') -    print('    const std::unordered_map<uint32_t, uint32_t>* map;') -    print('} SCRIPT_PARENTS[] = {') +    print('uint32_t findParentLocalePackedKey(const char* script, uint32_t packed_lang_region) {') +    print('    uint32_t packedScript = packScript(script);') +    print('    switch (packedScript) {') +      for script in sorted_scripts: -        print("    {{'%c', '%c', '%c', '%c'}, &%s_PARENTS}," % ( -            script[0], script[1], script[2], script[3], -            escape_script_variable_name(script.upper()))) -    print('};') +        print('        case 0x%08Xu: // %s' % (pack_script_to_uint32(script), script)) +        print('            return find%sParent(packed_lang_region);' % +              escape_script_variable_name(script)) + +    print('        default:') +    print('            return 0;') +    print('    }') +    print('}')  def dump_parent_tree_depth(parent_dict): @@ -261,7 +318,9 @@ def dump_parent_tree_depth(parent_dict):          max_depth = max(max_depth, depth)      assert max_depth < 5 # Our algorithms assume small max_depth      print() -    print('const size_t MAX_PARENT_DEPTH = %d;' % max_depth) +    print('uint32_t getMaxAncestorTreeDepth() {') +    print('    return %d;' % max_depth) +    print('}')  def read_and_dump_parent_data(icu_data_dir, likely_script_dict): @@ -286,10 +345,33 @@ def main():          'external', 'icu', 'icu4c', 'source', 'data')      cldr_source_dir = os.path.join(source_root, 'external', 'cldr') +    print('''/* + * Copyright (C) 2025 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + *      http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +''')      print('// Auto-generated by %s' % sys.argv[0]) -    print() +    print(''' +#include <androidfw/LocaleDataLookup.h> + +namespace android { +''')      likely_script_dict = read_and_dump_likely_data(cldr_source_dir)      read_and_dump_parent_data(icu_data_dir, likely_script_dict) +    print() +    print('} // namespace android')  if __name__ == '__main__': |