diff options
| author | 2024-06-11 10:29:27 +0000 | |
|---|---|---|
| committer | 2024-06-11 10:29:27 +0000 | |
| commit | 6997f294fc738eded08a07a40d3109056391a236 (patch) | |
| tree | 63fb37539e03a253977dc1a86ea6aedfa3cdd7b9 | |
| parent | 2db2c27d7eac3ef9454994e9ee8332ef6000199a (diff) | |
| parent | e1d909d3e96a884e78535dd13bd39dc80e7ec406 (diff) | |
Merge "Fix frameworks/base/tools/localedata/extract_icu_data.py" into main am: e1dee26b09 am: e1d909d3e9
Original change: https://android-review.googlesource.com/c/platform/frameworks/base/+/3103181
Change-Id: I297aca15b22e9e91f65a9055fe7f348f897b20f5
Signed-off-by: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>
| -rwxr-xr-x | tools/localedata/extract_icu_data.py | 85 |
1 files changed, 45 insertions, 40 deletions
diff --git a/tools/localedata/extract_icu_data.py b/tools/localedata/extract_icu_data.py index 81ac897deae0..8f67fa87adb5 100755 --- a/tools/localedata/extract_icu_data.py +++ b/tools/localedata/extract_icu_data.py @@ -22,6 +22,8 @@ import glob import os.path import sys +import xml.etree.ElementTree as ElementTree + def get_locale_parts(locale): """Split a locale into three parts, for langauge, script, and region.""" @@ -40,42 +42,43 @@ def get_locale_parts(locale): def read_likely_subtags(input_file_name): """Read and parse ICU's likelySubtags.txt.""" - with open(input_file_name) as input_file: - likely_script_dict = { - # Android's additions for pseudo-locales. These internal codes make - # sure that the pseudo-locales would not match other English or - # Arabic locales. (We can't use private-use ISO 15924 codes, since - # they may be used by apps for other purposes.) - "en_XA": "~~~A", - "ar_XB": "~~~B", - # Removed data from later versions of ICU - "ji": "Hebr", # Old code for Yiddish, still used in Java and Android - } - representative_locales = { - # Android's additions - "en_Latn_GB", # representative for en_Latn_001 - "es_Latn_MX", # representative for es_Latn_419 - "es_Latn_US", # representative for es_Latn_419 (not the best idea, - # but Android has been shipping with it for quite a - # while. Fortunately, MX < US, so if both exist, MX - # would be chosen.) - } - for line in input_file: - line = line.strip(u' \n\uFEFF') - if line.startswith('//'): - continue - if '{' in line and '}' in line: - from_locale = line[:line.index('{')] - to_locale = line[line.index('"')+1:line.rindex('"')] - from_lang, from_scr, from_region = get_locale_parts(from_locale) - _, to_scr, to_region = get_locale_parts(to_locale) - if from_lang == 'und': - continue # not very useful for our purposes - if from_region is None and to_region not in ['001', 'ZZ']: - representative_locales.add(to_locale) - if from_scr is None: - likely_script_dict[from_locale] = to_scr - return likely_script_dict, frozenset(representative_locales) + likely_script_dict = { + # Android's additions for pseudo-locales. These internal codes make + # sure that the pseudo-locales would not match other English or + # Arabic locales. (We can't use private-use ISO 15924 codes, since + # they may be used by apps for other purposes.) + "en_XA": "~~~A", + "ar_XB": "~~~B", + # Removed data from later versions of ICU + "ji": "Hebr", # Old code for Yiddish, still used in Java and Android + } + representative_locales = { + # Android's additions + "en_Latn_GB", # representative for en_Latn_001 + "es_Latn_MX", # representative for es_Latn_419 + "es_Latn_US", # representative for es_Latn_419 (not the best idea, + # but Android has been shipping with it for quite a + # while. Fortunately, MX < US, so if both exist, MX + # would be chosen.) + } + xml_tree = ElementTree.parse(input_file_name) + likely_subtags = xml_tree.find('likelySubtags') + for child in likely_subtags: + from_locale = child.get('from') + to_locale = child.get('to') + # print(f'from: {from_locale} to: {to_locale}') + from_lang, from_scr, from_region = get_locale_parts(from_locale) + _, to_scr, to_region = get_locale_parts(to_locale) + if to_locale == "FAIL": + continue # "FAIL" cases are not useful here. + if from_lang == 'und': + continue # not very useful for our purposes + if from_region is None and to_region not in ['001', 'ZZ']: + representative_locales.add(to_locale) + if from_scr is None: + likely_script_dict[from_locale] = to_scr + + return likely_script_dict, frozenset(representative_locales) # From packLanguageOrRegion() in ResourceTypes.cpp @@ -86,7 +89,7 @@ def pack_language_or_region(inp, base): elif len(inp) == 2: return ord(inp[0]), ord(inp[1]) else: - assert len(inp) == 3 + assert len(inp) == 3, f'Expects a 3-character string, but "{inp}" ' base = ord(base) first = ord(inp[0]) - base second = ord(inp[1]) - base @@ -161,9 +164,10 @@ def dump_representative_locales(representative_locales): print('});') -def read_and_dump_likely_data(icu_data_dir): +def read_and_dump_likely_data(cldr_source_dir): """Read and dump the likely-script data.""" - likely_subtags_txt = os.path.join(icu_data_dir, 'misc', 'likelySubtags.txt') + likely_subtags_txt = os.path.join(cldr_source_dir, + 'common', 'supplemental', 'likelySubtags.xml') likely_script_dict, representative_locales = read_likely_subtags( likely_subtags_txt) @@ -280,10 +284,11 @@ def main(): icu_data_dir = os.path.join( source_root, 'external', 'icu', 'icu4c', 'source', 'data') + cldr_source_dir = os.path.join(source_root, 'external', 'cldr') print('// Auto-generated by %s' % sys.argv[0]) print() - likely_script_dict = read_and_dump_likely_data(icu_data_dir) + likely_script_dict = read_and_dump_likely_data(cldr_source_dir) read_and_dump_parent_data(icu_data_dir, likely_script_dict) |