summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
author Treehugger Robot <android-test-infra-autosubmit@system.gserviceaccount.com> 2024-06-11 10:29:27 +0000
committer Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com> 2024-06-11 10:29:27 +0000
commit6997f294fc738eded08a07a40d3109056391a236 (patch)
tree63fb37539e03a253977dc1a86ea6aedfa3cdd7b9
parent2db2c27d7eac3ef9454994e9ee8332ef6000199a (diff)
parente1d909d3e96a884e78535dd13bd39dc80e7ec406 (diff)
Merge "Fix frameworks/base/tools/localedata/extract_icu_data.py" into main am: e1dee26b09 am: e1d909d3e9
Original change: https://android-review.googlesource.com/c/platform/frameworks/base/+/3103181 Change-Id: I297aca15b22e9e91f65a9055fe7f348f897b20f5 Signed-off-by: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>
-rwxr-xr-xtools/localedata/extract_icu_data.py85
1 files changed, 45 insertions, 40 deletions
diff --git a/tools/localedata/extract_icu_data.py b/tools/localedata/extract_icu_data.py
index 81ac897deae0..8f67fa87adb5 100755
--- a/tools/localedata/extract_icu_data.py
+++ b/tools/localedata/extract_icu_data.py
@@ -22,6 +22,8 @@ import glob
import os.path
import sys
+import xml.etree.ElementTree as ElementTree
+
def get_locale_parts(locale):
"""Split a locale into three parts, for langauge, script, and region."""
@@ -40,42 +42,43 @@ def get_locale_parts(locale):
def read_likely_subtags(input_file_name):
"""Read and parse ICU's likelySubtags.txt."""
- with open(input_file_name) as input_file:
- likely_script_dict = {
- # Android's additions for pseudo-locales. These internal codes make
- # sure that the pseudo-locales would not match other English or
- # Arabic locales. (We can't use private-use ISO 15924 codes, since
- # they may be used by apps for other purposes.)
- "en_XA": "~~~A",
- "ar_XB": "~~~B",
- # Removed data from later versions of ICU
- "ji": "Hebr", # Old code for Yiddish, still used in Java and Android
- }
- representative_locales = {
- # Android's additions
- "en_Latn_GB", # representative for en_Latn_001
- "es_Latn_MX", # representative for es_Latn_419
- "es_Latn_US", # representative for es_Latn_419 (not the best idea,
- # but Android has been shipping with it for quite a
- # while. Fortunately, MX < US, so if both exist, MX
- # would be chosen.)
- }
- for line in input_file:
- line = line.strip(u' \n\uFEFF')
- if line.startswith('//'):
- continue
- if '{' in line and '}' in line:
- from_locale = line[:line.index('{')]
- to_locale = line[line.index('"')+1:line.rindex('"')]
- from_lang, from_scr, from_region = get_locale_parts(from_locale)
- _, to_scr, to_region = get_locale_parts(to_locale)
- if from_lang == 'und':
- continue # not very useful for our purposes
- if from_region is None and to_region not in ['001', 'ZZ']:
- representative_locales.add(to_locale)
- if from_scr is None:
- likely_script_dict[from_locale] = to_scr
- return likely_script_dict, frozenset(representative_locales)
+ likely_script_dict = {
+ # Android's additions for pseudo-locales. These internal codes make
+ # sure that the pseudo-locales would not match other English or
+ # Arabic locales. (We can't use private-use ISO 15924 codes, since
+ # they may be used by apps for other purposes.)
+ "en_XA": "~~~A",
+ "ar_XB": "~~~B",
+ # Removed data from later versions of ICU
+ "ji": "Hebr", # Old code for Yiddish, still used in Java and Android
+ }
+ representative_locales = {
+ # Android's additions
+ "en_Latn_GB", # representative for en_Latn_001
+ "es_Latn_MX", # representative for es_Latn_419
+ "es_Latn_US", # representative for es_Latn_419 (not the best idea,
+ # but Android has been shipping with it for quite a
+ # while. Fortunately, MX < US, so if both exist, MX
+ # would be chosen.)
+ }
+ xml_tree = ElementTree.parse(input_file_name)
+ likely_subtags = xml_tree.find('likelySubtags')
+ for child in likely_subtags:
+ from_locale = child.get('from')
+ to_locale = child.get('to')
+ # print(f'from: {from_locale} to: {to_locale}')
+ from_lang, from_scr, from_region = get_locale_parts(from_locale)
+ _, to_scr, to_region = get_locale_parts(to_locale)
+ if to_locale == "FAIL":
+ continue # "FAIL" cases are not useful here.
+ if from_lang == 'und':
+ continue # not very useful for our purposes
+ if from_region is None and to_region not in ['001', 'ZZ']:
+ representative_locales.add(to_locale)
+ if from_scr is None:
+ likely_script_dict[from_locale] = to_scr
+
+ return likely_script_dict, frozenset(representative_locales)
# From packLanguageOrRegion() in ResourceTypes.cpp
@@ -86,7 +89,7 @@ def pack_language_or_region(inp, base):
elif len(inp) == 2:
return ord(inp[0]), ord(inp[1])
else:
- assert len(inp) == 3
+ assert len(inp) == 3, f'Expects a 3-character string, but "{inp}" '
base = ord(base)
first = ord(inp[0]) - base
second = ord(inp[1]) - base
@@ -161,9 +164,10 @@ def dump_representative_locales(representative_locales):
print('});')
-def read_and_dump_likely_data(icu_data_dir):
+def read_and_dump_likely_data(cldr_source_dir):
"""Read and dump the likely-script data."""
- likely_subtags_txt = os.path.join(icu_data_dir, 'misc', 'likelySubtags.txt')
+ likely_subtags_txt = os.path.join(cldr_source_dir,
+ 'common', 'supplemental', 'likelySubtags.xml')
likely_script_dict, representative_locales = read_likely_subtags(
likely_subtags_txt)
@@ -280,10 +284,11 @@ def main():
icu_data_dir = os.path.join(
source_root,
'external', 'icu', 'icu4c', 'source', 'data')
+ cldr_source_dir = os.path.join(source_root, 'external', 'cldr')
print('// Auto-generated by %s' % sys.argv[0])
print()
- likely_script_dict = read_and_dump_likely_data(icu_data_dir)
+ likely_script_dict = read_and_dump_likely_data(cldr_source_dir)
read_and_dump_parent_data(icu_data_dir, likely_script_dict)