summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
author TreeHugger Robot <treehugger-gerrit@google.com> 2017-10-25 02:47:06 +0000
committer Android (Google) Code Review <android-gerrit@google.com> 2017-10-25 02:47:06 +0000
commit3a00b6ff91f0a582e09e9840c1c48c4c16057b51 (patch)
tree4595768b65a329f6ec5f138bdd1559d09fff5490
parent0e364f64da7c5dcefb67b4897ddb9fd41d766cdc (diff)
parent28c5b4d50889a52fa769652704c899b5e4e570b9 (diff)
Merge "Fallback hyphenation for minority Indic languages"
-rw-r--r--core/jni/android_text_Hyphenator.cpp113
1 files changed, 62 insertions, 51 deletions
diff --git a/core/jni/android_text_Hyphenator.cpp b/core/jni/android_text_Hyphenator.cpp
index 05bec28a5d39..6f9cc22fb3ab 100644
--- a/core/jni/android_text_Hyphenator.cpp
+++ b/core/jni/android_text_Hyphenator.cpp
@@ -82,45 +82,45 @@ static void init() {
constexpr int INDIC_MIN_PREFIX = 2;
constexpr int INDIC_MIN_SUFFIX = 2;
- addHyphenator("as", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Assamese
- addHyphenator("be", 2, 2); // Belarusian
- addHyphenator("bg", 2, 2); // Bulgarian
- addHyphenator("bn", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Bengali
- addHyphenator("cu", 1, 2); // Church Slavonic
- addHyphenator("cy", 2, 3); // Welsh
- addHyphenator("da", 2, 2); // Danish
- addHyphenator("de-1901", 2, 2); // German 1901 orthography
- addHyphenator("de-1996", 2, 2); // German 1996 orthography
- addHyphenator("de-CH-1901", 2, 2); // Swiss High German 1901 orthography
- addHyphenator("en-GB", 2, 3); // British English
- addHyphenator("en-US", 2, 3); // American English
- addHyphenator("es", 2, 2); // Spanish
- addHyphenator("et", 2, 3); // Estonian
- addHyphenator("eu", 2, 2); // Basque
- addHyphenator("fr", 2, 3); // French
- addHyphenator("ga", 2, 3); // Irish
- addHyphenator("gu", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Gujarati
- addHyphenator("hi", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Hindi
- addHyphenator("hr", 2, 2); // Croatian
- addHyphenator("hu", 2, 2); // Hungarian
+ addHyphenator("as", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Assamese
+ addHyphenator("be", 2, 2); // Belarusian
+ addHyphenator("bg", 2, 2); // Bulgarian
+ addHyphenator("bn", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Bengali
+ addHyphenator("cu", 1, 2); // Church Slavonic
+ addHyphenator("cy", 2, 3); // Welsh
+ addHyphenator("da", 2, 2); // Danish
+ addHyphenator("de-1901", 2, 2); // German 1901 orthography
+ addHyphenator("de-1996", 2, 2); // German 1996 orthography
+ addHyphenator("de-CH-1901", 2, 2); // Swiss High German 1901 orthography
+ addHyphenator("en-GB", 2, 3); // British English
+ addHyphenator("en-US", 2, 3); // American English
+ addHyphenator("es", 2, 2); // Spanish
+ addHyphenator("et", 2, 3); // Estonian
+ addHyphenator("eu", 2, 2); // Basque
+ addHyphenator("fr", 2, 3); // French
+ addHyphenator("ga", 2, 3); // Irish
+ addHyphenator("gu", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Gujarati
+ addHyphenator("hi", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Hindi
+ addHyphenator("hr", 2, 2); // Croatian
+ addHyphenator("hu", 2, 2); // Hungarian
// texhyphen sources say Armenian may be (1, 2); but that it needs confirmation.
// Going with a more conservative value of (2, 2) for now.
- addHyphenator("hy", 2, 2); // Armenian
- addHyphenator("kn", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Kannada
- addHyphenator("la", 2, 2); // Latin
- addHyphenator("ml", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Malayalam
- addHyphenator("mn-Cyrl", 2, 2); // Mongolian in Cyrillic script
- addHyphenator("mr", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Marathi
- addHyphenator("nb", 2, 2); // Norwegian Bokmål
- addHyphenator("nn", 2, 2); // Norwegian Nynorsk
- addHyphenator("or", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Oriya
- addHyphenator("pa", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Punjabi
- addHyphenator("pt", 2, 3); // Portuguese
- addHyphenator("sl", 2, 2); // Slovenian
- addHyphenator("ta", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Tamil
- addHyphenator("te", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Telugu
- addHyphenator("tk", 2, 2); // Turkmen
- addHyphenator("und-Ethi", 1, 1); // Any language in Ethiopic script
+ addHyphenator("hy", 2, 2); // Armenian
+ addHyphenator("kn", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Kannada
+ addHyphenator("la", 2, 2); // Latin
+ addHyphenator("ml", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Malayalam
+ addHyphenator("mn-Cyrl", 2, 2); // Mongolian in Cyrillic script
+ addHyphenator("mr", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Marathi
+ addHyphenator("nb", 2, 2); // Norwegian Bokmål
+ addHyphenator("nn", 2, 2); // Norwegian Nynorsk
+ addHyphenator("or", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Oriya
+ addHyphenator("pa", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Punjabi
+ addHyphenator("pt", 2, 3); // Portuguese
+ addHyphenator("sl", 2, 2); // Slovenian
+ addHyphenator("ta", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Tamil
+ addHyphenator("te", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Telugu
+ addHyphenator("tk", 2, 2); // Turkmen
+ addHyphenator("und-Ethi", 1, 1); // Any language in Ethiopic script
// Following two hyphenators do not have pattern files but there is some special logic based on
// language.
@@ -130,13 +130,13 @@ static void init() {
// English locales that fall back to en-US. The data is from CLDR. It's all English locales,
// minus the locales whose parent is en-001 (from supplementalData.xml, under <parentLocales>).
// TODO: Figure out how to get this from ICU.
- addHyphenatorAlias("en-AS", "en-US"); // English (American Samoa)
- addHyphenatorAlias("en-GU", "en-US"); // English (Guam)
- addHyphenatorAlias("en-MH", "en-US"); // English (Marshall Islands)
- addHyphenatorAlias("en-MP", "en-US"); // English (Northern Mariana Islands)
- addHyphenatorAlias("en-PR", "en-US"); // English (Puerto Rico)
- addHyphenatorAlias("en-UM", "en-US"); // English (United States Minor Outlying Islands)
- addHyphenatorAlias("en-VI", "en-US"); // English (Virgin Islands)
+ addHyphenatorAlias("en-AS", "en-US"); // English (American Samoa)
+ addHyphenatorAlias("en-GU", "en-US"); // English (Guam)
+ addHyphenatorAlias("en-MH", "en-US"); // English (Marshall Islands)
+ addHyphenatorAlias("en-MP", "en-US"); // English (Northern Mariana Islands)
+ addHyphenatorAlias("en-PR", "en-US"); // English (Puerto Rico)
+ addHyphenatorAlias("en-UM", "en-US"); // English (United States Minor Outlying Islands)
+ addHyphenatorAlias("en-VI", "en-US"); // English (Virgin Islands)
// All English locales other than those falling back to en-US are mapped to en-GB.
addHyphenatorAlias("en", "en-GB");
@@ -150,17 +150,28 @@ static void init() {
addHyphenatorAlias("no", "nb");
// Use mn-Cyrl. According to CLDR's likelySubtags.xml, mn is most likely to be mn-Cyrl.
- addHyphenatorAlias("mn", "mn-Cyrl"); // Mongolian
+ addHyphenatorAlias("mn", "mn-Cyrl"); // Mongolian
// Fall back to Ethiopic script for languages likely to be written in Ethiopic.
// Data is from CLDR's likelySubtags.xml.
// TODO: Convert this to a mechanism using ICU4J's ULocale#addLikelySubtags().
- addHyphenatorAlias("am", "und-Ethi"); // Amharic
- addHyphenatorAlias("byn", "und-Ethi"); // Blin
- addHyphenatorAlias("gez", "und-Ethi"); // Geʻez
- addHyphenatorAlias("ti", "und-Ethi"); // Tigrinya
- addHyphenatorAlias("wal", "und-Ethi"); // Wolaytta
-
+ addHyphenatorAlias("am", "und-Ethi"); // Amharic
+ addHyphenatorAlias("byn", "und-Ethi"); // Blin
+ addHyphenatorAlias("gez", "und-Ethi"); // Geʻez
+ addHyphenatorAlias("ti", "und-Ethi"); // Tigrinya
+ addHyphenatorAlias("wal", "und-Ethi"); // Wolaytta
+
+ // Use Hindi as a fallback hyphenator for all languages written in Devanagari, etc. This makes
+ // sense because our Indic patterns are not really linguistic, but script-based.
+ addHyphenatorAlias("und-Beng", "bn"); // Bengali
+ addHyphenatorAlias("und-Deva", "hi"); // Devanagari -> Hindi
+ addHyphenatorAlias("und-Gujr", "gu"); // Gujarati
+ addHyphenatorAlias("und-Guru", "pa"); // Gurmukhi -> Punjabi
+ addHyphenatorAlias("und-Knda", "kn"); // Kannada
+ addHyphenatorAlias("und-Mlym", "ml"); // Malayalam
+ addHyphenatorAlias("und-Orya", "or"); // Oriya
+ addHyphenatorAlias("und-Taml", "ta"); // Tamil
+ addHyphenatorAlias("und-Telu", "te"); // Telugu
}
static const JNINativeMethod gMethods[] = {