| #include "pseudolocalize.h" |
| |
| using namespace std; |
| |
| // String basis to generate expansion |
| static const String16 k_expansion_string = String16("one two three " |
| "four five six seven eight nine ten eleven twelve thirteen " |
| "fourteen fiveteen sixteen seventeen nineteen twenty"); |
| |
| // Special unicode characters to override directionality of the words |
| static const String16 k_rlm = String16("\xe2\x80\x8f"); |
| static const String16 k_rlo = String16("\xE2\x80\xae"); |
| static const String16 k_pdf = String16("\xE2\x80\xac"); |
| |
| // Placeholder marks |
| static const String16 k_placeholder_open = String16("\xc2\xbb"); |
| static const String16 k_placeholder_close = String16("\xc2\xab"); |
| |
| static const char16_t k_arg_start = '{'; |
| static const char16_t k_arg_end = '}'; |
| |
| Pseudolocalizer::Pseudolocalizer(PseudolocalizationMethod m) |
| : mImpl(nullptr), mLastDepth(0) { |
| setMethod(m); |
| } |
| |
| void Pseudolocalizer::setMethod(PseudolocalizationMethod m) { |
| if (mImpl) { |
| delete mImpl; |
| } |
| if (m == PSEUDO_ACCENTED) { |
| mImpl = new PseudoMethodAccent(); |
| } else if (m == PSEUDO_BIDI) { |
| mImpl = new PseudoMethodBidi(); |
| } else { |
| mImpl = new PseudoMethodNone(); |
| } |
| } |
| |
| String16 Pseudolocalizer::text(const String16& text) { |
| String16 out; |
| size_t depth = mLastDepth; |
| size_t lastpos, pos; |
| const size_t length= text.size(); |
| const char16_t* str = text.c_str(); |
| bool escaped = false; |
| for (lastpos = pos = 0; pos < length; pos++) { |
| char16_t c = str[pos]; |
| if (escaped) { |
| escaped = false; |
| continue; |
| } |
| if (c == '\'') { |
| escaped = true; |
| continue; |
| } |
| |
| if (c == k_arg_start) { |
| depth++; |
| } else if (c == k_arg_end && depth) { |
| depth--; |
| } |
| |
| if (mLastDepth != depth || pos == length - 1) { |
| bool pseudo = ((mLastDepth % 2) == 0); |
| size_t nextpos = pos; |
| if (!pseudo || depth == mLastDepth) { |
| nextpos++; |
| } |
| size_t size = nextpos - lastpos; |
| if (size) { |
| String16 chunk = String16(text, size, lastpos); |
| if (pseudo) { |
| chunk = mImpl->text(chunk); |
| } else if (str[lastpos] == k_arg_start && |
| str[nextpos - 1] == k_arg_end) { |
| chunk = mImpl->placeholder(chunk); |
| } |
| out.append(chunk); |
| } |
| if (pseudo && depth < mLastDepth) { // End of message |
| out.append(mImpl->end()); |
| } else if (!pseudo && depth > mLastDepth) { // Start of message |
| out.append(mImpl->start()); |
| } |
| lastpos = nextpos; |
| mLastDepth = depth; |
| } |
| } |
| return out; |
| } |
| |
| static const char* |
| pseudolocalize_char(const char16_t c) |
| { |
| switch (c) { |
| case 'a': return "\xc3\xa5"; |
| case 'b': return "\xc9\x93"; |
| case 'c': return "\xc3\xa7"; |
| case 'd': return "\xc3\xb0"; |
| case 'e': return "\xc3\xa9"; |
| case 'f': return "\xc6\x92"; |
| case 'g': return "\xc4\x9d"; |
| case 'h': return "\xc4\xa5"; |
| case 'i': return "\xc3\xae"; |
| case 'j': return "\xc4\xb5"; |
| case 'k': return "\xc4\xb7"; |
| case 'l': return "\xc4\xbc"; |
| case 'm': return "\xe1\xb8\xbf"; |
| case 'n': return "\xc3\xb1"; |
| case 'o': return "\xc3\xb6"; |
| case 'p': return "\xc3\xbe"; |
| case 'q': return "\x51"; |
| case 'r': return "\xc5\x95"; |
| case 's': return "\xc5\xa1"; |
| case 't': return "\xc5\xa3"; |
| case 'u': return "\xc3\xbb"; |
| case 'v': return "\x56"; |
| case 'w': return "\xc5\xb5"; |
| case 'x': return "\xd1\x85"; |
| case 'y': return "\xc3\xbd"; |
| case 'z': return "\xc5\xbe"; |
| case 'A': return "\xc3\x85"; |
| case 'B': return "\xce\xb2"; |
| case 'C': return "\xc3\x87"; |
| case 'D': return "\xc3\x90"; |
| case 'E': return "\xc3\x89"; |
| case 'G': return "\xc4\x9c"; |
| case 'H': return "\xc4\xa4"; |
| case 'I': return "\xc3\x8e"; |
| case 'J': return "\xc4\xb4"; |
| case 'K': return "\xc4\xb6"; |
| case 'L': return "\xc4\xbb"; |
| case 'M': return "\xe1\xb8\xbe"; |
| case 'N': return "\xc3\x91"; |
| case 'O': return "\xc3\x96"; |
| case 'P': return "\xc3\x9e"; |
| case 'Q': return "\x71"; |
| case 'R': return "\xc5\x94"; |
| case 'S': return "\xc5\xa0"; |
| case 'T': return "\xc5\xa2"; |
| case 'U': return "\xc3\x9b"; |
| case 'V': return "\xce\xbd"; |
| case 'W': return "\xc5\xb4"; |
| case 'X': return "\xc3\x97"; |
| case 'Y': return "\xc3\x9d"; |
| case 'Z': return "\xc5\xbd"; |
| case '!': return "\xc2\xa1"; |
| case '?': return "\xc2\xbf"; |
| case '$': return "\xe2\x82\xac"; |
| default: return NULL; |
| } |
| } |
| |
| static bool is_possible_normal_placeholder_end(const char16_t c) { |
| switch (c) { |
| case 's': return true; |
| case 'S': return true; |
| case 'c': return true; |
| case 'C': return true; |
| case 'd': return true; |
| case 'o': return true; |
| case 'x': return true; |
| case 'X': return true; |
| case 'f': return true; |
| case 'e': return true; |
| case 'E': return true; |
| case 'g': return true; |
| case 'G': return true; |
| case 'a': return true; |
| case 'A': return true; |
| case 'b': return true; |
| case 'B': return true; |
| case 'h': return true; |
| case 'H': return true; |
| case '%': return true; |
| case 'n': return true; |
| default: return false; |
| } |
| } |
| |
| static String16 pseudo_generate_expansion(const unsigned int length) { |
| String16 result = k_expansion_string; |
| const char16_t* s = result.c_str(); |
| if (result.size() < length) { |
| result += String16(" "); |
| result += pseudo_generate_expansion(length - result.size()); |
| } else { |
| int ext = 0; |
| // Should contain only whole words, so looking for a space |
| for (unsigned int i = length + 1; i < result.size(); ++i) { |
| ++ext; |
| if (s[i] == ' ') { |
| break; |
| } |
| } |
| // Just keep the first length + ext characters |
| result = String16(result, length + ext); |
| } |
| return result; |
| } |
| |
| static bool is_space(const char16_t c) { |
| return (c == ' ' || c == '\t' || c == '\n'); |
| } |
| |
| String16 PseudoMethodAccent::start() { |
| String16 result; |
| if (mDepth == 0) { |
| result = String16(String8("[")); |
| } |
| mWordCount = mLength = 0; |
| mDepth++; |
| return result; |
| } |
| |
| String16 PseudoMethodAccent::end() { |
| String16 result; |
| if (mLength) { |
| result.append(String16(String8(" "))); |
| result.append(pseudo_generate_expansion( |
| mWordCount > 3 ? mLength : mLength / 2)); |
| } |
| mWordCount = mLength = 0; |
| mDepth--; |
| if (mDepth == 0) { |
| result.append(String16(String8("]"))); |
| } |
| return result; |
| } |
| |
| /** |
| * Converts characters so they look like they've been localized. |
| * |
| * Note: This leaves escape sequences untouched so they can later be |
| * processed by ResTable::collectString in the normal way. |
| */ |
| String16 PseudoMethodAccent::text(const String16& source) |
| { |
| const char16_t* s = source.c_str(); |
| String16 result; |
| const size_t I = source.size(); |
| bool lastspace = true; |
| for (size_t i=0; i<I; i++) { |
| char16_t c = s[i]; |
| if (c == '\\') { |
| // Escape syntax, no need to pseudolocalize |
| if (i<I-1) { |
| result += String16("\\"); |
| i++; |
| c = s[i]; |
| switch (c) { |
| case 'u': |
| // this one takes up 5 chars |
| result += String16(s+i, 5); |
| i += 4; |
| break; |
| case 't': |
| case 'n': |
| case '#': |
| case '@': |
| case '?': |
| case '"': |
| case '\'': |
| case '\\': |
| default: |
| result.append(&c, 1); |
| break; |
| } |
| } else { |
| result.append(&c, 1); |
| } |
| } else if (c == '%') { |
| // Placeholder syntax, no need to pseudolocalize |
| String16 chunk; |
| bool end = false; |
| chunk.append(&c, 1); |
| while (!end && i < I) { |
| ++i; |
| c = s[i]; |
| chunk.append(&c, 1); |
| if (is_possible_normal_placeholder_end(c)) { |
| end = true; |
| } else if (c == 't') { |
| ++i; |
| c = s[i]; |
| chunk.append(&c, 1); |
| end = true; |
| } |
| } |
| // Treat chunk as a placeholder unless it ends with %. |
| result += ((c == '%') ? chunk : placeholder(chunk)); |
| } else if (c == '<' || c == '&') { |
| // html syntax, no need to pseudolocalize |
| bool tag_closed = false; |
| while (!tag_closed && i < I) { |
| if (c == '&') { |
| String16 escape_text; |
| escape_text.append(&c, 1); |
| bool end = false; |
| size_t htmlCodePos = i; |
| while (!end && htmlCodePos < I) { |
| ++htmlCodePos; |
| c = s[htmlCodePos]; |
| escape_text.append(&c, 1); |
| // Valid html code |
| if (c == ';') { |
| end = true; |
| i = htmlCodePos; |
| } |
| // Wrong html code |
| else if (!((c == '#' || |
| (c >= 'a' && c <= 'z') || |
| (c >= 'A' && c <= 'Z') || |
| (c >= '0' && c <= '9')))) { |
| end = true; |
| } |
| } |
| result += escape_text; |
| if (escape_text != String16("<")) { |
| tag_closed = true; |
| } |
| continue; |
| } |
| if (c == '>') { |
| tag_closed = true; |
| result.append(&c, 1); |
| continue; |
| } |
| result.append(&c, 1); |
| i++; |
| c = s[i]; |
| } |
| } else { |
| // This is a pure text that should be pseudolocalized |
| const char* p = pseudolocalize_char(c); |
| if (p != NULL) { |
| result += String16(p); |
| } else { |
| bool space = is_space(c); |
| if (lastspace && !space) { |
| mWordCount++; |
| } |
| lastspace = space; |
| result.append(&c, 1); |
| } |
| // Count only pseudolocalizable chars and delimiters |
| mLength++; |
| } |
| } |
| return result; |
| } |
| String16 PseudoMethodAccent::placeholder(const String16& source) { |
| // Surround a placeholder with brackets |
| return k_placeholder_open + source + k_placeholder_close; |
| } |
| |
| String16 PseudoMethodBidi::text(const String16& source) |
| { |
| const char16_t* s = source.c_str(); |
| String16 result; |
| bool lastspace = true; |
| bool space = true; |
| bool escape = false; |
| const char16_t ESCAPE_CHAR = '\\'; |
| for (size_t i=0; i<source.size(); i++) { |
| char16_t c = s[i]; |
| if (!escape && c == ESCAPE_CHAR) { |
| escape = true; |
| continue; |
| } |
| space = (!escape && is_space(c)) || (escape && (c == 'n' || c == 't')); |
| if (lastspace && !space) { |
| // Word start |
| result += k_rlm + k_rlo; |
| } else if (!lastspace && space) { |
| // Word end |
| result += k_pdf + k_rlm; |
| } |
| lastspace = space; |
| if (escape) { |
| result.append(&ESCAPE_CHAR, 1); |
| escape=false; |
| } |
| result.append(&c, 1); |
| } |
| if (!lastspace) { |
| // End of last word |
| result += k_pdf + k_rlm; |
| } |
| return result; |
| } |
| |
| String16 PseudoMethodBidi::placeholder(const String16& source) { |
| // Surround a placeholder with directionality change sequence |
| return k_rlm + k_rlo + source + k_pdf + k_rlm; |
| } |
| |