| /* |
| * Copyright (C) 2015 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #include "compile/Pseudolocalizer.h" |
| #include "util/Util.h" |
| |
| namespace aapt { |
| |
| // String basis to generate expansion |
| static const std::u16string k_expansion_string = u"one two three " |
| "four five six seven eight nine ten eleven twelve thirteen " |
| "fourteen fiveteen sixteen seventeen nineteen twenty"; |
| |
| // Special unicode characters to override directionality of the words |
| static const std::u16string k_rlm = u"\u200f"; |
| static const std::u16string k_rlo = u"\u202e"; |
| static const std::u16string k_pdf = u"\u202c"; |
| |
| // Placeholder marks |
| static const std::u16string k_placeholder_open = u"\u00bb"; |
| static const std::u16string k_placeholder_close = u"\u00ab"; |
| |
| static const char16_t k_arg_start = u'{'; |
| static const char16_t k_arg_end = u'}'; |
| |
| class PseudoMethodNone : public PseudoMethodImpl { |
| public: |
| std::u16string text(const StringPiece16& text) override { return text.toString(); } |
| std::u16string placeholder(const StringPiece16& text) override { return text.toString(); } |
| }; |
| |
| class PseudoMethodBidi : public PseudoMethodImpl { |
| public: |
| std::u16string text(const StringPiece16& text) override; |
| std::u16string placeholder(const StringPiece16& text) override; |
| }; |
| |
| class PseudoMethodAccent : public PseudoMethodImpl { |
| public: |
| PseudoMethodAccent() : mDepth(0), mWordCount(0), mLength(0) {} |
| std::u16string start() override; |
| std::u16string end() override; |
| std::u16string text(const StringPiece16& text) override; |
| std::u16string placeholder(const StringPiece16& text) override; |
| private: |
| size_t mDepth; |
| size_t mWordCount; |
| size_t mLength; |
| }; |
| |
| Pseudolocalizer::Pseudolocalizer(Method method) : mLastDepth(0) { |
| setMethod(method); |
| } |
| |
| void Pseudolocalizer::setMethod(Method method) { |
| switch (method) { |
| case Method::kNone: |
| mImpl = util::make_unique<PseudoMethodNone>(); |
| break; |
| case Method::kAccent: |
| mImpl = util::make_unique<PseudoMethodAccent>(); |
| break; |
| case Method::kBidi: |
| mImpl = util::make_unique<PseudoMethodBidi>(); |
| break; |
| } |
| } |
| |
| std::u16string Pseudolocalizer::text(const StringPiece16& text) { |
| std::u16string out; |
| size_t depth = mLastDepth; |
| size_t lastpos, pos; |
| const size_t length = text.size(); |
| const char16_t* str = text.data(); |
| bool escaped = false; |
| for (lastpos = pos = 0; pos < length; pos++) { |
| char16_t c = str[pos]; |
| if (escaped) { |
| escaped = false; |
| continue; |
| } |
| if (c == '\'') { |
| escaped = true; |
| continue; |
| } |
| |
| if (c == k_arg_start) { |
| depth++; |
| } else if (c == k_arg_end && depth) { |
| depth--; |
| } |
| |
| if (mLastDepth != depth || pos == length - 1) { |
| bool pseudo = ((mLastDepth % 2) == 0); |
| size_t nextpos = pos; |
| if (!pseudo || depth == mLastDepth) { |
| nextpos++; |
| } |
| size_t size = nextpos - lastpos; |
| if (size) { |
| std::u16string chunk = text.substr(lastpos, size).toString(); |
| if (pseudo) { |
| chunk = mImpl->text(chunk); |
| } else if (str[lastpos] == k_arg_start && str[nextpos - 1] == k_arg_end) { |
| chunk = mImpl->placeholder(chunk); |
| } |
| out.append(chunk); |
| } |
| if (pseudo && depth < mLastDepth) { // End of message |
| out.append(mImpl->end()); |
| } else if (!pseudo && depth > mLastDepth) { // Start of message |
| out.append(mImpl->start()); |
| } |
| lastpos = nextpos; |
| mLastDepth = depth; |
| } |
| } |
| return out; |
| } |
| |
| static const char16_t* pseudolocalizeChar(const char16_t c) { |
| switch (c) { |
| case 'a': return u"\u00e5"; |
| case 'b': return u"\u0253"; |
| case 'c': return u"\u00e7"; |
| case 'd': return u"\u00f0"; |
| case 'e': return u"\u00e9"; |
| case 'f': return u"\u0192"; |
| case 'g': return u"\u011d"; |
| case 'h': return u"\u0125"; |
| case 'i': return u"\u00ee"; |
| case 'j': return u"\u0135"; |
| case 'k': return u"\u0137"; |
| case 'l': return u"\u013c"; |
| case 'm': return u"\u1e3f"; |
| case 'n': return u"\u00f1"; |
| case 'o': return u"\u00f6"; |
| case 'p': return u"\u00fe"; |
| case 'q': return u"\u0051"; |
| case 'r': return u"\u0155"; |
| case 's': return u"\u0161"; |
| case 't': return u"\u0163"; |
| case 'u': return u"\u00fb"; |
| case 'v': return u"\u0056"; |
| case 'w': return u"\u0175"; |
| case 'x': return u"\u0445"; |
| case 'y': return u"\u00fd"; |
| case 'z': return u"\u017e"; |
| case 'A': return u"\u00c5"; |
| case 'B': return u"\u03b2"; |
| case 'C': return u"\u00c7"; |
| case 'D': return u"\u00d0"; |
| case 'E': return u"\u00c9"; |
| case 'G': return u"\u011c"; |
| case 'H': return u"\u0124"; |
| case 'I': return u"\u00ce"; |
| case 'J': return u"\u0134"; |
| case 'K': return u"\u0136"; |
| case 'L': return u"\u013b"; |
| case 'M': return u"\u1e3e"; |
| case 'N': return u"\u00d1"; |
| case 'O': return u"\u00d6"; |
| case 'P': return u"\u00de"; |
| case 'Q': return u"\u0071"; |
| case 'R': return u"\u0154"; |
| case 'S': return u"\u0160"; |
| case 'T': return u"\u0162"; |
| case 'U': return u"\u00db"; |
| case 'V': return u"\u03bd"; |
| case 'W': return u"\u0174"; |
| case 'X': return u"\u00d7"; |
| case 'Y': return u"\u00dd"; |
| case 'Z': return u"\u017d"; |
| case '!': return u"\u00a1"; |
| case '?': return u"\u00bf"; |
| case '$': return u"\u20ac"; |
| default: return NULL; |
| } |
| } |
| |
| static bool isPossibleNormalPlaceholderEnd(const char16_t c) { |
| switch (c) { |
| case 's': return true; |
| case 'S': return true; |
| case 'c': return true; |
| case 'C': return true; |
| case 'd': return true; |
| case 'o': return true; |
| case 'x': return true; |
| case 'X': return true; |
| case 'f': return true; |
| case 'e': return true; |
| case 'E': return true; |
| case 'g': return true; |
| case 'G': return true; |
| case 'a': return true; |
| case 'A': return true; |
| case 'b': return true; |
| case 'B': return true; |
| case 'h': return true; |
| case 'H': return true; |
| case '%': return true; |
| case 'n': return true; |
| default: return false; |
| } |
| } |
| |
| static std::u16string pseudoGenerateExpansion(const unsigned int length) { |
| std::u16string result = k_expansion_string; |
| const char16_t* s = result.data(); |
| if (result.size() < length) { |
| result += u" "; |
| result += pseudoGenerateExpansion(length - result.size()); |
| } else { |
| int ext = 0; |
| // Should contain only whole words, so looking for a space |
| for (unsigned int i = length + 1; i < result.size(); ++i) { |
| ++ext; |
| if (s[i] == ' ') { |
| break; |
| } |
| } |
| result = result.substr(0, length + ext); |
| } |
| return result; |
| } |
| |
| std::u16string PseudoMethodAccent::start() { |
| std::u16string result; |
| if (mDepth == 0) { |
| result = u"["; |
| } |
| mWordCount = mLength = 0; |
| mDepth++; |
| return result; |
| } |
| |
| std::u16string PseudoMethodAccent::end() { |
| std::u16string result; |
| if (mLength) { |
| result += u" "; |
| result += pseudoGenerateExpansion(mWordCount > 3 ? mLength : mLength / 2); |
| } |
| mWordCount = mLength = 0; |
| mDepth--; |
| if (mDepth == 0) { |
| result += u"]"; |
| } |
| return result; |
| } |
| |
| /** |
| * Converts characters so they look like they've been localized. |
| * |
| * Note: This leaves placeholder syntax untouched. |
| */ |
| std::u16string PseudoMethodAccent::text(const StringPiece16& source) |
| { |
| const char16_t* s = source.data(); |
| std::u16string result; |
| const size_t I = source.size(); |
| bool lastspace = true; |
| for (size_t i = 0; i < I; i++) { |
| char16_t c = s[i]; |
| if (c == '%') { |
| // Placeholder syntax, no need to pseudolocalize |
| std::u16string chunk; |
| bool end = false; |
| chunk.append(&c, 1); |
| while (!end && i + 1 < I) { |
| ++i; |
| c = s[i]; |
| chunk.append(&c, 1); |
| if (isPossibleNormalPlaceholderEnd(c)) { |
| end = true; |
| } else if (i + 1 < I && c == 't') { |
| ++i; |
| c = s[i]; |
| chunk.append(&c, 1); |
| end = true; |
| } |
| } |
| // Treat chunk as a placeholder unless it ends with %. |
| result += ((c == '%') ? chunk : placeholder(chunk)); |
| } else if (c == '<' || c == '&') { |
| // html syntax, no need to pseudolocalize |
| bool tag_closed = false; |
| while (!tag_closed && i < I) { |
| if (c == '&') { |
| std::u16string escapeText; |
| escapeText.append(&c, 1); |
| bool end = false; |
| size_t htmlCodePos = i; |
| while (!end && htmlCodePos < I) { |
| ++htmlCodePos; |
| c = s[htmlCodePos]; |
| escapeText.append(&c, 1); |
| // Valid html code |
| if (c == ';') { |
| end = true; |
| i = htmlCodePos; |
| } |
| // Wrong html code |
| else if (!((c == '#' || |
| (c >= 'a' && c <= 'z') || |
| (c >= 'A' && c <= 'Z') || |
| (c >= '0' && c <= '9')))) { |
| end = true; |
| } |
| } |
| result += escapeText; |
| if (escapeText != u"<") { |
| tag_closed = true; |
| } |
| continue; |
| } |
| if (c == '>') { |
| tag_closed = true; |
| result.append(&c, 1); |
| continue; |
| } |
| result.append(&c, 1); |
| i++; |
| c = s[i]; |
| } |
| } else { |
| // This is a pure text that should be pseudolocalized |
| const char16_t* p = pseudolocalizeChar(c); |
| if (p != nullptr) { |
| result += p; |
| } else { |
| bool space = util::isspace16(c); |
| if (lastspace && !space) { |
| mWordCount++; |
| } |
| lastspace = space; |
| result.append(&c, 1); |
| } |
| // Count only pseudolocalizable chars and delimiters |
| mLength++; |
| } |
| } |
| return result; |
| } |
| |
| std::u16string PseudoMethodAccent::placeholder(const StringPiece16& source) { |
| // Surround a placeholder with brackets |
| return k_placeholder_open + source.toString() + k_placeholder_close; |
| } |
| |
| std::u16string PseudoMethodBidi::text(const StringPiece16& source) { |
| const char16_t* s = source.data(); |
| std::u16string result; |
| bool lastspace = true; |
| bool space = true; |
| for (size_t i = 0; i < source.size(); i++) { |
| char16_t c = s[i]; |
| space = util::isspace16(c); |
| if (lastspace && !space) { |
| // Word start |
| result += k_rlm + k_rlo; |
| } else if (!lastspace && space) { |
| // Word end |
| result += k_pdf + k_rlm; |
| } |
| lastspace = space; |
| result.append(&c, 1); |
| } |
| if (!lastspace) { |
| // End of last word |
| result += k_pdf + k_rlm; |
| } |
| return result; |
| } |
| |
| std::u16string PseudoMethodBidi::placeholder(const StringPiece16& source) { |
| // Surround a placeholder with directionality change sequence |
| return k_rlm + k_rlo + source.toString() + k_pdf + k_rlm; |
| } |
| |
| } // namespace aapt |