diff options
| -rw-r--r-- | api/current.txt | 57 | ||||
| -rw-r--r-- | core/java/android/text/TextDirectionHeuristic.java | 28 | ||||
| -rw-r--r-- | core/java/android/text/TextDirectionHeuristics.java | 147 | ||||
| -rw-r--r-- | core/java/android/text/bidi/BidiFormatter.java | 1147 |
4 files changed, 1304 insertions, 75 deletions
diff --git a/api/current.txt b/api/current.txt index c9a0029c07d1..b54314c26666 100644 --- a/api/current.txt +++ b/api/current.txt @@ -22273,6 +22273,21 @@ package android.text { method public int getTopPadding(); } + public abstract interface TextDirectionHeuristic { + method public abstract boolean isRtl(char[], int, int); + method public abstract boolean isRtl(java.lang.CharSequence, int, int); + } + + public class TextDirectionHeuristics { + ctor public TextDirectionHeuristics(); + field public static final android.text.TextDirectionHeuristic ANYRTL_LTR; + field public static final android.text.TextDirectionHeuristic FIRSTSTRONG_LTR; + field public static final android.text.TextDirectionHeuristic FIRSTSTRONG_RTL; + field public static final android.text.TextDirectionHeuristic LOCALE; + field public static final android.text.TextDirectionHeuristic LTR; + field public static final android.text.TextDirectionHeuristic RTL; + } + public class TextPaint extends android.graphics.Paint { ctor public TextPaint(); ctor public TextPaint(int); @@ -22364,6 +22379,48 @@ package android.text { } +package android.text.bidi { + + public final class BidiFormatter { + method public java.lang.String dirAttr(java.lang.String); + method public java.lang.String dirAttr(java.lang.String, android.text.TextDirectionHeuristic); + method public java.lang.String dirAttr(boolean); + method public java.lang.String dirAttrValue(java.lang.String); + method public java.lang.String dirAttrValue(java.lang.String, android.text.TextDirectionHeuristic); + method public java.lang.String dirAttrValue(boolean); + method public java.lang.String endEdge(); + method public static android.text.bidi.BidiFormatter getInstance(boolean); + method public static android.text.bidi.BidiFormatter getInstance(java.util.Locale); + method public boolean getStereoReset(); + method public boolean isRtl(java.lang.String); + method public boolean isRtlContext(); + method public java.lang.String mark(); + method public java.lang.String markAfter(java.lang.String); + method public java.lang.String markAfter(java.lang.String, android.text.TextDirectionHeuristic); + method public java.lang.String markBefore(java.lang.String); + method public java.lang.String markBefore(java.lang.String, android.text.TextDirectionHeuristic); + method public java.lang.String spanWrap(java.lang.String, android.text.TextDirectionHeuristic, boolean); + method public java.lang.String spanWrap(java.lang.String, android.text.TextDirectionHeuristic); + method public java.lang.String spanWrap(java.lang.String, boolean); + method public java.lang.String spanWrap(java.lang.String); + method public java.lang.String startEdge(); + method public java.lang.String unicodeWrap(java.lang.String, android.text.TextDirectionHeuristic, boolean); + method public java.lang.String unicodeWrap(java.lang.String, android.text.TextDirectionHeuristic); + method public java.lang.String unicodeWrap(java.lang.String, boolean); + method public java.lang.String unicodeWrap(java.lang.String); + } + + public static final class BidiFormatter.Builder { + ctor public BidiFormatter.Builder(); + ctor public BidiFormatter.Builder(boolean); + ctor public BidiFormatter.Builder(java.util.Locale); + method public android.text.bidi.BidiFormatter build(); + method public android.text.bidi.BidiFormatter.Builder setTextDirectionHeuristic(android.text.TextDirectionHeuristic); + method public android.text.bidi.BidiFormatter.Builder stereoReset(boolean); + } + +} + package android.text.format { public class DateFormat { diff --git a/core/java/android/text/TextDirectionHeuristic.java b/core/java/android/text/TextDirectionHeuristic.java index 513e11ce34d5..8a4ba42bcc91 100644 --- a/core/java/android/text/TextDirectionHeuristic.java +++ b/core/java/android/text/TextDirectionHeuristic.java @@ -17,10 +17,30 @@ package android.text; /** - * Interface for objects that guess at the paragraph direction by examining text. - * - * @hide + * Interface for objects that use a heuristic for guessing at the paragraph direction by examining text. */ public interface TextDirectionHeuristic { - boolean isRtl(char[] text, int start, int count); + /** + * Guess if a chars array is in the RTL direction or not. + * + * @param array the char array. + * @param start start index, inclusive. + * @param count the length to check, must not be negative and not greater than + * {@code array.length - start}. + * @return true if all chars in the range are to be considered in a RTL direction, + * false otherwise. + */ + boolean isRtl(char[] array, int start, int count); + + /** + * Guess if a {@code CharSequence} is in the RTL direction or not. + * + * @param cs the CharSequence. + * @param start start index, inclusive. + * @param count the length to check, must not be negative and not greater than + * {@code CharSequence.length() - start}. + * @return true if all chars in the range are to be considered in a RTL direction, + * false otherwise. + */ + boolean isRtl(CharSequence cs, int start, int count); } diff --git a/core/java/android/text/TextDirectionHeuristics.java b/core/java/android/text/TextDirectionHeuristics.java index df8c4c665f1e..7d7e3a935f94 100644 --- a/core/java/android/text/TextDirectionHeuristics.java +++ b/core/java/android/text/TextDirectionHeuristics.java @@ -19,43 +19,45 @@ package android.text; import android.view.View; +import java.nio.CharBuffer; + /** * Some objects that implement TextDirectionHeuristic. * - * @hide */ public class TextDirectionHeuristics { - /** Always decides that the direction is left to right. */ + /** + * Always decides that the direction is left to right. + */ public static final TextDirectionHeuristic LTR = new TextDirectionHeuristicInternal(null /* no algorithm */, false); - /** Always decides that the direction is right to left. */ + /** + * Always decides that the direction is right to left. + */ public static final TextDirectionHeuristic RTL = new TextDirectionHeuristicInternal(null /* no algorithm */, true); /** - * Determines the direction based on the first strong directional character, - * including bidi format chars, falling back to left to right if it - * finds none. This is the default behavior of the Unicode Bidirectional - * Algorithm. + * Determines the direction based on the first strong directional character, including bidi + * format chars, falling back to left to right if it finds none. This is the default behavior + * of the Unicode Bidirectional Algorithm. */ public static final TextDirectionHeuristic FIRSTSTRONG_LTR = new TextDirectionHeuristicInternal(FirstStrong.INSTANCE, false); /** - * Determines the direction based on the first strong directional character, - * including bidi format chars, falling back to right to left if it - * finds none. This is similar to the default behavior of the Unicode - * Bidirectional Algorithm, just with different fallback behavior. + * Determines the direction based on the first strong directional character, including bidi + * format chars, falling back to right to left if it finds none. This is similar to the default + * behavior of the Unicode Bidirectional Algorithm, just with different fallback behavior. */ public static final TextDirectionHeuristic FIRSTSTRONG_RTL = new TextDirectionHeuristicInternal(FirstStrong.INSTANCE, true); /** - * If the text contains any strong right to left non-format character, determines - * that the direction is right to left, falling back to left to right if it - * finds none. + * If the text contains any strong right to left non-format character, determines that the + * direction is right to left, falling back to left to right if it finds none. */ public static final TextDirectionHeuristic ANYRTL_LTR = new TextDirectionHeuristicInternal(AnyStrong.INSTANCE_RTL, false); @@ -65,8 +67,39 @@ public class TextDirectionHeuristics { */ public static final TextDirectionHeuristic LOCALE = TextDirectionHeuristicLocale.INSTANCE; - private static enum TriState { - TRUE, FALSE, UNKNOWN; + /** + * State constants for taking care about true / false / unknown + */ + private static final int STATE_TRUE = 0; + private static final int STATE_FALSE = 1; + private static final int STATE_UNKNOWN = 2; + + private static int isRtlText(int directionality) { + switch (directionality) { + case Character.DIRECTIONALITY_LEFT_TO_RIGHT: + return STATE_FALSE; + case Character.DIRECTIONALITY_RIGHT_TO_LEFT: + case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC: + return STATE_TRUE; + default: + return STATE_UNKNOWN; + } + } + + private static int isRtlTextOrFormat(int directionality) { + switch (directionality) { + case Character.DIRECTIONALITY_LEFT_TO_RIGHT: + case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING: + case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: + return STATE_FALSE; + case Character.DIRECTIONALITY_RIGHT_TO_LEFT: + case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC: + case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING: + case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE: + return STATE_TRUE; + default: + return STATE_UNKNOWN; + } } /** @@ -87,21 +120,26 @@ public class TextDirectionHeuristics { abstract protected boolean defaultIsRtl(); @Override - public boolean isRtl(char[] chars, int start, int count) { - if (chars == null || start < 0 || count < 0 || chars.length - count < start) { + public boolean isRtl(char[] array, int start, int count) { + return isRtl(CharBuffer.wrap(array), start, count); + } + + @Override + public boolean isRtl(CharSequence cs, int start, int count) { + if (cs == null || start < 0 || count < 0 || cs.length() - count < start) { throw new IllegalArgumentException(); } if (mAlgorithm == null) { return defaultIsRtl(); } - return doCheck(chars, start, count); + return doCheck(cs, start, count); } - private boolean doCheck(char[] chars, int start, int count) { - switch(mAlgorithm.checkRtl(chars, start, count)) { - case TRUE: + private boolean doCheck(CharSequence cs, int start, int count) { + switch(mAlgorithm.checkRtl(cs, start, count)) { + case STATE_TRUE: return true; - case FALSE: + case STATE_FALSE: return false; default: return defaultIsRtl(); @@ -124,58 +162,26 @@ public class TextDirectionHeuristics { } } - private static TriState isRtlText(int directionality) { - switch (directionality) { - case Character.DIRECTIONALITY_LEFT_TO_RIGHT: - return TriState.FALSE; - case Character.DIRECTIONALITY_RIGHT_TO_LEFT: - case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC: - return TriState.TRUE; - default: - return TriState.UNKNOWN; - } - } - - private static TriState isRtlTextOrFormat(int directionality) { - switch (directionality) { - case Character.DIRECTIONALITY_LEFT_TO_RIGHT: - case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING: - case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: - return TriState.FALSE; - case Character.DIRECTIONALITY_RIGHT_TO_LEFT: - case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC: - case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING: - case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE: - return TriState.TRUE; - default: - return TriState.UNKNOWN; - } - } - /** * Interface for an algorithm to guess the direction of a paragraph of text. - * */ private static interface TextDirectionAlgorithm { /** * Returns whether the range of text is RTL according to the algorithm. - * */ - TriState checkRtl(char[] text, int start, int count); + int checkRtl(CharSequence cs, int start, int count); } /** - * Algorithm that uses the first strong directional character to determine - * the paragraph direction. This is the standard Unicode Bidirectional - * algorithm. - * + * Algorithm that uses the first strong directional character to determine the paragraph + * direction. This is the standard Unicode Bidirectional algorithm. */ private static class FirstStrong implements TextDirectionAlgorithm { @Override - public TriState checkRtl(char[] text, int start, int count) { - TriState result = TriState.UNKNOWN; - for (int i = start, e = start + count; i < e && result == TriState.UNKNOWN; ++i) { - result = isRtlTextOrFormat(Character.getDirectionality(text[i])); + public int checkRtl(CharSequence cs, int start, int count) { + int result = STATE_UNKNOWN; + for (int i = start, e = start + count; i < e && result == STATE_UNKNOWN; ++i) { + result = isRtlTextOrFormat(Character.getDirectionality(cs.charAt(i))); } return result; } @@ -190,25 +196,24 @@ public class TextDirectionHeuristics { * Algorithm that uses the presence of any strong directional non-format * character (e.g. excludes LRE, LRO, RLE, RLO) to determine the * direction of text. - * */ private static class AnyStrong implements TextDirectionAlgorithm { private final boolean mLookForRtl; @Override - public TriState checkRtl(char[] text, int start, int count) { + public int checkRtl(CharSequence cs, int start, int count) { boolean haveUnlookedFor = false; for (int i = start, e = start + count; i < e; ++i) { - switch (isRtlText(Character.getDirectionality(text[i]))) { - case TRUE: + switch (isRtlText(Character.getDirectionality(cs.charAt(i)))) { + case STATE_TRUE: if (mLookForRtl) { - return TriState.TRUE; + return STATE_TRUE; } haveUnlookedFor = true; break; - case FALSE: + case STATE_FALSE: if (!mLookForRtl) { - return TriState.FALSE; + return STATE_FALSE; } haveUnlookedFor = true; break; @@ -217,9 +222,9 @@ public class TextDirectionHeuristics { } } if (haveUnlookedFor) { - return mLookForRtl ? TriState.FALSE : TriState.TRUE; + return mLookForRtl ? STATE_FALSE : STATE_TRUE; } - return TriState.UNKNOWN; + return STATE_UNKNOWN; } private AnyStrong(boolean lookForRtl) { diff --git a/core/java/android/text/bidi/BidiFormatter.java b/core/java/android/text/bidi/BidiFormatter.java new file mode 100644 index 000000000000..355475154b54 --- /dev/null +++ b/core/java/android/text/bidi/BidiFormatter.java @@ -0,0 +1,1147 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package android.text.bidi; + +import android.text.TextDirectionHeuristic; +import android.text.TextDirectionHeuristics; +import android.text.TextUtils; +import android.view.View; + +import static android.text.TextDirectionHeuristics.FIRSTSTRONG_LTR; + +import java.util.Locale; + + +/** + * Utility class for formatting text for display in a potentially opposite-directionality context + * without garbling. The directionality of the context is set at formatter creation and the + * directionality of the text can be either estimated or passed in when known. Provides the + * following functionality: + * <p> + * 1. Bidi Wrapping + * When text in one language is mixed into a document in another, opposite-directionality language, + * e.g. when an English business name is embedded in a Hebrew web page, both the inserted string + * and the text surrounding it may be displayed incorrectly unless the inserted string is explicitly + * separated from the surrounding text in a "wrapper" that: + * <p> + * - Declares its directionality so that the string is displayed correctly. This can be done in HTML + * markup (e.g. a 'span dir="rtl"' element) by {@link #spanWrap} and similar methods, or - only in + * contexts where markup can't be used - in Unicode bidi formatting codes by {@link #unicodeWrap} + * and similar methods. + * <p> + * - Isolates the string's directionality, so it does not unduly affect the surrounding content. + * Currently, this can only be done using invisible Unicode characters of the same direction as + * the context (LRM or RLM) in addition to the directionality declaration above, thus "resetting" + * the directionality to that of the context. The "reset" may need to be done at both ends of the + * string. Without "reset" after the string, the string will "stick" to a number or logically + * separate opposite-direction text that happens to follow it in-line (even if separated by + * neutral content like spaces and punctuation). Without "reset" before the string, the same can + * happen there, but only with more opposite-direction text, not a number. One approach is to + * "reset" the direction only after each string, on the theory that if the preceding opposite- + * direction text is itself bidi-wrapped, the "reset" after it will prevent the sticking. (Doing + * the "reset" only before each string definitely does not work because we do not want to require + * bidi-wrapping numbers, and a bidi-wrapped opposite-direction string could be followed by a + * number.) Still, the safest policy is to do the "reset" on both ends of each string, since RTL + * message translations often contain untranslated Latin-script brand names and technical terms, + * and one of these can be followed by a bidi-wrapped inserted value. On the other hand, when one + * has such a message, it is best to do the "reset" manually in the message translation itself, + * since the message's opposite-direction text could be followed by an inserted number, which we + * would not bidi-wrap anyway. Thus, "reset" only after the string is the current default. In an + * alternative to "reset", recent additions to the HTML, CSS, and Unicode standards allow the + * isolation to be part of the directionality declaration. This form of isolation is better than + * "reset" because it takes less space, does not require knowing the context directionality, has a + * gentler effect than "reset", and protects both ends of the string. However, we do not yet allow + * using it because required platforms do not yet support it. + * <p> + * Providing these wrapping services is the basic purpose of the bidi formatter. + * <p> + * 2. Directionality estimation + * How does one know whether a string about to be inserted into surrounding text has the same + * directionality? Well, in many cases, one knows that this must be the case when writing the code + * doing the insertion, e.g. when a localized message is inserted into a localized page. In such + * cases there is no need to involve the bidi formatter at all. In some other cases, it need not be + * the same as the context, but is either constant (e.g. urls are always LTR) or otherwise known. + * In the remaining cases, e.g. when the string is user-entered or comes from a database, the + * language of the string (and thus its directionality) is not known a priori, and must be + * estimated at run-time. The bidi formatter can do this automatically using the default + * first-strong estimation algorithm. It can also be configured to use a custom directionality + * estimation object. + * <p> + * 3. Escaping + * When wrapping plain text - i.e. text that is not already HTML or HTML-escaped - in HTML markup, + * the text must first be HTML-escaped to prevent XSS attacks and other nasty business. This of + * course is always true, but the escaping can not be done after the string has already been wrapped + * in markup, so the bidi formatter also serves as a last chance and includes escaping services. + * <p> + * Thus, in a single call, the formatter will escape the input string as specified, determine its + * directionality, and wrap it as necessary. It is then up to the caller to insert the return value + * in the output. + */ +public final class BidiFormatter { + + /** + * The default text direction heuristic. + */ + private static TextDirectionHeuristic DEFAULT_TEXT_DIRECTION_HEURISTIC = FIRSTSTRONG_LTR; + + /** + * Unicode "Left-To-Right Embedding" (LRE) character. + */ + private static final char LRE = '\u202A'; + + /** + * Unicode "Right-To-Left Embedding" (RLE) character. + */ + private static final char RLE = '\u202B'; + + /** + * Unicode "Pop Directional Formatting" (PDF) character. + */ + private static final char PDF = '\u202C'; + + /** + * Unicode "Left-To-Right Mark" (LRM) character. + */ + private static final char LRM = '\u200E'; + + /* + * Unicode "Right-To-Left Mark" (RLM) character. + */ + private static final char RLM = '\u200F'; + + /* + * String representation of LRM + */ + private static final String LRM_STRING = Character.toString(LRM); + + /* + * String representation of RLM + */ + private static final String RLM_STRING = Character.toString(RLM); + + /** + * "ltr" string constant. + */ + private static final String LTR_STRING = "ltr"; + + /** + * "rtl" string constant. + */ + private static final String RTL_STRING = "rtl"; + + /** + * "dir=\"ltr\"" string constant. + */ + private static final String DIR_LTR_STRING = "dir=\"ltr\""; + + /** + * "dir=\"rtl\"" string constant. + */ + private static final String DIR_RTL_STRING = "dir=\"rtl\""; + + /** + * "right" string constant. + */ + private static final String RIGHT = "right"; + + /** + * "left" string constant. + */ + private static final String LEFT = "left"; + + /** + * Empty string constant. + */ + private static final String EMPTY_STRING = ""; + + /** + * A class for building a BidiFormatter with non-default options. + */ + public static final class Builder { + private boolean isRtlContext; + private int flags; + private TextDirectionHeuristic textDirectionHeuristic; + + /** + * Constructor. + * + */ + public Builder() { + initialize(isRtlLocale(Locale.getDefault())); + } + + /** + * Constructor. + * + * @param rtlContext Whether the context directionality is RTL. + */ + public Builder(boolean rtlContext) { + initialize(rtlContext); + } + + /** + * Constructor. + * + * @param locale The context locale. + */ + public Builder(Locale locale) { + initialize(isRtlLocale(locale)); + } + + /** + * Initializes the builder with the given context directionality and default options. + * + * @param isRtlContext Whether the context is RTL or not. + */ + private void initialize(boolean isRtlContext) { + this.isRtlContext = isRtlContext; + textDirectionHeuristic = DEFAULT_TEXT_DIRECTION_HEURISTIC; + this.flags = DEFAULT_FLAGS; + } + + /** + * Specifies whether the BidiFormatter to be built should also "reset" directionality before + * a string being bidi-wrapped, not just after it. The default is false. + */ + public Builder stereoReset(boolean stereoReset) { + if (stereoReset) { + flags |= FLAG_STEREO_RESET; + } else { + flags &= ~FLAG_STEREO_RESET; + } + return this; + } + + /** + * Specifies the default directionality estimation algorithm to be used by the BidiFormatter. + * By default, uses the first-strong heuristic. + * + * @param heuristic the {@code TextDirectionHeuristic} to use. + * @return the builder itself. + */ + public Builder setTextDirectionHeuristic(TextDirectionHeuristic heuristic) { + this.textDirectionHeuristic = heuristic; + return this; + } + + private static BidiFormatter getDefaultInstanceFromContext(boolean isRtlContext) { + return isRtlContext ? DEFAULT_RTL_INSTANCE : DEFAULT_LTR_INSTANCE; + } + + /** + * @return A BidiFormatter with the specified options. + */ + public BidiFormatter build() { + if (flags == DEFAULT_FLAGS && + textDirectionHeuristic == DEFAULT_TEXT_DIRECTION_HEURISTIC) { + return getDefaultInstanceFromContext(isRtlContext); + } + return new BidiFormatter(isRtlContext, flags, textDirectionHeuristic); + } + } + + private static final int FLAG_STEREO_RESET = 2; + private static final int DEFAULT_FLAGS = FLAG_STEREO_RESET; + + private static final BidiFormatter DEFAULT_LTR_INSTANCE = + new BidiFormatter(false /* LTR context */, DEFAULT_FLAGS, DEFAULT_TEXT_DIRECTION_HEURISTIC); + private static final BidiFormatter DEFAULT_RTL_INSTANCE = + new BidiFormatter(true /* RTL context */, DEFAULT_FLAGS, DEFAULT_TEXT_DIRECTION_HEURISTIC); + + private final boolean isRtlContext; + private final int flags; + private final TextDirectionHeuristic defaultTextDirectionHeuristic; + + /** + * Factory for creating an instance of BidiFormatter given the context directionality. + * + * @param rtlContext Whether the context directionality is RTL. + */ + public static BidiFormatter getInstance(boolean rtlContext) { + return new Builder(rtlContext).build(); + } + + /** + * Factory for creating an instance of BidiFormatter given the context locale. + * + * @param locale The context locale. + */ + public static BidiFormatter getInstance(Locale locale) { + return new Builder(locale).build(); + } + + /** + * @param isRtlContext Whether the context directionality is RTL or not. + * @param flags The option flags. + * @param heuristic The default text direction heuristic. + */ + private BidiFormatter(boolean isRtlContext, int flags, TextDirectionHeuristic heuristic) { + this.isRtlContext = isRtlContext; + this.flags = flags; + this.defaultTextDirectionHeuristic = heuristic; + } + + /** + * @return Whether the context directionality is RTL + */ + public boolean isRtlContext() { + return isRtlContext; + } + + /** + * @return Whether directionality "reset" should also be done before a string being + * bidi-wrapped, not just after it. + */ + public boolean getStereoReset() { + return (flags & FLAG_STEREO_RESET) != 0; + } + + /** + * Returns "rtl" if {@code str}'s estimated directionality is RTL, and "ltr" if it is LTR. + * + * @param str String whose directionality is to be estimated. + * @return "rtl" if {@code str}'s estimated directionality is RTL, and "ltr" otherwise. + */ + public String dirAttrValue(String str) { + return dirAttrValue(isRtl(str)); + } + + /** + * Operates like {@link #dirAttrValue(String)}, but uses a given heuristic to estimate the + * {@code str}'s directionality. + * + * @param str String whose directionality is to be estimated. + * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s + * directionality. + * @return "rtl" if {@code str}'s estimated directionality is RTL, and "ltr" otherwise. + */ + public String dirAttrValue(String str, TextDirectionHeuristic heuristic) { + return dirAttrValue(heuristic.isRtl(str, 0, str.length())); + } + + /** + * Returns "rtl" if the given directionality is RTL, and "ltr" if it is LTR. + * + * @param isRtl Whether the directionality is RTL or not. + * @return "rtl" if the given directionality is RTL, and "ltr" otherwise. + */ + public String dirAttrValue(boolean isRtl) { + return isRtl ? RTL_STRING : LTR_STRING; + } + + /** + * Returns "dir=\"ltr\"" or "dir=\"rtl\"", depending on {@code str}'s estimated directionality, + * if it is not the same as the context directionality. Otherwise, returns the empty string. + * + * @param str String whose directionality is to be estimated. + * @return "dir=\"rtl\"" for RTL text in non-RTL context; "dir=\"ltr\"" for LTR text in non-LTR + * context; else, the empty string. + */ + public String dirAttr(String str) { + return dirAttr(isRtl(str)); + } + + /** + * Operates like {@link #dirAttr(String)}, but uses a given heuristic to estimate the + * {@code str}'s directionality. + * + * @param str String whose directionality is to be estimated. + * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s + * directionality. + * @return "dir=\"rtl\"" for RTL text in non-RTL context; "dir=\"ltr\"" for LTR text in non-LTR + * context; else, the empty string. + */ + public String dirAttr(String str, TextDirectionHeuristic heuristic) { + return dirAttr(heuristic.isRtl(str, 0, str.length())); + } + + /** + * Returns "dir=\"ltr\"" or "dir=\"rtl\"", depending on the given directionality, if it is not + * the same as the context directionality. Otherwise, returns the empty string. + * + * @param isRtl Whether the directionality is RTL or not + * @return "dir=\"rtl\"" for RTL text in non-RTL context; "dir=\"ltr\"" for LTR text in non-LTR + * context; else, the empty string. + */ + public String dirAttr(boolean isRtl) { + return (isRtl != isRtlContext) ? (isRtl ? DIR_RTL_STRING : DIR_LTR_STRING) : EMPTY_STRING; + } + + /** + * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the + * overall or the exit directionality of a given string is opposite to the context directionality. + * Putting this after the string (including its directionality declaration wrapping) prevents it + * from "sticking" to other opposite-directionality text or a number appearing after it inline + * with only neutral content in between. Otherwise returns the empty string. While the exit + * directionality is determined by scanning the end of the string, the overall directionality is + * given explicitly in {@code dir}. + * + * @param str String after which the mark may need to appear. + * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context; + * else, the empty string. + */ + public String markAfter(String str) { + return markAfter(str, defaultTextDirectionHeuristic); + } + + /** + * Operates like {@link #markAfter(String)}, but uses a given heuristic to estimate the + * {@code str}'s directionality. + * + * @param str String after which the mark may need to appear. + * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s + * directionality. + * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context; + * else, the empty string. + */ + public String markAfter(String str, TextDirectionHeuristic heuristic) { + final boolean isRtl = heuristic.isRtl(str, 0, str.length()); + // getExitDir() is called only if needed (short-circuit). + if (!isRtlContext && (isRtl || getExitDir(str) == Dir.RTL)) { + return LRM_STRING; + } + if (isRtlContext && (!isRtl || getExitDir(str) == Dir.LTR)) { + return RLM_STRING; + } + return EMPTY_STRING; + } + + /** + * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the + * overall or the entry directionality of a given string is opposite to the context + * directionality. Putting this before the string (including its directionality declaration + * wrapping) prevents it from "sticking" to other opposite-directionality text appearing before it + * inline with only neutral content in between. Otherwise returns the empty string. While the + * entry directionality is determined by scanning the beginning of the string, the overall + * directionality is given explicitly in {@code dir}. + * + * @param str String before which the mark may need to appear. + * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context; + * else, the empty string. + */ + public String markBefore(String str) { + return markBefore(str, defaultTextDirectionHeuristic); + } + + /** + * Operates like {@link #markBefore(String)}, but uses a given heuristic to estimate the + * {@code str}'s directionality. + * + * @param str String before which the mark may need to appear. + * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s + * directionality. + * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context; + * else, the empty string. + */ + public String markBefore(String str, TextDirectionHeuristic heuristic) { + final boolean isRtl = heuristic.isRtl(str, 0, str.length()); + // getEntryDir() is called only if needed (short-circuit). + if (!isRtlContext && (isRtl || getEntryDir(str) == Dir.RTL)) { + return LRM_STRING; + } + if (isRtlContext && (!isRtl || getEntryDir(str) == Dir.LTR)) { + return RLM_STRING; + } + return EMPTY_STRING; + } + + /** + * Returns the Unicode bidi mark matching the context directionality (LRM for LTR context + * directionality, RLM for RTL context directionality). + */ + public String mark() { + return isRtlContext ? RLM_STRING : LRM_STRING; + } + + /** + * Returns "right" for RTL context directionality. Otherwise for LTR context directionality + * returns "left". + */ + public String startEdge() { + return isRtlContext ? RIGHT : LEFT; + } + + /** + * Returns "left" for RTL context directionality. Otherwise for LTR context directionality + * returns "right". + */ + public String endEdge() { + return isRtlContext ? LEFT : RIGHT; + } + + /** + * Estimates the directionality of a string using the default text direction heuristic. + * + * @param str String whose directionality is to be estimated. + * @return true if {@code str}'s estimated overall directionality is RTL. Otherwise returns + * false. + */ + public boolean isRtl(String str) { + return defaultTextDirectionHeuristic.isRtl(str, 0, str.length()); + } + + /** + * Formats a given string of unknown directionality for use in HTML output of the context + * directionality, so an opposite-directionality string is neither garbled nor garbles its + * surroundings. + * <p> + * The algorithm: estimates the directionality of the given string using the given heuristic. + * If the directionality is known, pass TextDirectionHeuristics.LTR or RTL for heuristic. + * In case its directionality doesn't match the context directionality, wraps it with a 'span' + * element and adds a "dir" attribute (either 'dir=\"rtl\"' or 'dir=\"ltr\"'). + * <p> + * If {@code isolate}, directionally isolates the string so that it does not garble its + * surroundings. Currently, this is done by "resetting" the directionality after the string by + * appending a trailing Unicode bidi mark matching the context directionality (LRM or RLM) when + * either the overall directionality or the exit directionality of the string is opposite to that + * of the context. If the formatter was built using {@link Builder#stereoReset(boolean)} and + * passing "true" as an argument, also prepends a Unicode bidi mark matching the context + * directionality when either the overall directionality or the entry directionality of the + * string is opposite to that of the context. + * <p> + * + * @param str The input string. + * @param heuristic The algorithm to be used to estimate the string's overall direction. + * @param isolate Whether to directionally isolate the string to prevent it from garbling the + * content around it. + * @return Input string after applying the above processing. + */ + public String spanWrap(String str, TextDirectionHeuristic heuristic, boolean isolate) { + final boolean isRtl = heuristic.isRtl(str, 0, str.length()); + String origStr = str; + str = TextUtils.htmlEncode(str); + + StringBuilder result = new StringBuilder(); + if (getStereoReset() && isolate) { + result.append(markBefore(origStr, + isRtl ? TextDirectionHeuristics.RTL : TextDirectionHeuristics.LTR)); + } + if (isRtl != isRtlContext) { + result.append("<span ").append(dirAttr(isRtl)).append('>').append(str).append("</span>"); + } else { + result.append(str); + } + if (isolate) { + result.append(markAfter(origStr, + isRtl ? TextDirectionHeuristics.RTL : TextDirectionHeuristics.LTR)); + } + return result.toString(); + } + + /** + * Operates like {@link #spanWrap(String, TextDirectionHeuristic, boolean)}, but assumes + * {@code isolate} is true. + * + * @param str The input string. + * @param heuristic The algorithm to be used to estimate the string's overall direction. + * @return Input string after applying the above processing. + */ + public String spanWrap(String str, TextDirectionHeuristic heuristic) { + return spanWrap(str, heuristic, true /* isolate */); + } + + /** + * Operates like {@link #spanWrap(String, TextDirectionHeuristic, boolean)}, but uses the + * formatter's default direction estimation algorithm. + * + * @param str The input string. + * @param isolate Whether to directionally isolate the string to prevent it from garbling the + * content around it + * @return Input string after applying the above processing. + */ + public String spanWrap(String str, boolean isolate) { + return spanWrap(str, defaultTextDirectionHeuristic, isolate); + } + + /** + * Operates like {@link #spanWrap(String, TextDirectionHeuristic, boolean)}, but uses the + * formatter's default direction estimation algorithm and assumes {@code isolate} is true. + * + * @param str The input string. + * @return Input string after applying the above processing. + */ + public String spanWrap(String str) { + return spanWrap(str, defaultTextDirectionHeuristic, true /* isolate */); + } + + /** + * Formats a string of given directionality for use in plain-text output of the context + * directionality, so an opposite-directionality string is neither garbled nor garbles its + * surroundings. As opposed to {@link #spanWrap}, this makes use of Unicode bidi + * formatting characters. In HTML, its *only* valid use is inside of elements that do not allow + * markup, e.g. the 'option' and 'title' elements. + * <p> + * The algorithm: In case the given directionality doesn't match the context directionality, wraps + * the string with Unicode bidi formatting characters: RLE+{@code str}+PDF for RTL text, or + * LRE+{@code str}+PDF for LTR text. + * <p> + * If {@code isolate}, directionally isolates the string so that it does not garble its + * surroundings. Currently, this is done by "resetting" the directionality after the string by + * appending a trailing Unicode bidi mark matching the context directionality (LRM or RLM) when + * either the overall directionality or the exit directionality of the string is opposite to that + * of the context. If the formatter was built using {@link Builder#stereoReset(boolean)} and + * passing "true" as an argument, also prepends a Unicode bidi mark matching the context + * directionality when either the overall directionality or the entry directionality of the + * string is opposite to that of the context. Note that as opposed to the overall + * directionality, the entry and exit directionalities are determined from the string itself. + * <p> + * Does *not* do HTML-escaping. + * + * @param str The input string. + * @param heuristic The algorithm to be used to estimate the string's overall direction. + * @param isolate Whether to directionally isolate the string to prevent it from garbling the + * content around it + * @return Input string after applying the above processing. + */ + public String unicodeWrap(String str, TextDirectionHeuristic heuristic, boolean isolate) { + final boolean isRtl = heuristic.isRtl(str, 0, str.length()); + StringBuilder result = new StringBuilder(); + if (getStereoReset() && isolate) { + result.append(markBefore(str, + isRtl ? TextDirectionHeuristics.RTL : TextDirectionHeuristics.LTR)); + } + if (isRtl != isRtlContext) { + result.append(isRtl ? RLE : LRE); + result.append(str); + result.append(PDF); + } else { + result.append(str); + } + if (isolate) { + result.append(markAfter(str, + isRtl ? TextDirectionHeuristics.RTL : TextDirectionHeuristics.LTR)); + } + return result.toString(); + } + + /** + * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but assumes + * {@code isolate} is true. + * + * @param str The input string. + * @param heuristic The algorithm to be used to estimate the string's overall direction. + * @return Input string after applying the above processing. + */ + public String unicodeWrap(String str, TextDirectionHeuristic heuristic) { + return unicodeWrap(str, heuristic, true /* isolate */); + } + + /** + * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but uses the + * formatter's default direction estimation algorithm. + * + * @param str The input string. + * @param isolate Whether to directionally isolate the string to prevent it from garbling the + * content around it + * @return Input string after applying the above processing. + */ + public String unicodeWrap(String str, boolean isolate) { + return unicodeWrap(str, defaultTextDirectionHeuristic, isolate); + } + + /** + * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but uses the + * formatter's default direction estimation algorithm and assumes {@code isolate} is true. + * + * @param str The input string. + * @return Input string after applying the above processing. + */ + public String unicodeWrap(String str) { + return unicodeWrap(str, defaultTextDirectionHeuristic, true /* isolate */); + } + + /** + * Helper method to return true if the Locale directionality is RTL. + * + * @param locale The Locale whose directionality will be checked to be RTL or LTR + * @return true if the {@code locale} directionality is RTL. False otherwise. + */ + private static boolean isRtlLocale(Locale locale) { + return (TextUtils.getLayoutDirectionFromLocale(locale) == View.LAYOUT_DIRECTION_RTL); + } + + /** + * Enum for directionality type. + */ + private enum Dir { + LTR (1), + UNKNOWN (0), + RTL (-1); + + public final int ord; + + Dir(int ord) {this.ord = ord; } + + /** + * Interprets numeric representation of directionality: positive values are + * interpreted as RTL, negative values as LTR, and zero as UNKNOWN. + */ + public static Dir valueOf(int dir) { + return dir > 0 ? LTR : dir < 0 ? RTL : UNKNOWN; + } + + /** + * Interprets boolean representation of directionality: false is interpreted + * as LTR and true as RTL. + */ + public static Dir valueOf(boolean dir) { + return dir ? RTL : LTR; + } + + /** + * Returns whether this directionality is opposite to the given + * directionality. + */ + public boolean isOppositeTo(Dir dir) { + return this.ord * dir.ord < 0; + } + } + + /** + * Returns the directionality of the last character with strong directionality in the string, or + * Dir.UNKNOWN if none was encountered. For efficiency, actually scans backwards from the end of + * the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its matching PDF as a + * strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results are undefined for a + * string containing unbalanced LRE/RLE/LRO/RLO/PDF characters. The intended use is to check + * whether a logically separate item that starts with a number or a character of the string's + * exit directionality and follows this string inline (not counting any neutral characters in + * between) would "stick" to it in an opposite-directionality context, thus being displayed in + * an incorrect position. An LRM or RLM character (the one of the context's directionality) + * between the two will prevent such sticking. + * + * @param str the string to check. + */ + private static Dir getExitDir(String str) { + return new DirectionalityEstimator(str, false /* isHtml */).getExitDir(); + } + + /** + * Returns the directionality of the first character with strong directionality in the string, + * or Dir.UNKNOWN if none was encountered. Treats a non-BN character between an + * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL after + * RLE/RLO. The results are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF + * characters. The intended use is to check whether a logically separate item that ends with a + * character of the string's entry directionality and precedes the string inline (not counting + * any neutral characters in between) would "stick" to it in an opposite-directionality context, + * thus being displayed in an incorrect position. An LRM or RLM character (the one of the + * context's directionality) between the two will prevent such sticking. + * + * @param str the string to check. + */ + private static Dir getEntryDir(String str) { + return new DirectionalityEstimator(str, false /* isHtml */).getEntryDir(); + } + + /** + * An object that estimates the directionality of a given string by various methods. + * + */ + private static class DirectionalityEstimator { + + // Internal static variables and constants. + + /** + * Size of the bidi character class cache. The results of the Character.getDirectionality() + * calls on the lowest DIR_TYPE_CACHE_SIZE codepoints are kept in an array for speed. + * The 0x700 value is designed to leave all the European and Near Eastern languages in the + * cache. It can be reduced to 0x180, restricting the cache to the Western European + * languages. + */ + private static final int DIR_TYPE_CACHE_SIZE = 0x700; + + /** + * The bidi character class cache. + */ + private static final byte DIR_TYPE_CACHE[]; + + static { + DIR_TYPE_CACHE = new byte[DIR_TYPE_CACHE_SIZE]; + for (int i = 0; i < DIR_TYPE_CACHE_SIZE; i++) { + DIR_TYPE_CACHE[i] = Character.getDirectionality(i); + } + } + + // Internal instance variables. + + /** + * The text to be scanned. + */ + private final String text; + + /** + * Whether the text to be scanned is to be treated as HTML, i.e. skipping over tags and + * entities when looking for the next / preceding dir type. + */ + private final boolean isHtml; + + /** + * The length of the text in chars. + */ + private final int length; + + /** + * The current position in the text. + */ + private int charIndex; + + /** + * The char encountered by the last dirTypeForward or dirTypeBackward call. If it + * encountered a supplementary codepoint, this contains a char that is not a valid + * codepoint. This is ok, because this member is only used to detect some well-known ASCII + * syntax, e.g. "http://" and the beginning of an HTML tag or entity. + */ + private char lastChar; + + /** + * Constructor. + * + * @param text The string to scan. + * @param isHtml Whether the text to be scanned is to be treated as HTML, i.e. skipping over + * tags and entities. + */ + DirectionalityEstimator(String text, boolean isHtml) { + this.text = text; + this.isHtml = isHtml; + length = text.length(); + } + + /** + * Returns the directionality of the first character with strong directionality in the + * string, or Dir.UNKNOWN if none was encountered. Treats a non-BN character between an + * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL + * after RLE/RLO. The results are undefined for a string containing unbalanced + * LRE/RLE/LRO/RLO/PDF characters. + */ + Dir getEntryDir() { + // The reason for this method name, as opposed to getFirstStrongDir(), is that + // "first strong" is a commonly used description of Unicode's estimation algorithm, + // but the two must treat formatting characters quite differently. Thus, we are staying + // away from both "first" and "last" in these method names to avoid confusion. + charIndex = 0; + int embeddingLevel = 0; + Dir embeddingLevelDir = Dir.UNKNOWN; + int firstNonEmptyEmbeddingLevel = 0; + while (charIndex < length && firstNonEmptyEmbeddingLevel == 0) { + switch (dirTypeForward()) { + case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING: + case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: + ++embeddingLevel; + embeddingLevelDir = Dir.LTR; + break; + case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING: + case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE: + ++embeddingLevel; + embeddingLevelDir = Dir.RTL; + break; + case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT: + --embeddingLevel; + // To restore embeddingLevelDir to its previous value, we would need a + // stack, which we want to avoid. Thus, at this point we do not know the + // current embedding's directionality. + embeddingLevelDir = Dir.UNKNOWN; + break; + case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL: + break; + case Character.DIRECTIONALITY_LEFT_TO_RIGHT: + if (embeddingLevel == 0) { + return Dir.LTR; + } + firstNonEmptyEmbeddingLevel = embeddingLevel; + break; + case Character.DIRECTIONALITY_RIGHT_TO_LEFT: + case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC: + if (embeddingLevel == 0) { + return Dir.RTL; + } + firstNonEmptyEmbeddingLevel = embeddingLevel; + break; + default: + firstNonEmptyEmbeddingLevel = embeddingLevel; + break; + } + } + + // We have either found a non-empty embedding or scanned the entire string finding + // neither a non-empty embedding nor a strong character outside of an embedding. + if (firstNonEmptyEmbeddingLevel == 0) { + // We have not found a non-empty embedding. Thus, the string contains neither a + // non-empty embedding nor a strong character outside of an embedding. + return Dir.UNKNOWN; + } + + // We have found a non-empty embedding. + if (embeddingLevelDir != Dir.UNKNOWN) { + // We know the directionality of the non-empty embedding. + return embeddingLevelDir; + } + + // We do not remember the directionality of the non-empty embedding we found. So, we go + // backwards to find the start of the non-empty embedding and get its directionality. + while (charIndex > 0) { + switch (dirTypeBackward()) { + case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING: + case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: + if (firstNonEmptyEmbeddingLevel == embeddingLevel) { + return Dir.LTR; + } + --embeddingLevel; + break; + case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING: + case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE: + if (firstNonEmptyEmbeddingLevel == embeddingLevel) { + return Dir.RTL; + } + --embeddingLevel; + break; + case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT: + ++embeddingLevel; + break; + } + } + // We should never get here. + return Dir.UNKNOWN; + } + + /** + * Returns the directionality of the last character with strong directionality in the + * string, or Dir.UNKNOWN if none was encountered. For efficiency, actually scans backwards + * from the end of the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its + * matching PDF as a strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results + * are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF characters. + */ + Dir getExitDir() { + // The reason for this method name, as opposed to getLastStrongDir(), is that "last + // strong" sounds like the exact opposite of "first strong", which is a commonly used + // description of Unicode's estimation algorithm (getUnicodeDir() above), but the two + // must treat formatting characters quite differently. Thus, we are staying away from + // both "first" and "last" in these method names to avoid confusion. + charIndex = length; + int embeddingLevel = 0; + int lastNonEmptyEmbeddingLevel = 0; + while (charIndex > 0) { + switch (dirTypeBackward()) { + case Character.DIRECTIONALITY_LEFT_TO_RIGHT: + if (embeddingLevel == 0) { + return Dir.LTR; + } + if (lastNonEmptyEmbeddingLevel == 0) { + lastNonEmptyEmbeddingLevel = embeddingLevel; + } + break; + case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING: + case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: + if (lastNonEmptyEmbeddingLevel == embeddingLevel) { + return Dir.LTR; + } + --embeddingLevel; + break; + case Character.DIRECTIONALITY_RIGHT_TO_LEFT: + case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC: + if (embeddingLevel == 0) { + return Dir.RTL; + } + if (lastNonEmptyEmbeddingLevel == 0) { + lastNonEmptyEmbeddingLevel = embeddingLevel; + } + break; + case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING: + case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE: + if (lastNonEmptyEmbeddingLevel == embeddingLevel) { + return Dir.RTL; + } + --embeddingLevel; + break; + case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT: + ++embeddingLevel; + break; + case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL: + break; + default: + if (lastNonEmptyEmbeddingLevel == 0) { + lastNonEmptyEmbeddingLevel = embeddingLevel; + } + break; + } + } + return Dir.UNKNOWN; + } + + // Internal methods + + /** + * Gets the bidi character class, i.e. Character.getDirectionality(), of a given char, using + * a cache for speed. Not designed for supplementary codepoints, whose results we do not + * cache. + */ + private static byte getCachedDirectionality(char c) { + return c < DIR_TYPE_CACHE_SIZE ? DIR_TYPE_CACHE[c] : Character.getDirectionality(c); + } + + /** + * Returns the Character.DIRECTIONALITY_... value of the next codepoint and advances + * charIndex. If isHtml, and the codepoint is '<' or '&', advances through the tag/entity, + * and returns Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to + * figure out the actual character, and return its dirtype, but treating it as whitespace is + * good enough for our purposes. + * + * @throws java.lang.IndexOutOfBoundsException if called when charIndex >= length or < 0. + */ + byte dirTypeForward() { + lastChar = text.charAt(charIndex); + if (Character.isHighSurrogate(lastChar)) { + int codePoint = Character.codePointAt(text, charIndex); + charIndex += Character.charCount(codePoint); + return Character.getDirectionality(codePoint); + } + charIndex++; + byte dirType = getCachedDirectionality(lastChar); + if (isHtml) { + // Process tags and entities. + if (lastChar == '<') { + dirType = skipTagForward(); + } else if (lastChar == '&') { + dirType = skipEntityForward(); + } + } + return dirType; + } + + /** + * Returns the Character.DIRECTIONALITY_... value of the preceding codepoint and advances + * charIndex backwards. If isHtml, and the codepoint is the end of a complete HTML tag or + * entity, advances over the whole tag/entity and returns + * Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to figure out the + * actual character, and return its dirtype, but treating it as whitespace is good enough + * for our purposes. + * + * @throws java.lang.IndexOutOfBoundsException if called when charIndex > length or <= 0. + */ + byte dirTypeBackward() { + lastChar = text.charAt(charIndex - 1); + if (Character.isLowSurrogate(lastChar)) { + int codePoint = Character.codePointBefore(text, charIndex); + charIndex -= Character.charCount(codePoint); + return Character.getDirectionality(codePoint); + } + charIndex--; + byte dirType = getCachedDirectionality(lastChar); + if (isHtml) { + // Process tags and entities. + if (lastChar == '>') { + dirType = skipTagBackward(); + } else if (lastChar == ';') { + dirType = skipEntityBackward(); + } + } + return dirType; + } + + /** + * Advances charIndex forward through an HTML tag (after the opening < has already been + * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching >, + * does not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the + * < that hadn't been part of a tag after all). + */ + private byte skipTagForward() { + int initialCharIndex = charIndex; + while (charIndex < length) { + lastChar = text.charAt(charIndex++); + if (lastChar == '>') { + // The end of the tag. + return Character.DIRECTIONALITY_WHITESPACE; + } + if (lastChar == '"' || lastChar == '\'') { + // Skip over a quoted attribute value inside the tag. + char quote = lastChar; + while (charIndex < length && (lastChar = text.charAt(charIndex++)) != quote) {} + } + } + // The original '<' wasn't the start of a tag after all. + charIndex = initialCharIndex; + lastChar = '<'; + return Character.DIRECTIONALITY_OTHER_NEUTRALS; + } + + /** + * Advances charIndex backward through an HTML tag (after the closing > has already been + * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching <, does + * not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the > + * that hadn't been part of a tag after all). Nevertheless, the running time for calling + * skipTagBackward() in a loop remains linear in the size of the text, even for a text like + * ">>>>", because skipTagBackward() also stops looking for a matching < + * when it encounters another >. + */ + private byte skipTagBackward() { + int initialCharIndex = charIndex; + while (charIndex > 0) { + lastChar = text.charAt(--charIndex); + if (lastChar == '<') { + // The start of the tag. + return Character.DIRECTIONALITY_WHITESPACE; + } + if (lastChar == '>') { + break; + } + if (lastChar == '"' || lastChar == '\'') { + // Skip over a quoted attribute value inside the tag. + char quote = lastChar; + while (charIndex > 0 && (lastChar = text.charAt(--charIndex)) != quote) {} + } + } + // The original '>' wasn't the end of a tag after all. + charIndex = initialCharIndex; + lastChar = '>'; + return Character.DIRECTIONALITY_OTHER_NEUTRALS; + } + + /** + * Advances charIndex forward through an HTML character entity tag (after the opening + * & has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be + * best to figure out the actual character and return its dirtype, but this is good enough. + */ + private byte skipEntityForward() { + while (charIndex < length && (lastChar = text.charAt(charIndex++)) != ';') {} + return Character.DIRECTIONALITY_WHITESPACE; + } + + /** + * Advances charIndex backward through an HTML character entity tag (after the closing ; + * has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be best + * to figure out the actual character and return its dirtype, but this is good enough. + * If there is no matching &, does not change charIndex and returns + * Character.DIRECTIONALITY_OTHER_NEUTRALS (for the ';' that did not start an entity after + * all). Nevertheless, the running time for calling skipEntityBackward() in a loop remains + * linear in the size of the text, even for a text like ";;;;;;;", because skipTagBackward() + * also stops looking for a matching & when it encounters another ;. + */ + private byte skipEntityBackward() { + int initialCharIndex = charIndex; + while (charIndex > 0) { + lastChar = text.charAt(--charIndex); + if (lastChar == '&') { + return Character.DIRECTIONALITY_WHITESPACE; + } + if (lastChar == ';') { + break; + } + } + charIndex = initialCharIndex; + lastChar = ';'; + return Character.DIRECTIONALITY_OTHER_NEUTRALS; + } + } +}
\ No newline at end of file |