From 473601013923d6b5a30fecad5dbc4ff719f38fe4 Mon Sep 17 00:00:00 2001
From: Roozbeh Pournader <roozbeh@google.com>
Date: Thu, 9 Apr 2015 21:49:29 -0700
Subject: Fix various conformance issues with TextDirectionHeuristics.

1. The algorithms no longer consider directional embeddings and
overrides as strong characters.

2. The algorithms are updated to Unicode 7.0, and consider the
directional isolates in determining the direction of input.

3. Codepoints are used instead of code units for looking up bidi
properties.

4. The bidi direction of unassigned characters is considered.

Bug: 20142480
Change-Id: I9be161c112b8f23565ed8961bb7d44ced234f67a
---
 .../java/android/text/TextDirectionHeuristics.java | 120 ++++++++++++++-------
 1 file changed, 80 insertions(+), 40 deletions(-)

diff --git a/core/java/android/text/TextDirectionHeuristics.java b/core/java/android/text/TextDirectionHeuristics.java
index 866137cc3a63..354c15fae66f 100644
--- a/core/java/android/text/TextDirectionHeuristics.java
+++ b/core/java/android/text/TextDirectionHeuristics.java
@@ -81,29 +81,47 @@ public class TextDirectionHeuristics {
     private static final int STATE_FALSE = 1;
     private static final int STATE_UNKNOWN = 2;
 
-    private static int isRtlText(int directionality) {
-        switch (directionality) {
-            case Character.DIRECTIONALITY_LEFT_TO_RIGHT:
-                return STATE_FALSE;
-            case Character.DIRECTIONALITY_RIGHT_TO_LEFT:
-            case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC:
-                return STATE_TRUE;
-            default:
-                return STATE_UNKNOWN;
-        }
-    }
-
-    private static int isRtlTextOrFormat(int directionality) {
-        switch (directionality) {
+    /* Returns STATE_TRUE for strong RTL characters, STATE_FALSE for strong LTR characters, and
+     * STATE_UNKNOWN for everything else.
+     */
+    private static int isRtlCodePoint(int codePoint) {
+        switch (Character.getDirectionality(codePoint)) {
             case Character.DIRECTIONALITY_LEFT_TO_RIGHT:
-            case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
-            case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
                 return STATE_FALSE;
             case Character.DIRECTIONALITY_RIGHT_TO_LEFT:
             case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC:
-            case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
-            case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
                 return STATE_TRUE;
+            case Character.DIRECTIONALITY_UNDEFINED:
+                // Unassigned characters still have bidi direction, defined at:
+                // http://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedBidiClass.txt
+
+                if ((0x0590 <= codePoint && codePoint <= 0x08FF) ||
+                        (0xFB1D <= codePoint && codePoint <= 0xFDCF) ||
+                        (0xFDF0 <= codePoint && codePoint <= 0xFDFF) ||
+                        (0xFE70 <= codePoint && codePoint <= 0xFEFF) ||
+                        (0x10800 <= codePoint && codePoint <= 0x10FFF) ||
+                        (0x1E800 <= codePoint && codePoint <= 0x1EFFF)) {
+                    // Unassigned RTL character
+                    return STATE_TRUE;
+                } else if (
+                        // Potentially-unassigned Default_Ignorable. Ranges are from unassigned
+                        // characters that have Unicode property Other_Default_Ignorable_Code_Point
+                        // plus some enlargening to cover bidi isolates and simplify checks.
+                        (0x2065 <= codePoint && codePoint <= 0x2069) ||
+                        (0xFFF0 <= codePoint && codePoint <= 0xFFF8) ||
+                        (0xE0000 <= codePoint && codePoint <= 0xE0FFF) ||
+                        // Non-character
+                        (0xFDD0 <= codePoint && codePoint <= 0xFDEF) ||
+                        ((codePoint & 0xFFFE) == 0xFFFE) ||
+                        // Currency symbol
+                        (0x20A0 <= codePoint && codePoint <= 0x20CF) ||
+                        // Unpaired surrogate
+                        (0xD800 <= codePoint && codePoint <= 0xDFFF)) {
+                    return STATE_UNKNOWN;
+                } else {
+                    // Unassigned LTR character
+                    return STATE_FALSE;
+                }
             default:
                 return STATE_UNKNOWN;
         }
@@ -181,14 +199,26 @@ public class TextDirectionHeuristics {
 
     /**
      * Algorithm that uses the first strong directional character to determine the paragraph
-     * direction. This is the standard Unicode Bidirectional algorithm.
+     * direction. This is the standard Unicode Bidirectional Algorithm (steps P2 and P3), with the
+     * exception that if no strong character is found, UNKNOWN is returned.
      */
     private static class FirstStrong implements TextDirectionAlgorithm {
         @Override
         public int checkRtl(CharSequence cs, int start, int count) {
             int result = STATE_UNKNOWN;
-            for (int i = start, e = start + count; i < e && result == STATE_UNKNOWN; ++i) {
-                result = isRtlTextOrFormat(Character.getDirectionality(cs.charAt(i)));
+            int openIsolateCount = 0;
+            for (int cp, i = start, end = start + count;
+                    i < end && result == STATE_UNKNOWN;
+                    i += Character.charCount(cp)) {
+                cp = Character.codePointAt(cs, i);
+                if (0x2066 <= cp && cp <= 0x2068) { // Opening isolates
+                    openIsolateCount += 1;
+                } else if (cp == 0x2069) { // POP DIRECTIONAL ISOLATE (PDI)
+                    if (openIsolateCount > 0) openIsolateCount -= 1;
+                } else if (openIsolateCount == 0) {
+                    // Only consider the characters outside isolate pairs
+                    result = isRtlCodePoint(cp);
+                }
             }
             return result;
         }
@@ -200,9 +230,10 @@ public class TextDirectionHeuristics {
     }
 
     /**
-     * Algorithm that uses the presence of any strong directional non-format
-     * character (e.g. excludes LRE, LRO, RLE, RLO) to determine the
-     * direction of text.
+     * Algorithm that uses the presence of any strong directional character of the type indicated
+     * in the constructor parameter to determine the direction of text.
+     *
+     * Characters inside isolate pairs are skipped.
      */
     private static class AnyStrong implements TextDirectionAlgorithm {
         private final boolean mLookForRtl;
@@ -210,22 +241,31 @@ public class TextDirectionHeuristics {
         @Override
         public int checkRtl(CharSequence cs, int start, int count) {
             boolean haveUnlookedFor = false;
-            for (int i = start, e = start + count; i < e; ++i) {
-                switch (isRtlText(Character.getDirectionality(cs.charAt(i)))) {
-                    case STATE_TRUE:
-                        if (mLookForRtl) {
-                            return STATE_TRUE;
-                        }
-                        haveUnlookedFor = true;
-                        break;
-                    case STATE_FALSE:
-                        if (!mLookForRtl) {
-                            return STATE_FALSE;
-                        }
-                        haveUnlookedFor = true;
-                        break;
-                    default:
-                        break;
+            int openIsolateCount = 0;
+            for (int cp, i = start, end = start + count; i < end; i += Character.charCount(cp)) {
+                cp = Character.codePointAt(cs, i);
+                if (0x2066 <= cp && cp <= 0x2068) { // Opening isolates
+                    openIsolateCount += 1;
+                } else if (cp == 0x2069) { // POP DIRECTIONAL ISOLATE (PDI)
+                    if (openIsolateCount > 0) openIsolateCount -= 1;
+                } else if (openIsolateCount == 0) {
+                    // Only consider the characters outside isolate pairs
+                    switch (isRtlCodePoint(cp)) {
+                        case STATE_TRUE:
+                            if (mLookForRtl) {
+                                return STATE_TRUE;
+                            }
+                            haveUnlookedFor = true;
+                            break;
+                        case STATE_FALSE:
+                            if (!mLookForRtl) {
+                                return STATE_FALSE;
+                            }
+                            haveUnlookedFor = true;
+                            break;
+                        default:
+                            break;
+                    }
                 }
             }
             if (haveUnlookedFor) {
-- 
cgit v1.2.3-59-g8ed1b