AAPT2: Fix JavaDoc first sentence extraction.

The old algorithm for detecting the first sentence of a JavaDoc comment looked for the first occurence of '.'. This does not work when code or a {@link android.R.styleable} link is encountered in the first sentence. Switch to checking for whitespace characters after the '.' character. Bug: 62900335 Test: make aapt2_tests Change-Id: I8238f6a6304c9c2f92e2e576ca8962a59c2b20ea
author: Adam Lesinski <adamlesinski@google.com> 2017-07-24 18:19:36 -0700
committer: Adam Lesinski <adamlesinski@google.com> 2017-07-25 15:53:15 -0700
commit: e967d3f6ac2e1e1f612f99b9c76abcb9e13bb7a2 (patch)
tree: 57a3a90478daeaf536af29463e3ac3866777511c
parent: 3370d3230d4e3084ca95eb6d6bb63f27c3cb6f63 (diff)
10 files changed, 119 insertions, 39 deletions
diff --git a/tools/aapt2/java/AnnotationProcessor.cpp b/tools/aapt2/java/AnnotationProcessor.cpp
index a0ef00b1ea1f..1f83fa098d74 100644
--- a/tools/aapt2/java/AnnotationProcessor.cpp
+++ b/tools/aapt2/java/AnnotationProcessor.cpp
@@ -18,12 +18,29 @@
 
 #include <algorithm>
 
+#include "text/Unicode.h"
+#include "text/Utf8Iterator.h"
 #include "util/Util.h"
 
-using android::StringPiece;
+using ::aapt::text::Utf8Iterator;
+using ::android::StringPiece;
 
 namespace aapt {
 
+StringPiece AnnotationProcessor::ExtractFirstSentence(const StringPiece& comment) {
+  Utf8Iterator iter(comment);
+  while (iter.HasNext()) {
+    const char32_t codepoint = iter.Next();
+    if (codepoint == U'.') {
+      const size_t current_position = iter.Position();
+      if (!iter.HasNext() || text::IsWhitespace(iter.Next())) {
+        return comment.substr(0, current_position);
+      }
+    }
+  }
+  return comment;
+}
+
 void AnnotationProcessor::AppendCommentLine(std::string& comment) {
   static const std::string sDeprecated = "@deprecated";
   static const std::string sSystemApi = "@SystemApi";
diff --git a/tools/aapt2/java/AnnotationProcessor.h b/tools/aapt2/java/AnnotationProcessor.h
index 99cd44fd2cc1..a06eda0f9c5c 100644
--- a/tools/aapt2/java/AnnotationProcessor.h
+++ b/tools/aapt2/java/AnnotationProcessor.h
@@ -53,6 +53,8 @@ namespace aapt {
  */
 class AnnotationProcessor {
  public:
+  static android::StringPiece ExtractFirstSentence(const android::StringPiece& comment);
+
   /**
    * Adds more comments. Since resources can have various values with different
    * configurations,
diff --git a/tools/aapt2/java/AnnotationProcessor_test.cpp b/tools/aapt2/java/AnnotationProcessor_test.cpp
index 3e43c4295c07..9ccac8888426 100644
--- a/tools/aapt2/java/AnnotationProcessor_test.cpp
+++ b/tools/aapt2/java/AnnotationProcessor_test.cpp
@@ -18,6 +18,10 @@
 
 #include "test/Test.h"
 
+using ::testing::Eq;
+using ::testing::HasSubstr;
+using ::testing::Not;
+
 namespace aapt {
 
 TEST(AnnotationProcessorTest, EmitsDeprecated) {
@@ -33,7 +37,7 @@ TEST(AnnotationProcessorTest, EmitsDeprecated) {
   processor.WriteToStream(&result, "");
   std::string annotations = result.str();
 
-  EXPECT_NE(std::string::npos, annotations.find("@Deprecated"));
+  EXPECT_THAT(annotations, HasSubstr("@Deprecated"));
 }
 
 TEST(AnnotationProcessorTest, EmitsSystemApiAnnotationAndRemovesFromComment) {
@@ -44,10 +48,20 @@ TEST(AnnotationProcessorTest, EmitsSystemApiAnnotationAndRemovesFromComment) {
   processor.WriteToStream(&result, "");
   std::string annotations = result.str();
 
-  EXPECT_NE(std::string::npos,
-            annotations.find("@android.annotation.SystemApi"));
-  EXPECT_EQ(std::string::npos, annotations.find("@SystemApi"));
-  EXPECT_NE(std::string::npos, annotations.find("This is a system API"));
+  EXPECT_THAT(annotations, HasSubstr("@android.annotation.SystemApi"));
+  EXPECT_THAT(annotations, Not(HasSubstr("@SystemApi")));
+  EXPECT_THAT(annotations, HasSubstr("This is a system API"));
+}
+
+TEST(AnnotationProcessor, ExtractsFirstSentence) {
+  EXPECT_THAT(AnnotationProcessor::ExtractFirstSentence("This is the only sentence"),
+              Eq("This is the only sentence"));
+  EXPECT_THAT(AnnotationProcessor::ExtractFirstSentence(
+                  "This is the\n  first sentence.  This is the rest of the paragraph."),
+              Eq("This is the\n  first sentence."));
+  EXPECT_THAT(AnnotationProcessor::ExtractFirstSentence(
+                  "This is the first sentence with a {@link android.R.styleable.Theme}."),
+              Eq("This is the first sentence with a {@link android.R.styleable.Theme}."));
 }
 
 }  // namespace aapt
diff --git a/tools/aapt2/java/JavaClassGenerator.cpp b/tools/aapt2/java/JavaClassGenerator.cpp
index 2a23aa9e5372..44fa0f19a0e5 100644
--- a/tools/aapt2/java/JavaClassGenerator.cpp
+++ b/tools/aapt2/java/JavaClassGenerator.cpp
@@ -299,24 +299,16 @@ void JavaClassGenerator::ProcessStyleable(const ResourceNameRef& name, const Res
       }
 
       const ResourceName& attr_name = entry.attr_ref->name.value();
-      styleable_comment << "<tr><td>";
-      styleable_comment << "<code>{@link #" << entry.field_name << " "
-                        << (!attr_name.package.empty()
-                                ? attr_name.package
-                                : context_->GetCompilationPackage())
-                        << ":" << attr_name.entry << "}</code>";
-      styleable_comment << "</td>";
-
-      styleable_comment << "<td>";
+      styleable_comment << "<tr><td><code>{@link #" << entry.field_name << " "
+                        << (!attr_name.package.empty() ? attr_name.package
+                                                       : context_->GetCompilationPackage())
+                        << ":" << attr_name.entry << "}</code></td>";
 
       // Only use the comment up until the first '.'. This is to stay compatible with
       // the way old AAPT did it (presumably to keep it short and to avoid including
       // annotations like @hide which would affect this Styleable).
-      auto iter = std::find(attr_comment_line.begin(), attr_comment_line.end(), '.');
-      if (iter != attr_comment_line.end()) {
-        attr_comment_line = attr_comment_line.substr(0, (iter - attr_comment_line.begin()) + 1);
-      }
-      styleable_comment << attr_comment_line << "</td></tr>\n";
+      styleable_comment << "<td>" << AnnotationProcessor::ExtractFirstSentence(attr_comment_line)
+                        << "</td></tr>\n";
     }
     styleable_comment << "</table>\n";
 
diff --git a/tools/aapt2/text/Unicode.cpp b/tools/aapt2/text/Unicode.cpp
index 38ec9c465ec2..75eeb46c7f5e 100644
--- a/tools/aapt2/text/Unicode.cpp
+++ b/tools/aapt2/text/Unicode.cpp
@@ -66,6 +66,17 @@ bool IsXidContinue(char32_t codepoint) {
   return FindCharacterProperties(codepoint) & CharacterProperties::kXidContinue;
 }
 
+// Hardcode the White_Space characters since they are few and the external/icu project doesn't
+// list them as data files to parse.
+// Sourced from http://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
+bool IsWhitespace(char32_t codepoint) {
+  return (codepoint >= 0x0009 && codepoint <= 0x000d) || (codepoint == 0x0020) ||
+         (codepoint == 0x0085) || (codepoint == 0x00a0) || (codepoint == 0x1680) ||
+         (codepoint >= 0x2000 && codepoint <= 0x200a) || (codepoint == 0x2028) ||
+         (codepoint == 0x2029) || (codepoint == 0x202f) || (codepoint == 0x205f) ||
+         (codepoint == 0x3000);
+}
+
 bool IsJavaIdentifier(const StringPiece& str) {
   Utf8Iterator iter(str);
 
diff --git a/tools/aapt2/text/Unicode.h b/tools/aapt2/text/Unicode.h
index 270718734cff..546714e9a290 100644
--- a/tools/aapt2/text/Unicode.h
+++ b/tools/aapt2/text/Unicode.h
@@ -40,6 +40,10 @@ bool IsXidStart(char32_t codepoint);
 // characters in the ID_Continue set.
 bool IsXidContinue(char32_t codepoint);
 
+// Returns true if the Unicode codepoint has the White_Space property.
+// http://unicode.org/reports/tr44/#White_Space
+bool IsWhitespace(char32_t codepoint);
+
 // Returns true if the UTF8 string can be used as a Java identifier.
 // NOTE: This does not check against the set of reserved Java keywords.
 bool IsJavaIdentifier(const android::StringPiece& str);
diff --git a/tools/aapt2/text/Utf8Iterator.cpp b/tools/aapt2/text/Utf8Iterator.cpp
index 0d43353ef39a..20b9073b9a26 100644
--- a/tools/aapt2/text/Utf8Iterator.cpp
+++ b/tools/aapt2/text/Utf8Iterator.cpp
@@ -25,18 +25,17 @@ namespace aapt {
 namespace text {
 
 Utf8Iterator::Utf8Iterator(const StringPiece& str)
-    : str_(str), next_pos_(0), current_codepoint_(0) {
+    : str_(str), current_pos_(0), next_pos_(0), current_codepoint_(0) {
   DoNext();
 }
 
 void Utf8Iterator::DoNext() {
-  size_t next_pos = 0u;
-  int32_t result = utf32_from_utf8_at(str_.data(), str_.size(), next_pos_, &next_pos);
+  current_pos_ = next_pos_;
+  int32_t result = utf32_from_utf8_at(str_.data(), str_.size(), current_pos_, &next_pos_);
   if (result == -1) {
     current_codepoint_ = 0u;
   } else {
     current_codepoint_ = static_cast<char32_t>(result);
-    next_pos_ = next_pos;
   }
 }
 
@@ -44,6 +43,10 @@ bool Utf8Iterator::HasNext() const {
   return current_codepoint_ != 0;
 }
 
+size_t Utf8Iterator::Position() const {
+  return current_pos_;
+}
+
 void Utf8Iterator::Skip(int amount) {
   while (amount > 0 && HasNext()) {
     Next();
diff --git a/tools/aapt2/text/Utf8Iterator.h b/tools/aapt2/text/Utf8Iterator.h
index 6923957a5a22..9318401876d1 100644
--- a/tools/aapt2/text/Utf8Iterator.h
+++ b/tools/aapt2/text/Utf8Iterator.h
@@ -29,6 +29,10 @@ class Utf8Iterator {
 
   bool HasNext() const;
 
+  // Returns the current position of the iterator in bytes of the source UTF8 string.
+  // This position is the start of the codepoint returned by the next call to Next().
+  size_t Position() const;
+
   void Skip(int amount);
 
   char32_t Next();
@@ -39,6 +43,7 @@ class Utf8Iterator {
   void DoNext();
 
   android::StringPiece str_;
+  size_t current_pos_;
   size_t next_pos_;
   char32_t current_codepoint_;
 };
diff --git a/tools/aapt2/text/Utf8Iterator_test.cpp b/tools/aapt2/text/Utf8Iterator_test.cpp
index f3111c081276..8c3e77446595 100644
--- a/tools/aapt2/text/Utf8Iterator_test.cpp
+++ b/tools/aapt2/text/Utf8Iterator_test.cpp
@@ -18,6 +18,7 @@
 
 #include "test/Test.h"
 
+using ::android::StringPiece;
 using ::testing::Eq;
 
 namespace aapt {
@@ -63,5 +64,32 @@ TEST(Utf8IteratorTest, IteratesOverUnicode) {
   EXPECT_FALSE(iter.HasNext());
 }
 
+TEST(Utf8IteratorTest, PositionPointsToTheCorrectPlace) {
+  const StringPiece expected("Mm🍩");
+  Utf8Iterator iter(expected);
+
+  // Before any character, the position should be 0.
+  EXPECT_THAT(iter.Position(), Eq(0u));
+
+  // The 'M' character, one byte.
+  ASSERT_TRUE(iter.HasNext());
+  iter.Next();
+  EXPECT_THAT(iter.Position(), Eq(1u));
+
+  // The 'm' character, one byte.
+  ASSERT_TRUE(iter.HasNext());
+  iter.Next();
+  EXPECT_THAT(iter.Position(), Eq(2u));
+
+  // The doughnut character, 4 bytes.
+  ASSERT_TRUE(iter.HasNext());
+  iter.Next();
+  EXPECT_THAT(iter.Position(), Eq(6u));
+
+  // There should be nothing left.
+  EXPECT_FALSE(iter.HasNext());
+  EXPECT_THAT(iter.Position(), Eq(expected.size()));
+}
+
 }  // namespace text
 }  // namespace aapt
diff --git a/tools/aapt2/tools/extract_unicode_properties.py b/tools/aapt2/tools/extract_unicode_properties.py
index d7e0479bb788..7577ec82aa86 100644
--- a/tools/aapt2/tools/extract_unicode_properties.py
+++ b/tools/aapt2/tools/extract_unicode_properties.py
@@ -35,9 +35,8 @@ class CharacterProperty:
         return "{}0x{:04x}, 0x{:04x}, {}{}".format(
                 "{", self.first_char, self.last_char, ' | '.join(types), "}")
 
-def extract_unicode_properties(f, props):
-    prog = re.compile(r"^(?P<first>\w{4})(..(?P<last>\w{4}))?\W+;\W+(?P<prop>\w+)\n$")
-    chars = {}
+def extract_unicode_properties(f, props, chars_out):
+    prog = re.compile(r"^(?P<first>\w{4})(..(?P<last>\w{4}))?\W+;\W+(?P<prop>\w+)")
     for line in f:
         result = prog.match(line)
         if result:
@@ -49,10 +48,12 @@ def extract_unicode_properties(f, props):
                 last_char = (int(last_char_str, 16) if last_char_str else start_char) + 1
                 prop_type = props[prop_type_str]
                 for char in range(start_char, last_char):
-                    if char not in chars:
-                        chars[char] = CharacterProperty(char, char, 0)
-                    chars[char].prop_type |= prop_type
+                    if char not in chars_out:
+                        chars_out[char] = CharacterProperty(char, char, 0)
+                    chars_out[char].prop_type |= prop_type
+    return chars_out
 
+def flatten_unicode_properties(chars):
     result = []
     for char_prop in sorted(chars.values(), key=CharacterProperty.key):
         if len(result) == 0:
@@ -82,17 +83,20 @@ license = """/*
 """
 
 if __name__ == "__main__":
-    if len(sys.argv) != 2:
+    if len(sys.argv) < 2:
         print("must specify path to icu DerivedCoreProperties file (e.g:" \
                 "external/icu/icu4c/source/data/unidata/DerivedCoreProperties.txt)")
         sys.exit(1)
 
-    with open(sys.argv[1]) as f:
-        props = {"XID_Start": 1, "XID_Continue": 2}
-        char_props = extract_unicode_properties(f, props)
-        print("{}\nconst static std::array<CharacterProperties, {}> sCharacterProperties = {}"
-                .format(license, len(char_props), "{{"))
-        for prop in char_props:
-            print("    {},".format(prop))
-        print("}};")
+    props = {"XID_Start": 1, "XID_Continue": 2}
+    char_props = {}
+    for file_path in sys.argv[1:]:
+        with open(file_path) as f:
+            extract_unicode_properties(f, props, char_props)
+    result = flatten_unicode_properties(char_props)
+    print("{}\nconst static std::array<CharacterProperties, {}> sCharacterProperties = {}"
+            .format(license, len(result), "{{"))
+    for prop in result:
+        print("    {},".format(prop))
+    print("}};")
author	Adam Lesinski <adamlesinski@google.com>	2017-07-24 18:19:36 -0700
committer	Adam Lesinski <adamlesinski@google.com>	2017-07-25 15:53:15 -0700
commit	e967d3f6ac2e1e1f612f99b9c76abcb9e13bb7a2 (patch)
tree	57a3a90478daeaf536af29463e3ac3866777511c
parent	3370d3230d4e3084ca95eb6d6bb63f27c3cb6f63 (diff)