Add ASCII vs UTF-16 string data analysis

Measure how many strings are fully ASCII characters or a UTF-16 mix.

Bug: 77721545
Test: test-art-host-gtest-dexanalyze_test
Change-Id: Ic59613596c1542c693bcf49cee379310414c44cd
diff --git a/tools/dexanalyze/dexanalyze.cc b/tools/dexanalyze/dexanalyze.cc
index 58b1fc7..46c4852 100644
--- a/tools/dexanalyze/dexanalyze.cc
+++ b/tools/dexanalyze/dexanalyze.cc
@@ -30,7 +30,10 @@
 namespace art {
 
 class DexAnalyze {
-  static const int kExitCodeUsageError = 1;
+  static constexpr int kExitCodeUsageError = 1;
+  static constexpr int kExitCodeFailedToOpenFile = 2;
+  static constexpr int kExitCodeFailedToOpenDex = 3;
+  static constexpr int kExitCodeFailedToProcessDex = 4;
 
   static void StdoutLogger(android::base::LogId,
                            android::base::LogSeverity,
@@ -135,10 +138,10 @@
     Analysis cumulative(&options);
     for (const std::string& filename : options.filenames_) {
       std::string content;
-      // TODO: once added, use an api to android::base to read a std::vector<uint8_t>.
+      // TODO: once added, use an API to android::base to read a std::vector<uint8_t>.
       if (!android::base::ReadFileToString(filename.c_str(), &content)) {
         LOG(ERROR) << "ReadFileToString failed for " + filename << std::endl;
-        return 2;
+        return kExitCodeFailedToOpenFile;
       }
       std::vector<std::unique_ptr<const DexFile>> dex_files;
       const DexFileLoader dex_file_loader;
@@ -150,14 +153,14 @@
                                    &error_msg,
                                    &dex_files)) {
         LOG(ERROR) << "OpenAll failed for " + filename << " with " << error_msg << std::endl;
-        return 3;
+        return kExitCodeFailedToOpenDex;
       }
       for (std::unique_ptr<const DexFile>& dex_file : dex_files) {
         if (options.dump_per_input_dex_) {
           Analysis current(&options);
           if (!current.ProcessDexFile(*dex_file)) {
             LOG(ERROR) << "Failed to process " << filename << " with error " << error_msg;
-            return 4;
+            return kExitCodeFailedToProcessDex;
           }
           LOG(INFO) << "Analysis for " << dex_file->GetLocation() << std::endl;
           current.Dump(LOG_STREAM(INFO));
diff --git a/tools/dexanalyze/dexanalyze_experiments.cc b/tools/dexanalyze/dexanalyze_experiments.cc
index bfeb4b9..adc5154 100644
--- a/tools/dexanalyze/dexanalyze_experiments.cc
+++ b/tools/dexanalyze/dexanalyze_experiments.cc
@@ -26,6 +26,7 @@
 #include "dex/code_item_accessors-inl.h"
 #include "dex/dex_instruction-inl.h"
 #include "dex/standard_dex_file.h"
+#include "dex/utf-inl.h"
 
 namespace art {
 
@@ -48,8 +49,20 @@
   std::vector<std::string> strings;
   for (size_t i = 0; i < dex_file.NumStringIds(); ++i) {
     uint32_t length = 0;
-    const char* data =
-        dex_file.GetStringDataAndUtf16Length(dex_file.GetStringId(dex::StringIndex(i)), &length);
+    const char* data = dex_file.StringDataAndUtf16LengthByIdx(dex::StringIndex(i), &length);
+    // Analyze if the string has any UTF16 chars.
+    bool have_wide_char = false;
+    const char* ptr = data;
+    for (size_t j = 0; j < length; ++j) {
+      have_wide_char = have_wide_char || GetUtf16FromUtf8(&ptr) >= 0x100;
+    }
+    if (have_wide_char) {
+      wide_string_bytes_ += 2 * length;
+    } else {
+      ascii_string_bytes_ += length;
+    }
+    string_data_bytes_ += ptr - data;
+
     strings.push_back(data);
   }
   // Note that the strings are probably already sorted.
@@ -88,6 +101,11 @@
 }
 
 void AnalyzeStrings::Dump(std::ostream& os, uint64_t total_size) const {
+  os << "Total string data bytes " << Percent(string_data_bytes_, total_size) << "\n";
+  os << "UTF-16 string data bytes " << Percent(wide_string_bytes_, total_size) << "\n";
+  os << "ASCII string data bytes " << Percent(ascii_string_bytes_, total_size) << "\n";
+
+  // Prefix based strings.
   os << "Total shared prefix bytes " << Percent(total_prefix_savings_, total_size) << "\n";
   os << "Prefix dictionary cost " << Percent(total_prefix_dict_, total_size) << "\n";
   os << "Prefix table cost " << Percent(total_prefix_table_, total_size) << "\n";
diff --git a/tools/dexanalyze/dexanalyze_experiments.h b/tools/dexanalyze/dexanalyze_experiments.h
index 6f70f5d..0fb4d32 100644
--- a/tools/dexanalyze/dexanalyze_experiments.h
+++ b/tools/dexanalyze/dexanalyze_experiments.h
@@ -41,6 +41,9 @@
   void Dump(std::ostream& os, uint64_t total_size) const;
 
  private:
+  int64_t wide_string_bytes_ = 0u;
+  int64_t ascii_string_bytes_ = 0u;
+  int64_t string_data_bytes_ = 0u;
   int64_t total_prefix_savings_ = 0u;
   int64_t total_prefix_dict_ = 0u;
   int64_t total_prefix_table_ = 0u;