Add some experiments to analyze string usage

Analyze shared prefixes for strings across dex files and estimate
possible savings from supporting a per string prefix. The estimation
is not yet optimal.

Bug: 77709234
Test: dexanalyze

Change-Id: I2e9f8a09595b54ea4a3e331efde32f9c1689fc82
diff --git a/tools/dexanalyze/dexanalyze.cc b/tools/dexanalyze/dexanalyze.cc
index c4aebc6..0b08bee 100644
--- a/tools/dexanalyze/dexanalyze.cc
+++ b/tools/dexanalyze/dexanalyze.cc
@@ -63,6 +63,8 @@
           run_all_experiments_ = true;
         } else if (arg == "-count-indices") {
           exp_count_indices_ = true;
+        } else if (arg == "-analyze-strings") {
+          exp_analyze_strings_ = true;
         } else if (arg == "-d") {
           dump_per_input_dex_ = true;
         } else if (!arg.empty() && arg[0] == '-') {
@@ -82,6 +84,7 @@
     bool run_dex_file_verifier_ = true;
     bool dump_per_input_dex_ = false;
     bool exp_count_indices_ = false;
+    bool exp_analyze_strings_ = false;
     bool run_all_experiments_ = false;
     std::vector<std::string> filenames_;
   };
@@ -92,25 +95,30 @@
       if (options->run_all_experiments_ || options->exp_count_indices_) {
         experiments_.emplace_back(new CountDexIndices);
       }
+      if (options->run_all_experiments_ || options->exp_analyze_strings_) {
+        experiments_.emplace_back(new AnalyzeStrings);
+      }
     }
 
     bool ProcessDexFile(const DexFile& dex_file) {
       for (std::unique_ptr<Experiment>& experiment : experiments_) {
         experiment->ProcessDexFile(dex_file);
       }
+      total_size_ += dex_file.Size();
       ++dex_count_;
       return true;
     }
 
     void Dump(std::ostream& os) {
       for (std::unique_ptr<Experiment>& experiment : experiments_) {
-        experiment->Dump(os);
+        experiment->Dump(os, total_size_);
       }
     }
 
     const Options* const options_;
     std::vector<std::unique_ptr<Experiment>> experiments_;
     size_t dex_count_ = 0;
+    uint64_t total_size_ = 0u;
   };
 
  public:
diff --git a/tools/dexanalyze/dexanalyze_experiments.cc b/tools/dexanalyze/dexanalyze_experiments.cc
index e1f119d..bfeb4b9 100644
--- a/tools/dexanalyze/dexanalyze_experiments.cc
+++ b/tools/dexanalyze/dexanalyze_experiments.cc
@@ -15,12 +15,91 @@
  */
 
 #include "dexanalyze_experiments.h"
+
+#include <stdint.h>
+#include <inttypes.h>
+#include <iostream>
+#include <map>
+#include <vector>
+
+#include "android-base/stringprintf.h"
 #include "dex/code_item_accessors-inl.h"
 #include "dex/dex_instruction-inl.h"
 #include "dex/standard_dex_file.h"
 
 namespace art {
 
+std::string Percent(uint64_t value, uint64_t max) {
+  if (max == 0) {
+    ++max;
+  }
+  return android::base::StringPrintf("%" PRId64 "(%.2f%%)",
+                                     value,
+                                     static_cast<double>(value * 100) / static_cast<double>(max));
+}
+
+static size_t PrefixLen(const std::string& a, const std::string& b) {
+  size_t len = 0;
+  for (; len < a.length() && len < b.length() && a[len] == b[len]; ++len) {}
+  return len;
+}
+
+void AnalyzeStrings::ProcessDexFile(const DexFile& dex_file) {
+  std::vector<std::string> strings;
+  for (size_t i = 0; i < dex_file.NumStringIds(); ++i) {
+    uint32_t length = 0;
+    const char* data =
+        dex_file.GetStringDataAndUtf16Length(dex_file.GetStringId(dex::StringIndex(i)), &length);
+    strings.push_back(data);
+  }
+  // Note that the strings are probably already sorted.
+  std::sort(strings.begin(), strings.end());
+
+  // Tunable parameters.
+  static const size_t kMinPrefixLen = 3;
+  static const size_t kPrefixConstantCost = 5;
+  static const size_t kPrefixIndexCost = 2;
+
+  // Calculate total shared prefix.
+  std::vector<size_t> shared_len;
+  std::set<std::string> prefixes;
+  for (size_t i = 0; i < strings.size(); ++i) {
+    size_t best_len = 0;
+    if (i > 0) {
+      best_len = std::max(best_len, PrefixLen(strings[i], strings[i - 1]));
+    }
+    if (i < strings.size() - 1) {
+      best_len = std::max(best_len, PrefixLen(strings[i], strings[i + 1]));
+    }
+    std::string prefix;
+    if (best_len >= kMinPrefixLen) {
+      prefix = strings[i].substr(0, best_len);
+      prefixes.insert(prefix);
+      total_prefix_savings_ += prefix.length();
+    }
+    total_prefix_index_cost_ += kPrefixIndexCost;
+  }
+  total_num_prefixes_ += prefixes.size();
+  for (const std::string& s : prefixes) {
+    // 4 bytes for an offset, one for length.
+    total_prefix_dict_ += s.length();
+    total_prefix_table_ += kPrefixConstantCost;
+  }
+}
+
+void AnalyzeStrings::Dump(std::ostream& os, uint64_t total_size) const {
+  os << "Total shared prefix bytes " << Percent(total_prefix_savings_, total_size) << "\n";
+  os << "Prefix dictionary cost " << Percent(total_prefix_dict_, total_size) << "\n";
+  os << "Prefix table cost " << Percent(total_prefix_table_, total_size) << "\n";
+  os << "Prefix index cost " << Percent(total_prefix_index_cost_, total_size) << "\n";
+  int64_t net_savings = total_prefix_savings_;
+  net_savings -= total_prefix_dict_;
+  net_savings -= total_prefix_table_;
+  net_savings -= total_prefix_index_cost_;
+  os << "Prefix net savings " << Percent(net_savings, total_size) << "\n";
+  os << "Prefix dictionary elements " << total_num_prefixes_ << "\n";
+}
+
 void CountDexIndices::ProcessDexFile(const DexFile& dex_file) {
   num_string_ids_ += dex_file.NumStringIds();
   num_method_ids_ += dex_file.NumMethodIds();
@@ -107,7 +186,7 @@
   }
 }
 
-void CountDexIndices::Dump(std::ostream& os) const {
+void CountDexIndices::Dump(std::ostream& os, uint64_t total_size) const {
   os << "Num string ids: " << num_string_ids_ << "\n";
   os << "Num method ids: " << num_method_ids_ << "\n";
   os << "Num field ids: " << num_field_ids_ << "\n";
@@ -127,6 +206,7 @@
   os << "Same class invoke: " << same_class_total << "\n";
   os << "Other class invoke: " << other_class_total << "\n";
   os << "Invokes from code: " << (same_class_total + other_class_total) << "\n";
+  os << "Total dex size: " << total_size << "\n";
 }
 
 }  // namespace art
diff --git a/tools/dexanalyze/dexanalyze_experiments.h b/tools/dexanalyze/dexanalyze_experiments.h
index 5d0f51b..6f70f5d 100644
--- a/tools/dexanalyze/dexanalyze_experiments.h
+++ b/tools/dexanalyze/dexanalyze_experiments.h
@@ -24,12 +24,28 @@
 
 class DexFile;
 
+std::string Percent(uint64_t value, uint64_t max);
+
 // An experiment a stateful visitor that runs on dex files. Results are cumulative.
 class Experiment {
  public:
   virtual ~Experiment() {}
   virtual void ProcessDexFile(const DexFile& dex_file) = 0;
-  virtual void Dump(std::ostream& os) const = 0;
+  virtual void Dump(std::ostream& os, uint64_t total_size) const = 0;
+};
+
+// Analyze string data and strings accessed from code.
+class AnalyzeStrings : public Experiment {
+ public:
+  void ProcessDexFile(const DexFile& dex_file);
+  void Dump(std::ostream& os, uint64_t total_size) const;
+
+ private:
+  int64_t total_prefix_savings_ = 0u;
+  int64_t total_prefix_dict_ = 0u;
+  int64_t total_prefix_table_ = 0u;
+  int64_t total_prefix_index_cost_ = 0u;
+  int64_t total_num_prefixes_ = 0u;
 };
 
 // Count numbers of dex indices.
@@ -37,7 +53,7 @@
  public:
   void ProcessDexFile(const DexFile& dex_file);
 
-  void Dump(std::ostream& os) const;
+  void Dump(std::ostream& os, uint64_t total_size) const;
 
  private:
   // Total string ids loaded from dex code.
@@ -65,4 +81,3 @@
 }  // namespace art
 
 #endif  // ART_TOOLS_DEXANALYZE_DEXANALYZE_EXPERIMENTS_H_
-