Add string prefix optimization

Optimize until fixed point by reducing to shorter prefixes if there
are savings.

This dictionary optimization saves up to 2.6% more. Average prefix
savings are 4.6% on top 99 APKs.

Test: test-art-host
Bug: 77709234
Bug: 77721545
Change-Id: I8e9e3aaf06ded9fde0153e8236f8c6b56450d881
diff --git a/tools/dexanalyze/dexanalyze_experiments.cc b/tools/dexanalyze/dexanalyze_experiments.cc
index b9a2ede..1f6fe46 100644
--- a/tools/dexanalyze/dexanalyze_experiments.cc
+++ b/tools/dexanalyze/dexanalyze_experiments.cc
@@ -208,37 +208,40 @@
      << Percent(total_unique_non_header_bytes_, total_size) << "\n";
 }
 
-void AnalyzeStrings::ProcessDexFile(const DexFile& dex_file) {
-  std::vector<std::string> strings;
-  for (size_t i = 0; i < dex_file.NumStringIds(); ++i) {
-    uint32_t length = 0;
-    const char* data = dex_file.StringDataAndUtf16LengthByIdx(dex::StringIndex(i), &length);
-    // Analyze if the string has any UTF16 chars.
-    bool have_wide_char = false;
-    const char* ptr = data;
-    for (size_t j = 0; j < length; ++j) {
-      have_wide_char = have_wide_char || GetUtf16FromUtf8(&ptr) >= 0x100;
+void AnalyzeStrings::ProcessDexFiles(const std::vector<std::unique_ptr<const DexFile>>& dex_files) {
+  std::set<std::string> unique_strings;
+  for (const std::unique_ptr<const DexFile>& dex_file : dex_files) {
+    for (size_t i = 0; i < dex_file->NumStringIds(); ++i) {
+      uint32_t length = 0;
+      const char* data = dex_file->StringDataAndUtf16LengthByIdx(dex::StringIndex(i), &length);
+      // Analyze if the string has any UTF16 chars.
+      bool have_wide_char = false;
+      const char* ptr = data;
+      for (size_t j = 0; j < length; ++j) {
+        have_wide_char = have_wide_char || GetUtf16FromUtf8(&ptr) >= 0x100;
+      }
+      if (have_wide_char) {
+        wide_string_bytes_ += 2 * length;
+      } else {
+        ascii_string_bytes_ += length;
+      }
+      string_data_bytes_ += ptr - data;
+      unique_strings.insert(data);
     }
-    if (have_wide_char) {
-      wide_string_bytes_ += 2 * length;
-    } else {
-      ascii_string_bytes_ += length;
-    }
-    string_data_bytes_ += ptr - data;
-
-    strings.push_back(data);
   }
-  // Note that the strings are probably already sorted.
-  std::sort(strings.begin(), strings.end());
+  // Unique strings only since we want to exclude savings from multidex duplication.
+  std::vector<std::string> strings(unique_strings.begin(), unique_strings.end());
+  unique_strings.clear();
 
   // Tunable parameters.
-  static const size_t kMinPrefixLen = 3;
-  static const size_t kPrefixConstantCost = 5;
+  static const size_t kMinPrefixLen = 1;
+  static const size_t kMaxPrefixLen = 255;
+  static const size_t kPrefixConstantCost = 4;
   static const size_t kPrefixIndexCost = 2;
 
   // Calculate total shared prefix.
   std::vector<size_t> shared_len;
-  std::set<std::string> prefixes;
+  prefixes_.clear();
   for (size_t i = 0; i < strings.size(); ++i) {
     size_t best_len = 0;
     if (i > 0) {
@@ -247,19 +250,65 @@
     if (i < strings.size() - 1) {
       best_len = std::max(best_len, PrefixLen(strings[i], strings[i + 1]));
     }
+    best_len = std::min(best_len, kMaxPrefixLen);
     std::string prefix;
     if (best_len >= kMinPrefixLen) {
       prefix = strings[i].substr(0, best_len);
-      prefixes.insert(prefix);
-      total_prefix_savings_ += prefix.length();
+      ++prefixes_[prefix];
     }
     total_prefix_index_cost_ += kPrefixIndexCost;
   }
-  total_num_prefixes_ += prefixes.size();
-  for (const std::string& s : prefixes) {
+  // Optimize the result by moving long prefixes to shorter ones if it causes savings.
+  while (true) {
+    bool have_savings = false;
+    auto it = prefixes_.begin();
+    std::vector<std::string> longest;
+    for (const auto& pair : prefixes_) {
+      longest.push_back(pair.first);
+    }
+    std::sort(longest.begin(), longest.end(), [](const std::string& a, const std::string& b) {
+      return a.length() > b.length();
+    });
+    // Do longest first since this provides the best results.
+    for (const std::string& s : longest) {
+      it = prefixes_.find(s);
+      CHECK(it != prefixes_.end());
+      const std::string& prefix = it->first;
+      int64_t best_savings = 0u;
+      int64_t best_len = -1;
+      for (int64_t len = prefix.length() - 1; len >= 0; --len) {
+        auto found = prefixes_.find(prefix.substr(0, len));
+        if (len != 0 && found == prefixes_.end()) {
+          continue;
+        }
+        // Calculate savings from downgrading the prefix.
+        int64_t savings = kPrefixConstantCost + prefix.length() -
+            (prefix.length() - len) * it->second;
+        if (savings > best_savings) {
+          best_savings = savings;
+          best_len = len;
+          break;
+        }
+      }
+      if (best_len != -1) {
+        prefixes_[prefix.substr(0, best_len)] += it->second;
+        it = prefixes_.erase(it);
+        optimization_savings_ += best_savings;
+        have_savings = true;
+      } else {
+        ++it;
+      }
+    }
+    if (!have_savings) {
+      break;
+    }
+  }
+  total_num_prefixes_ += prefixes_.size();
+  for (const auto& pair : prefixes_) {
     // 4 bytes for an offset, one for length.
-    total_prefix_dict_ += s.length();
+    total_prefix_dict_ += pair.first.length();
     total_prefix_table_ += kPrefixConstantCost;
+    total_prefix_savings_ += pair.first.length() * pair.second;
   }
 }
 
@@ -277,8 +326,17 @@
   net_savings -= total_prefix_dict_;
   net_savings -= total_prefix_table_;
   net_savings -= total_prefix_index_cost_;
-  os << "Prefix net savings " << Percent(net_savings, total_size) << "\n";
   os << "Prefix dictionary elements " << total_num_prefixes_ << "\n";
+  os << "Optimization savings " << Percent(optimization_savings_, total_size) << "\n";
+  os << "Prefix net savings " << Percent(net_savings, total_size) << "\n";
+  if (verbose_level_ >= VerboseLevel::kEverything) {
+    std::vector<std::pair<std::string, size_t>> pairs(prefixes_.begin(), prefixes_.end());
+    // Sort lexicographically.
+    std::sort(pairs.begin(), pairs.end());
+    for (const auto& pair : pairs) {
+      os << pair.first << " : " << pair.second << "\n";
+    }
+  }
 }
 
 void CountDexIndices::ProcessDexFiles(
diff --git a/tools/dexanalyze/dexanalyze_experiments.h b/tools/dexanalyze/dexanalyze_experiments.h
index 312330b..4e66b3c 100644
--- a/tools/dexanalyze/dexanalyze_experiments.h
+++ b/tools/dexanalyze/dexanalyze_experiments.h
@@ -21,6 +21,7 @@
 #include <iosfwd>
 #include <memory>
 #include <set>
+#include <unordered_map>
 #include <vector>
 
 #include "base/macros.h"
@@ -64,7 +65,7 @@
 // Analyze string data and strings accessed from code.
 class AnalyzeStrings : public Experiment {
  public:
-  void ProcessDexFile(const DexFile& dex_file) OVERRIDE;
+  void ProcessDexFiles(const std::vector<std::unique_ptr<const DexFile>>& dex_files) OVERRIDE;
   void Dump(std::ostream& os, uint64_t total_size) const OVERRIDE;
 
  private:
@@ -76,6 +77,8 @@
   int64_t total_prefix_table_ = 0u;
   int64_t total_prefix_index_cost_ = 0u;
   int64_t total_num_prefixes_ = 0u;
+  int64_t optimization_savings_ = 0u;
+  std::unordered_map<std::string, size_t> prefixes_;
 };
 
 // Analyze debug info sizes.