Add string prefix optimization
Optimize until fixed point by reducing to shorter prefixes if there
are savings.
This dictionary optimization saves up to 2.6% more. Average prefix
savings are 4.6% on top 99 APKs.
Test: test-art-host
Bug: 77709234
Bug: 77721545
Change-Id: I8e9e3aaf06ded9fde0153e8236f8c6b56450d881
diff --git a/tools/dexanalyze/dexanalyze_experiments.cc b/tools/dexanalyze/dexanalyze_experiments.cc
index b9a2ede..1f6fe46 100644
--- a/tools/dexanalyze/dexanalyze_experiments.cc
+++ b/tools/dexanalyze/dexanalyze_experiments.cc
@@ -208,37 +208,40 @@
<< Percent(total_unique_non_header_bytes_, total_size) << "\n";
}
-void AnalyzeStrings::ProcessDexFile(const DexFile& dex_file) {
- std::vector<std::string> strings;
- for (size_t i = 0; i < dex_file.NumStringIds(); ++i) {
- uint32_t length = 0;
- const char* data = dex_file.StringDataAndUtf16LengthByIdx(dex::StringIndex(i), &length);
- // Analyze if the string has any UTF16 chars.
- bool have_wide_char = false;
- const char* ptr = data;
- for (size_t j = 0; j < length; ++j) {
- have_wide_char = have_wide_char || GetUtf16FromUtf8(&ptr) >= 0x100;
+void AnalyzeStrings::ProcessDexFiles(const std::vector<std::unique_ptr<const DexFile>>& dex_files) {
+ std::set<std::string> unique_strings;
+ for (const std::unique_ptr<const DexFile>& dex_file : dex_files) {
+ for (size_t i = 0; i < dex_file->NumStringIds(); ++i) {
+ uint32_t length = 0;
+ const char* data = dex_file->StringDataAndUtf16LengthByIdx(dex::StringIndex(i), &length);
+ // Analyze if the string has any UTF16 chars.
+ bool have_wide_char = false;
+ const char* ptr = data;
+ for (size_t j = 0; j < length; ++j) {
+ have_wide_char = have_wide_char || GetUtf16FromUtf8(&ptr) >= 0x100;
+ }
+ if (have_wide_char) {
+ wide_string_bytes_ += 2 * length;
+ } else {
+ ascii_string_bytes_ += length;
+ }
+ string_data_bytes_ += ptr - data;
+ unique_strings.insert(data);
}
- if (have_wide_char) {
- wide_string_bytes_ += 2 * length;
- } else {
- ascii_string_bytes_ += length;
- }
- string_data_bytes_ += ptr - data;
-
- strings.push_back(data);
}
- // Note that the strings are probably already sorted.
- std::sort(strings.begin(), strings.end());
+ // Unique strings only since we want to exclude savings from multidex duplication.
+ std::vector<std::string> strings(unique_strings.begin(), unique_strings.end());
+ unique_strings.clear();
// Tunable parameters.
- static const size_t kMinPrefixLen = 3;
- static const size_t kPrefixConstantCost = 5;
+ static const size_t kMinPrefixLen = 1;
+ static const size_t kMaxPrefixLen = 255;
+ static const size_t kPrefixConstantCost = 4;
static const size_t kPrefixIndexCost = 2;
// Calculate total shared prefix.
std::vector<size_t> shared_len;
- std::set<std::string> prefixes;
+ prefixes_.clear();
for (size_t i = 0; i < strings.size(); ++i) {
size_t best_len = 0;
if (i > 0) {
@@ -247,19 +250,65 @@
if (i < strings.size() - 1) {
best_len = std::max(best_len, PrefixLen(strings[i], strings[i + 1]));
}
+ best_len = std::min(best_len, kMaxPrefixLen);
std::string prefix;
if (best_len >= kMinPrefixLen) {
prefix = strings[i].substr(0, best_len);
- prefixes.insert(prefix);
- total_prefix_savings_ += prefix.length();
+ ++prefixes_[prefix];
}
total_prefix_index_cost_ += kPrefixIndexCost;
}
- total_num_prefixes_ += prefixes.size();
- for (const std::string& s : prefixes) {
+ // Optimize the result by moving long prefixes to shorter ones if it causes savings.
+ while (true) {
+ bool have_savings = false;
+ auto it = prefixes_.begin();
+ std::vector<std::string> longest;
+ for (const auto& pair : prefixes_) {
+ longest.push_back(pair.first);
+ }
+ std::sort(longest.begin(), longest.end(), [](const std::string& a, const std::string& b) {
+ return a.length() > b.length();
+ });
+ // Do longest first since this provides the best results.
+ for (const std::string& s : longest) {
+ it = prefixes_.find(s);
+ CHECK(it != prefixes_.end());
+ const std::string& prefix = it->first;
+ int64_t best_savings = 0u;
+ int64_t best_len = -1;
+ for (int64_t len = prefix.length() - 1; len >= 0; --len) {
+ auto found = prefixes_.find(prefix.substr(0, len));
+ if (len != 0 && found == prefixes_.end()) {
+ continue;
+ }
+ // Calculate savings from downgrading the prefix.
+ int64_t savings = kPrefixConstantCost + prefix.length() -
+ (prefix.length() - len) * it->second;
+ if (savings > best_savings) {
+ best_savings = savings;
+ best_len = len;
+ break;
+ }
+ }
+ if (best_len != -1) {
+ prefixes_[prefix.substr(0, best_len)] += it->second;
+ it = prefixes_.erase(it);
+ optimization_savings_ += best_savings;
+ have_savings = true;
+ } else {
+ ++it;
+ }
+ }
+ if (!have_savings) {
+ break;
+ }
+ }
+ total_num_prefixes_ += prefixes_.size();
+ for (const auto& pair : prefixes_) {
// 4 bytes for an offset, one for length.
- total_prefix_dict_ += s.length();
+ total_prefix_dict_ += pair.first.length();
total_prefix_table_ += kPrefixConstantCost;
+ total_prefix_savings_ += pair.first.length() * pair.second;
}
}
@@ -277,8 +326,17 @@
net_savings -= total_prefix_dict_;
net_savings -= total_prefix_table_;
net_savings -= total_prefix_index_cost_;
- os << "Prefix net savings " << Percent(net_savings, total_size) << "\n";
os << "Prefix dictionary elements " << total_num_prefixes_ << "\n";
+ os << "Optimization savings " << Percent(optimization_savings_, total_size) << "\n";
+ os << "Prefix net savings " << Percent(net_savings, total_size) << "\n";
+ if (verbose_level_ >= VerboseLevel::kEverything) {
+ std::vector<std::pair<std::string, size_t>> pairs(prefixes_.begin(), prefixes_.end());
+ // Sort lexicographically.
+ std::sort(pairs.begin(), pairs.end());
+ for (const auto& pair : pairs) {
+ os << pair.first << " : " << pair.second << "\n";
+ }
+ }
}
void CountDexIndices::ProcessDexFiles(
diff --git a/tools/dexanalyze/dexanalyze_experiments.h b/tools/dexanalyze/dexanalyze_experiments.h
index 312330b..4e66b3c 100644
--- a/tools/dexanalyze/dexanalyze_experiments.h
+++ b/tools/dexanalyze/dexanalyze_experiments.h
@@ -21,6 +21,7 @@
#include <iosfwd>
#include <memory>
#include <set>
+#include <unordered_map>
#include <vector>
#include "base/macros.h"
@@ -64,7 +65,7 @@
// Analyze string data and strings accessed from code.
class AnalyzeStrings : public Experiment {
public:
- void ProcessDexFile(const DexFile& dex_file) OVERRIDE;
+ void ProcessDexFiles(const std::vector<std::unique_ptr<const DexFile>>& dex_files) OVERRIDE;
void Dump(std::ostream& os, uint64_t total_size) const OVERRIDE;
private:
@@ -76,6 +77,8 @@
int64_t total_prefix_table_ = 0u;
int64_t total_prefix_index_cost_ = 0u;
int64_t total_num_prefixes_ = 0u;
+ int64_t optimization_savings_ = 0u;
+ std::unordered_map<std::string, size_t> prefixes_;
};
// Analyze debug info sizes.