Move string analysis to its own file
Bug: 112311352
Bug: 77709234
Test: test-art-host
Change-Id: I9ed6f5ae3534584ced594fe348ec52d91b52a98f
diff --git a/tools/dexanalyze/Android.bp b/tools/dexanalyze/Android.bp
index 9515ca5..a85bf56 100644
--- a/tools/dexanalyze/Android.bp
+++ b/tools/dexanalyze/Android.bp
@@ -22,6 +22,7 @@
"dexanalyze.cc",
"dexanalyze_bytecode.cc",
"dexanalyze_experiments.cc",
+ "dexanalyze_strings.cc",
],
target: {
android: {
diff --git a/tools/dexanalyze/dexanalyze.cc b/tools/dexanalyze/dexanalyze.cc
index 841719b..040f41b 100644
--- a/tools/dexanalyze/dexanalyze.cc
+++ b/tools/dexanalyze/dexanalyze.cc
@@ -23,6 +23,7 @@
#include "dexanalyze_bytecode.h"
#include "dexanalyze_experiments.h"
+#include "dexanalyze_strings.h"
#include "dex/code_item_accessors-inl.h"
#include "dex/dex_file.h"
#include "dex/dex_file_loader.h"
diff --git a/tools/dexanalyze/dexanalyze_experiments.cc b/tools/dexanalyze/dexanalyze_experiments.cc
index 1f6fe46..b124f43 100644
--- a/tools/dexanalyze/dexanalyze_experiments.cc
+++ b/tools/dexanalyze/dexanalyze_experiments.cc
@@ -208,137 +208,6 @@
<< Percent(total_unique_non_header_bytes_, total_size) << "\n";
}
-void AnalyzeStrings::ProcessDexFiles(const std::vector<std::unique_ptr<const DexFile>>& dex_files) {
- std::set<std::string> unique_strings;
- for (const std::unique_ptr<const DexFile>& dex_file : dex_files) {
- for (size_t i = 0; i < dex_file->NumStringIds(); ++i) {
- uint32_t length = 0;
- const char* data = dex_file->StringDataAndUtf16LengthByIdx(dex::StringIndex(i), &length);
- // Analyze if the string has any UTF16 chars.
- bool have_wide_char = false;
- const char* ptr = data;
- for (size_t j = 0; j < length; ++j) {
- have_wide_char = have_wide_char || GetUtf16FromUtf8(&ptr) >= 0x100;
- }
- if (have_wide_char) {
- wide_string_bytes_ += 2 * length;
- } else {
- ascii_string_bytes_ += length;
- }
- string_data_bytes_ += ptr - data;
- unique_strings.insert(data);
- }
- }
- // Unique strings only since we want to exclude savings from multidex duplication.
- std::vector<std::string> strings(unique_strings.begin(), unique_strings.end());
- unique_strings.clear();
-
- // Tunable parameters.
- static const size_t kMinPrefixLen = 1;
- static const size_t kMaxPrefixLen = 255;
- static const size_t kPrefixConstantCost = 4;
- static const size_t kPrefixIndexCost = 2;
-
- // Calculate total shared prefix.
- std::vector<size_t> shared_len;
- prefixes_.clear();
- for (size_t i = 0; i < strings.size(); ++i) {
- size_t best_len = 0;
- if (i > 0) {
- best_len = std::max(best_len, PrefixLen(strings[i], strings[i - 1]));
- }
- if (i < strings.size() - 1) {
- best_len = std::max(best_len, PrefixLen(strings[i], strings[i + 1]));
- }
- best_len = std::min(best_len, kMaxPrefixLen);
- std::string prefix;
- if (best_len >= kMinPrefixLen) {
- prefix = strings[i].substr(0, best_len);
- ++prefixes_[prefix];
- }
- total_prefix_index_cost_ += kPrefixIndexCost;
- }
- // Optimize the result by moving long prefixes to shorter ones if it causes savings.
- while (true) {
- bool have_savings = false;
- auto it = prefixes_.begin();
- std::vector<std::string> longest;
- for (const auto& pair : prefixes_) {
- longest.push_back(pair.first);
- }
- std::sort(longest.begin(), longest.end(), [](const std::string& a, const std::string& b) {
- return a.length() > b.length();
- });
- // Do longest first since this provides the best results.
- for (const std::string& s : longest) {
- it = prefixes_.find(s);
- CHECK(it != prefixes_.end());
- const std::string& prefix = it->first;
- int64_t best_savings = 0u;
- int64_t best_len = -1;
- for (int64_t len = prefix.length() - 1; len >= 0; --len) {
- auto found = prefixes_.find(prefix.substr(0, len));
- if (len != 0 && found == prefixes_.end()) {
- continue;
- }
- // Calculate savings from downgrading the prefix.
- int64_t savings = kPrefixConstantCost + prefix.length() -
- (prefix.length() - len) * it->second;
- if (savings > best_savings) {
- best_savings = savings;
- best_len = len;
- break;
- }
- }
- if (best_len != -1) {
- prefixes_[prefix.substr(0, best_len)] += it->second;
- it = prefixes_.erase(it);
- optimization_savings_ += best_savings;
- have_savings = true;
- } else {
- ++it;
- }
- }
- if (!have_savings) {
- break;
- }
- }
- total_num_prefixes_ += prefixes_.size();
- for (const auto& pair : prefixes_) {
- // 4 bytes for an offset, one for length.
- total_prefix_dict_ += pair.first.length();
- total_prefix_table_ += kPrefixConstantCost;
- total_prefix_savings_ += pair.first.length() * pair.second;
- }
-}
-
-void AnalyzeStrings::Dump(std::ostream& os, uint64_t total_size) const {
- os << "Total string data bytes " << Percent(string_data_bytes_, total_size) << "\n";
- os << "UTF-16 string data bytes " << Percent(wide_string_bytes_, total_size) << "\n";
- os << "ASCII string data bytes " << Percent(ascii_string_bytes_, total_size) << "\n";
-
- // Prefix based strings.
- os << "Total shared prefix bytes " << Percent(total_prefix_savings_, total_size) << "\n";
- os << "Prefix dictionary cost " << Percent(total_prefix_dict_, total_size) << "\n";
- os << "Prefix table cost " << Percent(total_prefix_table_, total_size) << "\n";
- os << "Prefix index cost " << Percent(total_prefix_index_cost_, total_size) << "\n";
- int64_t net_savings = total_prefix_savings_;
- net_savings -= total_prefix_dict_;
- net_savings -= total_prefix_table_;
- net_savings -= total_prefix_index_cost_;
- os << "Prefix dictionary elements " << total_num_prefixes_ << "\n";
- os << "Optimization savings " << Percent(optimization_savings_, total_size) << "\n";
- os << "Prefix net savings " << Percent(net_savings, total_size) << "\n";
- if (verbose_level_ >= VerboseLevel::kEverything) {
- std::vector<std::pair<std::string, size_t>> pairs(prefixes_.begin(), prefixes_.end());
- // Sort lexicographically.
- std::sort(pairs.begin(), pairs.end());
- for (const auto& pair : pairs) {
- os << pair.first << " : " << pair.second << "\n";
- }
- }
-}
-
void CountDexIndices::ProcessDexFiles(
const std::vector<std::unique_ptr<const DexFile>>& dex_files) {
std::set<std::string> unique_field_names;
diff --git a/tools/dexanalyze/dexanalyze_experiments.h b/tools/dexanalyze/dexanalyze_experiments.h
index 4e66b3c..3542d95 100644
--- a/tools/dexanalyze/dexanalyze_experiments.h
+++ b/tools/dexanalyze/dexanalyze_experiments.h
@@ -62,25 +62,6 @@
VerboseLevel verbose_level_ = VerboseLevel::kNormal;
};
-// Analyze string data and strings accessed from code.
-class AnalyzeStrings : public Experiment {
- public:
- void ProcessDexFiles(const std::vector<std::unique_ptr<const DexFile>>& dex_files) OVERRIDE;
- void Dump(std::ostream& os, uint64_t total_size) const OVERRIDE;
-
- private:
- int64_t wide_string_bytes_ = 0u;
- int64_t ascii_string_bytes_ = 0u;
- int64_t string_data_bytes_ = 0u;
- int64_t total_prefix_savings_ = 0u;
- int64_t total_prefix_dict_ = 0u;
- int64_t total_prefix_table_ = 0u;
- int64_t total_prefix_index_cost_ = 0u;
- int64_t total_num_prefixes_ = 0u;
- int64_t optimization_savings_ = 0u;
- std::unordered_map<std::string, size_t> prefixes_;
-};
-
// Analyze debug info sizes.
class AnalyzeDebugInfo : public Experiment {
public:
diff --git a/tools/dexanalyze/dexanalyze_strings.cc b/tools/dexanalyze/dexanalyze_strings.cc
new file mode 100644
index 0000000..9f67ff4
--- /dev/null
+++ b/tools/dexanalyze/dexanalyze_strings.cc
@@ -0,0 +1,162 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dexanalyze_strings.h"
+
+#include <algorithm>
+#include <iomanip>
+#include <iostream>
+
+#include "dex/class_accessor-inl.h"
+#include "dex/code_item_accessors-inl.h"
+#include "dex/dex_instruction-inl.h"
+
+namespace art {
+namespace dexanalyze {
+
+void AnalyzeStrings::ProcessDexFiles(const std::vector<std::unique_ptr<const DexFile>>& dex_files) {
+ std::set<std::string> unique_strings;
+ for (const std::unique_ptr<const DexFile>& dex_file : dex_files) {
+ for (size_t i = 0; i < dex_file->NumStringIds(); ++i) {
+ uint32_t length = 0;
+ const char* data = dex_file->StringDataAndUtf16LengthByIdx(dex::StringIndex(i), &length);
+ // Analyze if the string has any UTF16 chars.
+ bool have_wide_char = false;
+ const char* ptr = data;
+ for (size_t j = 0; j < length; ++j) {
+ have_wide_char = have_wide_char || GetUtf16FromUtf8(&ptr) >= 0x100;
+ }
+ if (have_wide_char) {
+ wide_string_bytes_ += 2 * length;
+ } else {
+ ascii_string_bytes_ += length;
+ }
+ string_data_bytes_ += ptr - data;
+ unique_strings.insert(data);
+ }
+ }
+ // Unique strings only since we want to exclude savings from multidex duplication.
+ std::vector<std::string> strings(unique_strings.begin(), unique_strings.end());
+ unique_strings.clear();
+
+ // Tunable parameters.
+ static const size_t kMinPrefixLen = 1;
+ static const size_t kMaxPrefixLen = 255;
+ static const size_t kPrefixConstantCost = 4;
+ static const size_t kPrefixIndexCost = 2;
+
+ // Calculate total shared prefix.
+ std::vector<size_t> shared_len;
+ prefixes_.clear();
+ for (size_t i = 0; i < strings.size(); ++i) {
+ size_t best_len = 0;
+ if (i > 0) {
+ best_len = std::max(best_len, PrefixLen(strings[i], strings[i - 1]));
+ }
+ if (i < strings.size() - 1) {
+ best_len = std::max(best_len, PrefixLen(strings[i], strings[i + 1]));
+ }
+ best_len = std::min(best_len, kMaxPrefixLen);
+ std::string prefix;
+ if (best_len >= kMinPrefixLen) {
+ prefix = strings[i].substr(0, best_len);
+ ++prefixes_[prefix];
+ }
+ total_prefix_index_cost_ += kPrefixIndexCost;
+ }
+ // Optimize the result by moving long prefixes to shorter ones if it causes savings.
+ while (true) {
+ bool have_savings = false;
+ auto it = prefixes_.begin();
+ std::vector<std::string> longest;
+ for (const auto& pair : prefixes_) {
+ longest.push_back(pair.first);
+ }
+ std::sort(longest.begin(), longest.end(), [](const std::string& a, const std::string& b) {
+ return a.length() > b.length();
+ });
+ // Do longest first since this provides the best results.
+ for (const std::string& s : longest) {
+ it = prefixes_.find(s);
+ CHECK(it != prefixes_.end());
+ const std::string& prefix = it->first;
+ int64_t best_savings = 0u;
+ int64_t best_len = -1;
+ for (int64_t len = prefix.length() - 1; len >= 0; --len) {
+ auto found = prefixes_.find(prefix.substr(0, len));
+ if (len != 0 && found == prefixes_.end()) {
+ continue;
+ }
+ // Calculate savings from downgrading the prefix.
+ int64_t savings = kPrefixConstantCost + prefix.length() -
+ (prefix.length() - len) * it->second;
+ if (savings > best_savings) {
+ best_savings = savings;
+ best_len = len;
+ break;
+ }
+ }
+ if (best_len != -1) {
+ prefixes_[prefix.substr(0, best_len)] += it->second;
+ it = prefixes_.erase(it);
+ optimization_savings_ += best_savings;
+ have_savings = true;
+ } else {
+ ++it;
+ }
+ }
+ if (!have_savings) {
+ break;
+ }
+ }
+ total_num_prefixes_ += prefixes_.size();
+ for (const auto& pair : prefixes_) {
+ // 4 bytes for an offset, one for length.
+ total_prefix_dict_ += pair.first.length();
+ total_prefix_table_ += kPrefixConstantCost;
+ total_prefix_savings_ += pair.first.length() * pair.second;
+ }
+}
+
+void AnalyzeStrings::Dump(std::ostream& os, uint64_t total_size) const {
+ os << "Total string data bytes " << Percent(string_data_bytes_, total_size) << "\n";
+ os << "UTF-16 string data bytes " << Percent(wide_string_bytes_, total_size) << "\n";
+ os << "ASCII string data bytes " << Percent(ascii_string_bytes_, total_size) << "\n";
+
+ // Prefix based strings.
+ os << "Total shared prefix bytes " << Percent(total_prefix_savings_, total_size) << "\n";
+ os << "Prefix dictionary cost " << Percent(total_prefix_dict_, total_size) << "\n";
+ os << "Prefix table cost " << Percent(total_prefix_table_, total_size) << "\n";
+ os << "Prefix index cost " << Percent(total_prefix_index_cost_, total_size) << "\n";
+ int64_t net_savings = total_prefix_savings_;
+ net_savings -= total_prefix_dict_;
+ net_savings -= total_prefix_table_;
+ net_savings -= total_prefix_index_cost_;
+ os << "Prefix dictionary elements " << total_num_prefixes_ << "\n";
+ os << "Optimization savings " << Percent(optimization_savings_, total_size) << "\n";
+ os << "Prefix net savings " << Percent(net_savings, total_size) << "\n";
+ if (verbose_level_ >= VerboseLevel::kEverything) {
+ std::vector<std::pair<std::string, size_t>> pairs(prefixes_.begin(), prefixes_.end());
+ // Sort lexicographically.
+ std::sort(pairs.begin(), pairs.end());
+ for (const auto& pair : pairs) {
+ os << pair.first << " : " << pair.second << "\n";
+ }
+ }
+}
+
+} // namespace dexanalyze
+} // namespace art
diff --git a/tools/dexanalyze/dexanalyze_strings.h b/tools/dexanalyze/dexanalyze_strings.h
new file mode 100644
index 0000000..a5c202e
--- /dev/null
+++ b/tools/dexanalyze/dexanalyze_strings.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_TOOLS_DEXANALYZE_DEXANALYZE_STRINGS_H_
+#define ART_TOOLS_DEXANALYZE_DEXANALYZE_STRINGS_H_
+
+#include <array>
+#include <vector>
+#include <map>
+
+#include "base/safe_map.h"
+#include "dexanalyze_experiments.h"
+#include "dex/code_item_accessors.h"
+#include "dex/utf-inl.h"
+
+namespace art {
+namespace dexanalyze {
+
+// Analyze string data and strings accessed from code.
+class AnalyzeStrings : public Experiment {
+ public:
+ void ProcessDexFiles(const std::vector<std::unique_ptr<const DexFile>>& dex_files) OVERRIDE;
+ void Dump(std::ostream& os, uint64_t total_size) const OVERRIDE;
+
+ private:
+ int64_t wide_string_bytes_ = 0u;
+ int64_t ascii_string_bytes_ = 0u;
+ int64_t string_data_bytes_ = 0u;
+ int64_t total_prefix_savings_ = 0u;
+ int64_t total_prefix_dict_ = 0u;
+ int64_t total_prefix_table_ = 0u;
+ int64_t total_prefix_index_cost_ = 0u;
+ int64_t total_num_prefixes_ = 0u;
+ int64_t optimization_savings_ = 0u;
+ std::unordered_map<std::string, size_t> prefixes_;
+};
+
+} // namespace dexanalyze
+} // namespace art
+
+#endif // ART_TOOLS_DEXANALYZE_DEXANALYZE_STRINGS_H_