Add ASCII vs UTF-16 string data analysis
Measure how many strings are fully ASCII characters or a UTF-16 mix.
Bug: 77721545
Test: test-art-host-gtest-dexanalyze_test
Change-Id: Ic59613596c1542c693bcf49cee379310414c44cd
diff --git a/tools/dexanalyze/dexanalyze.cc b/tools/dexanalyze/dexanalyze.cc
index 58b1fc7..46c4852 100644
--- a/tools/dexanalyze/dexanalyze.cc
+++ b/tools/dexanalyze/dexanalyze.cc
@@ -30,7 +30,10 @@
namespace art {
class DexAnalyze {
- static const int kExitCodeUsageError = 1;
+ static constexpr int kExitCodeUsageError = 1;
+ static constexpr int kExitCodeFailedToOpenFile = 2;
+ static constexpr int kExitCodeFailedToOpenDex = 3;
+ static constexpr int kExitCodeFailedToProcessDex = 4;
static void StdoutLogger(android::base::LogId,
android::base::LogSeverity,
@@ -135,10 +138,10 @@
Analysis cumulative(&options);
for (const std::string& filename : options.filenames_) {
std::string content;
- // TODO: once added, use an api to android::base to read a std::vector<uint8_t>.
+ // TODO: once added, use an API to android::base to read a std::vector<uint8_t>.
if (!android::base::ReadFileToString(filename.c_str(), &content)) {
LOG(ERROR) << "ReadFileToString failed for " + filename << std::endl;
- return 2;
+ return kExitCodeFailedToOpenFile;
}
std::vector<std::unique_ptr<const DexFile>> dex_files;
const DexFileLoader dex_file_loader;
@@ -150,14 +153,14 @@
&error_msg,
&dex_files)) {
LOG(ERROR) << "OpenAll failed for " + filename << " with " << error_msg << std::endl;
- return 3;
+ return kExitCodeFailedToOpenDex;
}
for (std::unique_ptr<const DexFile>& dex_file : dex_files) {
if (options.dump_per_input_dex_) {
Analysis current(&options);
if (!current.ProcessDexFile(*dex_file)) {
LOG(ERROR) << "Failed to process " << filename << " with error " << error_msg;
- return 4;
+ return kExitCodeFailedToProcessDex;
}
LOG(INFO) << "Analysis for " << dex_file->GetLocation() << std::endl;
current.Dump(LOG_STREAM(INFO));
diff --git a/tools/dexanalyze/dexanalyze_experiments.cc b/tools/dexanalyze/dexanalyze_experiments.cc
index bfeb4b9..adc5154 100644
--- a/tools/dexanalyze/dexanalyze_experiments.cc
+++ b/tools/dexanalyze/dexanalyze_experiments.cc
@@ -26,6 +26,7 @@
#include "dex/code_item_accessors-inl.h"
#include "dex/dex_instruction-inl.h"
#include "dex/standard_dex_file.h"
+#include "dex/utf-inl.h"
namespace art {
@@ -48,8 +49,20 @@
std::vector<std::string> strings;
for (size_t i = 0; i < dex_file.NumStringIds(); ++i) {
uint32_t length = 0;
- const char* data =
- dex_file.GetStringDataAndUtf16Length(dex_file.GetStringId(dex::StringIndex(i)), &length);
+ const char* data = dex_file.StringDataAndUtf16LengthByIdx(dex::StringIndex(i), &length);
+ // Analyze if the string has any UTF16 chars.
+ bool have_wide_char = false;
+ const char* ptr = data;
+ for (size_t j = 0; j < length; ++j) {
+ have_wide_char = have_wide_char || GetUtf16FromUtf8(&ptr) >= 0x100;
+ }
+ if (have_wide_char) {
+ wide_string_bytes_ += 2 * length;
+ } else {
+ ascii_string_bytes_ += length;
+ }
+ string_data_bytes_ += ptr - data;
+
strings.push_back(data);
}
// Note that the strings are probably already sorted.
@@ -88,6 +101,11 @@
}
void AnalyzeStrings::Dump(std::ostream& os, uint64_t total_size) const {
+ os << "Total string data bytes " << Percent(string_data_bytes_, total_size) << "\n";
+ os << "UTF-16 string data bytes " << Percent(wide_string_bytes_, total_size) << "\n";
+ os << "ASCII string data bytes " << Percent(ascii_string_bytes_, total_size) << "\n";
+
+ // Prefix based strings.
os << "Total shared prefix bytes " << Percent(total_prefix_savings_, total_size) << "\n";
os << "Prefix dictionary cost " << Percent(total_prefix_dict_, total_size) << "\n";
os << "Prefix table cost " << Percent(total_prefix_table_, total_size) << "\n";
diff --git a/tools/dexanalyze/dexanalyze_experiments.h b/tools/dexanalyze/dexanalyze_experiments.h
index 6f70f5d..0fb4d32 100644
--- a/tools/dexanalyze/dexanalyze_experiments.h
+++ b/tools/dexanalyze/dexanalyze_experiments.h
@@ -41,6 +41,9 @@
void Dump(std::ostream& os, uint64_t total_size) const;
private:
+ int64_t wide_string_bytes_ = 0u;
+ int64_t ascii_string_bytes_ = 0u;
+ int64_t string_data_bytes_ = 0u;
int64_t total_prefix_savings_ = 0u;
int64_t total_prefix_dict_ = 0u;
int64_t total_prefix_table_ = 0u;