tools/dexanalyze/dexanalyze_strings.cc - LeafOS-Project/android_art - Gitiles

 /*
  * Copyright (C) 2018 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include "dexanalyze_strings.h"

 #include <algorithm>
 #include <iomanip>
 #include <iostream>
 #include <queue>

 #include "base/time_utils.h"
 #include "dex/class_accessor-inl.h"
 #include "dex/code_item_accessors-inl.h"
 #include "dex/dex_instruction-inl.h"

 namespace art {
 namespace dexanalyze {

 // Tunable parameters.
 static const size_t kMinPrefixLen = 1;
 static const size_t kMaxPrefixLen = 255;
 static const size_t kPrefixConstantCost = 4;
 static const size_t kPrefixIndexCost = 2;

 class PrefixDictionary {
  public:
   // Add prefix data and return the offset to the start of the added data.
   size_t AddPrefixData(const uint8_t* data, size_t len) {
     const size_t offset = prefix_data_.size();
     prefix_data_.insert(prefix_data_.end(), data, data + len);
     return offset;
   }

   static constexpr size_t kLengthBits = 8;
   static constexpr size_t kLengthMask = (1u << kLengthBits) - 1;

   // Return the prefix offset and length.
   ALWAYS_INLINE void GetOffset(uint32_t prefix_index, uint32_t* offset, uint32_t* length) const {
     CHECK_LT(prefix_index, offsets_.size());
     const uint32_t data = offsets_[prefix_index];
     *length = data & kLengthMask;
     *offset = data >> kLengthBits;
   }

   uint32_t AddOffset(uint32_t offset, uint32_t length) {
     CHECK_LE(length, kLengthMask);
     offsets_.push_back((offset << kLengthBits) | length);
     return offsets_.size() - 1;
   }

  public:
   std::vector<uint32_t> offsets_;
   std::vector<uint8_t> prefix_data_;
 };

 class PrefixStrings {
  public:
   class Builder {
    public:
     explicit Builder(PrefixStrings* output) : output_(output) {}
     void Build(const std::vector<std::string>& strings);

    private:
     PrefixStrings* const output_;
   };

   // Return the string index that was added.
   size_t AddString(uint16_t prefix, const std::string& str) {
     const size_t string_offset = chars_.size();
     chars_.push_back(static_cast<uint8_t>(prefix >> 8));
     chars_.push_back(static_cast<uint8_t>(prefix >> 0));
     EncodeUnsignedLeb128(&chars_, str.length());
     const uint8_t* ptr = reinterpret_cast<const uint8_t*>(&str[0]);
     chars_.insert(chars_.end(), ptr, ptr + str.length());
     string_offsets_.push_back(string_offset);
     return string_offsets_.size() - 1;
   }

   std::string GetString(uint32_t string_idx) const {
     const size_t offset = string_offsets_[string_idx];
     const uint8_t* suffix_data = &chars_[offset];
     uint16_t prefix_idx = (static_cast<uint16_t>(suffix_data[0]) << 8) +
         suffix_data[1];
     suffix_data += 2;
     uint32_t prefix_offset;
     uint32_t prefix_len;
     dictionary_.GetOffset(prefix_idx, &prefix_offset, &prefix_len);
     const uint8_t* prefix_data = &dictionary_.prefix_data_[prefix_offset];
     std::string ret(prefix_data, prefix_data + prefix_len);
     uint32_t suffix_len = DecodeUnsignedLeb128(&suffix_data);
     ret.insert(ret.end(), suffix_data, suffix_data + suffix_len);
     return ret;
   }

   ALWAYS_INLINE bool Equal(uint32_t string_idx, const uint8_t* data, size_t len) const {
     const size_t offset = string_offsets_[string_idx];
     const uint8_t* suffix_data = &chars_[offset];
     uint16_t prefix_idx = (static_cast<uint16_t>(suffix_data[0]) << 8) +
         suffix_data[1];
     suffix_data += 2;
     uint32_t prefix_offset;
     uint32_t prefix_len;
     dictionary_.GetOffset(prefix_idx, &prefix_offset, &prefix_len);
     uint32_t suffix_len = DecodeUnsignedLeb128(&suffix_data);
     if (prefix_len + suffix_len != len) {
       return false;
     }
     const uint8_t* prefix_data = &dictionary_.prefix_data_[prefix_offset];
     if ((true)) {
       return memcmp(prefix_data, data, prefix_len) == 0u &&
           memcmp(suffix_data, data + prefix_len, len - prefix_len) == 0u;
     } else {
       len -= prefix_len;
       while (prefix_len != 0u) {
         if (*prefix_data++ != *data++) {
           return false;
         }
         --prefix_len;
       }
       while (len != 0u) {
         if (*suffix_data++ != *data++) {
           return false;
         }
         --len;
       }
       return true;
     }
   }

  public:
   PrefixDictionary dictionary_;
   std::vector<uint8_t> chars_;
   std::vector<uint32_t> string_offsets_;
 };

 // Normal non prefix strings.
 class NormalStrings {
  public:
   // Return the string index that was added.
   size_t AddString(const std::string& str) {
     const size_t string_offset = chars_.size();
     EncodeUnsignedLeb128(&chars_, str.length());
     const uint8_t* ptr = reinterpret_cast<const uint8_t*>(&str[0]);
     chars_.insert(chars_.end(), ptr, ptr + str.length());
     string_offsets_.push_back(string_offset);
     return string_offsets_.size() - 1;
   }

   std::string GetString(uint32_t string_idx) const {
     const size_t offset = string_offsets_[string_idx];
     const uint8_t* data = &chars_[offset];
     uint32_t len = DecodeUnsignedLeb128(&data);
     return std::string(data, data + len);
   }

   ALWAYS_INLINE bool Equal(uint32_t string_idx, const uint8_t* data, size_t len) const {
     const size_t offset = string_offsets_[string_idx];
     const uint8_t* str_data = &chars_[offset];
     uint32_t str_len = DecodeUnsignedLeb128(&str_data);
     if (str_len != len) {
       return false;
     }
     return memcmp(data, str_data, len) == 0u;
   }

  public:
   std::vector<uint8_t> chars_;
   std::vector<uint32_t> string_offsets_;
 };

 // Node value = (distance from root) * (occurrences - 1).
 class MatchTrie {
  public:
   MatchTrie* Add(const std::string& str) {
     MatchTrie* node = this;
     size_t depth = 0u;
     for (uint8_t c : str) {
       ++depth;
       if (node->nodes_[c] == nullptr) {
         MatchTrie* new_node = new MatchTrie();
         node->nodes_[c].reset(new_node);
         new_node->parent_ = node;
         new_node->depth_ = depth;
         new_node->incoming_ = c;
         node = new_node;
       } else {
         node = node->nodes_[c].get();
       }
       ++node->count_;
     }
     return node;
   }

   // Returns the length of the longest prefix and if it's a leaf node.
   MatchTrie* LongestPrefix(const std::string& str) {
     MatchTrie* node = this;
     for (uint8_t c : str) {
       if (node->nodes_[c] == nullptr) {
         break;
       }
       node = node->nodes_[c].get();
     }
     return node;
   }

   bool IsLeaf() const {
     for (const std::unique_ptr<MatchTrie>& cur_node : nodes_) {
       if (cur_node != nullptr) {
         return false;
       }
     }
     return true;
   }

   int32_t Savings() const {
     int32_t cost = kPrefixConstantCost;
     int32_t first_used = 0u;
     if (chosen_suffix_count_ == 0u) {
       cost += depth_;
     }
     uint32_t extra_savings = 0u;
     for (MatchTrie* cur = parent_; cur != nullptr; cur = cur->parent_) {
       if (cur->chosen_) {
         first_used = cur->depth_;
         if (cur->chosen_suffix_count_ == 0u) {
           // First suffix for the chosen parent, remove the cost of the dictionary entry.
           extra_savings += first_used;
         }
         break;
       }
     }
     return count_ * (depth_ - first_used) - cost + extra_savings;
   }

   template <typename T, typename... Args, template <typename...> class Queue>
   T PopRealTop(Queue<T, Args...>& queue) {
     auto pair = queue.top();
     queue.pop();
     // Keep updating values until one sticks.
     while (pair.second->Savings() != pair.first) {
       pair.first = pair.second->Savings();
       queue.push(pair);
       pair = queue.top();
       queue.pop();
     }
     return pair;
   }

   std::vector<std::string> ExtractPrefixes(size_t max) {
     std::vector<std::string> ret;
     // Make priority queue and adaptively update it. Each node value is the savings from picking
     // it. Insert all of the interesting nodes in the queue (children != 1).
     std::priority_queue<std::pair<int32_t, MatchTrie*>> queue;
     // Add all of the nodes to the queue.
     std::vector<MatchTrie*> work(1, this);
     while (!work.empty()) {
       MatchTrie* elem = work.back();
       work.pop_back();
       size_t num_childs = 0u;
       for (const std::unique_ptr<MatchTrie>& child : elem->nodes_) {
         if (child != nullptr) {
           work.push_back(child.get());
           ++num_childs;
         }
       }
       if (num_childs > 1u || elem->value_ != 0u) {
         queue.emplace(elem->Savings(), elem);
       }
     }
     std::priority_queue<std::pair<int32_t, MatchTrie*>> prefixes;
     // The savings can only ever go down for a given node, never up.
     while (max != 0u && !queue.empty()) {
       std::pair<int32_t, MatchTrie*> pair = PopRealTop(queue);
       if (pair.second != this && pair.first > 0) {
         // Pick this node.
         uint32_t count = pair.second->count_;
         pair.second->chosen_ = true;
         for (MatchTrie* cur = pair.second->parent_; cur != this; cur = cur->parent_) {
           if (cur->chosen_) {
             break;
           }
           cur->count_ -= count;
         }
         for (MatchTrie* cur = pair.second->parent_; cur != this; cur = cur->parent_) {
           ++cur->chosen_suffix_count_;
         }
         prefixes.emplace(pair.first, pair.second);
         --max;
       } else {
         // Negative or no EV, just delete the node.
       }
     }
     while (!prefixes.empty()) {
       std::pair<int32_t, MatchTrie*> pair = PopRealTop(prefixes);
       if (pair.first <= 0) {
         continue;
       }
       ret.push_back(pair.second->GetString());
     }
     return ret;
   }

   std::string GetString() const {
     std::vector<uint8_t> chars;
     for (const MatchTrie* cur = this; cur->parent_ != nullptr; cur = cur->parent_) {
       chars.push_back(cur->incoming_);
     }
     return std::string(chars.rbegin(), chars.rend());
   }

   std::unique_ptr<MatchTrie> nodes_[256];
   MatchTrie* parent_ = nullptr;
   uint32_t count_ = 0u;
   uint32_t depth_ = 0u;
   int32_t savings_ = 0u;
   uint8_t incoming_ = 0u;
   // Value of the current node, non zero if the node is chosen.
   uint32_t value_ = 0u;
   // If the current node is chosen to be a used prefix.
   bool chosen_ = false;
   // If the current node is a prefix of a longer chosen prefix.
   uint32_t chosen_suffix_count_ = 0u;
 };

 void PrefixStrings::Builder::Build(const std::vector<std::string>& strings) {
   std::unique_ptr<MatchTrie> prefixe_trie(new MatchTrie());
   for (size_t i = 0; i < strings.size(); ++i) {
     size_t len = 0u;
     if (i > 0u) {
       CHECK_GT(strings[i], strings[i - 1]);
       len = std::max(len, PrefixLen(strings[i], strings[i - 1]));
     }
     if (i < strings.size() - 1) {
       len = std::max(len, PrefixLen(strings[i], strings[i + 1]));
     }
     len = std::min(len, kMaxPrefixLen);
     if (len >= kMinPrefixLen) {
       prefixe_trie->Add(strings[i].substr(0, len))->value_ = 1u;
     }
   }

   // Build prefixes.
   {
     static constexpr size_t kPrefixBits = 15;
     std::vector<std::string> prefixes(prefixe_trie->ExtractPrefixes(1 << kPrefixBits));
     // Add longest prefixes first so that subprefixes can share data.
     std::sort(prefixes.begin(), prefixes.end(), [](const std::string& a, const std::string& b) {
       return a.length() > b.length();
     });
     prefixe_trie.reset();
     prefixe_trie.reset(new MatchTrie());
     uint32_t prefix_idx = 0u;
     CHECK_EQ(output_->dictionary_.AddOffset(0u, 0u), prefix_idx++);
     for (const std::string& str : prefixes) {
       uint32_t prefix_offset = 0u;
       MatchTrie* node = prefixe_trie->LongestPrefix(str);
       if (node != nullptr && node->depth_ == str.length() && node->value_ != 0u) {
         CHECK_EQ(node->GetString(), str);
         uint32_t existing_len = 0u;
         output_->dictionary_.GetOffset(node->value_, &prefix_offset, &existing_len);
         // Make sure to register the current node.
         prefixe_trie->Add(str)->value_ = prefix_idx;
       } else {
         auto add_str = [&](const std::string& s) {
           node = prefixe_trie->Add(s);
           node->value_ = prefix_idx;
           while (node != nullptr) {
             node->value_ = prefix_idx;
             node = node->parent_;
           }
         };
         static constexpr size_t kNumSubstrings = 1u;
         // Increasing kNumSubstrings provides savings since it enables common substrings and not
         // only prefixes to share data. The problem is that it's slow.
         for (size_t i = 0; i < std::min(str.length(), kNumSubstrings); ++i) {
           add_str(str.substr(i));
         }
         prefix_offset = output_->dictionary_.AddPrefixData(
             reinterpret_cast<const uint8_t*>(&str[0]),
             str.length());
       }
       // TODO: Validiate the prefix offset.
       CHECK_EQ(output_->dictionary_.AddOffset(prefix_offset, str.length()), prefix_idx);
       ++prefix_idx;
     }
   }

   // Add strings to the dictionary.
   for (const std::string& str : strings) {
     MatchTrie* node = prefixe_trie->LongestPrefix(str);
     uint32_t prefix_idx = 0u;
     uint32_t best_length = 0u;
     while (node != nullptr) {
       uint32_t offset = 0u;
       uint32_t length = 0u;
       output_->dictionary_.GetOffset(node->value_, &offset, &length);
       if (node->depth_ == length) {
         prefix_idx = node->value_;
         best_length = node->depth_;
         break;
         // Actually the prefix we want.
       }
       node = node->parent_;
     }
     output_->AddString(prefix_idx, str.substr(best_length));
   }
 }

 void AnalyzeStrings::ProcessDexFiles(const std::vector<std::unique_ptr<const DexFile>>& dex_files) {
   std::set<std::string> unique_strings;
   // Accumulate the strings.
   for (const std::unique_ptr<const DexFile>& dex_file : dex_files) {
     for (size_t i = 0; i < dex_file->NumStringIds(); ++i) {
       uint32_t length = 0;
       const char* data = dex_file->StringDataAndUtf16LengthByIdx(dex::StringIndex(i), &length);
       // Analyze if the string has any UTF16 chars.
       bool have_wide_char = false;
       const char* ptr = data;
       for (size_t j = 0; j < length; ++j) {
         have_wide_char = have_wide_char || GetUtf16FromUtf8(&ptr) >= 0x100;
       }
       if (have_wide_char) {
         wide_string_bytes_ += 2 * length;
       } else {
         ascii_string_bytes_ += length;
       }
       string_data_bytes_ += ptr - data;
       unique_strings.insert(data);
     }
   }
   // Unique strings only since we want to exclude savings from multi-dex duplication.
   ProcessStrings(std::vector<std::string>(unique_strings.begin(), unique_strings.end()));
 }

 void AnalyzeStrings::ProcessStrings(const std::vector<std::string>& strings) {
   // Calculate total shared prefix.
   size_t prefix_index_cost_ = 0u;
   for (size_t i = 0; i < strings.size(); ++i) {
     size_t best_len = 0;
     if (i > 0) {
       best_len = std::max(best_len, PrefixLen(strings[i], strings[i - 1]));
     }
     if (i < strings.size() - 1) {
       best_len = std::max(best_len, PrefixLen(strings[i], strings[i + 1]));
     }
     best_len = std::min(best_len, kMaxPrefixLen);
     if (best_len >= kMinPrefixLen) {
       total_shared_prefix_bytes_ += best_len;
     }
     prefix_index_cost_ += kPrefixIndexCost;
     if (strings[i].length() < 64) {
       ++short_strings_;
     } else {
       ++long_strings_;
     }
   }
   total_prefix_index_cost_ += prefix_index_cost_;

   PrefixStrings prefix_strings;
   {
     PrefixStrings::Builder prefix_builder(&prefix_strings);
     prefix_builder.Build(strings);
   }
   Benchmark(prefix_strings, strings, &prefix_timings_);
   const size_t num_prefixes = prefix_strings.dictionary_.offsets_.size();
   total_num_prefixes_ += num_prefixes;
   total_prefix_table_ += num_prefixes * sizeof(prefix_strings.dictionary_.offsets_[0]);
   total_prefix_dict_ += prefix_strings.dictionary_.prefix_data_.size();

   {
     NormalStrings normal_strings;
     for (const std::string& s : strings) {
       normal_strings.AddString(s);
     }
     const uint64_t unique_string_data_bytes = normal_strings.chars_.size();
     total_unique_string_data_bytes_ += unique_string_data_bytes;
     total_prefix_savings_ += unique_string_data_bytes - prefix_strings.chars_.size() +
         prefix_index_cost_;
     Benchmark(normal_strings, strings, &normal_timings_);
   }
 }

 template <typename Strings>
 void AnalyzeStrings::Benchmark(const Strings& strings,
                                const std::vector<std::string>& reference,
                                StringTimings* timings) {
   const size_t kIterations = 100;
   timings->num_comparisons_ += reference.size() * kIterations;

   uint64_t start = NanoTime();
   for (size_t j = 0; j < kIterations; ++j) {
     for (size_t i = 0; i < reference.size(); ++i) {
       CHECK(strings.Equal(
           i,
           reinterpret_cast<const uint8_t*>(&reference[i][0]),
           reference[i].length()))
           << i << ": " << strings.GetString(i) << " vs " << reference[i];
     }
   }
   timings->time_equal_comparisons_ += NanoTime() - start;

   start = NanoTime();
   for (size_t j = 0; j < kIterations; ++j) {
     size_t count = 0u;
     for (size_t i = 0; i < reference.size(); ++i) {
       count += strings.Equal(
           reference.size() - 1 - i,
           reinterpret_cast<const uint8_t*>(&reference[i][0]),
           reference[i].length());
     }
     CHECK_LT(count, 2u);
   }
   timings->time_non_equal_comparisons_ += NanoTime() - start;
 }

 template void AnalyzeStrings::Benchmark(const PrefixStrings&,
                                         const std::vector<std::string>&,
                                         StringTimings* timings);
 template void AnalyzeStrings::Benchmark(const NormalStrings&,
                                         const std::vector<std::string>&,
                                         StringTimings* timings);

 void StringTimings::Dump(std::ostream& os) const {
   const double comparisons = static_cast<double>(num_comparisons_);
   os << "Compare equal " << static_cast<double>(time_equal_comparisons_) / comparisons << "\n";
   os << "Compare not equal " << static_cast<double>(time_non_equal_comparisons_) / comparisons << "\n";
 }

 void AnalyzeStrings::Dump(std::ostream& os, uint64_t total_size) const {
   os << "Total string data bytes " << Percent(string_data_bytes_, total_size) << "\n";
   os << "Total unique string data bytes "
      << Percent(total_unique_string_data_bytes_, total_size) << "\n";
   os << "UTF-16 string data bytes " << Percent(wide_string_bytes_, total_size) << "\n";
   os << "ASCII string data bytes " << Percent(ascii_string_bytes_, total_size) << "\n";

   os << "Prefix string timings\n";
   prefix_timings_.Dump(os);
   os << "Normal string timings\n";
   normal_timings_.Dump(os);

   // Prefix based strings.
   os << "Total shared prefix bytes " << Percent(total_shared_prefix_bytes_, total_size) << "\n";
   os << "Prefix dictionary cost " << Percent(total_prefix_dict_, total_size) << "\n";
   os << "Prefix table cost " << Percent(total_prefix_table_, total_size) << "\n";
   os << "Prefix index cost " << Percent(total_prefix_index_cost_, total_size) << "\n";
   int64_t net_savings = total_prefix_savings_;
   net_savings -= total_prefix_dict_;
   net_savings -= total_prefix_table_;
   net_savings -= total_prefix_index_cost_;
   os << "Prefix dictionary elements " << total_num_prefixes_ << "\n";
   os << "Prefix base savings " << Percent(total_prefix_savings_, total_size) << "\n";
   os << "Prefix net savings " << Percent(net_savings, total_size) << "\n";
   os << "Strings using prefix "
      << Percent(strings_used_prefixed_, total_prefix_index_cost_ / kPrefixIndexCost) << "\n";
   os << "Short strings " << Percent(short_strings_, short_strings_ + long_strings_) << "\n";
   if (verbose_level_ >= VerboseLevel::kEverything) {
     std::vector<std::pair<std::string, size_t>> pairs;  // (prefixes_.begin(), prefixes_.end());
     // Sort lexicographically.
     std::sort(pairs.begin(), pairs.end());
     for (const auto& pair : pairs) {
       os << pair.first << " : " << pair.second << "\n";
     }
   }
 }

 }  // namespace dexanalyze
 }  // namespace art
	/*
	* Copyright (C) 2018 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include "dexanalyze_strings.h"

	#include <algorithm>
	#include <iomanip>
	#include <iostream>
	#include <queue>

	#include "base/time_utils.h"
	#include "dex/class_accessor-inl.h"
	#include "dex/code_item_accessors-inl.h"
	#include "dex/dex_instruction-inl.h"

	namespace art {
	namespace dexanalyze {

	// Tunable parameters.
	static const size_t kMinPrefixLen = 1;
	static const size_t kMaxPrefixLen = 255;
	static const size_t kPrefixConstantCost = 4;
	static const size_t kPrefixIndexCost = 2;

	class PrefixDictionary {
	public:
	// Add prefix data and return the offset to the start of the added data.
	size_t AddPrefixData(const uint8_t* data, size_t len) {
	const size_t offset = prefix_data_.size();
	prefix_data_.insert(prefix_data_.end(), data, data + len);
	return offset;
	}

	static constexpr size_t kLengthBits = 8;
	static constexpr size_t kLengthMask = (1u << kLengthBits) - 1;

	// Return the prefix offset and length.
	ALWAYS_INLINE void GetOffset(uint32_t prefix_index, uint32_t* offset, uint32_t* length) const {
	CHECK_LT(prefix_index, offsets_.size());
	const uint32_t data = offsets_[prefix_index];
	*length = data & kLengthMask;
	*offset = data >> kLengthBits;
	}

	uint32_t AddOffset(uint32_t offset, uint32_t length) {
	CHECK_LE(length, kLengthMask);
	offsets_.push_back((offset << kLengthBits) \| length);
	return offsets_.size() - 1;
	}

	public:
	std::vector<uint32_t> offsets_;
	std::vector<uint8_t> prefix_data_;
	};

	class PrefixStrings {
	public:
	class Builder {
	public:
	explicit Builder(PrefixStrings* output) : output_(output) {}
	void Build(const std::vector<std::string>& strings);

	private:
	PrefixStrings* const output_;
	};

	// Return the string index that was added.
	size_t AddString(uint16_t prefix, const std::string& str) {
	const size_t string_offset = chars_.size();
	chars_.push_back(static_cast<uint8_t>(prefix >> 8));
	chars_.push_back(static_cast<uint8_t>(prefix >> 0));
	EncodeUnsignedLeb128(&chars_, str.length());
	const uint8_t* ptr = reinterpret_cast<const uint8_t*>(&str[0]);
	chars_.insert(chars_.end(), ptr, ptr + str.length());
	string_offsets_.push_back(string_offset);
	return string_offsets_.size() - 1;
	}

	std::string GetString(uint32_t string_idx) const {
	const size_t offset = string_offsets_[string_idx];
	const uint8_t* suffix_data = &chars_[offset];
	uint16_t prefix_idx = (static_cast<uint16_t>(suffix_data[0]) << 8) +
	suffix_data[1];
	suffix_data += 2;
	uint32_t prefix_offset;
	uint32_t prefix_len;
	dictionary_.GetOffset(prefix_idx, &prefix_offset, &prefix_len);
	const uint8_t* prefix_data = &dictionary_.prefix_data_[prefix_offset];
	std::string ret(prefix_data, prefix_data + prefix_len);
	uint32_t suffix_len = DecodeUnsignedLeb128(&suffix_data);
	ret.insert(ret.end(), suffix_data, suffix_data + suffix_len);
	return ret;
	}

	ALWAYS_INLINE bool Equal(uint32_t string_idx, const uint8_t* data, size_t len) const {
	const size_t offset = string_offsets_[string_idx];
	const uint8_t* suffix_data = &chars_[offset];
	uint16_t prefix_idx = (static_cast<uint16_t>(suffix_data[0]) << 8) +
	suffix_data[1];
	suffix_data += 2;
	uint32_t prefix_offset;
	uint32_t prefix_len;
	dictionary_.GetOffset(prefix_idx, &prefix_offset, &prefix_len);
	uint32_t suffix_len = DecodeUnsignedLeb128(&suffix_data);
	if (prefix_len + suffix_len != len) {
	return false;
	}
	const uint8_t* prefix_data = &dictionary_.prefix_data_[prefix_offset];
	if ((true)) {
	return memcmp(prefix_data, data, prefix_len) == 0u &&
	memcmp(suffix_data, data + prefix_len, len - prefix_len) == 0u;
	} else {
	len -= prefix_len;
	while (prefix_len != 0u) {
	if (prefix_data++ != data++) {
	return false;
	}
	--prefix_len;
	}
	while (len != 0u) {
	if (suffix_data++ != data++) {
	return false;
	}
	--len;
	}
	return true;
	}
	}

	public:
	PrefixDictionary dictionary_;
	std::vector<uint8_t> chars_;
	std::vector<uint32_t> string_offsets_;
	};

	// Normal non prefix strings.
	class NormalStrings {
	public:
	// Return the string index that was added.
	size_t AddString(const std::string& str) {
	const size_t string_offset = chars_.size();
	EncodeUnsignedLeb128(&chars_, str.length());
	const uint8_t* ptr = reinterpret_cast<const uint8_t*>(&str[0]);
	chars_.insert(chars_.end(), ptr, ptr + str.length());
	string_offsets_.push_back(string_offset);
	return string_offsets_.size() - 1;
	}

	std::string GetString(uint32_t string_idx) const {
	const size_t offset = string_offsets_[string_idx];
	const uint8_t* data = &chars_[offset];
	uint32_t len = DecodeUnsignedLeb128(&data);
	return std::string(data, data + len);
	}

	ALWAYS_INLINE bool Equal(uint32_t string_idx, const uint8_t* data, size_t len) const {
	const size_t offset = string_offsets_[string_idx];
	const uint8_t* str_data = &chars_[offset];
	uint32_t str_len = DecodeUnsignedLeb128(&str_data);
	if (str_len != len) {
	return false;
	}
	return memcmp(data, str_data, len) == 0u;
	}

	public:
	std::vector<uint8_t> chars_;
	std::vector<uint32_t> string_offsets_;
	};

	// Node value = (distance from root) * (occurrences - 1).
	class MatchTrie {
	public:
	MatchTrie* Add(const std::string& str) {
	MatchTrie* node = this;
	size_t depth = 0u;
	for (uint8_t c : str) {
	++depth;
	if (node->nodes_[c] == nullptr) {
	MatchTrie* new_node = new MatchTrie();
	node->nodes_[c].reset(new_node);
	new_node->parent_ = node;
	new_node->depth_ = depth;
	new_node->incoming_ = c;
	node = new_node;
	} else {
	node = node->nodes_[c].get();
	}
	++node->count_;
	}
	return node;
	}

	// Returns the length of the longest prefix and if it's a leaf node.
	MatchTrie* LongestPrefix(const std::string& str) {
	MatchTrie* node = this;
	for (uint8_t c : str) {
	if (node->nodes_[c] == nullptr) {
	break;
	}
	node = node->nodes_[c].get();
	}
	return node;
	}

	bool IsLeaf() const {
	for (const std::unique_ptr<MatchTrie>& cur_node : nodes_) {
	if (cur_node != nullptr) {
	return false;
	}
	}
	return true;
	}

	int32_t Savings() const {
	int32_t cost = kPrefixConstantCost;
	int32_t first_used = 0u;
	if (chosen_suffix_count_ == 0u) {
	cost += depth_;
	}
	uint32_t extra_savings = 0u;
	for (MatchTrie* cur = parent_; cur != nullptr; cur = cur->parent_) {
	if (cur->chosen_) {
	first_used = cur->depth_;
	if (cur->chosen_suffix_count_ == 0u) {
	// First suffix for the chosen parent, remove the cost of the dictionary entry.
	extra_savings += first_used;
	}
	break;
	}
	}
	return count_ * (depth_ - first_used) - cost + extra_savings;
	}

	template <typename T, typename... Args, template <typename...> class Queue>
	T PopRealTop(Queue<T, Args...>& queue) {
	auto pair = queue.top();
	queue.pop();
	// Keep updating values until one sticks.
	while (pair.second->Savings() != pair.first) {
	pair.first = pair.second->Savings();
	queue.push(pair);
	pair = queue.top();
	queue.pop();
	}
	return pair;
	}

	std::vector<std::string> ExtractPrefixes(size_t max) {
	std::vector<std::string> ret;
	// Make priority queue and adaptively update it. Each node value is the savings from picking
	// it. Insert all of the interesting nodes in the queue (children != 1).
	std::priority_queue<std::pair<int32_t, MatchTrie*>> queue;
	// Add all of the nodes to the queue.
	std::vector<MatchTrie*> work(1, this);
	while (!work.empty()) {
	MatchTrie* elem = work.back();
	work.pop_back();
	size_t num_childs = 0u;
	for (const std::unique_ptr<MatchTrie>& child : elem->nodes_) {
	if (child != nullptr) {
	work.push_back(child.get());
	++num_childs;
	}
	}
	if (num_childs > 1u \|\| elem->value_ != 0u) {
	queue.emplace(elem->Savings(), elem);
	}
	}
	std::priority_queue<std::pair<int32_t, MatchTrie*>> prefixes;
	// The savings can only ever go down for a given node, never up.
	while (max != 0u && !queue.empty()) {
	std::pair<int32_t, MatchTrie*> pair = PopRealTop(queue);
	if (pair.second != this && pair.first > 0) {
	// Pick this node.
	uint32_t count = pair.second->count_;
	pair.second->chosen_ = true;
	for (MatchTrie* cur = pair.second->parent_; cur != this; cur = cur->parent_) {
	if (cur->chosen_) {
	break;
	}
	cur->count_ -= count;
	}
	for (MatchTrie* cur = pair.second->parent_; cur != this; cur = cur->parent_) {
	++cur->chosen_suffix_count_;
	}
	prefixes.emplace(pair.first, pair.second);
	--max;
	} else {
	// Negative or no EV, just delete the node.
	}
	}
	while (!prefixes.empty()) {
	std::pair<int32_t, MatchTrie*> pair = PopRealTop(prefixes);
	if (pair.first <= 0) {
	continue;
	}
	ret.push_back(pair.second->GetString());
	}
	return ret;
	}

	std::string GetString() const {
	std::vector<uint8_t> chars;
	for (const MatchTrie* cur = this; cur->parent_ != nullptr; cur = cur->parent_) {
	chars.push_back(cur->incoming_);
	}
	return std::string(chars.rbegin(), chars.rend());
	}

	std::unique_ptr<MatchTrie> nodes_[256];
	MatchTrie* parent_ = nullptr;
	uint32_t count_ = 0u;
	uint32_t depth_ = 0u;
	int32_t savings_ = 0u;
	uint8_t incoming_ = 0u;
	// Value of the current node, non zero if the node is chosen.
	uint32_t value_ = 0u;
	// If the current node is chosen to be a used prefix.
	bool chosen_ = false;
	// If the current node is a prefix of a longer chosen prefix.
	uint32_t chosen_suffix_count_ = 0u;
	};

	void PrefixStrings::Builder::Build(const std::vector<std::string>& strings) {
	std::unique_ptr<MatchTrie> prefixe_trie(new MatchTrie());
	for (size_t i = 0; i < strings.size(); ++i) {
	size_t len = 0u;
	if (i > 0u) {
	CHECK_GT(strings[i], strings[i - 1]);
	len = std::max(len, PrefixLen(strings[i], strings[i - 1]));
	}
	if (i < strings.size() - 1) {
	len = std::max(len, PrefixLen(strings[i], strings[i + 1]));
	}
	len = std::min(len, kMaxPrefixLen);
	if (len >= kMinPrefixLen) {
	prefixe_trie->Add(strings[i].substr(0, len))->value_ = 1u;
	}
	}

	// Build prefixes.
	{
	static constexpr size_t kPrefixBits = 15;
	std::vector<std::string> prefixes(prefixe_trie->ExtractPrefixes(1 << kPrefixBits));
	// Add longest prefixes first so that subprefixes can share data.
	std::sort(prefixes.begin(), prefixes.end(), [](const std::string& a, const std::string& b) {
	return a.length() > b.length();
	});
	prefixe_trie.reset();
	prefixe_trie.reset(new MatchTrie());
	uint32_t prefix_idx = 0u;
	CHECK_EQ(output_->dictionary_.AddOffset(0u, 0u), prefix_idx++);
	for (const std::string& str : prefixes) {
	uint32_t prefix_offset = 0u;
	MatchTrie* node = prefixe_trie->LongestPrefix(str);
	if (node != nullptr && node->depth_ == str.length() && node->value_ != 0u) {
	CHECK_EQ(node->GetString(), str);
	uint32_t existing_len = 0u;
	output_->dictionary_.GetOffset(node->value_, &prefix_offset, &existing_len);
	// Make sure to register the current node.
	prefixe_trie->Add(str)->value_ = prefix_idx;
	} else {
	auto add_str = [&](const std::string& s) {
	node = prefixe_trie->Add(s);
	node->value_ = prefix_idx;
	while (node != nullptr) {
	node->value_ = prefix_idx;
	node = node->parent_;
	}
	};
	static constexpr size_t kNumSubstrings = 1u;
	// Increasing kNumSubstrings provides savings since it enables common substrings and not
	// only prefixes to share data. The problem is that it's slow.
	for (size_t i = 0; i < std::min(str.length(), kNumSubstrings); ++i) {
	add_str(str.substr(i));
	}
	prefix_offset = output_->dictionary_.AddPrefixData(
	reinterpret_cast<const uint8_t*>(&str[0]),
	str.length());
	}
	// TODO: Validiate the prefix offset.
	CHECK_EQ(output_->dictionary_.AddOffset(prefix_offset, str.length()), prefix_idx);
	++prefix_idx;
	}
	}

	// Add strings to the dictionary.
	for (const std::string& str : strings) {
	MatchTrie* node = prefixe_trie->LongestPrefix(str);
	uint32_t prefix_idx = 0u;
	uint32_t best_length = 0u;
	while (node != nullptr) {
	uint32_t offset = 0u;
	uint32_t length = 0u;
	output_->dictionary_.GetOffset(node->value_, &offset, &length);
	if (node->depth_ == length) {
	prefix_idx = node->value_;
	best_length = node->depth_;
	break;
	// Actually the prefix we want.
	}
	node = node->parent_;
	}
	output_->AddString(prefix_idx, str.substr(best_length));
	}
	}

	void AnalyzeStrings::ProcessDexFiles(const std::vector<std::unique_ptr<const DexFile>>& dex_files) {
	std::set<std::string> unique_strings;
	// Accumulate the strings.
	for (const std::unique_ptr<const DexFile>& dex_file : dex_files) {
	for (size_t i = 0; i < dex_file->NumStringIds(); ++i) {
	uint32_t length = 0;
	const char* data = dex_file->StringDataAndUtf16LengthByIdx(dex::StringIndex(i), &length);
	// Analyze if the string has any UTF16 chars.
	bool have_wide_char = false;
	const char* ptr = data;
	for (size_t j = 0; j < length; ++j) {
	have_wide_char = have_wide_char \|\| GetUtf16FromUtf8(&ptr) >= 0x100;
	}
	if (have_wide_char) {
	wide_string_bytes_ += 2 * length;
	} else {
	ascii_string_bytes_ += length;
	}
	string_data_bytes_ += ptr - data;
	unique_strings.insert(data);
	}
	}
	// Unique strings only since we want to exclude savings from multi-dex duplication.
	ProcessStrings(std::vector<std::string>(unique_strings.begin(), unique_strings.end()));
	}

	void AnalyzeStrings::ProcessStrings(const std::vector<std::string>& strings) {
	// Calculate total shared prefix.
	size_t prefix_index_cost_ = 0u;
	for (size_t i = 0; i < strings.size(); ++i) {
	size_t best_len = 0;
	if (i > 0) {
	best_len = std::max(best_len, PrefixLen(strings[i], strings[i - 1]));
	}
	if (i < strings.size() - 1) {
	best_len = std::max(best_len, PrefixLen(strings[i], strings[i + 1]));
	}
	best_len = std::min(best_len, kMaxPrefixLen);
	if (best_len >= kMinPrefixLen) {
	total_shared_prefix_bytes_ += best_len;
	}
	prefix_index_cost_ += kPrefixIndexCost;
	if (strings[i].length() < 64) {
	++short_strings_;
	} else {
	++long_strings_;
	}
	}
	total_prefix_index_cost_ += prefix_index_cost_;

	PrefixStrings prefix_strings;
	{
	PrefixStrings::Builder prefix_builder(&prefix_strings);
	prefix_builder.Build(strings);
	}
	Benchmark(prefix_strings, strings, &prefix_timings_);
	const size_t num_prefixes = prefix_strings.dictionary_.offsets_.size();
	total_num_prefixes_ += num_prefixes;
	total_prefix_table_ += num_prefixes * sizeof(prefix_strings.dictionary_.offsets_[0]);
	total_prefix_dict_ += prefix_strings.dictionary_.prefix_data_.size();

	{
	NormalStrings normal_strings;
	for (const std::string& s : strings) {
	normal_strings.AddString(s);
	}
	const uint64_t unique_string_data_bytes = normal_strings.chars_.size();
	total_unique_string_data_bytes_ += unique_string_data_bytes;
	total_prefix_savings_ += unique_string_data_bytes - prefix_strings.chars_.size() +
	prefix_index_cost_;
	Benchmark(normal_strings, strings, &normal_timings_);
	}
	}

	template <typename Strings>
	void AnalyzeStrings::Benchmark(const Strings& strings,
	const std::vector<std::string>& reference,
	StringTimings* timings) {
	const size_t kIterations = 100;
	timings->num_comparisons_ += reference.size() * kIterations;

	uint64_t start = NanoTime();
	for (size_t j = 0; j < kIterations; ++j) {
	for (size_t i = 0; i < reference.size(); ++i) {
	CHECK(strings.Equal(
	i,
	reinterpret_cast<const uint8_t*>(&reference[i][0]),
	reference[i].length()))
	<< i << ": " << strings.GetString(i) << " vs " << reference[i];
	}
	}
	timings->time_equal_comparisons_ += NanoTime() - start;

	start = NanoTime();
	for (size_t j = 0; j < kIterations; ++j) {
	size_t count = 0u;
	for (size_t i = 0; i < reference.size(); ++i) {
	count += strings.Equal(
	reference.size() - 1 - i,
	reinterpret_cast<const uint8_t*>(&reference[i][0]),
	reference[i].length());
	}
	CHECK_LT(count, 2u);
	}
	timings->time_non_equal_comparisons_ += NanoTime() - start;
	}

	template void AnalyzeStrings::Benchmark(const PrefixStrings&,
	const std::vector<std::string>&,
	StringTimings* timings);
	template void AnalyzeStrings::Benchmark(const NormalStrings&,
	const std::vector<std::string>&,
	StringTimings* timings);

	void StringTimings::Dump(std::ostream& os) const {
	const double comparisons = static_cast<double>(num_comparisons_);
	os << "Compare equal " << static_cast<double>(time_equal_comparisons_) / comparisons << "\n";
	os << "Compare not equal " << static_cast<double>(time_non_equal_comparisons_) / comparisons << "\n";
	}

	void AnalyzeStrings::Dump(std::ostream& os, uint64_t total_size) const {
	os << "Total string data bytes " << Percent(string_data_bytes_, total_size) << "\n";
	os << "Total unique string data bytes "
	<< Percent(total_unique_string_data_bytes_, total_size) << "\n";
	os << "UTF-16 string data bytes " << Percent(wide_string_bytes_, total_size) << "\n";
	os << "ASCII string data bytes " << Percent(ascii_string_bytes_, total_size) << "\n";

	os << "Prefix string timings\n";
	prefix_timings_.Dump(os);
	os << "Normal string timings\n";
	normal_timings_.Dump(os);

	// Prefix based strings.
	os << "Total shared prefix bytes " << Percent(total_shared_prefix_bytes_, total_size) << "\n";
	os << "Prefix dictionary cost " << Percent(total_prefix_dict_, total_size) << "\n";
	os << "Prefix table cost " << Percent(total_prefix_table_, total_size) << "\n";
	os << "Prefix index cost " << Percent(total_prefix_index_cost_, total_size) << "\n";
	int64_t net_savings = total_prefix_savings_;
	net_savings -= total_prefix_dict_;
	net_savings -= total_prefix_table_;
	net_savings -= total_prefix_index_cost_;
	os << "Prefix dictionary elements " << total_num_prefixes_ << "\n";
	os << "Prefix base savings " << Percent(total_prefix_savings_, total_size) << "\n";
	os << "Prefix net savings " << Percent(net_savings, total_size) << "\n";
	os << "Strings using prefix "
	<< Percent(strings_used_prefixed_, total_prefix_index_cost_ / kPrefixIndexCost) << "\n";
	os << "Short strings " << Percent(short_strings_, short_strings_ + long_strings_) << "\n";
	if (verbose_level_ >= VerboseLevel::kEverything) {
	std::vector<std::pair<std::string, size_t>> pairs; // (prefixes_.begin(), prefixes_.end());
	// Sort lexicographically.
	std::sort(pairs.begin(), pairs.end());
	for (const auto& pair : pairs) {
	os << pair.first << " : " << pair.second << "\n";
	}
	}
	}

	} // namespace dexanalyze
	} // namespace art