Add command line tool for creating mini-debug-info for native code.

Mini-debug-info for native code is currently created with bash script,
which works, but misses some optimizations that the ART compiler does.

This CL adds tool based on ART code-base, with the following features:
 * CIE entries are deduplicated to save space (usually one is needed).
 * FDE entries are sorted, which significantly improves compression.
 * Non-function and zero-sized function symbols are excluded.
 * Symbols are sorted by address to allow binary search.
 * Compressed data is split to blocks to allow random-access reads.

The space optimizations and the better random access balances,
so the overall generated file size remains approximately same.

Bug: 110133331
Test: manually check the generated data using readelf
Change-Id: I4ed8deaee647d5ee4dfb0846f316e888f060b98e
diff --git a/libelffile/elf/elf_builder.h b/libelffile/elf/elf_builder.h
index a76bf92..10541ec 100644
--- a/libelffile/elf/elf_builder.h
+++ b/libelffile/elf/elf_builder.h
@@ -310,7 +310,7 @@
       last_offset_ = 0;
     }
 
-    Elf_Word Write(const std::string& name) {
+    Elf_Word Write(std::string_view name) {
       if (current_offset_ == 0) {
         DCHECK(name.empty());
       } else if (name == last_name_) {
@@ -318,7 +318,9 @@
       }
       last_name_ = name;
       last_offset_ = current_offset_;
-      this->WriteFully(name.c_str(), name.length() + 1);
+      this->WriteFully(name.data(), name.length());
+      char null_terminator = '\0';
+      this->WriteFully(&null_terminator, sizeof(null_terminator));
       current_offset_ += name.length() + 1;
       return last_offset_;
     }
@@ -798,6 +800,21 @@
      return stream_.Seek(RoundUp(stream_.Seek(0, kSeekCurrent), alignment), kSeekSet);
   }
 
+  static InstructionSet GetIsaFromHeader(const Elf_Ehdr& header) {
+    switch (header.e_machine) {
+      case EM_ARM:
+        return InstructionSet::kThumb2;
+      case EM_AARCH64:
+        return InstructionSet::kArm64;
+      case EM_386:
+        return InstructionSet::kX86;
+      case EM_X86_64:
+        return InstructionSet::kX86_64;
+    }
+    LOG(FATAL) << "Unknown architecture: " << header.e_machine;
+    UNREACHABLE();
+  }
+
  private:
   static Elf_Ehdr MakeElfHeader(InstructionSet isa) {
     Elf_Ehdr elf_header = Elf_Ehdr();
@@ -832,6 +849,7 @@
         LOG(FATAL) << "Unknown instruction set " << isa;
       }
     }
+    DCHECK_EQ(GetIsaFromHeader(elf_header), isa);
 
     elf_header.e_ident[EI_MAG0]       = ELFMAG0;
     elf_header.e_ident[EI_MAG1]       = ELFMAG1;
diff --git a/libelffile/elf/elf_debug_reader.h b/libelffile/elf/elf_debug_reader.h
index 2b03037..266c638 100644
--- a/libelffile/elf/elf_debug_reader.h
+++ b/libelffile/elf/elf_debug_reader.h
@@ -36,6 +36,7 @@
  public:
   // Note that the input buffer might be misaligned.
   typedef typename ElfTypes::Ehdr ALIGNED(1) Elf_Ehdr;
+  typedef typename ElfTypes::Phdr ALIGNED(1) Elf_Phdr;
   typedef typename ElfTypes::Shdr ALIGNED(1) Elf_Shdr;
   typedef typename ElfTypes::Sym ALIGNED(1) Elf_Sym;
   typedef typename ElfTypes::Addr ALIGNED(1) Elf_Addr;
@@ -65,10 +66,11 @@
     CHECK_EQ(header_->e_ident[1], ELFMAG1);
     CHECK_EQ(header_->e_ident[2], ELFMAG2);
     CHECK_EQ(header_->e_ident[3], ELFMAG3);
+    CHECK_EQ(header_->e_ident[4], sizeof(Elf_Addr) / sizeof(uint32_t));
     CHECK_EQ(header_->e_ehsize, sizeof(Elf_Ehdr));
-    CHECK_EQ(header_->e_shentsize, sizeof(Elf_Shdr));
 
     // Find all ELF sections.
+    CHECK_EQ(header_->e_shentsize, sizeof(Elf_Shdr));
     sections_ = Read<Elf_Shdr>(header_->e_shoff, header_->e_shnum);
     for (const Elf_Shdr& section : sections_) {
       const char* name = Read<char>(sections_[header_->e_shstrndx].sh_offset + section.sh_name);
@@ -84,16 +86,38 @@
     }
   }
 
-  explicit ElfDebugReader(std::vector<uint8_t>& file)
+  explicit ElfDebugReader(const std::vector<uint8_t>& file)
       : ElfDebugReader(ArrayRef<const uint8_t>(file)) {
   }
 
+  // Check that ELF signature is present at the start of the files,
+  // and that the ELF bitness matches the ElfTypes template arguments.
+  static bool IsValidElfHeader(const std::vector<uint8_t>& data) {
+    static constexpr bool kIs64Bit = sizeof(Elf_Addr) == sizeof(uint64_t);
+    static constexpr char kMagic[] = { 0x7f, 'E', 'L', 'F', kIs64Bit ? 2 : 1 };
+    return data.size() >= sizeof(kMagic) && memcmp(data.data(), kMagic, sizeof(kMagic)) == 0;
+  }
+
   const Elf_Ehdr* GetHeader() { return header_; }
 
   ArrayRef<Elf_Shdr> GetSections() { return sections_; }
 
   const Elf_Shdr* GetSection(const char* name) { return section_map_[name]; }
 
+  // Find the base address where the ELF file wants to be loaded.
+  // This is generally zero (therefore always requiring relocation).
+  Elf_Addr GetLoadAddress() {
+    std::optional<Elf_Addr> addr;
+    CHECK_EQ(header_->e_phentsize, sizeof(Elf_Phdr));
+    for (const Elf_Phdr& phdr : Read<Elf_Phdr>(header_->e_phoff, header_->e_phnum)) {
+      if (phdr.p_type == PT_LOAD) {
+        addr = addr.has_value() ? std::min(addr.value(), phdr.p_vaddr) : phdr.p_vaddr;
+      }
+    }
+    CHECK(addr.has_value());
+    return addr.value();
+  }
+
   template <typename VisitSym>
   void VisitFunctionSymbols(VisitSym visit_sym) {
     const Elf_Shdr* symtab = GetSection(".symtab");
diff --git a/libelffile/elf/xz_utils.cc b/libelffile/elf/xz_utils.cc
index 87c9a7b..f064cb0 100644
--- a/libelffile/elf/xz_utils.cc
+++ b/libelffile/elf/xz_utils.cc
@@ -32,8 +32,6 @@
 
 namespace art {
 
-constexpr size_t kChunkSize = 16 * KB;
-
 static void XzInitCrc() {
   static std::once_flag crc_initialized;
   std::call_once(crc_initialized, []() {
@@ -42,14 +40,17 @@
   });
 }
 
-void XzCompress(ArrayRef<const uint8_t> src, std::vector<uint8_t>* dst, int level) {
+void XzCompress(ArrayRef<const uint8_t> src,
+                std::vector<uint8_t>* dst,
+                int level,
+                size_t block_size) {
   // Configure the compression library.
   XzInitCrc();
   CLzma2EncProps lzma2Props;
   Lzma2EncProps_Init(&lzma2Props);
   lzma2Props.lzmaProps.level = level;
   lzma2Props.lzmaProps.reduceSize = src.size();  // Size of data that will be compressed.
-  lzma2Props.blockSize = kChunkSize;
+  lzma2Props.blockSize = block_size;
   Lzma2EncProps_Normalize(&lzma2Props);
   CXzProps props;
   XzProps_Init(&props);
diff --git a/libelffile/elf/xz_utils.h b/libelffile/elf/xz_utils.h
index df5cb56..b1903ff 100644
--- a/libelffile/elf/xz_utils.h
+++ b/libelffile/elf/xz_utils.h
@@ -20,10 +20,17 @@
 #include <vector>
 
 #include "base/array_ref.h"
+#include "base/bit_utils.h"
 
 namespace art {
 
-void XzCompress(ArrayRef<const uint8_t> src, std::vector<uint8_t>* dst, int level = 1 /* speed */);
+constexpr size_t kXzDefaultBlockSize = 16 * KB;
+
+void XzCompress(ArrayRef<const uint8_t> src,
+                std::vector<uint8_t>* dst,
+                int level = 1 /* speed */,
+                size_t block_size = kXzDefaultBlockSize);
+
 void XzDecompress(ArrayRef<const uint8_t> src, std::vector<uint8_t>* dst);
 
 }  // namespace art
diff --git a/tools/create_minidebuginfo/Android.bp b/tools/create_minidebuginfo/Android.bp
new file mode 100644
index 0000000..a9d6b8d
--- /dev/null
+++ b/tools/create_minidebuginfo/Android.bp
@@ -0,0 +1,35 @@
+//
+// Copyright (C) 2021 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+art_cc_binary {
+    name: "create_minidebuginfo",
+    defaults: [
+        "art_debug_defaults",
+        "art_defaults",
+    ],
+    host_supported: true,
+    device_supported: false,
+    srcs: [
+        "create_minidebuginfo.cc",
+    ],
+    static_libs: [
+        "libartbase",
+        "libbase",
+        "libelffile",
+        "liblzma",
+        "liblog",
+    ],
+}
diff --git a/tools/create_minidebuginfo/create_minidebuginfo.cc b/tools/create_minidebuginfo/create_minidebuginfo.cc
new file mode 100644
index 0000000..aad36a8
--- /dev/null
+++ b/tools/create_minidebuginfo/create_minidebuginfo.cc
@@ -0,0 +1,175 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "android-base/logging.h"
+
+#include "base/os.h"
+#include "base/unix_file/fd_file.h"
+#include "elf/elf_builder.h"
+#include "elf/elf_debug_reader.h"
+#include "elf/xz_utils.h"
+#include "stream/file_output_stream.h"
+#include "stream/vector_output_stream.h"
+
+#include <algorithm>
+#include <deque>
+#include <map>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <vector>
+
+namespace art {
+
+static constexpr size_t kBlockSize = 32 * KB;
+
+template<typename ElfTypes>
+static void WriteMinidebugInfo(const std::vector<uint8_t>& input, std::vector<uint8_t>* output) {
+  using Elf_Addr = typename ElfTypes::Addr;
+  using Elf_Ehdr = typename ElfTypes::Ehdr;
+  using Elf_Shdr = typename ElfTypes::Shdr;
+  using Elf_Sym = typename ElfTypes::Sym;
+  using Elf_Word = typename ElfTypes::Word;
+  using CIE = typename ElfDebugReader<ElfTypes>::CIE;
+  using FDE = typename ElfDebugReader<ElfTypes>::FDE;
+
+  ElfDebugReader<ElfTypes> reader(input);
+
+  std::vector<uint8_t> output_elf_data;
+  VectorOutputStream output_stream("Output ELF", &output_elf_data);
+  InstructionSet isa = ElfBuilder<ElfTypes>::GetIsaFromHeader(*reader.GetHeader());
+  std::unique_ptr<ElfBuilder<ElfTypes>> builder(new ElfBuilder<ElfTypes>(isa, &output_stream));
+  builder->Start(/*write_program_headers=*/ false);
+
+  auto* rodata = builder->GetRoData();
+  auto* text = builder->GetText();
+  const Elf_Shdr* original_text = reader.GetSection(".text");
+  CHECK(original_text != nullptr);
+  CHECK_EQ(reader.GetLoadAddress(), 0u);
+  rodata->AllocateVirtualMemory(original_text->sh_addr - sizeof(Elf_Ehdr));
+  text->AllocateVirtualMemory(original_text->sh_addr, original_text->sh_size);
+
+  auto* strtab = builder->GetStrTab();
+  auto* symtab = builder->GetSymTab();
+  strtab->Start();
+  {
+    strtab->Write("");  // strtab should start with empty string.
+    std::multimap<std::string_view, Elf_Sym> syms;
+    reader.VisitFunctionSymbols([&](Elf_Sym sym, const char* name) {
+      // Exclude non-function or empty symbols.
+      if (ELF32_ST_TYPE(sym.st_info) == STT_FUNC && sym.st_size != 0) {
+        syms.emplace(name, sym);
+      }
+    });
+    reader.VisitDynamicSymbols([&](Elf_Sym sym, const char* name) {
+      // Exclude symbols which will be preserved in the dynamic table anyway.
+      auto it = syms.find(name);
+      if (it != syms.end() && it->second.st_value == sym.st_value) {
+        syms.erase(it);
+      }
+    });
+    for (auto& entry : syms) {
+      std::string_view name = entry.first;
+      const Elf_Sym& sym = entry.second;
+      Elf_Word name_idx = strtab->Write(name);
+      symtab->Add(name_idx, text, sym.st_value, sym.st_size, STB_GLOBAL, STT_FUNC);
+    }
+  }
+  strtab->End();
+  symtab->WriteCachedSection();
+
+  auto* debug_frame = builder->GetDebugFrame();
+  debug_frame->Start();
+  {
+    std::map<std::basic_string_view<uint8_t>, Elf_Addr> cie_dedup;
+    std::unordered_map<const CIE*, Elf_Addr> new_cie_offset;
+    std::deque<std::pair<const FDE*, const CIE*>> entries;
+    // Read, de-duplicate and write CIE entries.  Read FDE entries.
+    reader.VisitDebugFrame(
+        [&](const CIE* cie) {
+          std::basic_string_view<uint8_t> key(cie->data(), cie->size());
+          auto it = cie_dedup.emplace(key, debug_frame->GetPosition());
+          if (/* inserted */ it.second) {
+            debug_frame->WriteFully(cie->data(), cie->size());
+          }
+          new_cie_offset[cie] = it.first->second;
+        },
+        [&](const FDE* fde, const CIE* cie) {
+          entries.emplace_back(std::make_pair(fde, cie));
+        });
+    // Sort FDE entries by opcodes to improve locality for compression (saves ~25%).
+    std::stable_sort(entries.begin(), entries.end(), [](const auto& lhs, const auto& rhs) {
+      constexpr size_t opcode_offset = sizeof(FDE);
+      return std::lexicographical_compare(
+          lhs.first->data() + opcode_offset, lhs.first->data() + lhs.first->size(),
+          rhs.first->data() + opcode_offset, rhs.first->data() + rhs.first->size());
+    });
+    // Write all FDE entries while adjusting the CIE offsets to the new locations.
+    for (const auto& entry : entries) {
+      const FDE* fde = entry.first;
+      const CIE* cie = entry.second;
+      FDE new_header = *fde;
+      new_header.cie_pointer = new_cie_offset[cie];
+      debug_frame->WriteFully(&new_header, sizeof(FDE));
+      debug_frame->WriteFully(fde->data() + sizeof(FDE), fde->size() - sizeof(FDE));
+    }
+  }
+  debug_frame->End();
+
+  builder->End();
+  CHECK(builder->Good());
+
+  XzCompress(ArrayRef<const uint8_t>(output_elf_data), output, 9 /*size*/, kBlockSize);
+}
+
+static int Main(int argc, char** argv) {
+  // Check command like arguments.
+  if (argc != 3) {
+    printf("Usage: create_minidebuginfo ELF_FILE OUT_FILE\n");
+    printf("  ELF_FILE: The path to an ELF file with full symbols (before being stripped).\n");
+    printf("  OUT_FILE: The path for the generated mini-debug-info data (not an elf file).\n");
+    return 1;
+  }
+  const char* input_filename = argv[1];
+  const char* output_filename = argv[2];
+
+  // Read input file.
+  std::unique_ptr<File> input_file(OS::OpenFileForReading(input_filename));
+  CHECK(input_file.get() != nullptr) << "Failed to open input file";
+  std::vector<uint8_t> elf(input_file->GetLength());
+  CHECK(input_file->ReadFully(elf.data(), elf.size())) << "Failed to read input file";
+
+  // Write output file.
+  std::vector<uint8_t> output;
+  if (ElfDebugReader<ElfTypes32>::IsValidElfHeader(elf)) {
+    WriteMinidebugInfo<ElfTypes32>(elf, &output);
+  } else if (ElfDebugReader<ElfTypes64>::IsValidElfHeader(elf)) {
+    WriteMinidebugInfo<ElfTypes64>(elf, &output);
+  } else {
+    LOG(FATAL) << "Invalid ELF file header " << input_filename;
+  }
+  std::unique_ptr<File> output_file(OS::CreateEmptyFile(output_filename));
+  if (!output_file->WriteFully(output.data(), output.size()) || output_file->FlushClose() != 0) {
+    LOG(FATAL) << "Failed to write " << output_filename;
+  }
+  return 0;
+}
+
+}  // namespace art
+
+int main(int argc, char** argv) {
+  return art::Main(argc, argv);
+}