17 files changed, 299 insertions, 224 deletions
diff --git a/compiler/debug/elf_debug_frame_writer.h b/compiler/debug/elf_debug_frame_writer.h
index d0c98a7b79..27b70c8caa 100644
--- a/compiler/debug/elf_debug_frame_writer.h
+++ b/compiler/debug/elf_debug_frame_writer.h
@@ -207,13 +207,12 @@ void WriteCFISection(linker::ElfBuilder<ElfTypes>* builder,
   }
 
   // Write .eh_frame/.debug_frame section.
-  auto* cfi_section = (format == dwarf::DW_DEBUG_FRAME_FORMAT
-                       ? builder->GetDebugFrame()
-                       : builder->GetEhFrame());
+  const bool is_debug_frame = format == dwarf::DW_DEBUG_FRAME_FORMAT;
+  auto* cfi_section = (is_debug_frame ? builder->GetDebugFrame() : builder->GetEhFrame());
   {
     cfi_section->Start();
     const bool is64bit = Is64BitInstructionSet(builder->GetIsa());
-    const Elf_Addr cfi_address = cfi_section->GetAddress();
+    const Elf_Addr cfi_address = (is_debug_frame ? 0 : cfi_section->GetAddress());
     const Elf_Addr cie_address = cfi_address;
     Elf_Addr buffer_address = cfi_address;
     std::vector<uint8_t> buffer;  // Small temporary buffer.
diff --git a/compiler/debug/elf_debug_info_writer.h b/compiler/debug/elf_debug_info_writer.h
index 81a0a69bfa..107ed488cd 100644
--- a/compiler/debug/elf_debug_info_writer.h
+++ b/compiler/debug/elf_debug_info_writer.h
@@ -298,7 +298,7 @@ class ElfCompilationUnitWriter {
     CHECK_EQ(info_.Depth(), 0);
     std::vector<uint8_t> buffer;
     buffer.reserve(info_.data()->size() + KB);
-    const size_t offset = owner_->builder_->GetDebugInfo()->GetSize();
+    const size_t offset = owner_->builder_->GetDebugInfo()->GetPosition();
     // All compilation units share single table which is at the start of .debug_abbrev.
     const size_t debug_abbrev_offset = 0;
     WriteDebugInfoCU(debug_abbrev_offset, info_, offset, &buffer, &owner_->debug_info_patches_);
@@ -463,7 +463,7 @@ class ElfCompilationUnitWriter {
     CHECK_EQ(info_.Depth(), 0);
     std::vector<uint8_t> buffer;
     buffer.reserve(info_.data()->size() + KB);
-    const size_t offset = owner_->builder_->GetDebugInfo()->GetSize();
+    const size_t offset = owner_->builder_->GetDebugInfo()->GetPosition();
     // All compilation units share single table which is at the start of .debug_abbrev.
     const size_t debug_abbrev_offset = 0;
     WriteDebugInfoCU(debug_abbrev_offset, info_, offset, &buffer, &owner_->debug_info_patches_);
diff --git a/compiler/debug/elf_debug_line_writer.h b/compiler/debug/elf_debug_line_writer.h
index c7224fc94a..d7fd52448c 100644
--- a/compiler/debug/elf_debug_line_writer.h
+++ b/compiler/debug/elf_debug_line_writer.h
@@ -60,7 +60,7 @@ class ElfDebugLineWriter {
         ? builder_->GetText()->GetAddress()
         : 0;
 
-    compilation_unit.debug_line_offset = builder_->GetDebugLine()->GetSize();
+    compilation_unit.debug_line_offset = builder_->GetDebugLine()->GetPosition();
 
     std::vector<dwarf::FileEntry> files;
     std::unordered_map<std::string, size_t> files_map;
@@ -268,7 +268,7 @@ class ElfDebugLineWriter {
     }
     std::vector<uint8_t> buffer;
     buffer.reserve(opcodes.data()->size() + KB);
-    size_t offset = builder_->GetDebugLine()->GetSize();
+    size_t offset = builder_->GetDebugLine()->GetPosition();
     WriteDebugLineTable(directories, files, opcodes, offset, &buffer, &debug_line_patches_);
     builder_->GetDebugLine()->WriteFully(buffer.data(), buffer.size());
     return buffer.size();
diff --git a/compiler/debug/elf_debug_loc_writer.h b/compiler/debug/elf_debug_loc_writer.h
index bb856b29f4..1d609af4e6 100644
--- a/compiler/debug/elf_debug_loc_writer.h
+++ b/compiler/debug/elf_debug_loc_writer.h
@@ -251,7 +251,10 @@ static void WriteDebugLocEntry(const MethodDebugInfo* method_info,
         // kInStackLargeOffset and kConstantLargeValue are hidden by GetKind().
         // kInRegisterHigh and kInFpuRegisterHigh should be handled by
         // the special cases above and they should not occur alone.
-        LOG(ERROR) << "Unexpected register location kind: " << kind;
+        LOG(WARNING) << "Unexpected register location: " << kind
+                     << " (This can indicate either a bug in the dexer when generating"
+                     << " local variable information, or a bug in ART compiler."
+                     << " Please file a bug at go/art-bug)";
         break;
       }
       if (is64bitValue) {
diff --git a/compiler/debug/elf_debug_writer.cc b/compiler/debug/elf_debug_writer.cc
index 33c46d7e1f..a6267292bf 100644
--- a/compiler/debug/elf_debug_writer.cc
+++ b/compiler/debug/elf_debug_writer.cc
@@ -108,29 +108,32 @@ void WriteDebugInfo(linker::ElfBuilder<ElfTypes>* builder,
 std::vector<uint8_t> MakeMiniDebugInfo(
     InstructionSet isa,
     const InstructionSetFeatures* features,
-    size_t rodata_size,
+    uint64_t text_address,
     size_t text_size,
     const ArrayRef<const MethodDebugInfo>& method_infos) {
   if (Is64BitInstructionSet(isa)) {
     return MakeMiniDebugInfoInternal<ElfTypes64>(isa,
                                                  features,
-                                                 rodata_size,
+                                                 text_address,
                                                  text_size,
                                                  method_infos);
   } else {
     return MakeMiniDebugInfoInternal<ElfTypes32>(isa,
                                                  features,
-                                                 rodata_size,
+                                                 text_address,
                                                  text_size,
                                                  method_infos);
   }
 }
 
 template <typename ElfTypes>
-static std::vector<uint8_t> WriteDebugElfFileForMethodsInternal(
+static std::vector<uint8_t> MakeElfFileForJITInternal(
     InstructionSet isa,
     const InstructionSetFeatures* features,
-    const ArrayRef<const MethodDebugInfo>& method_infos) {
+    bool mini_debug_info,
+    const MethodDebugInfo& mi) {
+  CHECK_EQ(mi.is_code_address_text_relative, false);
+  ArrayRef<const MethodDebugInfo> method_infos(&mi, 1);
   std::vector<uint8_t> buffer;
   buffer.reserve(KB);
   linker::VectorOutputStream out("Debug ELF file", &buffer);
@@ -138,23 +141,34 @@ static std::vector<uint8_t> WriteDebugElfFileForMethodsInternal(
       new linker::ElfBuilder<ElfTypes>(isa, features, &out));
   // No program headers since the ELF file is not linked and has no allocated sections.
   builder->Start(false /* write_program_headers */);
-  WriteDebugInfo(builder.get(),
-                 method_infos,
-                 dwarf::DW_DEBUG_FRAME_FORMAT,
-                 false /* write_oat_patches */);
+  if (mini_debug_info) {
+    std::vector<uint8_t> mdi = MakeMiniDebugInfo(isa,
+                                                 features,
+                                                 mi.code_address,
+                                                 mi.code_size,
+                                                 method_infos);
+    builder->WriteSection(".gnu_debugdata", &mdi);
+  } else {
+    builder->GetText()->AllocateVirtualMemory(mi.code_address, mi.code_size);
+    WriteDebugInfo(builder.get(),
+                   method_infos,
+                   dwarf::DW_DEBUG_FRAME_FORMAT,
+                   false /* write_oat_patches */);
+  }
   builder->End();
   CHECK(builder->Good());
   return buffer;
 }
 
-std::vector<uint8_t> WriteDebugElfFileForMethods(
+std::vector<uint8_t> MakeElfFileForJIT(
     InstructionSet isa,
     const InstructionSetFeatures* features,
-    const ArrayRef<const MethodDebugInfo>& method_infos) {
+    bool mini_debug_info,
+    const MethodDebugInfo& method_info) {
   if (Is64BitInstructionSet(isa)) {
-    return WriteDebugElfFileForMethodsInternal<ElfTypes64>(isa, features, method_infos);
+    return MakeElfFileForJITInternal<ElfTypes64>(isa, features, mini_debug_info, method_info);
   } else {
-    return WriteDebugElfFileForMethodsInternal<ElfTypes32>(isa, features, method_infos);
+    return MakeElfFileForJITInternal<ElfTypes32>(isa, features, mini_debug_info, method_info);
   }
 }
 
diff --git a/compiler/debug/elf_debug_writer.h b/compiler/debug/elf_debug_writer.h
index d24ca9b203..a47bf076b9 100644
--- a/compiler/debug/elf_debug_writer.h
+++ b/compiler/debug/elf_debug_writer.h
@@ -43,14 +43,15 @@ void WriteDebugInfo(
 std::vector<uint8_t> MakeMiniDebugInfo(
     InstructionSet isa,
     const InstructionSetFeatures* features,
-    size_t rodata_section_size,
+    uint64_t text_section_address,
     size_t text_section_size,
     const ArrayRef<const MethodDebugInfo>& method_infos);
 
-std::vector<uint8_t> WriteDebugElfFileForMethods(
+std::vector<uint8_t> MakeElfFileForJIT(
     InstructionSet isa,
     const InstructionSetFeatures* features,
-    const ArrayRef<const MethodDebugInfo>& method_infos);
+    bool mini_debug_info,
+    const MethodDebugInfo& method_info);
 
 std::vector<uint8_t> WriteDebugElfFileForClasses(
     InstructionSet isa,
diff --git a/compiler/debug/elf_gnu_debugdata_writer.h b/compiler/debug/elf_gnu_debugdata_writer.h
index 1cdf6b0ad1..78b8e2780c 100644
--- a/compiler/debug/elf_gnu_debugdata_writer.h
+++ b/compiler/debug/elf_gnu_debugdata_writer.h
@@ -80,7 +80,7 @@ template <typename ElfTypes>
 static std::vector<uint8_t> MakeMiniDebugInfoInternal(
     InstructionSet isa,
     const InstructionSetFeatures* features,
-    size_t rodata_section_size,
+    typename ElfTypes::Addr text_section_address,
     size_t text_section_size,
     const ArrayRef<const MethodDebugInfo>& method_infos) {
   std::vector<uint8_t> buffer;
@@ -88,11 +88,9 @@ static std::vector<uint8_t> MakeMiniDebugInfoInternal(
   linker::VectorOutputStream out("Mini-debug-info ELF file", &buffer);
   std::unique_ptr<linker::ElfBuilder<ElfTypes>> builder(
       new linker::ElfBuilder<ElfTypes>(isa, features, &out));
-  builder->Start();
-  // Mirror .rodata and .text as NOBITS sections.
-  // It is needed to detected relocations after compression.
-  builder->GetRoData()->WriteNoBitsSection(rodata_section_size);
-  builder->GetText()->WriteNoBitsSection(text_section_size);
+  builder->Start(false /* write_program_headers */);
+  // Mirror .text as NOBITS section since the added symbols will reference it.
+  builder->GetText()->AllocateVirtualMemory(text_section_address, text_section_size);
   WriteDebugSymbols(builder.get(), method_infos, false /* with_signature */);
   WriteCFISection(builder.get(),
                   method_infos,
diff --git a/compiler/debug/elf_symtab_writer.h b/compiler/debug/elf_symtab_writer.h
index 0907e102a0..57e010f232 100644
--- a/compiler/debug/elf_symtab_writer.h
+++ b/compiler/debug/elf_symtab_writer.h
@@ -79,8 +79,9 @@ static void WriteDebugSymbols(linker::ElfBuilder<ElfTypes>* builder,
       last_name_offset = name_offset;
     }
 
-    const auto* text = info.is_code_address_text_relative ? builder->GetText() : nullptr;
-    uint64_t address = info.code_address + (text != nullptr ? text->GetAddress() : 0);
+    const auto* text = builder->GetText();
+    uint64_t address = info.code_address;
+    address += info.is_code_address_text_relative ? text->GetAddress() : 0;
     // Add in code delta, e.g., thumb bit 0 for Thumb2 code.
     address += CompiledMethod::CodeDelta(info.isa);
     symtab->Add(name_offset, text, address, info.code_size, STB_GLOBAL, STT_FUNC);
diff --git a/compiler/linker/elf_builder.h b/compiler/linker/elf_builder.h
index b30b55e9b4..aa3cd98595 100644
--- a/compiler/linker/elf_builder.h
+++ b/compiler/linker/elf_builder.h
@@ -108,8 +108,6 @@ class ElfBuilder FINAL {
           section_index_(0),
           name_(name),
           link_(link),
-          started_(false),
-          finished_(false),
           phdr_flags_(PF_R),
           phdr_type_(0) {
       DCHECK_GE(align, 1u);
@@ -120,90 +118,62 @@ class ElfBuilder FINAL {
       header_.sh_entsize = entsize;
     }
 
-    // Start writing of this section.
-    void Start() {
-      CHECK(!started_);
-      CHECK(!finished_);
-      started_ = true;
-      auto& sections = owner_->sections_;
-      // Check that the previous section is complete.
-      CHECK(sections.empty() || sections.back()->finished_);
-      // The first ELF section index is 1. Index 0 is reserved for NULL.
-      section_index_ = sections.size() + 1;
-      // Page-align if we switch between allocated and non-allocated sections,
-      // or if we change the type of allocation (e.g. executable vs non-executable).
-      if (!sections.empty()) {
-        if (header_.sh_flags != sections.back()->header_.sh_flags) {
-          header_.sh_addralign = kPageSize;
-        }
-      }
-      // Align file position.
-      if (header_.sh_type != SHT_NOBITS) {
-        header_.sh_offset = owner_->AlignFileOffset(header_.sh_addralign);
-      } else {
-        header_.sh_offset = 0;
-      }
-      // Align virtual memory address.
-      if ((header_.sh_flags & SHF_ALLOC) != 0) {
-        header_.sh_addr = owner_->AlignVirtualAddress(header_.sh_addralign);
-      } else {
-        header_.sh_addr = 0;
-      }
-      // Push this section on the list of written sections.
-      sections.push_back(this);
+    // Allocate chunk of virtual memory for this section from the owning ElfBuilder.
+    // This must be done at the start for all SHF_ALLOC sections (i.e. mmaped by linker).
+    // It is fine to allocate section but never call Start/End() (e.g. the .bss section).
+    void AllocateVirtualMemory(Elf_Word size) {
+      AllocateVirtualMemory(owner_->virtual_address_, size);
     }
 
-    // Finish writing of this section.
-    void End() {
-      CHECK(started_);
-      CHECK(!finished_);
-      finished_ = true;
-      if (header_.sh_type == SHT_NOBITS) {
-        CHECK_GT(header_.sh_size, 0u);
-      } else {
-        // Use the current file position to determine section size.
-        off_t file_offset = owner_->stream_.Seek(0, kSeekCurrent);
-        CHECK_GE(file_offset, (off_t)header_.sh_offset);
-        header_.sh_size = file_offset - header_.sh_offset;
-      }
-      if ((header_.sh_flags & SHF_ALLOC) != 0) {
-        owner_->virtual_address_ += header_.sh_size;
-      }
+    void AllocateVirtualMemory(Elf_Addr addr, Elf_Word size) {
+      CHECK_NE(header_.sh_flags & SHF_ALLOC, 0u);
+      Elf_Word align = AddSection();
+      CHECK_EQ(header_.sh_addr, 0u);
+      header_.sh_addr = RoundUp(addr, align);
+      CHECK(header_.sh_size == 0u || header_.sh_size == size);
+      header_.sh_size = size;
+      CHECK_LE(owner_->virtual_address_, header_.sh_addr);
+      owner_->virtual_address_ = header_.sh_addr + header_.sh_size;
     }
 
-    // Get the location of this section in virtual memory.
-    Elf_Addr GetAddress() const {
-      CHECK(started_);
-      return header_.sh_addr;
+    // Start writing file data of this section.
+    void Start() {
+      CHECK(owner_->current_section_ == nullptr);
+      Elf_Word align = AddSection();
+      CHECK_EQ(header_.sh_offset, 0u);
+      header_.sh_offset = owner_->AlignFileOffset(align);
+      owner_->current_section_ = this;
     }
 
-    // Returns the size of the content of this section.
-    Elf_Word GetSize() const {
-      if (finished_) {
-        return header_.sh_size;
-      } else {
-        CHECK(started_);
-        CHECK_NE(header_.sh_type, (Elf_Word)SHT_NOBITS);
-        return owner_->stream_.Seek(0, kSeekCurrent) - header_.sh_offset;
-      }
+    // Finish writing file data of this section.
+    void End() {
+      CHECK(owner_->current_section_ == this);
+      Elf_Word position = GetPosition();
+      CHECK(header_.sh_size == 0u || header_.sh_size == position);
+      header_.sh_size = position;
+      owner_->current_section_ = nullptr;
+    }
+
+    // Get the number of bytes written so far.
+    // Only valid while writing the section.
+    Elf_Word GetPosition() const {
+      CHECK(owner_->current_section_ == this);
+      off_t file_offset = owner_->stream_.Seek(0, kSeekCurrent);
+      DCHECK_GE(file_offset, (off_t)header_.sh_offset);
+      return file_offset - header_.sh_offset;
     }
 
-    // Write this section as "NOBITS" section. (used for the .bss section)
-    // This means that the ELF file does not contain the initial data for this section
-    // and it will be zero-initialized when the ELF file is loaded in the running program.
-    void WriteNoBitsSection(Elf_Word size) {
+    // Get the location of this section in virtual memory.
+    Elf_Addr GetAddress() const {
       DCHECK_NE(header_.sh_flags & SHF_ALLOC, 0u);
-      header_.sh_type = SHT_NOBITS;
-      Start();
-      header_.sh_size = size;
-      End();
+      DCHECK_NE(header_.sh_addr, 0u);
+      return header_.sh_addr;
     }
 
     // This function always succeeds to simplify code.
     // Use builder's Good() to check the actual status.
     bool WriteFully(const void* buffer, size_t byte_count) OVERRIDE {
-      CHECK(started_);
-      CHECK(!finished_);
+      CHECK(owner_->current_section_ == this);
       return owner_->stream_.WriteFully(buffer, byte_count);
     }
 
@@ -221,19 +191,32 @@ class ElfBuilder FINAL {
     }
 
     Elf_Word GetSectionIndex() const {
-      DCHECK(started_);
       DCHECK_NE(section_index_, 0u);
       return section_index_;
     }
 
    private:
+    // Add this section to the list of generated ELF sections (if not there already).
+    // It also ensures the alignment is sufficient to generate valid program headers,
+    // since that depends on the previous section. It returns the required alignment.
+    Elf_Word AddSection() {
+      if (section_index_ == 0) {
+        std::vector<Section*>& sections = owner_->sections_;
+        Elf_Word last = sections.empty() ? PF_R : sections.back()->phdr_flags_;
+        if (phdr_flags_ != last) {
+          header_.sh_addralign = kPageSize;  // Page-align if R/W/X flags changed.
+        }
+        sections.push_back(this);
+        section_index_ = sections.size();  // First ELF section has index 1.
+      }
+      return owner_->write_program_headers_ ? header_.sh_addralign : 1;
+    }
+
     ElfBuilder<ElfTypes>* owner_;
     Elf_Shdr header_;
     Elf_Word section_index_;
     const std::string name_;
     const Section* const link_;
-    bool started_;
-    bool finished_;
     Elf_Word phdr_flags_;
     Elf_Word phdr_type_;
 
@@ -370,7 +353,7 @@ class ElfBuilder FINAL {
       Elf_Word section_index;
       if (section != nullptr) {
         DCHECK_LE(section->GetAddress(), addr);
-        DCHECK_LE(addr, section->GetAddress() + section->GetSize());
+        DCHECK_LE(addr, section->GetAddress() + section->header_.sh_size);
         section_index = section->GetSectionIndex();
       } else {
         section_index = static_cast<Elf_Word>(SHN_ABS);
@@ -479,6 +462,10 @@ class ElfBuilder FINAL {
           digest_start_(-1) {
     }
 
+    Elf_Word GetSize() {
+      return 16 + kBuildIdLen;
+    }
+
     void Write() {
       // The size fields are 32-bit on both 32-bit and 64-bit systems, confirmed
       // with the 64-bit linker and libbfd code. The size of name and desc must
@@ -490,6 +477,7 @@ class ElfBuilder FINAL {
       digest_start_ = this->Seek(0, kSeekCurrent);
       static_assert(kBuildIdLen % 4 == 0, "expecting a mutliple of 4 for build ID length");
       this->WriteFully(std::string(kBuildIdLen, '\0').c_str(), kBuildIdLen);  // desc.
+      DCHECK_EQ(this->GetPosition(), GetSize());
     }
 
     off_t GetDigestStart() {
@@ -530,6 +518,7 @@ class ElfBuilder FINAL {
         abiflags_(this, ".MIPS.abiflags", SHT_MIPS_ABIFLAGS, SHF_ALLOC, nullptr, 0, kPageSize, 0,
                   isa, features),
         build_id_(this, ".note.gnu.build-id", SHT_NOTE, SHF_ALLOC, nullptr, 0, 4, 0),
+        current_section_(nullptr),
         started_(false),
         write_program_headers_(false),
         loaded_size_(0u),
@@ -545,6 +534,7 @@ class ElfBuilder FINAL {
   ~ElfBuilder() {}
 
   InstructionSet GetIsa() { return isa_; }
+  BuildIdSection* GetBuildId() { return &build_id_; }
   Section* GetRoData() { return &rodata_; }
   Section* GetText() { return &text_; }
   Section* GetBss() { return &bss_; }
@@ -622,6 +612,9 @@ class ElfBuilder FINAL {
       if (section->link_ != nullptr) {
         section->header_.sh_link = section->link_->GetSectionIndex();
       }
+      if (section->header_.sh_offset == 0) {
+        section->header_.sh_type = SHT_NOBITS;
+      }
     }
     shstrtab_.End();
 
@@ -680,65 +673,57 @@ class ElfBuilder FINAL {
       soname = soname.substr(directory_separator_pos + 1);
     }
 
-    // Calculate addresses of .text, .bss and .dynstr.
-    DCHECK_EQ(rodata_.header_.sh_addralign, static_cast<Elf_Word>(kPageSize));
-    DCHECK_EQ(text_.header_.sh_addralign, static_cast<Elf_Word>(kPageSize));
-    DCHECK_EQ(bss_.header_.sh_addralign, static_cast<Elf_Word>(kPageSize));
-    DCHECK_EQ(dynstr_.header_.sh_addralign, static_cast<Elf_Word>(kPageSize));
-    Elf_Word rodata_address = rodata_.GetAddress();
-    Elf_Word text_address = RoundUp(rodata_address + rodata_size, kPageSize);
-    Elf_Word bss_address = RoundUp(text_address + text_size, kPageSize);
-    Elf_Word abiflags_address = RoundUp(bss_address + bss_size, kPageSize);
-    Elf_Word abiflags_size = 0;
+    // Allocate all pre-dynamic sections.
+    rodata_.AllocateVirtualMemory(rodata_size);
+    text_.AllocateVirtualMemory(text_size);
+    if (bss_size != 0) {
+      bss_.AllocateVirtualMemory(bss_size);
+    }
     if (isa_ == InstructionSet::kMips || isa_ == InstructionSet::kMips64) {
-      abiflags_size = abiflags_.GetSize();
+      abiflags_.AllocateVirtualMemory(abiflags_.GetSize());
     }
-    Elf_Word dynstr_address = RoundUp(abiflags_address + abiflags_size, kPageSize);
 
     // Cache .dynstr, .dynsym and .hash data.
     dynstr_.Add("");  // dynstr should start with empty string.
-    Elf_Word rodata_index = rodata_.GetSectionIndex();
     Elf_Word oatdata = dynstr_.Add("oatdata");
-    dynsym_.Add(oatdata, rodata_index, rodata_address, rodata_size, STB_GLOBAL, STT_OBJECT);
+    dynsym_.Add(oatdata, &rodata_, rodata_.GetAddress(), rodata_size, STB_GLOBAL, STT_OBJECT);
     if (text_size != 0u) {
-      Elf_Word text_index = rodata_index + 1u;
       Elf_Word oatexec = dynstr_.Add("oatexec");
-      dynsym_.Add(oatexec, text_index, text_address, text_size, STB_GLOBAL, STT_OBJECT);
+      dynsym_.Add(oatexec, &text_, text_.GetAddress(), text_size, STB_GLOBAL, STT_OBJECT);
       Elf_Word oatlastword = dynstr_.Add("oatlastword");
-      Elf_Word oatlastword_address = text_address + text_size - 4;
-      dynsym_.Add(oatlastword, text_index, oatlastword_address, 4, STB_GLOBAL, STT_OBJECT);
+      Elf_Word oatlastword_address = text_.GetAddress() + text_size - 4;
+      dynsym_.Add(oatlastword, &text_, oatlastword_address, 4, STB_GLOBAL, STT_OBJECT);
     } else if (rodata_size != 0) {
       // rodata_ can be size 0 for dwarf_test.
       Elf_Word oatlastword = dynstr_.Add("oatlastword");
-      Elf_Word oatlastword_address = rodata_address + rodata_size - 4;
-      dynsym_.Add(oatlastword, rodata_index, oatlastword_address, 4, STB_GLOBAL, STT_OBJECT);
+      Elf_Word oatlastword_address = rodata_.GetAddress() + rodata_size - 4;
+      dynsym_.Add(oatlastword, &rodata_, oatlastword_address, 4, STB_GLOBAL, STT_OBJECT);
     }
     DCHECK_LE(bss_roots_offset, bss_size);
     if (bss_size != 0u) {
-      Elf_Word bss_index = rodata_index + 1u + (text_size != 0 ? 1u : 0u);
       Elf_Word oatbss = dynstr_.Add("oatbss");
-      dynsym_.Add(oatbss, bss_index, bss_address, bss_roots_offset, STB_GLOBAL, STT_OBJECT);
+      dynsym_.Add(oatbss, &bss_, bss_.GetAddress(), bss_roots_offset, STB_GLOBAL, STT_OBJECT);
       DCHECK_LE(bss_methods_offset, bss_roots_offset);
       DCHECK_LE(bss_roots_offset, bss_size);
       // Add a symbol marking the start of the methods part of the .bss, if not empty.
       if (bss_methods_offset != bss_roots_offset) {
-        Elf_Word bss_methods_address = bss_address + bss_methods_offset;
+        Elf_Word bss_methods_address = bss_.GetAddress() + bss_methods_offset;
         Elf_Word bss_methods_size = bss_roots_offset - bss_methods_offset;
         Elf_Word oatbssroots = dynstr_.Add("oatbssmethods");
         dynsym_.Add(
-            oatbssroots, bss_index, bss_methods_address, bss_methods_size, STB_GLOBAL, STT_OBJECT);
+            oatbssroots, &bss_, bss_methods_address, bss_methods_size, STB_GLOBAL, STT_OBJECT);
       }
       // Add a symbol marking the start of the GC roots part of the .bss, if not empty.
       if (bss_roots_offset != bss_size) {
-        Elf_Word bss_roots_address = bss_address + bss_roots_offset;
+        Elf_Word bss_roots_address = bss_.GetAddress() + bss_roots_offset;
         Elf_Word bss_roots_size = bss_size - bss_roots_offset;
         Elf_Word oatbssroots = dynstr_.Add("oatbssroots");
         dynsym_.Add(
-            oatbssroots, bss_index, bss_roots_address, bss_roots_size, STB_GLOBAL, STT_OBJECT);
+            oatbssroots, &bss_, bss_roots_address, bss_roots_size, STB_GLOBAL, STT_OBJECT);
       }
       Elf_Word oatbsslastword = dynstr_.Add("oatbsslastword");
-      Elf_Word bsslastword_address = bss_address + bss_size - 4;
-      dynsym_.Add(oatbsslastword, bss_index, bsslastword_address, 4, STB_GLOBAL, STT_OBJECT);
+      Elf_Word bsslastword_address = bss_.GetAddress() + bss_size - 4;
+      dynsym_.Add(oatbsslastword, &bss_, bsslastword_address, 4, STB_GLOBAL, STT_OBJECT);
     }
     Elf_Word soname_offset = dynstr_.Add(soname);
 
@@ -759,28 +744,24 @@ class ElfBuilder FINAL {
     hash.push_back(0);  // Last symbol terminates the chain.
     hash_.Add(hash.data(), hash.size() * sizeof(hash[0]));
 
-    // Calculate addresses of .dynsym, .hash and .dynamic.
-    DCHECK_EQ(dynstr_.header_.sh_flags, dynsym_.header_.sh_flags);
-    DCHECK_EQ(dynsym_.header_.sh_flags, hash_.header_.sh_flags);
-    Elf_Word dynsym_address =
-        RoundUp(dynstr_address + dynstr_.GetCacheSize(), dynsym_.header_.sh_addralign);
-    Elf_Word hash_address =
-        RoundUp(dynsym_address + dynsym_.GetCacheSize(), hash_.header_.sh_addralign);
-    DCHECK_EQ(dynamic_.header_.sh_addralign, static_cast<Elf_Word>(kPageSize));
-    Elf_Word dynamic_address = RoundUp(hash_address + dynsym_.GetCacheSize(), kPageSize);
+    // Allocate all remaining sections.
+    dynstr_.AllocateVirtualMemory(dynstr_.GetCacheSize());
+    dynsym_.AllocateVirtualMemory(dynsym_.GetCacheSize());
+    hash_.AllocateVirtualMemory(hash_.GetCacheSize());
 
     Elf_Dyn dyns[] = {
-      { DT_HASH, { hash_address } },
-      { DT_STRTAB, { dynstr_address } },
-      { DT_SYMTAB, { dynsym_address } },
+      { DT_HASH, { hash_.GetAddress() } },
+      { DT_STRTAB, { dynstr_.GetAddress() } },
+      { DT_SYMTAB, { dynsym_.GetAddress() } },
       { DT_SYMENT, { sizeof(Elf_Sym) } },
       { DT_STRSZ, { dynstr_.GetCacheSize() } },
       { DT_SONAME, { soname_offset } },
       { DT_NULL, { 0 } },
     };
     dynamic_.Add(&dyns, sizeof(dyns));
+    dynamic_.AllocateVirtualMemory(dynamic_.GetCacheSize());
 
-    loaded_size_ = RoundUp(dynamic_address + dynamic_.GetCacheSize(), kPageSize);
+    loaded_size_ = RoundUp(virtual_address_, kPageSize);
   }
 
   void WriteDynamicSection() {
@@ -788,8 +769,6 @@ class ElfBuilder FINAL {
     dynsym_.WriteCachedSection();
     hash_.WriteCachedSection();
     dynamic_.WriteCachedSection();
-
-    CHECK_EQ(loaded_size_, RoundUp(dynamic_.GetAddress() + dynamic_.GetSize(), kPageSize));
   }
 
   Elf_Word GetLoadedSize() {
@@ -828,10 +807,6 @@ class ElfBuilder FINAL {
      return stream_.Seek(RoundUp(stream_.Seek(0, kSeekCurrent), alignment), kSeekSet);
   }
 
-  Elf_Addr AlignVirtualAddress(size_t alignment) {
-     return virtual_address_ = RoundUp(virtual_address_, alignment);
-  }
-
  private:
   static Elf_Ehdr MakeElfHeader(InstructionSet isa, const InstructionSetFeatures* features) {
     Elf_Ehdr elf_header = Elf_Ehdr();
@@ -902,7 +877,6 @@ class ElfBuilder FINAL {
     elf_header.e_ehsize = sizeof(Elf_Ehdr);
     elf_header.e_phentsize = sizeof(Elf_Phdr);
     elf_header.e_shentsize = sizeof(Elf_Shdr);
-    elf_header.e_phoff = sizeof(Elf_Ehdr);
     return elf_header;
   }
 
@@ -933,6 +907,7 @@ class ElfBuilder FINAL {
     for (auto* section : sections_) {
       const Elf_Shdr& shdr = section->header_;
       if ((shdr.sh_flags & SHF_ALLOC) != 0 && shdr.sh_size != 0) {
+        DCHECK(shdr.sh_addr != 0u) << "Allocate virtual memory for the section";
         // PT_LOAD tells the linker to mmap part of the file.
         // The linker can only mmap page-aligned sections.
         // Single PT_LOAD may contain several ELF sections.
@@ -1010,6 +985,7 @@ class ElfBuilder FINAL {
 
   // List of used section in the order in which they were written.
   std::vector<Section*> sections_;
+  Section* current_section_;  // The section which is currently being written.
 
   bool started_;
   bool write_program_headers_;
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 2e8170ecc4..42ee9db167 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -5732,24 +5732,18 @@ X86Assembler* ParallelMoveResolverX86::GetAssembler() const {
   return codegen_->GetAssembler();
 }
 
-void ParallelMoveResolverX86::MoveMemoryToMemory32(int dst, int src) {
+void ParallelMoveResolverX86::MoveMemoryToMemory(int dst, int src, int number_of_words) {
   ScratchRegisterScope ensure_scratch(
       this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters());
   Register temp_reg = static_cast<Register>(ensure_scratch.GetRegister());
   int stack_offset = ensure_scratch.IsSpilled() ? kX86WordSize : 0;
-  __ movl(temp_reg, Address(ESP, src + stack_offset));
-  __ movl(Address(ESP, dst + stack_offset), temp_reg);
-}
 
-void ParallelMoveResolverX86::MoveMemoryToMemory64(int dst, int src) {
-  ScratchRegisterScope ensure_scratch(
-      this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters());
-  Register temp_reg = static_cast<Register>(ensure_scratch.GetRegister());
-  int stack_offset = ensure_scratch.IsSpilled() ? kX86WordSize : 0;
-  __ movl(temp_reg, Address(ESP, src + stack_offset));
-  __ movl(Address(ESP, dst + stack_offset), temp_reg);
-  __ movl(temp_reg, Address(ESP, src + stack_offset + kX86WordSize));
-  __ movl(Address(ESP, dst + stack_offset + kX86WordSize), temp_reg);
+  // Now that temp register is available (possibly spilled), move blocks of memory.
+  for (int i = 0; i < number_of_words; i++) {
+    __ movl(temp_reg, Address(ESP, src + stack_offset));
+    __ movl(Address(ESP, dst + stack_offset), temp_reg);
+    stack_offset += kX86WordSize;
+  }
 }
 
 void ParallelMoveResolverX86::EmitMove(size_t index) {
@@ -5800,7 +5794,7 @@ void ParallelMoveResolverX86::EmitMove(size_t index) {
       __ movss(destination.AsFpuRegister<XmmRegister>(), Address(ESP, source.GetStackIndex()));
     } else {
       DCHECK(destination.IsStackSlot());
-      MoveMemoryToMemory32(destination.GetStackIndex(), source.GetStackIndex());
+      MoveMemoryToMemory(destination.GetStackIndex(), source.GetStackIndex(), 1);
     }
   } else if (source.IsDoubleStackSlot()) {
     if (destination.IsRegisterPair()) {
@@ -5811,11 +5805,15 @@ void ParallelMoveResolverX86::EmitMove(size_t index) {
       __ movsd(destination.AsFpuRegister<XmmRegister>(), Address(ESP, source.GetStackIndex()));
     } else {
       DCHECK(destination.IsDoubleStackSlot()) << destination;
-      MoveMemoryToMemory64(destination.GetStackIndex(), source.GetStackIndex());
+      MoveMemoryToMemory(destination.GetStackIndex(), source.GetStackIndex(), 2);
     }
   } else if (source.IsSIMDStackSlot()) {
-    DCHECK(destination.IsFpuRegister());
-    __ movups(destination.AsFpuRegister<XmmRegister>(), Address(ESP, source.GetStackIndex()));
+    if (destination.IsFpuRegister()) {
+      __ movups(destination.AsFpuRegister<XmmRegister>(), Address(ESP, source.GetStackIndex()));
+    } else {
+      DCHECK(destination.IsSIMDStackSlot());
+      MoveMemoryToMemory(destination.GetStackIndex(), source.GetStackIndex(), 4);
+    }
   } else if (source.IsConstant()) {
     HConstant* constant = source.GetConstant();
     if (constant->IsIntConstant() || constant->IsNullConstant()) {
@@ -5915,7 +5913,16 @@ void ParallelMoveResolverX86::Exchange32(XmmRegister reg, int mem) {
   __ movd(reg, temp_reg);
 }
 
-void ParallelMoveResolverX86::Exchange(int mem1, int mem2) {
+void ParallelMoveResolverX86::Exchange128(XmmRegister reg, int mem) {
+  size_t extra_slot = 4 * kX86WordSize;
+  __ subl(ESP, Immediate(extra_slot));
+  __ movups(Address(ESP, 0), XmmRegister(reg));
+  ExchangeMemory(0, mem + extra_slot, 4);
+  __ movups(XmmRegister(reg), Address(ESP, 0));
+  __ addl(ESP, Immediate(extra_slot));
+}
+
+void ParallelMoveResolverX86::ExchangeMemory(int mem1, int mem2, int number_of_words) {
   ScratchRegisterScope ensure_scratch1(
       this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters());
 
@@ -5925,10 +5932,15 @@ void ParallelMoveResolverX86::Exchange(int mem1, int mem2) {
 
   int stack_offset = ensure_scratch1.IsSpilled() ? kX86WordSize : 0;
   stack_offset += ensure_scratch2.IsSpilled() ? kX86WordSize : 0;
-  __ movl(static_cast<Register>(ensure_scratch1.GetRegister()), Address(ESP, mem1 + stack_offset));
-  __ movl(static_cast<Register>(ensure_scratch2.GetRegister()), Address(ESP, mem2 + stack_offset));
-  __ movl(Address(ESP, mem2 + stack_offset), static_cast<Register>(ensure_scratch1.GetRegister()));
-  __ movl(Address(ESP, mem1 + stack_offset), static_cast<Register>(ensure_scratch2.GetRegister()));
+
+  // Now that temp registers are available (possibly spilled), exchange blocks of memory.
+  for (int i = 0; i < number_of_words; i++) {
+    __ movl(static_cast<Register>(ensure_scratch1.GetRegister()), Address(ESP, mem1 + stack_offset));
+    __ movl(static_cast<Register>(ensure_scratch2.GetRegister()), Address(ESP, mem2 + stack_offset));
+    __ movl(Address(ESP, mem2 + stack_offset), static_cast<Register>(ensure_scratch1.GetRegister()));
+    __ movl(Address(ESP, mem1 + stack_offset), static_cast<Register>(ensure_scratch2.GetRegister()));
+    stack_offset += kX86WordSize;
+  }
 }
 
 void ParallelMoveResolverX86::EmitSwap(size_t index) {
@@ -5947,7 +5959,7 @@ void ParallelMoveResolverX86::EmitSwap(size_t index) {
   } else if (source.IsStackSlot() && destination.IsRegister()) {
     Exchange(destination.AsRegister<Register>(), source.GetStackIndex());
   } else if (source.IsStackSlot() && destination.IsStackSlot()) {
-    Exchange(destination.GetStackIndex(), source.GetStackIndex());
+    ExchangeMemory(destination.GetStackIndex(), source.GetStackIndex(), 1);
   } else if (source.IsFpuRegister() && destination.IsFpuRegister()) {
     // Use XOR Swap algorithm to avoid a temporary.
     DCHECK_NE(source.reg(), destination.reg());
@@ -5983,8 +5995,13 @@ void ParallelMoveResolverX86::EmitSwap(size_t index) {
     // Move the high double to the low double.
     __ psrldq(reg, Immediate(8));
   } else if (destination.IsDoubleStackSlot() && source.IsDoubleStackSlot()) {
-    Exchange(destination.GetStackIndex(), source.GetStackIndex());
-    Exchange(destination.GetHighStackIndex(kX86WordSize), source.GetHighStackIndex(kX86WordSize));
+    ExchangeMemory(destination.GetStackIndex(), source.GetStackIndex(), 2);
+  } else if (source.IsSIMDStackSlot() && destination.IsSIMDStackSlot()) {
+    ExchangeMemory(destination.GetStackIndex(), source.GetStackIndex(), 4);
+  } else if (source.IsFpuRegister() && destination.IsSIMDStackSlot()) {
+    Exchange128(source.AsFpuRegister<XmmRegister>(), destination.GetStackIndex());
+  } else if (destination.IsFpuRegister() && source.IsSIMDStackSlot()) {
+    Exchange128(destination.AsFpuRegister<XmmRegister>(), source.GetStackIndex());
   } else {
     LOG(FATAL) << "Unimplemented: source: " << source << ", destination: " << destination;
   }
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index 176e4dfda0..40b7e3c54f 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -139,10 +139,10 @@ class ParallelMoveResolverX86 : public ParallelMoveResolverWithSwap {
 
  private:
   void Exchange(Register reg, int mem);
-  void Exchange(int mem1, int mem2);
   void Exchange32(XmmRegister reg, int mem);
-  void MoveMemoryToMemory32(int dst, int src);
-  void MoveMemoryToMemory64(int dst, int src);
+  void Exchange128(XmmRegister reg, int mem);
+  void ExchangeMemory(int mem1, int mem2, int number_of_words);
+  void MoveMemoryToMemory(int dst, int src, int number_of_words);
 
   CodeGeneratorX86* const codegen_;
 
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index e25688c9a3..02fbf234c1 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -5220,9 +5220,17 @@ void ParallelMoveResolverX86_64::EmitMove(size_t index) {
       __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
     }
   } else if (source.IsSIMDStackSlot()) {
-    DCHECK(destination.IsFpuRegister());
-    __ movups(destination.AsFpuRegister<XmmRegister>(),
-              Address(CpuRegister(RSP), source.GetStackIndex()));
+    if (destination.IsFpuRegister()) {
+      __ movups(destination.AsFpuRegister<XmmRegister>(),
+                Address(CpuRegister(RSP), source.GetStackIndex()));
+    } else {
+      DCHECK(destination.IsSIMDStackSlot());
+      size_t high = kX86_64WordSize;
+      __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), source.GetStackIndex()));
+      __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
+      __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), source.GetStackIndex() + high));
+      __ movq(Address(CpuRegister(RSP), destination.GetStackIndex() + high), CpuRegister(TMP));
+    }
   } else if (source.IsConstant()) {
     HConstant* constant = source.GetConstant();
     if (constant->IsIntConstant() || constant->IsNullConstant()) {
@@ -5290,19 +5298,6 @@ void ParallelMoveResolverX86_64::Exchange32(CpuRegister reg, int mem) {
   __ movl(reg, CpuRegister(TMP));
 }
 
-void ParallelMoveResolverX86_64::Exchange32(int mem1, int mem2) {
-  ScratchRegisterScope ensure_scratch(
-      this, TMP, RAX, codegen_->GetNumberOfCoreRegisters());
-
-  int stack_offset = ensure_scratch.IsSpilled() ? kX86_64WordSize : 0;
-  __ movl(CpuRegister(TMP), Address(CpuRegister(RSP), mem1 + stack_offset));
-  __ movl(CpuRegister(ensure_scratch.GetRegister()),
-          Address(CpuRegister(RSP), mem2 + stack_offset));
-  __ movl(Address(CpuRegister(RSP), mem2 + stack_offset), CpuRegister(TMP));
-  __ movl(Address(CpuRegister(RSP), mem1 + stack_offset),
-          CpuRegister(ensure_scratch.GetRegister()));
-}
-
 void ParallelMoveResolverX86_64::Exchange64(CpuRegister reg1, CpuRegister reg2) {
   __ movq(CpuRegister(TMP), reg1);
   __ movq(reg1, reg2);
@@ -5315,19 +5310,6 @@ void ParallelMoveResolverX86_64::Exchange64(CpuRegister reg, int mem) {
   __ movq(reg, CpuRegister(TMP));
 }
 
-void ParallelMoveResolverX86_64::Exchange64(int mem1, int mem2) {
-  ScratchRegisterScope ensure_scratch(
-      this, TMP, RAX, codegen_->GetNumberOfCoreRegisters());
-
-  int stack_offset = ensure_scratch.IsSpilled() ? kX86_64WordSize : 0;
-  __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), mem1 + stack_offset));
-  __ movq(CpuRegister(ensure_scratch.GetRegister()),
-          Address(CpuRegister(RSP), mem2 + stack_offset));
-  __ movq(Address(CpuRegister(RSP), mem2 + stack_offset), CpuRegister(TMP));
-  __ movq(Address(CpuRegister(RSP), mem1 + stack_offset),
-          CpuRegister(ensure_scratch.GetRegister()));
-}
-
 void ParallelMoveResolverX86_64::Exchange32(XmmRegister reg, int mem) {
   __ movl(CpuRegister(TMP), Address(CpuRegister(RSP), mem));
   __ movss(Address(CpuRegister(RSP), mem), reg);
@@ -5340,6 +5322,48 @@ void ParallelMoveResolverX86_64::Exchange64(XmmRegister reg, int mem) {
   __ movd(reg, CpuRegister(TMP));
 }
 
+void ParallelMoveResolverX86_64::Exchange128(XmmRegister reg, int mem) {
+  size_t extra_slot = 2 * kX86_64WordSize;
+  __ subq(CpuRegister(RSP), Immediate(extra_slot));
+  __ movups(Address(CpuRegister(RSP), 0), XmmRegister(reg));
+  ExchangeMemory64(0, mem + extra_slot, 2);
+  __ movups(XmmRegister(reg), Address(CpuRegister(RSP), 0));
+  __ addq(CpuRegister(RSP), Immediate(extra_slot));
+}
+
+void ParallelMoveResolverX86_64::ExchangeMemory32(int mem1, int mem2) {
+  ScratchRegisterScope ensure_scratch(
+      this, TMP, RAX, codegen_->GetNumberOfCoreRegisters());
+
+  int stack_offset = ensure_scratch.IsSpilled() ? kX86_64WordSize : 0;
+  __ movl(CpuRegister(TMP), Address(CpuRegister(RSP), mem1 + stack_offset));
+  __ movl(CpuRegister(ensure_scratch.GetRegister()),
+          Address(CpuRegister(RSP), mem2 + stack_offset));
+  __ movl(Address(CpuRegister(RSP), mem2 + stack_offset), CpuRegister(TMP));
+  __ movl(Address(CpuRegister(RSP), mem1 + stack_offset),
+          CpuRegister(ensure_scratch.GetRegister()));
+}
+
+void ParallelMoveResolverX86_64::ExchangeMemory64(int mem1, int mem2, int num_of_qwords) {
+  ScratchRegisterScope ensure_scratch(
+      this, TMP, RAX, codegen_->GetNumberOfCoreRegisters());
+
+  int stack_offset = ensure_scratch.IsSpilled() ? kX86_64WordSize : 0;
+
+  // Now that temp registers are available (possibly spilled), exchange blocks of memory.
+  for (int i = 0; i < num_of_qwords; i++) {
+    __ movq(CpuRegister(TMP),
+            Address(CpuRegister(RSP), mem1 + stack_offset));
+    __ movq(CpuRegister(ensure_scratch.GetRegister()),
+            Address(CpuRegister(RSP), mem2 + stack_offset));
+    __ movq(Address(CpuRegister(RSP), mem2 + stack_offset),
+            CpuRegister(TMP));
+    __ movq(Address(CpuRegister(RSP), mem1 + stack_offset),
+            CpuRegister(ensure_scratch.GetRegister()));
+    stack_offset += kX86_64WordSize;
+  }
+}
+
 void ParallelMoveResolverX86_64::EmitSwap(size_t index) {
   MoveOperands* move = moves_[index];
   Location source = move->GetSource();
@@ -5352,13 +5376,13 @@ void ParallelMoveResolverX86_64::EmitSwap(size_t index) {
   } else if (source.IsStackSlot() && destination.IsRegister()) {
     Exchange32(destination.AsRegister<CpuRegister>(), source.GetStackIndex());
   } else if (source.IsStackSlot() && destination.IsStackSlot()) {
-    Exchange32(destination.GetStackIndex(), source.GetStackIndex());
+    ExchangeMemory32(destination.GetStackIndex(), source.GetStackIndex());
   } else if (source.IsRegister() && destination.IsDoubleStackSlot()) {
     Exchange64(source.AsRegister<CpuRegister>(), destination.GetStackIndex());
   } else if (source.IsDoubleStackSlot() && destination.IsRegister()) {
     Exchange64(destination.AsRegister<CpuRegister>(), source.GetStackIndex());
   } else if (source.IsDoubleStackSlot() && destination.IsDoubleStackSlot()) {
-    Exchange64(destination.GetStackIndex(), source.GetStackIndex());
+    ExchangeMemory64(destination.GetStackIndex(), source.GetStackIndex(), 1);
   } else if (source.IsFpuRegister() && destination.IsFpuRegister()) {
     __ movd(CpuRegister(TMP), source.AsFpuRegister<XmmRegister>());
     __ movaps(source.AsFpuRegister<XmmRegister>(), destination.AsFpuRegister<XmmRegister>());
@@ -5371,6 +5395,12 @@ void ParallelMoveResolverX86_64::EmitSwap(size_t index) {
     Exchange64(source.AsFpuRegister<XmmRegister>(), destination.GetStackIndex());
   } else if (source.IsDoubleStackSlot() && destination.IsFpuRegister()) {
     Exchange64(destination.AsFpuRegister<XmmRegister>(), source.GetStackIndex());
+  } else if (source.IsSIMDStackSlot() && destination.IsSIMDStackSlot()) {
+    ExchangeMemory64(destination.GetStackIndex(), source.GetStackIndex(), 2);
+  } else if (source.IsFpuRegister() && destination.IsSIMDStackSlot()) {
+    Exchange128(source.AsFpuRegister<XmmRegister>(), destination.GetStackIndex());
+  } else if (destination.IsFpuRegister() && source.IsSIMDStackSlot()) {
+    Exchange128(destination.AsFpuRegister<XmmRegister>(), source.GetStackIndex());
   } else {
     LOG(FATAL) << "Unimplemented swap between " << source << " and " << destination;
   }
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index 00c5c27470..e86123ef01 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -139,11 +139,12 @@ class ParallelMoveResolverX86_64 : public ParallelMoveResolverWithSwap {
  private:
   void Exchange32(CpuRegister reg, int mem);
   void Exchange32(XmmRegister reg, int mem);
-  void Exchange32(int mem1, int mem2);
   void Exchange64(CpuRegister reg1, CpuRegister reg2);
   void Exchange64(CpuRegister reg, int mem);
   void Exchange64(XmmRegister reg, int mem);
-  void Exchange64(int mem1, int mem2);
+  void Exchange128(XmmRegister reg, int mem);
+  void ExchangeMemory32(int mem1, int mem2);
+  void ExchangeMemory64(int mem1, int mem2, int num_of_qwords);
 
   CodeGeneratorX86_64* const codegen_;
 
diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h
index 096349fd73..87dff8403b 100644
--- a/compiler/optimizing/nodes_vector.h
+++ b/compiler/optimizing/nodes_vector.h
@@ -109,6 +109,16 @@ class HVecOperation : public HVariableInputSizeInstruction {
 
   // Assumes vector nodes cannot be moved by default. Each concrete implementation
   // that can be moved should override this method and return true.
+  //
+  // Note: similar approach is used for instruction scheduling (if it is turned on for the target):
+  // by default HScheduler::IsSchedulable returns false for a particular HVecOperation.
+  // HScheduler${ARCH}::IsSchedulable can be overridden to return true for an instruction (see
+  // scheduler_arm64.h for example) if it is safe to schedule it; in this case one *must* also
+  // look at/update HScheduler${ARCH}::IsSchedulingBarrier for this instruction.
+  //
+  // Note: For newly introduced vector instructions HScheduler${ARCH}::IsSchedulingBarrier must be
+  // altered to return true if the instruction might reside outside the SIMD loop body since SIMD
+  // registers are not kept alive across vector loop boundaries (yet).
   bool CanBeMoved() const OVERRIDE { return false; }
 
   // Tests if all data of a vector node (vector length and packed type) is equal.
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index 73c72fc57a..24b1a123ee 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -1224,7 +1224,7 @@ bool OptimizingCompiler::JitCompile(Thread* self,
     }
 
     const CompilerOptions& compiler_options = GetCompilerDriver()->GetCompilerOptions();
-    if (compiler_options.GetGenerateDebugInfo()) {
+    if (compiler_options.GenerateAnyDebugInfo()) {
       const auto* method_header = reinterpret_cast<const OatQuickMethodHeader*>(code);
       const uintptr_t code_address = reinterpret_cast<uintptr_t>(method_header->GetCode());
       debug::MethodDebugInfo info = {};
@@ -1244,10 +1244,13 @@ bool OptimizingCompiler::JitCompile(Thread* self,
       info.frame_size_in_bytes = method_header->GetFrameSizeInBytes();
       info.code_info = nullptr;
       info.cfi = jni_compiled_method.GetCfi();
-      std::vector<uint8_t> elf_file = debug::WriteDebugElfFileForMethods(
+      // If both flags are passed, generate full debug info.
+      const bool mini_debug_info = !compiler_options.GetGenerateDebugInfo();
+      std::vector<uint8_t> elf_file = debug::MakeElfFileForJIT(
           GetCompilerDriver()->GetInstructionSet(),
           GetCompilerDriver()->GetInstructionSetFeatures(),
-          ArrayRef<const debug::MethodDebugInfo>(&info, 1));
+          mini_debug_info,
+          info);
       CreateJITCodeEntryForAddress(code_address, std::move(elf_file));
     }
 
@@ -1352,7 +1355,7 @@ bool OptimizingCompiler::JitCompile(Thread* self,
   }
 
   const CompilerOptions& compiler_options = GetCompilerDriver()->GetCompilerOptions();
-  if (compiler_options.GetGenerateDebugInfo()) {
+  if (compiler_options.GenerateAnyDebugInfo()) {
     const auto* method_header = reinterpret_cast<const OatQuickMethodHeader*>(code);
     const uintptr_t code_address = reinterpret_cast<uintptr_t>(method_header->GetCode());
     debug::MethodDebugInfo info = {};
@@ -1372,10 +1375,13 @@ bool OptimizingCompiler::JitCompile(Thread* self,
     info.frame_size_in_bytes = method_header->GetFrameSizeInBytes();
     info.code_info = stack_map_size == 0 ? nullptr : stack_map_data;
     info.cfi = ArrayRef<const uint8_t>(*codegen->GetAssembler()->cfi().data());
-    std::vector<uint8_t> elf_file = debug::WriteDebugElfFileForMethods(
+    // If both flags are passed, generate full debug info.
+    const bool mini_debug_info = !compiler_options.GetGenerateDebugInfo();
+    std::vector<uint8_t> elf_file = debug::MakeElfFileForJIT(
         GetCompilerDriver()->GetInstructionSet(),
         GetCompilerDriver()->GetInstructionSetFeatures(),
-        ArrayRef<const debug::MethodDebugInfo>(&info, 1));
+        mini_debug_info,
+        info);
     CreateJITCodeEntryForAddress(code_address, std::move(elf_file));
   }
 
diff --git a/compiler/optimizing/scheduler.h b/compiler/optimizing/scheduler.h
index bb7c353bc2..dfa077f7de 100644
--- a/compiler/optimizing/scheduler.h
+++ b/compiler/optimizing/scheduler.h
@@ -462,6 +462,11 @@ class HScheduler {
   // containing basic block from being scheduled.
   // This method is used to restrict scheduling to instructions that we know are
   // safe to handle.
+  //
+  // For newly introduced instructions by default HScheduler::IsSchedulable returns false.
+  // HScheduler${ARCH}::IsSchedulable can be overridden to return true for an instruction (see
+  // scheduler_arm64.h for example) if it is safe to schedule it; in this case one *must* also
+  // look at/update HScheduler${ARCH}::IsSchedulingBarrier for this instruction.
   virtual bool IsSchedulable(const HInstruction* instruction) const;
   bool IsSchedulable(const HBasicBlock* block) const;
 
diff --git a/compiler/optimizing/scheduler_arm64.h b/compiler/optimizing/scheduler_arm64.h
index 32f161f26a..f71cb5b784 100644
--- a/compiler/optimizing/scheduler_arm64.h
+++ b/compiler/optimizing/scheduler_arm64.h
@@ -151,6 +151,20 @@ class HSchedulerARM64 : public HScheduler {
 #undef CASE_INSTRUCTION_KIND
   }
 
+  // Treat as scheduling barriers those vector instructions whose live ranges exceed the vectorized
+  // loop boundaries. This is a workaround for the lack of notion of SIMD register in the compiler;
+  // around a call we have to save/restore all live SIMD&FP registers (only lower 64 bits of
+  // SIMD&FP registers are callee saved) so don't reorder such vector instructions.
+  //
+  // TODO: remove this when a proper support of SIMD registers is introduced to the compiler.
+  bool IsSchedulingBarrier(const HInstruction* instr) const OVERRIDE {
+    return HScheduler::IsSchedulingBarrier(instr) ||
+           instr->IsVecReduce() ||
+           instr->IsVecExtractScalar() ||
+           instr->IsVecSetScalars() ||
+           instr->IsVecReplicateScalar();
+  }
+
  private:
   SchedulingLatencyVisitorARM64 arm64_latency_visitor_;
   DISALLOW_COPY_AND_ASSIGN(HSchedulerARM64);