Merge "Ignore vdex files for addr2line."
diff --git a/compiler/Android.bp b/compiler/Android.bp
index 40c676c..c4d538f 100644
--- a/compiler/Android.bp
+++ b/compiler/Android.bp
@@ -256,7 +256,14 @@
         instrumentation: true,
         profile_file: "art/dex2oat.profdata",
         benchmarks: ["dex2oat"],
-    }
+    },
+    target: {
+        android: {
+            lto: {
+                 thin: true,
+            },
+        },
+    },
 }
 
 art_cc_library {
diff --git a/compiler/linker/arm64/relative_patcher_arm64.cc b/compiler/linker/arm64/relative_patcher_arm64.cc
index 52a0796..7230f11 100644
--- a/compiler/linker/arm64/relative_patcher_arm64.cc
+++ b/compiler/linker/arm64/relative_patcher_arm64.cc
@@ -60,6 +60,7 @@
     case LinkerPatch::Type::kCallRelative:
     case LinkerPatch::Type::kBakerReadBarrierBranch:
       return false;
+    case LinkerPatch::Type::kDataBimgRelRo:
     case LinkerPatch::Type::kMethodRelative:
     case LinkerPatch::Type::kMethodBssEntry:
     case LinkerPatch::Type::kTypeRelative:
@@ -271,7 +272,8 @@
       shift = 0u;  // No shift for ADD.
     } else {
       // LDR/STR 32-bit or 64-bit with imm12 == 0 (unset).
-      DCHECK(patch.GetType() == LinkerPatch::Type::kMethodBssEntry ||
+      DCHECK(patch.GetType() == LinkerPatch::Type::kDataBimgRelRo ||
+             patch.GetType() == LinkerPatch::Type::kMethodBssEntry ||
              patch.GetType() == LinkerPatch::Type::kTypeClassTable ||
              patch.GetType() == LinkerPatch::Type::kTypeBssEntry ||
              patch.GetType() == LinkerPatch::Type::kStringInternTable ||
diff --git a/compiler/linker/elf_builder.h b/compiler/linker/elf_builder.h
index a5f6099..3da7a43 100644
--- a/compiler/linker/elf_builder.h
+++ b/compiler/linker/elf_builder.h
@@ -529,6 +529,8 @@
         stream_(output),
         rodata_(this, ".rodata", SHT_PROGBITS, SHF_ALLOC, nullptr, 0, kPageSize, 0),
         text_(this, ".text", SHT_PROGBITS, SHF_ALLOC | SHF_EXECINSTR, nullptr, 0, kPageSize, 0),
+        data_bimg_rel_ro_(
+            this, ".data.bimg.rel.ro", SHT_PROGBITS, SHF_ALLOC, nullptr, 0, kPageSize, 0),
         bss_(this, ".bss", SHT_NOBITS, SHF_ALLOC, nullptr, 0, kPageSize, 0),
         dex_(this, ".dex", SHT_NOBITS, SHF_ALLOC, nullptr, 0, kPageSize, 0),
         dynstr_(this, ".dynstr", SHF_ALLOC, kPageSize),
@@ -552,6 +554,7 @@
         loaded_size_(0u),
         virtual_address_(0) {
     text_.phdr_flags_ = PF_R | PF_X;
+    data_bimg_rel_ro_.phdr_flags_ = PF_R | PF_W;  // Shall be made read-only at run time.
     bss_.phdr_flags_ = PF_R | PF_W;
     dex_.phdr_flags_ = PF_R;
     dynamic_.phdr_flags_ = PF_R | PF_W;
@@ -566,6 +569,7 @@
   BuildIdSection* GetBuildId() { return &build_id_; }
   Section* GetRoData() { return &rodata_; }
   Section* GetText() { return &text_; }
+  Section* GetDataBimgRelRo() { return &data_bimg_rel_ro_; }
   Section* GetBss() { return &bss_; }
   Section* GetDex() { return &dex_; }
   StringSection* GetStrTab() { return &strtab_; }
@@ -694,6 +698,7 @@
   void PrepareDynamicSection(const std::string& elf_file_path,
                              Elf_Word rodata_size,
                              Elf_Word text_size,
+                             Elf_Word data_bimg_rel_ro_size,
                              Elf_Word bss_size,
                              Elf_Word bss_methods_offset,
                              Elf_Word bss_roots_offset,
@@ -707,6 +712,9 @@
     // Allocate all pre-dynamic sections.
     rodata_.AllocateVirtualMemory(rodata_size);
     text_.AllocateVirtualMemory(text_size);
+    if (data_bimg_rel_ro_size != 0) {
+      data_bimg_rel_ro_.AllocateVirtualMemory(data_bimg_rel_ro_size);
+    }
     if (bss_size != 0) {
       bss_.AllocateVirtualMemory(bss_size);
     }
@@ -735,6 +743,24 @@
       Elf_Word oatlastword_address = rodata_.GetAddress() + rodata_size - 4;
       dynsym_.Add(oatlastword, &rodata_, oatlastword_address, 4, STB_GLOBAL, STT_OBJECT);
     }
+    if (data_bimg_rel_ro_size != 0u) {
+      Elf_Word oatdatabimgrelro = dynstr_.Add("oatdatabimgrelro");
+      dynsym_.Add(oatdatabimgrelro,
+                  &data_bimg_rel_ro_,
+                  data_bimg_rel_ro_.GetAddress(),
+                  data_bimg_rel_ro_size,
+                  STB_GLOBAL,
+                  STT_OBJECT);
+      Elf_Word oatdatabimgrelrolastword = dynstr_.Add("oatdatabimgrelrolastword");
+      Elf_Word oatdatabimgrelrolastword_address =
+          data_bimg_rel_ro_.GetAddress() + data_bimg_rel_ro_size - 4;
+      dynsym_.Add(oatdatabimgrelrolastword,
+                  &data_bimg_rel_ro_,
+                  oatdatabimgrelrolastword_address,
+                  4,
+                  STB_GLOBAL,
+                  STT_OBJECT);
+    }
     DCHECK_LE(bss_roots_offset, bss_size);
     if (bss_size != 0u) {
       Elf_Word oatbss = dynstr_.Add("oatbss");
@@ -1010,6 +1036,7 @@
 
   Section rodata_;
   Section text_;
+  Section data_bimg_rel_ro_;
   Section bss_;
   Section dex_;
   CachedStringSection dynstr_;
diff --git a/compiler/linker/linker_patch.h b/compiler/linker/linker_patch.h
index 6f4e774..a3c737c 100644
--- a/compiler/linker/linker_patch.h
+++ b/compiler/linker/linker_patch.h
@@ -41,6 +41,7 @@
   // choose to squeeze the Type into fewer than 8 bits, we'll have to declare
   // patch_type_ as an uintN_t and do explicit static_cast<>s.
   enum class Type : uint8_t {
+    kDataBimgRelRo,           // NOTE: Actual patching is instruction_set-dependent.
     kMethodRelative,          // NOTE: Actual patching is instruction_set-dependent.
     kMethodBssEntry,          // NOTE: Actual patching is instruction_set-dependent.
     kCall,
@@ -54,6 +55,15 @@
     kBakerReadBarrierBranch,  // NOTE: Actual patching is instruction_set-dependent.
   };
 
+  static LinkerPatch DataBimgRelRoPatch(size_t literal_offset,
+                                        uint32_t pc_insn_offset,
+                                        uint32_t boot_image_offset) {
+    LinkerPatch patch(literal_offset, Type::kDataBimgRelRo, /* target_dex_file */ nullptr);
+    patch.boot_image_offset_ = boot_image_offset;
+    patch.pc_insn_offset_ = pc_insn_offset;
+    return patch;
+  }
+
   static LinkerPatch RelativeMethodPatch(size_t literal_offset,
                                          const DexFile* target_dex_file,
                                          uint32_t pc_insn_offset,
@@ -172,6 +182,7 @@
 
   bool IsPcRelative() const {
     switch (GetType()) {
+      case Type::kDataBimgRelRo:
       case Type::kMethodRelative:
       case Type::kMethodBssEntry:
       case Type::kCallRelative:
@@ -188,6 +199,11 @@
     }
   }
 
+  uint32_t BootImageOffset() const {
+    DCHECK(patch_type_ == Type::kDataBimgRelRo);
+    return boot_image_offset_;
+  }
+
   MethodReference TargetMethod() const {
     DCHECK(patch_type_ == Type::kMethodRelative ||
            patch_type_ == Type::kMethodBssEntry ||
@@ -225,7 +241,8 @@
   }
 
   uint32_t PcInsnOffset() const {
-    DCHECK(patch_type_ == Type::kMethodRelative ||
+    DCHECK(patch_type_ == Type::kDataBimgRelRo ||
+           patch_type_ == Type::kMethodRelative ||
            patch_type_ == Type::kMethodBssEntry ||
            patch_type_ == Type::kTypeRelative ||
            patch_type_ == Type::kTypeClassTable ||
@@ -263,10 +280,11 @@
   uint32_t literal_offset_ : 24;  // Method code size up to 16MiB.
   Type patch_type_ : 8;
   union {
-    uint32_t cmp1_;             // Used for relational operators.
-    uint32_t method_idx_;       // Method index for Call/Method patches.
-    uint32_t type_idx_;         // Type index for Type patches.
-    uint32_t string_idx_;       // String index for String patches.
+    uint32_t cmp1_;               // Used for relational operators.
+    uint32_t boot_image_offset_;  // Data to write to the .data.bimg.rel.ro entry.
+    uint32_t method_idx_;         // Method index for Call/Method patches.
+    uint32_t type_idx_;           // Type index for Type patches.
+    uint32_t string_idx_;         // String index for String patches.
     uint32_t baker_custom_value1_;
     static_assert(sizeof(method_idx_) == sizeof(cmp1_), "needed by relational operators");
     static_assert(sizeof(type_idx_) == sizeof(cmp1_), "needed by relational operators");
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index b0ddd8e..bc687e8 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -4460,12 +4460,23 @@
       // Load method address from literal pool.
       __ Ldr(XRegisterFrom(temp), DeduplicateUint64Literal(invoke->GetMethodAddress()));
       break;
+    case HInvokeStaticOrDirect::MethodLoadKind::kBootImageRelRo: {
+      // Add ADRP with its PC-relative .data.bimg.rel.ro patch.
+      uint32_t boot_image_offset = invoke->GetDispatchInfo().method_load_data;
+      vixl::aarch64::Label* adrp_label = NewBootImageRelRoPatch(boot_image_offset);
+      EmitAdrpPlaceholder(adrp_label, XRegisterFrom(temp));
+      // Add LDR with its PC-relative .data.bimg.rel.ro patch.
+      vixl::aarch64::Label* ldr_label = NewBootImageRelRoPatch(boot_image_offset, adrp_label);
+      // Note: Boot image is in the low 4GiB and the entry is 32-bit, so emit a 32-bit load.
+      EmitLdrOffsetPlaceholder(ldr_label, WRegisterFrom(temp), XRegisterFrom(temp));
+      break;
+    }
     case HInvokeStaticOrDirect::MethodLoadKind::kBssEntry: {
-      // Add ADRP with its PC-relative DexCache access patch.
+      // Add ADRP with its PC-relative .bss entry patch.
       MethodReference target_method(&GetGraph()->GetDexFile(), invoke->GetDexMethodIndex());
       vixl::aarch64::Label* adrp_label = NewMethodBssEntryPatch(target_method);
       EmitAdrpPlaceholder(adrp_label, XRegisterFrom(temp));
-      // Add LDR with its PC-relative DexCache access patch.
+      // Add LDR with its PC-relative .bss entry patch.
       vixl::aarch64::Label* ldr_label =
           NewMethodBssEntryPatch(target_method, adrp_label);
       EmitLdrOffsetPlaceholder(ldr_label, XRegisterFrom(temp), XRegisterFrom(temp));
@@ -4560,6 +4571,13 @@
   codegen_->MaybeGenerateMarkingRegisterCheck(/* code */ __LINE__);
 }
 
+vixl::aarch64::Label* CodeGeneratorARM64::NewBootImageRelRoPatch(
+    uint32_t boot_image_offset,
+    vixl::aarch64::Label* adrp_label) {
+  return NewPcRelativePatch(
+      /* dex_file */ nullptr, boot_image_offset, adrp_label, &boot_image_method_patches_);
+}
+
 vixl::aarch64::Label* CodeGeneratorARM64::NewBootImageMethodPatch(
     MethodReference target_method,
     vixl::aarch64::Label* adrp_label) {
@@ -4682,6 +4700,14 @@
   }
 }
 
+linker::LinkerPatch DataBimgRelRoPatchAdapter(size_t literal_offset,
+                                              const DexFile* target_dex_file,
+                                              uint32_t pc_insn_offset,
+                                              uint32_t boot_image_offset) {
+  DCHECK(target_dex_file == nullptr);  // Unused for DataBimgRelRoPatch(), should be null.
+  return linker::LinkerPatch::DataBimgRelRoPatch(literal_offset, pc_insn_offset, boot_image_offset);
+}
+
 void CodeGeneratorARM64::EmitLinkerPatches(ArenaVector<linker::LinkerPatch>* linker_patches) {
   DCHECK(linker_patches->empty());
   size_t size =
@@ -4701,7 +4727,8 @@
     EmitPcRelativeLinkerPatches<linker::LinkerPatch::RelativeStringPatch>(
         boot_image_string_patches_, linker_patches);
   } else {
-    DCHECK(boot_image_method_patches_.empty());
+    EmitPcRelativeLinkerPatches<DataBimgRelRoPatchAdapter>(
+        boot_image_method_patches_, linker_patches);
     EmitPcRelativeLinkerPatches<linker::LinkerPatch::TypeClassTablePatch>(
         boot_image_type_patches_, linker_patches);
     EmitPcRelativeLinkerPatches<linker::LinkerPatch::StringInternTablePatch>(
@@ -5484,9 +5511,9 @@
   }
 }
 
-void InstructionCodeGeneratorARM64::GenerateMinMax(LocationSummary* locations,
-                                                   bool is_min,
-                                                   DataType::Type type) {
+void InstructionCodeGeneratorARM64::GenerateMinMaxInt(LocationSummary* locations,
+                                                      bool is_min,
+                                                      DataType::Type type) {
   Location op1 = locations->InAt(0);
   Location op2 = locations->InAt(1);
   Location out = locations->Out();
@@ -5537,24 +5564,29 @@
   }
 }
 
+// TODO: integrate with HandleBinaryOp?
+void InstructionCodeGeneratorARM64::GenerateMinMax(HBinaryOperation* minmax, bool is_min) {
+  DataType::Type type = minmax->GetResultType();
+  switch (type) {
+    case DataType::Type::kInt32:
+    case DataType::Type::kInt64:
+      GenerateMinMaxInt(minmax->GetLocations(), is_min, type);
+      break;
+    case DataType::Type::kFloat32:
+    case DataType::Type::kFloat64:
+      GenerateMinMaxFP(minmax->GetLocations(), is_min, type);
+      break;
+    default:
+      LOG(FATAL) << "Unexpected type for HMinMax " << type;
+  }
+}
+
 void LocationsBuilderARM64::VisitMin(HMin* min) {
   CreateMinMaxLocations(GetGraph()->GetAllocator(), min);
 }
 
-// TODO: integrate with HandleBinaryOp?
 void InstructionCodeGeneratorARM64::VisitMin(HMin* min) {
-  switch (min->GetResultType()) {
-    case DataType::Type::kInt32:
-    case DataType::Type::kInt64:
-      GenerateMinMax(min->GetLocations(), /*is_min*/ true, min->GetResultType());
-      break;
-    case DataType::Type::kFloat32:
-    case DataType::Type::kFloat64:
-      GenerateMinMaxFP(min->GetLocations(), /*is_min*/ true, min->GetResultType());
-      break;
-    default:
-      LOG(FATAL) << "Unexpected type for HMin " << min->GetResultType();
-  }
+  GenerateMinMax(min, /*is_min*/ true);
 }
 
 void LocationsBuilderARM64::VisitMax(HMax* max) {
@@ -5562,18 +5594,7 @@
 }
 
 void InstructionCodeGeneratorARM64::VisitMax(HMax* max) {
-  switch (max->GetResultType()) {
-    case DataType::Type::kInt32:
-    case DataType::Type::kInt64:
-      GenerateMinMax(max->GetLocations(), /*is_min*/ false, max->GetResultType());
-      break;
-    case DataType::Type::kFloat32:
-    case DataType::Type::kFloat64:
-      GenerateMinMaxFP(max->GetLocations(), /*is_min*/ false, max->GetResultType());
-      break;
-    default:
-      LOG(FATAL) << "Unexpected type for HMax " << max->GetResultType();
-  }
+  GenerateMinMax(max, /*is_min*/ false);
 }
 
 void LocationsBuilderARM64::VisitAbs(HAbs* abs) {
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 70f5500..cb61b69 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -273,8 +273,9 @@
   void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info);
   void HandleCondition(HCondition* instruction);
 
-  void GenerateMinMax(LocationSummary* locations, bool is_min, DataType::Type type);
+  void GenerateMinMaxInt(LocationSummary* locations, bool is_min, DataType::Type type);
   void GenerateMinMaxFP(LocationSummary* locations, bool is_min, DataType::Type type);
+  void GenerateMinMax(HBinaryOperation* minmax, bool is_min);
 
   // Generate a heap reference load using one register `out`:
   //
@@ -564,7 +565,14 @@
     UNIMPLEMENTED(FATAL);
   }
 
-  // Add a new PC-relative method patch for an instruction and return the label
+  // Add a new boot image relocation patch for an instruction and return the label
+  // to be bound before the instruction. The instruction will be either the
+  // ADRP (pass `adrp_label = null`) or the LDR (pass `adrp_label` pointing
+  // to the associated ADRP patch label).
+  vixl::aarch64::Label* NewBootImageRelRoPatch(uint32_t boot_image_offset,
+                                               vixl::aarch64::Label* adrp_label = nullptr);
+
+  // Add a new boot image method patch for an instruction and return the label
   // to be bound before the instruction. The instruction will be either the
   // ADRP (pass `adrp_label = null`) or the ADD (pass `adrp_label` pointing
   // to the associated ADRP patch label).
@@ -578,7 +586,7 @@
   vixl::aarch64::Label* NewMethodBssEntryPatch(MethodReference target_method,
                                                vixl::aarch64::Label* adrp_label = nullptr);
 
-  // Add a new PC-relative type patch for an instruction and return the label
+  // Add a new boot image type patch for an instruction and return the label
   // to be bound before the instruction. The instruction will be either the
   // ADRP (pass `adrp_label = null`) or the ADD (pass `adrp_label` pointing
   // to the associated ADRP patch label).
@@ -594,7 +602,7 @@
                                              dex::TypeIndex type_index,
                                              vixl::aarch64::Label* adrp_label = nullptr);
 
-  // Add a new PC-relative string patch for an instruction and return the label
+  // Add a new boot image string patch for an instruction and return the label
   // to be bound before the instruction. The instruction will be either the
   // ADRP (pass `adrp_label = null`) or the ADD (pass `adrp_label` pointing
   // to the associated ADRP patch label).
@@ -823,7 +831,7 @@
   Uint32ToLiteralMap uint32_literals_;
   // Deduplication map for 64-bit literals, used for non-patchable method address or method code.
   Uint64ToLiteralMap uint64_literals_;
-  // PC-relative method patch info for kBootImageLinkTimePcRelative.
+  // PC-relative method patch info for kBootImageLinkTimePcRelative/BootImageRelRo.
   ArenaDeque<PcRelativePatchInfo> boot_image_method_patches_;
   // PC-relative method patch info for kBssEntry.
   ArenaDeque<PcRelativePatchInfo> method_bss_entry_patches_;
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index 4fef027..94438df 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -4719,7 +4719,7 @@
   }
 }
 
-void InstructionCodeGeneratorARMVIXL::GenerateMinMax(LocationSummary* locations, bool is_min) {
+void InstructionCodeGeneratorARMVIXL::GenerateMinMaxInt(LocationSummary* locations, bool is_min) {
   Location op1_loc = locations->InAt(0);
   Location op2_loc = locations->InAt(1);
   Location out_loc = locations->Out();
@@ -4780,8 +4780,8 @@
   }
 }
 
-void InstructionCodeGeneratorARMVIXL::GenerateMinMaxFloat(HInstruction* min_max, bool is_min) {
-  LocationSummary* locations = min_max->GetLocations();
+void InstructionCodeGeneratorARMVIXL::GenerateMinMaxFloat(HInstruction* minmax, bool is_min) {
+  LocationSummary* locations = minmax->GetLocations();
   Location op1_loc = locations->InAt(0);
   Location op2_loc = locations->InAt(1);
   Location out_loc = locations->Out();
@@ -4800,7 +4800,7 @@
   const vixl32::Register temp1 = temps.Acquire();
   vixl32::Register temp2 = RegisterFrom(locations->GetTemp(0));
   vixl32::Label nan, done;
-  vixl32::Label* final_label = codegen_->GetFinalLabel(min_max, &done);
+  vixl32::Label* final_label = codegen_->GetFinalLabel(minmax, &done);
 
   DCHECK(op1.Is(out));
 
@@ -4841,8 +4841,8 @@
   }
 }
 
-void InstructionCodeGeneratorARMVIXL::GenerateMinMaxDouble(HInstruction* min_max, bool is_min) {
-  LocationSummary* locations = min_max->GetLocations();
+void InstructionCodeGeneratorARMVIXL::GenerateMinMaxDouble(HInstruction* minmax, bool is_min) {
+  LocationSummary* locations = minmax->GetLocations();
   Location op1_loc = locations->InAt(0);
   Location op2_loc = locations->InAt(1);
   Location out_loc = locations->Out();
@@ -4857,7 +4857,7 @@
   vixl32::DRegister op2 = DRegisterFrom(op2_loc);
   vixl32::DRegister out = DRegisterFrom(out_loc);
   vixl32::Label handle_nan_eq, done;
-  vixl32::Label* final_label = codegen_->GetFinalLabel(min_max, &done);
+  vixl32::Label* final_label = codegen_->GetFinalLabel(minmax, &done);
 
   DCHECK(op1.Is(out));
 
@@ -4892,27 +4892,32 @@
   }
 }
 
+void InstructionCodeGeneratorARMVIXL::GenerateMinMax(HBinaryOperation* minmax, bool is_min) {
+  DataType::Type type = minmax->GetResultType();
+  switch (type) {
+    case DataType::Type::kInt32:
+      GenerateMinMaxInt(minmax->GetLocations(), is_min);
+      break;
+    case DataType::Type::kInt64:
+      GenerateMinMaxLong(minmax->GetLocations(), is_min);
+      break;
+    case DataType::Type::kFloat32:
+      GenerateMinMaxFloat(minmax, is_min);
+      break;
+    case DataType::Type::kFloat64:
+      GenerateMinMaxDouble(minmax, is_min);
+      break;
+    default:
+      LOG(FATAL) << "Unexpected type for HMinMax " << type;
+  }
+}
+
 void LocationsBuilderARMVIXL::VisitMin(HMin* min) {
   CreateMinMaxLocations(GetGraph()->GetAllocator(), min);
 }
 
 void InstructionCodeGeneratorARMVIXL::VisitMin(HMin* min) {
-  switch (min->GetResultType()) {
-    case DataType::Type::kInt32:
-      GenerateMinMax(min->GetLocations(), /*is_min*/ true);
-      break;
-    case DataType::Type::kInt64:
-      GenerateMinMaxLong(min->GetLocations(), /*is_min*/ true);
-      break;
-    case DataType::Type::kFloat32:
-      GenerateMinMaxFloat(min, /*is_min*/ true);
-      break;
-    case DataType::Type::kFloat64:
-      GenerateMinMaxDouble(min, /*is_min*/ true);
-      break;
-    default:
-      LOG(FATAL) << "Unexpected type for HMin " << min->GetResultType();
-  }
+  GenerateMinMax(min, /*is_min*/ true);
 }
 
 void LocationsBuilderARMVIXL::VisitMax(HMax* max) {
@@ -4920,22 +4925,7 @@
 }
 
 void InstructionCodeGeneratorARMVIXL::VisitMax(HMax* max) {
-  switch (max->GetResultType()) {
-    case DataType::Type::kInt32:
-      GenerateMinMax(max->GetLocations(), /*is_min*/ false);
-      break;
-    case DataType::Type::kInt64:
-      GenerateMinMaxLong(max->GetLocations(), /*is_min*/ false);
-      break;
-    case DataType::Type::kFloat32:
-      GenerateMinMaxFloat(max, /*is_min*/ false);
-      break;
-    case DataType::Type::kFloat64:
-      GenerateMinMaxDouble(max, /*is_min*/ false);
-      break;
-    default:
-      LOG(FATAL) << "Unexpected type for HMax " << max->GetResultType();
-  }
+  GenerateMinMax(max, /*is_min*/ false);
 }
 
 void LocationsBuilderARMVIXL::VisitAbs(HAbs* abs) {
@@ -9204,6 +9194,14 @@
     case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress:
       __ Mov(RegisterFrom(temp), Operand::From(invoke->GetMethodAddress()));
       break;
+    case HInvokeStaticOrDirect::MethodLoadKind::kBootImageRelRo: {
+      uint32_t boot_image_offset = invoke->GetDispatchInfo().method_load_data;
+      PcRelativePatchInfo* labels = NewBootImageRelRoPatch(boot_image_offset);
+      vixl32::Register temp_reg = RegisterFrom(temp);
+      EmitMovwMovtPlaceholder(labels, temp_reg);
+      GetAssembler()->LoadFromOffset(kLoadWord, temp_reg, temp_reg, /* offset*/ 0);
+      break;
+    }
     case HInvokeStaticOrDirect::MethodLoadKind::kBssEntry: {
       PcRelativePatchInfo* labels = NewMethodBssEntryPatch(
           MethodReference(&GetGraph()->GetDexFile(), invoke->GetDexMethodIndex()));
@@ -9301,6 +9299,13 @@
   }
 }
 
+CodeGeneratorARMVIXL::PcRelativePatchInfo* CodeGeneratorARMVIXL::NewBootImageRelRoPatch(
+    uint32_t boot_image_offset) {
+  return NewPcRelativePatch(/* dex_file */ nullptr,
+                            boot_image_offset,
+                            &boot_image_method_patches_);
+}
+
 CodeGeneratorARMVIXL::PcRelativePatchInfo* CodeGeneratorARMVIXL::NewBootImageMethodPatch(
     MethodReference target_method) {
   return NewPcRelativePatch(
@@ -9391,6 +9396,14 @@
   }
 }
 
+linker::LinkerPatch DataBimgRelRoPatchAdapter(size_t literal_offset,
+                                              const DexFile* target_dex_file,
+                                              uint32_t pc_insn_offset,
+                                              uint32_t boot_image_offset) {
+  DCHECK(target_dex_file == nullptr);  // Unused for DataBimgRelRoPatch(), should be null.
+  return linker::LinkerPatch::DataBimgRelRoPatch(literal_offset, pc_insn_offset, boot_image_offset);
+}
+
 void CodeGeneratorARMVIXL::EmitLinkerPatches(ArenaVector<linker::LinkerPatch>* linker_patches) {
   DCHECK(linker_patches->empty());
   size_t size =
@@ -9410,7 +9423,8 @@
     EmitPcRelativeLinkerPatches<linker::LinkerPatch::RelativeStringPatch>(
         boot_image_string_patches_, linker_patches);
   } else {
-    DCHECK(boot_image_method_patches_.empty());
+    EmitPcRelativeLinkerPatches<DataBimgRelRoPatchAdapter>(
+        boot_image_method_patches_, linker_patches);
     EmitPcRelativeLinkerPatches<linker::LinkerPatch::TypeClassTablePatch>(
         boot_image_type_patches_, linker_patches);
     EmitPcRelativeLinkerPatches<linker::LinkerPatch::StringInternTablePatch>(
diff --git a/compiler/optimizing/code_generator_arm_vixl.h b/compiler/optimizing/code_generator_arm_vixl.h
index 726a2f9..054acbc 100644
--- a/compiler/optimizing/code_generator_arm_vixl.h
+++ b/compiler/optimizing/code_generator_arm_vixl.h
@@ -349,10 +349,11 @@
                       bool value_can_be_null);
   void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info);
 
-  void GenerateMinMax(LocationSummary* locations, bool is_min);
+  void GenerateMinMaxInt(LocationSummary* locations, bool is_min);
   void GenerateMinMaxLong(LocationSummary* locations, bool is_min);
-  void GenerateMinMaxFloat(HInstruction* min_max, bool is_min);
-  void GenerateMinMaxDouble(HInstruction* min_max, bool is_min);
+  void GenerateMinMaxFloat(HInstruction* minmax, bool is_min);
+  void GenerateMinMaxDouble(HInstruction* minmax, bool is_min);
+  void GenerateMinMax(HBinaryOperation* minmax, bool is_min);
 
   // Generate a heap reference load using one register `out`:
   //
@@ -579,6 +580,7 @@
     vixl::aarch32::Label add_pc_label;
   };
 
+  PcRelativePatchInfo* NewBootImageRelRoPatch(uint32_t boot_image_offset);
   PcRelativePatchInfo* NewBootImageMethodPatch(MethodReference target_method);
   PcRelativePatchInfo* NewMethodBssEntryPatch(MethodReference target_method);
   PcRelativePatchInfo* NewBootImageTypePatch(const DexFile& dex_file, dex::TypeIndex type_index);
@@ -803,7 +805,7 @@
 
   // Deduplication map for 32-bit literals, used for non-patchable boot image addresses.
   Uint32ToLiteralMap uint32_literals_;
-  // PC-relative method patch info for kBootImageLinkTimePcRelative.
+  // PC-relative method patch info for kBootImageLinkTimePcRelative/kBootImageRelRo.
   ArenaDeque<PcRelativePatchInfo> boot_image_method_patches_;
   // PC-relative method patch info for kBssEntry.
   ArenaDeque<PcRelativePatchInfo> method_bss_entry_patches_;
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index ae42bbc..11c1163 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -1597,6 +1597,14 @@
   }
 }
 
+linker::LinkerPatch DataBimgRelRoPatchAdapter(size_t literal_offset,
+                                              const DexFile* target_dex_file,
+                                              uint32_t pc_insn_offset,
+                                              uint32_t boot_image_offset) {
+  DCHECK(target_dex_file == nullptr);  // Unused for DataBimgRelRoPatch(), should be null.
+  return linker::LinkerPatch::DataBimgRelRoPatch(literal_offset, pc_insn_offset, boot_image_offset);
+}
+
 void CodeGeneratorMIPS::EmitLinkerPatches(ArenaVector<linker::LinkerPatch>* linker_patches) {
   DCHECK(linker_patches->empty());
   size_t size =
@@ -1615,7 +1623,8 @@
     EmitPcRelativeLinkerPatches<linker::LinkerPatch::RelativeStringPatch>(
         boot_image_string_patches_, linker_patches);
   } else {
-    DCHECK(boot_image_method_patches_.empty());
+    EmitPcRelativeLinkerPatches<DataBimgRelRoPatchAdapter>(
+        boot_image_method_patches_, linker_patches);
     EmitPcRelativeLinkerPatches<linker::LinkerPatch::TypeClassTablePatch>(
         boot_image_type_patches_, linker_patches);
     EmitPcRelativeLinkerPatches<linker::LinkerPatch::StringInternTablePatch>(
@@ -1630,6 +1639,13 @@
   DCHECK_EQ(size, linker_patches->size());
 }
 
+CodeGeneratorMIPS::PcRelativePatchInfo* CodeGeneratorMIPS::NewBootImageRelRoPatch(
+    uint32_t boot_image_offset,
+    const PcRelativePatchInfo* info_high) {
+  return NewPcRelativePatch(
+      /* dex_file */ nullptr, boot_image_offset, info_high, &boot_image_method_patches_);
+}
+
 CodeGeneratorMIPS::PcRelativePatchInfo* CodeGeneratorMIPS::NewBootImageMethodPatch(
     MethodReference target_method,
     const PcRelativePatchInfo* info_high) {
@@ -7835,6 +7851,15 @@
     case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress:
       __ LoadConst32(temp.AsRegister<Register>(), invoke->GetMethodAddress());
       break;
+    case HInvokeStaticOrDirect::MethodLoadKind::kBootImageRelRo: {
+      uint32_t boot_image_offset = invoke->GetDispatchInfo().method_load_data;
+      PcRelativePatchInfo* info_high = NewBootImageRelRoPatch(boot_image_offset);
+      PcRelativePatchInfo* info_low = NewBootImageRelRoPatch(boot_image_offset, info_high);
+      Register temp_reg = temp.AsRegister<Register>();
+      EmitPcRelativeAddressPlaceholderHigh(info_high, TMP, base_reg);
+      __ Lw(temp_reg, TMP, /* placeholder */ 0x5678, &info_low->label);
+      break;
+    }
     case HInvokeStaticOrDirect::MethodLoadKind::kBssEntry: {
       PcRelativePatchInfo* info_high = NewMethodBssEntryPatch(
           MethodReference(&GetGraph()->GetDexFile(), invoke->GetDexMethodIndex()));
@@ -8799,10 +8824,10 @@
   }
 }
 
-void InstructionCodeGeneratorMIPS::GenerateMinMax(LocationSummary* locations,
-                                                  bool is_min,
-                                                  bool isR6,
-                                                  DataType::Type type) {
+void InstructionCodeGeneratorMIPS::GenerateMinMaxInt(LocationSummary* locations,
+                                                     bool is_min,
+                                                     bool isR6,
+                                                     DataType::Type type) {
   if (isR6) {
     // Some architectures, such as ARM and MIPS (prior to r6), have a
     // conditional move instruction which only changes the target
@@ -9130,24 +9155,29 @@
   }
 }
 
+void InstructionCodeGeneratorMIPS::GenerateMinMax(HBinaryOperation* minmax, bool is_min) {
+  bool isR6 = codegen_->GetInstructionSetFeatures().IsR6();
+  DataType::Type type = minmax->GetResultType();
+  switch (type) {
+    case DataType::Type::kInt32:
+    case DataType::Type::kInt64:
+      GenerateMinMaxInt(minmax->GetLocations(), is_min, isR6, type);
+      break;
+    case DataType::Type::kFloat32:
+    case DataType::Type::kFloat64:
+      GenerateMinMaxFP(minmax->GetLocations(), is_min, isR6, type);
+      break;
+    default:
+      LOG(FATAL) << "Unexpected type for HMinMax " << type;
+  }
+}
+
 void LocationsBuilderMIPS::VisitMin(HMin* min) {
   CreateMinMaxLocations(GetGraph()->GetAllocator(), min);
 }
 
 void InstructionCodeGeneratorMIPS::VisitMin(HMin* min) {
-  bool isR6 = codegen_->GetInstructionSetFeatures().IsR6();
-  switch (min->GetResultType()) {
-    case DataType::Type::kInt32:
-    case DataType::Type::kInt64:
-      GenerateMinMax(min->GetLocations(), /*is_min*/ true, isR6, min->GetResultType());
-      break;
-    case DataType::Type::kFloat32:
-    case DataType::Type::kFloat64:
-      GenerateMinMaxFP(min->GetLocations(), /*is_min*/ true, isR6, min->GetResultType());
-      break;
-    default:
-      LOG(FATAL) << "Unexpected type for HMin " << min->GetResultType();
-  }
+  GenerateMinMax(min, /*is_min*/ true);
 }
 
 void LocationsBuilderMIPS::VisitMax(HMax* max) {
@@ -9155,19 +9185,7 @@
 }
 
 void InstructionCodeGeneratorMIPS::VisitMax(HMax* max) {
-  bool isR6 = codegen_->GetInstructionSetFeatures().IsR6();
-  switch (max->GetResultType()) {
-    case DataType::Type::kInt32:
-    case DataType::Type::kInt64:
-      GenerateMinMax(max->GetLocations(), /*is_min*/ false, isR6, max->GetResultType());
-      break;
-    case DataType::Type::kFloat32:
-    case DataType::Type::kFloat64:
-      GenerateMinMaxFP(max->GetLocations(), /*is_min*/ false, isR6, max->GetResultType());
-      break;
-    default:
-      LOG(FATAL) << "Unexpected type for HMax " << max->GetResultType();
-  }
+  GenerateMinMax(max, /*is_min*/ false);
 }
 
 void LocationsBuilderMIPS::VisitAbs(HAbs* abs) {
diff --git a/compiler/optimizing/code_generator_mips.h b/compiler/optimizing/code_generator_mips.h
index ae5fe5b..2be8e2e 100644
--- a/compiler/optimizing/code_generator_mips.h
+++ b/compiler/optimizing/code_generator_mips.h
@@ -246,8 +246,9 @@
                       bool value_can_be_null);
   void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info, uint32_t dex_pc);
 
-  void GenerateMinMax(LocationSummary* locations, bool is_min, bool isR6, DataType::Type type);
+  void GenerateMinMaxInt(LocationSummary* locations, bool is_min, bool isR6, DataType::Type type);
   void GenerateMinMaxFP(LocationSummary* locations, bool is_min, bool isR6, DataType::Type type);
+  void GenerateMinMax(HBinaryOperation*, bool is_min);
   void GenerateAbsFP(LocationSummary* locations, DataType::Type type, bool isR2OrNewer, bool isR6);
 
   // Generate a heap reference load using one register `out`:
@@ -619,6 +620,8 @@
     DISALLOW_COPY_AND_ASSIGN(PcRelativePatchInfo);
   };
 
+  PcRelativePatchInfo* NewBootImageRelRoPatch(uint32_t boot_image_offset,
+                                              const PcRelativePatchInfo* info_high = nullptr);
   PcRelativePatchInfo* NewBootImageMethodPatch(MethodReference target_method,
                                                const PcRelativePatchInfo* info_high = nullptr);
   PcRelativePatchInfo* NewMethodBssEntryPatch(MethodReference target_method,
@@ -693,7 +696,7 @@
 
   // Deduplication map for 32-bit literals, used for non-patchable boot image addresses.
   Uint32ToLiteralMap uint32_literals_;
-  // PC-relative method patch info for kBootImageLinkTimePcRelative.
+  // PC-relative method patch info for kBootImageLinkTimePcRelative/kBootImageRelRo.
   ArenaDeque<PcRelativePatchInfo> boot_image_method_patches_;
   // PC-relative method patch info for kBssEntry.
   ArenaDeque<PcRelativePatchInfo> method_bss_entry_patches_;
diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc
index 8031cca..d08a065 100644
--- a/compiler/optimizing/code_generator_mips64.cc
+++ b/compiler/optimizing/code_generator_mips64.cc
@@ -1509,6 +1509,14 @@
   }
 }
 
+linker::LinkerPatch DataBimgRelRoPatchAdapter(size_t literal_offset,
+                                              const DexFile* target_dex_file,
+                                              uint32_t pc_insn_offset,
+                                              uint32_t boot_image_offset) {
+  DCHECK(target_dex_file == nullptr);  // Unused for DataBimgRelRoPatch(), should be null.
+  return linker::LinkerPatch::DataBimgRelRoPatch(literal_offset, pc_insn_offset, boot_image_offset);
+}
+
 void CodeGeneratorMIPS64::EmitLinkerPatches(ArenaVector<linker::LinkerPatch>* linker_patches) {
   DCHECK(linker_patches->empty());
   size_t size =
@@ -1527,7 +1535,8 @@
     EmitPcRelativeLinkerPatches<linker::LinkerPatch::RelativeStringPatch>(
         boot_image_string_patches_, linker_patches);
   } else {
-    DCHECK(boot_image_method_patches_.empty());
+    EmitPcRelativeLinkerPatches<DataBimgRelRoPatchAdapter>(
+        boot_image_method_patches_, linker_patches);
     EmitPcRelativeLinkerPatches<linker::LinkerPatch::TypeClassTablePatch>(
         boot_image_type_patches_, linker_patches);
     EmitPcRelativeLinkerPatches<linker::LinkerPatch::StringInternTablePatch>(
@@ -1542,6 +1551,13 @@
   DCHECK_EQ(size, linker_patches->size());
 }
 
+CodeGeneratorMIPS64::PcRelativePatchInfo* CodeGeneratorMIPS64::NewBootImageRelRoPatch(
+    uint32_t boot_image_offset,
+    const PcRelativePatchInfo* info_high) {
+  return NewPcRelativePatch(
+      /* dex_file */ nullptr, boot_image_offset, info_high, &boot_image_method_patches_);
+}
+
 CodeGeneratorMIPS64::PcRelativePatchInfo* CodeGeneratorMIPS64::NewBootImageMethodPatch(
     MethodReference target_method,
     const PcRelativePatchInfo* info_high) {
@@ -5926,6 +5942,15 @@
                      kLoadDoubleword,
                      DeduplicateUint64Literal(invoke->GetMethodAddress()));
       break;
+    case HInvokeStaticOrDirect::MethodLoadKind::kBootImageRelRo: {
+      uint32_t boot_image_offset = invoke->GetDispatchInfo().method_load_data;
+      PcRelativePatchInfo* info_high = NewBootImageRelRoPatch(boot_image_offset);
+      PcRelativePatchInfo* info_low = NewBootImageRelRoPatch(boot_image_offset, info_high);
+      EmitPcRelativeAddressPlaceholderHigh(info_high, AT, info_low);
+      // Note: Boot image is in the low 4GiB and the entry is 32-bit, so emit a 32-bit load.
+      __ Lwu(temp.AsRegister<GpuRegister>(), AT, /* placeholder */ 0x5678);
+      break;
+    }
     case HInvokeStaticOrDirect::MethodLoadKind::kBssEntry: {
       PcRelativePatchInfo* info_high = NewMethodBssEntryPatch(
           MethodReference(&GetGraph()->GetDexFile(), invoke->GetDexMethodIndex()));
@@ -6685,7 +6710,7 @@
   }
 }
 
-void InstructionCodeGeneratorMIPS64::GenerateMinMax(LocationSummary* locations, bool is_min) {
+void InstructionCodeGeneratorMIPS64::GenerateMinMaxInt(LocationSummary* locations, bool is_min) {
   GpuRegister lhs = locations->InAt(0).AsRegister<GpuRegister>();
   GpuRegister rhs = locations->InAt(1).AsRegister<GpuRegister>();
   GpuRegister out = locations->Out().AsRegister<GpuRegister>();
@@ -6809,23 +6834,28 @@
   __ Bind(&done);
 }
 
+void InstructionCodeGeneratorMIPS64::GenerateMinMax(HBinaryOperation* minmax, bool is_min) {
+  DataType::Type type = minmax->GetResultType();
+  switch (type) {
+    case DataType::Type::kInt32:
+    case DataType::Type::kInt64:
+      GenerateMinMaxInt(minmax->GetLocations(), is_min);
+      break;
+    case DataType::Type::kFloat32:
+    case DataType::Type::kFloat64:
+      GenerateMinMaxFP(minmax->GetLocations(), is_min, type);
+      break;
+    default:
+      LOG(FATAL) << "Unexpected type for HMinMax " << type;
+  }
+}
+
 void LocationsBuilderMIPS64::VisitMin(HMin* min) {
   CreateMinMaxLocations(GetGraph()->GetAllocator(), min);
 }
 
 void InstructionCodeGeneratorMIPS64::VisitMin(HMin* min) {
-  switch (min->GetResultType()) {
-    case DataType::Type::kInt32:
-    case DataType::Type::kInt64:
-      GenerateMinMax(min->GetLocations(), /*is_min*/ true);
-      break;
-    case DataType::Type::kFloat32:
-    case DataType::Type::kFloat64:
-      GenerateMinMaxFP(min->GetLocations(), /*is_min*/ true, min->GetResultType());
-      break;
-    default:
-      LOG(FATAL) << "Unexpected type for HMin " << min->GetResultType();
-  }
+  GenerateMinMax(min, /*is_min*/ true);
 }
 
 void LocationsBuilderMIPS64::VisitMax(HMax* max) {
@@ -6833,18 +6863,7 @@
 }
 
 void InstructionCodeGeneratorMIPS64::VisitMax(HMax* max) {
-  switch (max->GetResultType()) {
-    case DataType::Type::kInt32:
-    case DataType::Type::kInt64:
-      GenerateMinMax(max->GetLocations(), /*is_min*/ false);
-      break;
-    case DataType::Type::kFloat32:
-    case DataType::Type::kFloat64:
-      GenerateMinMaxFP(max->GetLocations(), /*is_min*/ false, max->GetResultType());
-      break;
-    default:
-      LOG(FATAL) << "Unexpected type for HMax " << max->GetResultType();
-  }
+  GenerateMinMax(max, /*is_min*/ false);
 }
 
 void LocationsBuilderMIPS64::VisitAbs(HAbs* abs) {
diff --git a/compiler/optimizing/code_generator_mips64.h b/compiler/optimizing/code_generator_mips64.h
index 5d925d5..5d40307 100644
--- a/compiler/optimizing/code_generator_mips64.h
+++ b/compiler/optimizing/code_generator_mips64.h
@@ -242,8 +242,9 @@
                       bool value_can_be_null);
   void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info);
 
-  void GenerateMinMax(LocationSummary* locations, bool is_min);
+  void GenerateMinMaxInt(LocationSummary* locations, bool is_min);
   void GenerateMinMaxFP(LocationSummary* locations, bool is_min, DataType::Type type);
+  void GenerateMinMax(HBinaryOperation* minmax, bool is_min);
 
   // Generate a heap reference load using one register `out`:
   //
@@ -589,6 +590,8 @@
     DISALLOW_COPY_AND_ASSIGN(PcRelativePatchInfo);
   };
 
+  PcRelativePatchInfo* NewBootImageRelRoPatch(uint32_t boot_image_offset,
+                                              const PcRelativePatchInfo* info_high = nullptr);
   PcRelativePatchInfo* NewBootImageMethodPatch(MethodReference target_method,
                                                const PcRelativePatchInfo* info_high = nullptr);
   PcRelativePatchInfo* NewMethodBssEntryPatch(MethodReference target_method,
@@ -658,7 +661,7 @@
   // Deduplication map for 64-bit literals, used for non-patchable method address or method code
   // address.
   Uint64ToLiteralMap uint64_literals_;
-  // PC-relative method patch info for kBootImageLinkTimePcRelative.
+  // PC-relative method patch info for kBootImageLinkTimePcRelative/kBootImageRelRo.
   ArenaDeque<PcRelativePatchInfo> boot_image_method_patches_;
   // PC-relative method patch info for kBssEntry.
   ArenaDeque<PcRelativePatchInfo> method_bss_entry_patches_;
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 536909a..528930a 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -3836,9 +3836,9 @@
   }
 }
 
-void InstructionCodeGeneratorX86::GenerateMinMax(LocationSummary* locations,
-                                                 bool is_min,
-                                                 DataType::Type type) {
+void InstructionCodeGeneratorX86::GenerateMinMaxInt(LocationSummary* locations,
+                                                    bool is_min,
+                                                    DataType::Type type) {
   Location op1_loc = locations->InAt(0);
   Location op2_loc = locations->InAt(1);
 
@@ -3978,23 +3978,28 @@
   __ Bind(&done);
 }
 
+void InstructionCodeGeneratorX86::GenerateMinMax(HBinaryOperation* minmax, bool is_min) {
+  DataType::Type type = minmax->GetResultType();
+  switch (type) {
+    case DataType::Type::kInt32:
+    case DataType::Type::kInt64:
+      GenerateMinMaxInt(minmax->GetLocations(), is_min, type);
+      break;
+    case DataType::Type::kFloat32:
+    case DataType::Type::kFloat64:
+      GenerateMinMaxFP(minmax->GetLocations(), is_min, type);
+      break;
+    default:
+      LOG(FATAL) << "Unexpected type for HMinMax " << type;
+  }
+}
+
 void LocationsBuilderX86::VisitMin(HMin* min) {
   CreateMinMaxLocations(GetGraph()->GetAllocator(), min);
 }
 
 void InstructionCodeGeneratorX86::VisitMin(HMin* min) {
-  switch (min->GetResultType()) {
-    case DataType::Type::kInt32:
-    case DataType::Type::kInt64:
-      GenerateMinMax(min->GetLocations(), /*is_min*/ true, min->GetResultType());
-      break;
-    case DataType::Type::kFloat32:
-    case DataType::Type::kFloat64:
-      GenerateMinMaxFP(min->GetLocations(), /*is_min*/ true, min->GetResultType());
-      break;
-    default:
-      LOG(FATAL) << "Unexpected type for HMin " << min->GetResultType();
-  }
+  GenerateMinMax(min, /*is_min*/ true);
 }
 
 void LocationsBuilderX86::VisitMax(HMax* max) {
@@ -4002,18 +4007,7 @@
 }
 
 void InstructionCodeGeneratorX86::VisitMax(HMax* max) {
-  switch (max->GetResultType()) {
-    case DataType::Type::kInt32:
-    case DataType::Type::kInt64:
-      GenerateMinMax(max->GetLocations(), /*is_min*/ false, max->GetResultType());
-      break;
-    case DataType::Type::kFloat32:
-    case DataType::Type::kFloat64:
-      GenerateMinMaxFP(max->GetLocations(), /*is_min*/ false, max->GetResultType());
-      break;
-    default:
-      LOG(FATAL) << "Unexpected type for HMax " << max->GetResultType();
-  }
+  GenerateMinMax(max, /*is_min*/ false);
 }
 
 void LocationsBuilderX86::VisitAbs(HAbs* abs) {
@@ -4838,6 +4832,15 @@
     case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress:
       __ movl(temp.AsRegister<Register>(), Immediate(invoke->GetMethodAddress()));
       break;
+    case HInvokeStaticOrDirect::MethodLoadKind::kBootImageRelRo: {
+      Register base_reg = GetInvokeStaticOrDirectExtraParameter(invoke,
+                                                                temp.AsRegister<Register>());
+      __ movl(temp.AsRegister<Register>(), Address(base_reg, kDummy32BitOffset));
+      RecordBootImageRelRoPatch(
+          invoke->InputAt(invoke->GetSpecialInputIndex())->AsX86ComputeBaseMethodAddress(),
+          invoke->GetDispatchInfo().method_load_data);
+      break;
+    }
     case HInvokeStaticOrDirect::MethodLoadKind::kBssEntry: {
       Register base_reg = GetInvokeStaticOrDirectExtraParameter(invoke,
                                                                 temp.AsRegister<Register>());
@@ -4899,6 +4902,13 @@
   RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
 }
 
+void CodeGeneratorX86::RecordBootImageRelRoPatch(HX86ComputeBaseMethodAddress* method_address,
+                                                 uint32_t boot_image_offset) {
+  boot_image_method_patches_.emplace_back(
+      method_address, /* target_dex_file */ nullptr, boot_image_offset);
+  __ Bind(&boot_image_method_patches_.back().label);
+}
+
 void CodeGeneratorX86::RecordBootImageMethodPatch(HInvokeStaticOrDirect* invoke) {
   DCHECK_EQ(invoke->InputCount(), invoke->GetNumberOfArguments() + 1u);
   HX86ComputeBaseMethodAddress* method_address =
@@ -4968,6 +4978,14 @@
   }
 }
 
+linker::LinkerPatch DataBimgRelRoPatchAdapter(size_t literal_offset,
+                                              const DexFile* target_dex_file,
+                                              uint32_t pc_insn_offset,
+                                              uint32_t boot_image_offset) {
+  DCHECK(target_dex_file == nullptr);  // Unused for DataBimgRelRoPatch(), should be null.
+  return linker::LinkerPatch::DataBimgRelRoPatch(literal_offset, pc_insn_offset, boot_image_offset);
+}
+
 void CodeGeneratorX86::EmitLinkerPatches(ArenaVector<linker::LinkerPatch>* linker_patches) {
   DCHECK(linker_patches->empty());
   size_t size =
@@ -4986,7 +5004,8 @@
     EmitPcRelativeLinkerPatches<linker::LinkerPatch::RelativeStringPatch>(
         boot_image_string_patches_, linker_patches);
   } else {
-    DCHECK(boot_image_method_patches_.empty());
+    EmitPcRelativeLinkerPatches<DataBimgRelRoPatchAdapter>(
+        boot_image_method_patches_, linker_patches);
     EmitPcRelativeLinkerPatches<linker::LinkerPatch::TypeClassTablePatch>(
         boot_image_type_patches_, linker_patches);
     EmitPcRelativeLinkerPatches<linker::LinkerPatch::StringInternTablePatch>(
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index 82496d1..2dc34e8 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -225,8 +225,9 @@
   void GenerateShlLong(const Location& loc, int shift);
   void GenerateShrLong(const Location& loc, int shift);
   void GenerateUShrLong(const Location& loc, int shift);
-  void GenerateMinMax(LocationSummary* locations, bool is_min, DataType::Type type);
+  void GenerateMinMaxInt(LocationSummary* locations, bool is_min, DataType::Type type);
   void GenerateMinMaxFP(LocationSummary* locations, bool is_min, DataType::Type type);
+  void GenerateMinMax(HBinaryOperation* minmax, bool is_min);
 
   void HandleFieldSet(HInstruction* instruction,
                       const FieldInfo& field_info,
@@ -416,6 +417,8 @@
   void GenerateVirtualCall(
       HInvokeVirtual* invoke, Location temp, SlowPathCode* slow_path = nullptr) OVERRIDE;
 
+  void RecordBootImageRelRoPatch(HX86ComputeBaseMethodAddress* method_address,
+                                 uint32_t boot_image_offset);
   void RecordBootImageMethodPatch(HInvokeStaticOrDirect* invoke);
   void RecordMethodBssEntryPatch(HInvokeStaticOrDirect* invoke);
   void RecordBootImageTypePatch(HLoadClass* load_class);
@@ -633,7 +636,7 @@
   X86Assembler assembler_;
   const X86InstructionSetFeatures& isa_features_;
 
-  // PC-relative method patch info for kBootImageLinkTimePcRelative.
+  // PC-relative method patch info for kBootImageLinkTimePcRelative/kBootImageRelRo.
   ArenaDeque<X86PcRelativePatchInfo> boot_image_method_patches_;
   // PC-relative method patch info for kBssEntry.
   ArenaDeque<X86PcRelativePatchInfo> method_bss_entry_patches_;
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index bb1fbc5..d599724 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -998,6 +998,13 @@
     case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress:
       Load64BitValue(temp.AsRegister<CpuRegister>(), invoke->GetMethodAddress());
       break;
+    case HInvokeStaticOrDirect::MethodLoadKind::kBootImageRelRo: {
+      // Note: Boot image is in the low 4GiB and the entry is 32-bit, so emit a 32-bit load.
+      __ movl(temp.AsRegister<CpuRegister>(),
+              Address::Absolute(kDummy32BitOffset, /* no_rip */ false));
+      RecordBootImageRelRoPatch(invoke->GetDispatchInfo().method_load_data);
+      break;
+    }
     case HInvokeStaticOrDirect::MethodLoadKind::kBssEntry: {
       __ movq(temp.AsRegister<CpuRegister>(),
               Address::Absolute(kDummy32BitOffset, /* no_rip */ false));
@@ -1059,6 +1066,11 @@
   RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
 }
 
+void CodeGeneratorX86_64::RecordBootImageRelRoPatch(uint32_t boot_image_offset) {
+  boot_image_method_patches_.emplace_back(/* target_dex_file */ nullptr, boot_image_offset);
+  __ Bind(&boot_image_method_patches_.back().label);
+}
+
 void CodeGeneratorX86_64::RecordBootImageMethodPatch(HInvokeStaticOrDirect* invoke) {
   boot_image_method_patches_.emplace_back(
       invoke->GetTargetMethod().dex_file, invoke->GetTargetMethod().index);
@@ -1110,6 +1122,14 @@
   }
 }
 
+linker::LinkerPatch DataBimgRelRoPatchAdapter(size_t literal_offset,
+                                              const DexFile* target_dex_file,
+                                              uint32_t pc_insn_offset,
+                                              uint32_t boot_image_offset) {
+  DCHECK(target_dex_file == nullptr);  // Unused for DataBimgRelRoPatch(), should be null.
+  return linker::LinkerPatch::DataBimgRelRoPatch(literal_offset, pc_insn_offset, boot_image_offset);
+}
+
 void CodeGeneratorX86_64::EmitLinkerPatches(ArenaVector<linker::LinkerPatch>* linker_patches) {
   DCHECK(linker_patches->empty());
   size_t size =
@@ -1128,7 +1148,8 @@
     EmitPcRelativeLinkerPatches<linker::LinkerPatch::RelativeStringPatch>(
         boot_image_string_patches_, linker_patches);
   } else {
-    DCHECK(boot_image_method_patches_.empty());
+    EmitPcRelativeLinkerPatches<DataBimgRelRoPatchAdapter>(
+        boot_image_method_patches_, linker_patches);
     EmitPcRelativeLinkerPatches<linker::LinkerPatch::TypeClassTablePatch>(
         boot_image_type_patches_, linker_patches);
     EmitPcRelativeLinkerPatches<linker::LinkerPatch::StringInternTablePatch>(
@@ -3843,9 +3864,9 @@
   }
 }
 
-void InstructionCodeGeneratorX86_64::GenerateMinMax(LocationSummary* locations,
-                                                    bool is_min,
-                                                    DataType::Type type) {
+void InstructionCodeGeneratorX86_64::GenerateMinMaxInt(LocationSummary* locations,
+                                                       bool is_min,
+                                                       DataType::Type type) {
   Location op1_loc = locations->InAt(0);
   Location op2_loc = locations->InAt(1);
 
@@ -3960,23 +3981,28 @@
   __ Bind(&done);
 }
 
+void InstructionCodeGeneratorX86_64::GenerateMinMax(HBinaryOperation* minmax, bool is_min) {
+  DataType::Type type = minmax->GetResultType();
+  switch (type) {
+    case DataType::Type::kInt32:
+    case DataType::Type::kInt64:
+      GenerateMinMaxInt(minmax->GetLocations(), is_min, type);
+      break;
+    case DataType::Type::kFloat32:
+    case DataType::Type::kFloat64:
+      GenerateMinMaxFP(minmax->GetLocations(), is_min, type);
+      break;
+    default:
+      LOG(FATAL) << "Unexpected type for HMinMax " << type;
+  }
+}
+
 void LocationsBuilderX86_64::VisitMin(HMin* min) {
   CreateMinMaxLocations(GetGraph()->GetAllocator(), min);
 }
 
 void InstructionCodeGeneratorX86_64::VisitMin(HMin* min) {
-  switch (min->GetResultType()) {
-    case DataType::Type::kInt32:
-    case DataType::Type::kInt64:
-      GenerateMinMax(min->GetLocations(), /*is_min*/ true, min->GetResultType());
-      break;
-    case DataType::Type::kFloat32:
-    case DataType::Type::kFloat64:
-      GenerateMinMaxFP(min->GetLocations(), /*is_min*/ true, min->GetResultType());
-      break;
-    default:
-      LOG(FATAL) << "Unexpected type for HMin " << min->GetResultType();
-  }
+  GenerateMinMax(min, /*is_min*/ true);
 }
 
 void LocationsBuilderX86_64::VisitMax(HMax* max) {
@@ -3984,18 +4010,7 @@
 }
 
 void InstructionCodeGeneratorX86_64::VisitMax(HMax* max) {
-  switch (max->GetResultType()) {
-    case DataType::Type::kInt32:
-    case DataType::Type::kInt64:
-      GenerateMinMax(max->GetLocations(), /*is_min*/ false, max->GetResultType());
-      break;
-    case DataType::Type::kFloat32:
-    case DataType::Type::kFloat64:
-      GenerateMinMaxFP(max->GetLocations(), /*is_min*/ false, max->GetResultType());
-      break;
-    default:
-      LOG(FATAL) << "Unexpected type for HMax " << max->GetResultType();
-  }
+  GenerateMinMax(max, /*is_min*/ false);
 }
 
 void LocationsBuilderX86_64::VisitAbs(HAbs* abs) {
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index 933afda..5c8ed6c 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -222,8 +222,9 @@
                       bool value_can_be_null);
   void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info);
 
-  void GenerateMinMax(LocationSummary* locations, bool is_min, DataType::Type type);
+  void GenerateMinMaxInt(LocationSummary* locations, bool is_min, DataType::Type type);
   void GenerateMinMaxFP(LocationSummary* locations, bool is_min, DataType::Type type);
+  void GenerateMinMax(HBinaryOperation* minmax, bool is_min);
 
   // Generate a heap reference load using one register `out`:
   //
@@ -413,6 +414,7 @@
   void GenerateVirtualCall(
       HInvokeVirtual* invoke, Location temp, SlowPathCode* slow_path = nullptr) OVERRIDE;
 
+  void RecordBootImageRelRoPatch(uint32_t boot_image_offset);
   void RecordBootImageMethodPatch(HInvokeStaticOrDirect* invoke);
   void RecordMethodBssEntryPatch(HInvokeStaticOrDirect* invoke);
   void RecordBootImageTypePatch(HLoadClass* load_class);
@@ -607,7 +609,7 @@
   // Used for fixups to the constant area.
   int constant_area_start_;
 
-  // PC-relative method patch info for kBootImageLinkTimePcRelative.
+  // PC-relative method patch info for kBootImageLinkTimePcRelative/kBootImageRelRo.
   ArenaDeque<PatchInfo<Label>> boot_image_method_patches_;
   // PC-relative method patch info for kBssEntry.
   ArenaDeque<PatchInfo<Label>> method_bss_entry_patches_;
diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc
index d108c43..bc1292b 100644
--- a/compiler/optimizing/intrinsics_mips.cc
+++ b/compiler/optimizing/intrinsics_mips.cc
@@ -58,6 +58,10 @@
   return codegen_->GetInstructionSetFeatures().Is32BitFloatingPoint();
 }
 
+inline bool IntrinsicCodeGeneratorMIPS::HasMsa() const {
+  return codegen_->GetInstructionSetFeatures().HasMsa();
+}
+
 #define __ codegen->GetAssembler()->
 
 static void MoveFromReturnRegister(Location trg,
@@ -612,6 +616,7 @@
 static void GenBitCount(LocationSummary* locations,
                         DataType::Type type,
                         bool isR6,
+                        bool hasMsa,
                         MipsAssembler* assembler) {
   Register out = locations->Out().AsRegister<Register>();
 
@@ -637,85 +642,102 @@
   // instructions compared to a loop-based algorithm which required 47
   // instructions.
 
-  if (type == DataType::Type::kInt32) {
-    Register in = locations->InAt(0).AsRegister<Register>();
-
-    __ Srl(TMP, in, 1);
-    __ LoadConst32(AT, 0x55555555);
-    __ And(TMP, TMP, AT);
-    __ Subu(TMP, in, TMP);
-    __ LoadConst32(AT, 0x33333333);
-    __ And(out, TMP, AT);
-    __ Srl(TMP, TMP, 2);
-    __ And(TMP, TMP, AT);
-    __ Addu(TMP, out, TMP);
-    __ Srl(out, TMP, 4);
-    __ Addu(out, out, TMP);
-    __ LoadConst32(AT, 0x0F0F0F0F);
-    __ And(out, out, AT);
-    __ LoadConst32(TMP, 0x01010101);
-    if (isR6) {
-      __ MulR6(out, out, TMP);
+  if (hasMsa) {
+    if (type == DataType::Type::kInt32) {
+      Register in = locations->InAt(0).AsRegister<Register>();
+      __ Mtc1(in, FTMP);
+      __ PcntW(static_cast<VectorRegister>(FTMP), static_cast<VectorRegister>(FTMP));
+      __ Mfc1(out, FTMP);
     } else {
-      __ MulR2(out, out, TMP);
+      DCHECK_EQ(type, DataType::Type::kInt64);
+      Register in_lo = locations->InAt(0).AsRegisterPairLow<Register>();
+      Register in_hi = locations->InAt(0).AsRegisterPairHigh<Register>();
+      __ Mtc1(in_lo, FTMP);
+      __ Mthc1(in_hi, FTMP);
+      __ PcntD(static_cast<VectorRegister>(FTMP), static_cast<VectorRegister>(FTMP));
+      __ Mfc1(out, FTMP);
     }
-    __ Srl(out, out, 24);
   } else {
-    DCHECK_EQ(type, DataType::Type::kInt64);
-    Register in_lo = locations->InAt(0).AsRegisterPairLow<Register>();
-    Register in_hi = locations->InAt(0).AsRegisterPairHigh<Register>();
-    Register tmp_hi = locations->GetTemp(0).AsRegister<Register>();
-    Register out_hi = locations->GetTemp(1).AsRegister<Register>();
-    Register tmp_lo = TMP;
-    Register out_lo = out;
+    if (type == DataType::Type::kInt32) {
+      Register in = locations->InAt(0).AsRegister<Register>();
 
-    __ Srl(tmp_lo, in_lo, 1);
-    __ Srl(tmp_hi, in_hi, 1);
-
-    __ LoadConst32(AT, 0x55555555);
-
-    __ And(tmp_lo, tmp_lo, AT);
-    __ Subu(tmp_lo, in_lo, tmp_lo);
-
-    __ And(tmp_hi, tmp_hi, AT);
-    __ Subu(tmp_hi, in_hi, tmp_hi);
-
-    __ LoadConst32(AT, 0x33333333);
-
-    __ And(out_lo, tmp_lo, AT);
-    __ Srl(tmp_lo, tmp_lo, 2);
-    __ And(tmp_lo, tmp_lo, AT);
-    __ Addu(tmp_lo, out_lo, tmp_lo);
-
-    __ And(out_hi, tmp_hi, AT);
-    __ Srl(tmp_hi, tmp_hi, 2);
-    __ And(tmp_hi, tmp_hi, AT);
-    __ Addu(tmp_hi, out_hi, tmp_hi);
-
-    // Here we deviate from the original algorithm a bit. We've reached
-    // the stage where the bitfields holding the subtotals are large
-    // enough to hold the combined subtotals for both the low word, and
-    // the high word. This means that we can add the subtotals for the
-    // the high, and low words into a single word, and compute the final
-    // result for both the high, and low words using fewer instructions.
-    __ LoadConst32(AT, 0x0F0F0F0F);
-
-    __ Addu(TMP, tmp_hi, tmp_lo);
-
-    __ Srl(out, TMP, 4);
-    __ And(out, out, AT);
-    __ And(TMP, TMP, AT);
-    __ Addu(out, out, TMP);
-
-    __ LoadConst32(AT, 0x01010101);
-
-    if (isR6) {
-      __ MulR6(out, out, AT);
+      __ Srl(TMP, in, 1);
+      __ LoadConst32(AT, 0x55555555);
+      __ And(TMP, TMP, AT);
+      __ Subu(TMP, in, TMP);
+      __ LoadConst32(AT, 0x33333333);
+      __ And(out, TMP, AT);
+      __ Srl(TMP, TMP, 2);
+      __ And(TMP, TMP, AT);
+      __ Addu(TMP, out, TMP);
+      __ Srl(out, TMP, 4);
+      __ Addu(out, out, TMP);
+      __ LoadConst32(AT, 0x0F0F0F0F);
+      __ And(out, out, AT);
+      __ LoadConst32(TMP, 0x01010101);
+      if (isR6) {
+        __ MulR6(out, out, TMP);
+      } else {
+        __ MulR2(out, out, TMP);
+      }
+      __ Srl(out, out, 24);
     } else {
-      __ MulR2(out, out, AT);
-    }
+      DCHECK_EQ(type, DataType::Type::kInt64);
+      Register in_lo = locations->InAt(0).AsRegisterPairLow<Register>();
+      Register in_hi = locations->InAt(0).AsRegisterPairHigh<Register>();
+      Register tmp_hi = locations->GetTemp(0).AsRegister<Register>();
+      Register out_hi = locations->GetTemp(1).AsRegister<Register>();
+      Register tmp_lo = TMP;
+      Register out_lo = out;
 
-    __ Srl(out, out, 24);
+      __ Srl(tmp_lo, in_lo, 1);
+      __ Srl(tmp_hi, in_hi, 1);
+
+      __ LoadConst32(AT, 0x55555555);
+
+      __ And(tmp_lo, tmp_lo, AT);
+      __ Subu(tmp_lo, in_lo, tmp_lo);
+
+      __ And(tmp_hi, tmp_hi, AT);
+      __ Subu(tmp_hi, in_hi, tmp_hi);
+
+      __ LoadConst32(AT, 0x33333333);
+
+      __ And(out_lo, tmp_lo, AT);
+      __ Srl(tmp_lo, tmp_lo, 2);
+      __ And(tmp_lo, tmp_lo, AT);
+      __ Addu(tmp_lo, out_lo, tmp_lo);
+
+      __ And(out_hi, tmp_hi, AT);
+      __ Srl(tmp_hi, tmp_hi, 2);
+      __ And(tmp_hi, tmp_hi, AT);
+      __ Addu(tmp_hi, out_hi, tmp_hi);
+
+      // Here we deviate from the original algorithm a bit. We've reached
+      // the stage where the bitfields holding the subtotals are large
+      // enough to hold the combined subtotals for both the low word, and
+      // the high word. This means that we can add the subtotals for the
+      // the high, and low words into a single word, and compute the final
+      // result for both the high, and low words using fewer instructions.
+      __ LoadConst32(AT, 0x0F0F0F0F);
+
+      __ Addu(TMP, tmp_hi, tmp_lo);
+
+      __ Srl(out, TMP, 4);
+      __ And(out, out, AT);
+      __ And(TMP, TMP, AT);
+      __ Addu(out, out, TMP);
+
+      __ LoadConst32(AT, 0x01010101);
+
+      if (isR6) {
+        __ MulR6(out, out, AT);
+      } else {
+        __ MulR2(out, out, AT);
+      }
+
+      __ Srl(out, out, 24);
+    }
   }
 }
 
@@ -725,7 +747,7 @@
 }
 
 void IntrinsicCodeGeneratorMIPS::VisitIntegerBitCount(HInvoke* invoke) {
-  GenBitCount(invoke->GetLocations(), DataType::Type::kInt32, IsR6(), GetAssembler());
+  GenBitCount(invoke->GetLocations(), DataType::Type::kInt32, IsR6(), HasMsa(), GetAssembler());
 }
 
 // int java.lang.Long.bitCount(int)
@@ -739,7 +761,7 @@
 }
 
 void IntrinsicCodeGeneratorMIPS::VisitLongBitCount(HInvoke* invoke) {
-  GenBitCount(invoke->GetLocations(), DataType::Type::kInt64, IsR6(), GetAssembler());
+  GenBitCount(invoke->GetLocations(), DataType::Type::kInt64, IsR6(), HasMsa(), GetAssembler());
 }
 
 // double java.lang.Math.sqrt(double)
diff --git a/compiler/optimizing/intrinsics_mips.h b/compiler/optimizing/intrinsics_mips.h
index 13397f1..1c1ba40 100644
--- a/compiler/optimizing/intrinsics_mips.h
+++ b/compiler/optimizing/intrinsics_mips.h
@@ -71,6 +71,7 @@
   bool IsR2OrNewer() const;
   bool IsR6() const;
   bool Is32BitFPU() const;
+  bool HasMsa() const;
 
  private:
   MipsAssembler* GetAssembler();
diff --git a/compiler/optimizing/intrinsics_mips64.cc b/compiler/optimizing/intrinsics_mips64.cc
index 9987d05..f429afd 100644
--- a/compiler/optimizing/intrinsics_mips64.cc
+++ b/compiler/optimizing/intrinsics_mips64.cc
@@ -46,6 +46,10 @@
   return codegen_->GetGraph()->GetAllocator();
 }
 
+inline bool IntrinsicCodeGeneratorMIPS64::HasMsa() const {
+  return codegen_->GetInstructionSetFeatures().HasMsa();
+}
+
 #define __ codegen->GetAssembler()->
 
 static void MoveFromReturnRegister(Location trg,
@@ -386,6 +390,7 @@
 
 static void GenBitCount(LocationSummary* locations,
                         const DataType::Type type,
+                        const bool hasMsa,
                         Mips64Assembler* assembler) {
   GpuRegister out = locations->Out().AsRegister<GpuRegister>();
   GpuRegister in = locations->InAt(0).AsRegister<GpuRegister>();
@@ -414,41 +419,52 @@
   // bits are set but the algorithm here attempts to minimize the total
   // number of instructions executed even when a large number of bits
   // are set.
-
-  if (type == DataType::Type::kInt32) {
-    __ Srl(TMP, in, 1);
-    __ LoadConst32(AT, 0x55555555);
-    __ And(TMP, TMP, AT);
-    __ Subu(TMP, in, TMP);
-    __ LoadConst32(AT, 0x33333333);
-    __ And(out, TMP, AT);
-    __ Srl(TMP, TMP, 2);
-    __ And(TMP, TMP, AT);
-    __ Addu(TMP, out, TMP);
-    __ Srl(out, TMP, 4);
-    __ Addu(out, out, TMP);
-    __ LoadConst32(AT, 0x0F0F0F0F);
-    __ And(out, out, AT);
-    __ LoadConst32(TMP, 0x01010101);
-    __ MulR6(out, out, TMP);
-    __ Srl(out, out, 24);
-  } else if (type == DataType::Type::kInt64) {
-    __ Dsrl(TMP, in, 1);
-    __ LoadConst64(AT, 0x5555555555555555L);
-    __ And(TMP, TMP, AT);
-    __ Dsubu(TMP, in, TMP);
-    __ LoadConst64(AT, 0x3333333333333333L);
-    __ And(out, TMP, AT);
-    __ Dsrl(TMP, TMP, 2);
-    __ And(TMP, TMP, AT);
-    __ Daddu(TMP, out, TMP);
-    __ Dsrl(out, TMP, 4);
-    __ Daddu(out, out, TMP);
-    __ LoadConst64(AT, 0x0F0F0F0F0F0F0F0FL);
-    __ And(out, out, AT);
-    __ LoadConst64(TMP, 0x0101010101010101L);
-    __ Dmul(out, out, TMP);
-    __ Dsrl32(out, out, 24);
+  if (hasMsa) {
+    if (type == DataType::Type::kInt32) {
+      __ Mtc1(in, FTMP);
+      __ PcntW(static_cast<VectorRegister>(FTMP), static_cast<VectorRegister>(FTMP));
+      __ Mfc1(out, FTMP);
+    } else {
+      __ Dmtc1(in, FTMP);
+      __ PcntD(static_cast<VectorRegister>(FTMP), static_cast<VectorRegister>(FTMP));
+      __ Dmfc1(out, FTMP);
+    }
+  } else {
+    if (type == DataType::Type::kInt32) {
+      __ Srl(TMP, in, 1);
+      __ LoadConst32(AT, 0x55555555);
+      __ And(TMP, TMP, AT);
+      __ Subu(TMP, in, TMP);
+      __ LoadConst32(AT, 0x33333333);
+      __ And(out, TMP, AT);
+      __ Srl(TMP, TMP, 2);
+      __ And(TMP, TMP, AT);
+      __ Addu(TMP, out, TMP);
+      __ Srl(out, TMP, 4);
+      __ Addu(out, out, TMP);
+      __ LoadConst32(AT, 0x0F0F0F0F);
+      __ And(out, out, AT);
+      __ LoadConst32(TMP, 0x01010101);
+      __ MulR6(out, out, TMP);
+      __ Srl(out, out, 24);
+    } else {
+      __ Dsrl(TMP, in, 1);
+      __ LoadConst64(AT, 0x5555555555555555L);
+      __ And(TMP, TMP, AT);
+      __ Dsubu(TMP, in, TMP);
+      __ LoadConst64(AT, 0x3333333333333333L);
+      __ And(out, TMP, AT);
+      __ Dsrl(TMP, TMP, 2);
+      __ And(TMP, TMP, AT);
+      __ Daddu(TMP, out, TMP);
+      __ Dsrl(out, TMP, 4);
+      __ Daddu(out, out, TMP);
+      __ LoadConst64(AT, 0x0F0F0F0F0F0F0F0FL);
+      __ And(out, out, AT);
+      __ LoadConst64(TMP, 0x0101010101010101L);
+      __ Dmul(out, out, TMP);
+      __ Dsrl32(out, out, 24);
+    }
   }
 }
 
@@ -458,7 +474,7 @@
 }
 
 void IntrinsicCodeGeneratorMIPS64::VisitIntegerBitCount(HInvoke* invoke) {
-  GenBitCount(invoke->GetLocations(), DataType::Type::kInt32, GetAssembler());
+  GenBitCount(invoke->GetLocations(), DataType::Type::kInt32, HasMsa(), GetAssembler());
 }
 
 // int java.lang.Long.bitCount(long)
@@ -467,7 +483,7 @@
 }
 
 void IntrinsicCodeGeneratorMIPS64::VisitLongBitCount(HInvoke* invoke) {
-  GenBitCount(invoke->GetLocations(), DataType::Type::kInt64, GetAssembler());
+  GenBitCount(invoke->GetLocations(), DataType::Type::kInt64, HasMsa(), GetAssembler());
 }
 
 // double java.lang.Math.sqrt(double)
diff --git a/compiler/optimizing/intrinsics_mips64.h b/compiler/optimizing/intrinsics_mips64.h
index 6f40d90..748b0b0 100644
--- a/compiler/optimizing/intrinsics_mips64.h
+++ b/compiler/optimizing/intrinsics_mips64.h
@@ -68,6 +68,8 @@
 #undef INTRINSICS_LIST
 #undef OPTIMIZING_INTRINSICS
 
+  bool HasMsa() const;
+
  private:
   Mips64Assembler* GetAssembler();
 
diff --git a/compiler/optimizing/loop_optimization_test.cc b/compiler/optimizing/loop_optimization_test.cc
index db83689..c21bd65 100644
--- a/compiler/optimizing/loop_optimization_test.cc
+++ b/compiler/optimizing/loop_optimization_test.cc
@@ -227,11 +227,14 @@
   graph_->ClearDominanceInformation();
   graph_->BuildDominatorTree();
 
+  // BuildDominatorTree inserts a block beetween loop header and entry block.
+  EXPECT_EQ(header->GetPredecessors()[0]->GetSinglePredecessor(), entry_block_);
+
   // Check that after optimizations in BuildDominatorTree()/SimplifyCFG() phi inputs
   // are still mapped correctly to the block predecessors.
   for (size_t i = 0, e = phi->InputCount(); i < e; i++) {
     HInstruction* input = phi->InputAt(i);
-    ASSERT_TRUE(input->GetBlock()->Dominates(header->GetPredecessors()[i]));
+    EXPECT_TRUE(input->GetBlock()->Dominates(header->GetPredecessors()[i]));
   }
 }
 
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index f6ba19f..a8ddb7c 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -2891,6 +2891,8 @@
       return os << "BootImageLinkTimePcRelative";
     case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress:
       return os << "DirectAddress";
+    case HInvokeStaticOrDirect::MethodLoadKind::kBootImageRelRo:
+      return os << "BootImageRelRo";
     case HInvokeStaticOrDirect::MethodLoadKind::kBssEntry:
       return os << "BssEntry";
     case HInvokeStaticOrDirect::MethodLoadKind::kRuntimeCall:
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 9da4620..d42f4a7 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -4431,6 +4431,10 @@
     // Used for app->boot calls with non-relocatable image and for JIT-compiled calls.
     kDirectAddress,
 
+    // Load from an entry in the .data.bimg.rel.ro using a PC-relative load.
+    // Used for app->boot calls with relocatable image.
+    kBootImageRelRo,
+
     // Load from an entry in the .bss section using a PC-relative load.
     // Used for classes outside boot image when .bss is accessible with a PC-relative load.
     kBssEntry,
@@ -4563,6 +4567,7 @@
   bool HasMethodAddress() const { return GetMethodLoadKind() == MethodLoadKind::kDirectAddress; }
   bool HasPcRelativeMethodLoadKind() const {
     return GetMethodLoadKind() == MethodLoadKind::kBootImageLinkTimePcRelative ||
+           GetMethodLoadKind() == MethodLoadKind::kBootImageRelRo ||
            GetMethodLoadKind() == MethodLoadKind::kBssEntry;
   }
   bool HasCurrentMethodInput() const {
diff --git a/compiler/optimizing/sharpening.cc b/compiler/optimizing/sharpening.cc
index 1e49411..b65628e 100644
--- a/compiler/optimizing/sharpening.cc
+++ b/compiler/optimizing/sharpening.cc
@@ -125,8 +125,14 @@
              BootImageAOTCanEmbedMethod(callee, compiler_driver)) {
     method_load_kind = HInvokeStaticOrDirect::MethodLoadKind::kBootImageLinkTimePcRelative;
     code_ptr_location = HInvokeStaticOrDirect::CodePtrLocation::kCallArtMethod;
+  } else if (IsInBootImage(callee)) {
+    // Use PC-relative access to the .data.bimg.rel.ro methods array.
+    method_load_kind = HInvokeStaticOrDirect::MethodLoadKind::kBootImageRelRo;
+    uint8_t* begin = Runtime::Current()->GetHeap()->GetBootImageSpaces().front()->Begin();
+    method_load_data = reinterpret_cast<uintptr_t>(callee) - reinterpret_cast<uintptr_t>(begin);
+    code_ptr_location = HInvokeStaticOrDirect::CodePtrLocation::kCallArtMethod;
   } else {
-    // Use PC-relative access to the .bss methods arrays.
+    // Use PC-relative access to the .bss methods array.
     method_load_kind = HInvokeStaticOrDirect::MethodLoadKind::kBssEntry;
     code_ptr_location = HInvokeStaticOrDirect::CodePtrLocation::kCallArtMethod;
   }
diff --git a/compiler/optimizing/superblock_cloner.cc b/compiler/optimizing/superblock_cloner.cc
index a7c23be..04942f9 100644
--- a/compiler/optimizing/superblock_cloner.cc
+++ b/compiler/optimizing/superblock_cloner.cc
@@ -28,6 +28,11 @@
 using HBasicBlockSet = SuperblockCloner::HBasicBlockSet;
 using HEdgeSet = SuperblockCloner::HEdgeSet;
 
+// When doing peeling we can choose whether to keep original loop (made of original basic blocks)
+// and form a peeled iteration of the copy blocks (preserve the header) or transfer original loop
+// blocks to the peeled iteration and create new loop from the copy blocks. Similar for unrolling.
+static const bool kPeelUnrollPreserveHeader = true;
+
 void HEdge::Dump(std::ostream& stream) const {
   stream << "(" << from_ << "->" << to_ << ")";
 }
@@ -70,20 +75,18 @@
   return true;
 }
 
-// Returns a common predecessor of loop1 and loop2 in the loop tree or nullptr if it is the whole
-// graph.
-static HLoopInformation* FindCommonLoop(HLoopInformation* loop1, HLoopInformation* loop2) {
-  if (loop1 != nullptr || loop2 != nullptr) {
-    return nullptr;
+// Returns whether two Edge sets are equal (ArenaHashSet doesn't have "Equal" method).
+static bool EdgeHashSetsEqual(const HEdgeSet* set1, const HEdgeSet* set2) {
+  if (set1->Size() != set2->Size()) {
+    return false;
   }
 
-  if (loop1->IsIn(*loop2)) {
-    return loop2;
-  } else if (loop2->IsIn(*loop1)) {
-    return loop1;
+  for (auto e : *set1) {
+    if (set2->Find(e) == set2->end()) {
+      return false;
+    }
   }
-  HBasicBlock* block = CommonDominator::ForPair(loop1->GetHeader(), loop2->GetHeader());
-  return block->GetLoopInformation();
+  return true;
 }
 
 // Calls HGraph::OrderLoopHeaderPredecessors for each loop in the graph.
@@ -95,6 +98,21 @@
   }
 }
 
+// Performs DFS on the subgraph (specified by 'bb_set') starting from the specified block; while
+// traversing the function removes basic blocks from the bb_set (instead of traditional DFS
+// 'marking'). So what is left in the 'bb_set' after the traversal is not reachable from the start
+// block.
+static void TraverseSubgraphForConnectivity(HBasicBlock* block, HBasicBlockSet* bb_set) {
+  DCHECK(bb_set->IsBitSet(block->GetBlockId()));
+  bb_set->ClearBit(block->GetBlockId());
+
+  for (HBasicBlock* succ : block->GetSuccessors()) {
+    if (bb_set->IsBitSet(succ->GetBlockId())) {
+      TraverseSubgraphForConnectivity(succ, bb_set);
+    }
+  }
+}
+
 //
 // Helpers for CloneBasicBlock.
 //
@@ -268,7 +286,6 @@
 }
 
 void SuperblockCloner::RecalculateBackEdgesInfo(ArenaBitVector* outer_loop_bb_set) {
-  // TODO: DCHECK that after the transformation the graph is connected.
   HBasicBlock* block_entry = nullptr;
 
   if (outer_loop_ == nullptr) {
@@ -424,6 +441,11 @@
       outer_loop_ = nullptr;
       break;
     }
+    if (outer_loop_ == nullptr) {
+      // We should not use the initial outer_loop_ value 'nullptr' when finding the most outer
+      // common loop.
+      outer_loop_ = loop_exit_loop_info;
+    }
     outer_loop_ = FindCommonLoop(outer_loop_, loop_exit_loop_info);
   }
 
@@ -507,6 +529,34 @@
 // Debug and logging methods.
 //
 
+// Debug function to dump graph' BasicBlocks info.
+void DumpBB(HGraph* graph) {
+  for (HBasicBlock* bb : graph->GetBlocks()) {
+    if (bb == nullptr) {
+      continue;
+    }
+    std::cout << bb->GetBlockId();
+    std::cout << " <- ";
+    for (HBasicBlock* pred : bb->GetPredecessors()) {
+      std::cout << pred->GetBlockId() << " ";
+    }
+    std::cout << " -> ";
+    for (HBasicBlock* succ : bb->GetSuccessors()) {
+      std::cout << succ->GetBlockId()  << " ";
+    }
+
+    if (bb->GetDominator()) {
+      std::cout << " dom " << bb->GetDominator()->GetBlockId();
+    }
+
+    if (bb->GetLoopInformation()) {
+      std::cout <<  "\tloop: " << bb->GetLoopInformation()->GetHeader()->GetBlockId();
+    }
+
+    std::cout << std::endl;
+  }
+}
+
 void SuperblockCloner::CheckInstructionInputsRemapping(HInstruction* orig_instr) {
   DCHECK(!orig_instr->IsPhi());
   HInstruction* copy_instr = GetInstrCopy(orig_instr);
@@ -542,6 +592,82 @@
   }
 }
 
+bool SuperblockCloner::CheckRemappingInfoIsValid() {
+  for (HEdge edge : *remap_orig_internal_) {
+    if (!IsEdgeValid(edge, graph_) ||
+        !IsInOrigBBSet(edge.GetFrom()) ||
+        !IsInOrigBBSet(edge.GetTo())) {
+      return false;
+    }
+  }
+
+  for (auto edge : *remap_copy_internal_) {
+    if (!IsEdgeValid(edge, graph_) ||
+        !IsInOrigBBSet(edge.GetFrom()) ||
+        !IsInOrigBBSet(edge.GetTo())) {
+      return false;
+    }
+  }
+
+  for (auto edge : *remap_incoming_) {
+    if (!IsEdgeValid(edge, graph_) ||
+        IsInOrigBBSet(edge.GetFrom()) ||
+        !IsInOrigBBSet(edge.GetTo())) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+void SuperblockCloner::VerifyGraph() {
+  for (auto it : *hir_map_) {
+    HInstruction* orig_instr = it.first;
+    HInstruction* copy_instr = it.second;
+    if (!orig_instr->IsPhi() && !orig_instr->IsSuspendCheck()) {
+      DCHECK(it.first->GetBlock() != nullptr);
+    }
+    if (!copy_instr->IsPhi() && !copy_instr->IsSuspendCheck()) {
+      DCHECK(it.second->GetBlock() != nullptr);
+    }
+  }
+
+  GraphChecker checker(graph_);
+  checker.Run();
+  if (!checker.IsValid()) {
+    for (const std::string& error : checker.GetErrors()) {
+      std::cout << error << std::endl;
+    }
+    LOG(FATAL) << "GraphChecker failed: superblock cloner\n";
+  }
+}
+
+void DumpBBSet(const ArenaBitVector* set) {
+  for (uint32_t idx : set->Indexes()) {
+    std::cout << idx << "\n";
+  }
+}
+
+void SuperblockCloner::DumpInputSets() {
+  std::cout << graph_->PrettyMethod() << "\n";
+  std::cout << "orig_bb_set:\n";
+  for (uint32_t idx : orig_bb_set_.Indexes()) {
+    std::cout << idx << "\n";
+  }
+  std::cout << "remap_orig_internal:\n";
+  for (HEdge e : *remap_orig_internal_) {
+    std::cout << e << "\n";
+  }
+  std::cout << "remap_copy_internal:\n";
+  for (auto e : *remap_copy_internal_) {
+    std::cout << e << "\n";
+  }
+  std::cout << "remap_incoming:\n";
+  for (auto e : *remap_incoming_) {
+    std::cout << e << "\n";
+  }
+}
+
 //
 // Public methods.
 //
@@ -569,6 +695,7 @@
   remap_orig_internal_ = remap_orig_internal;
   remap_copy_internal_ = remap_copy_internal;
   remap_incoming_ = remap_incoming;
+  DCHECK(CheckRemappingInfoIsValid());
 }
 
 bool SuperblockCloner::IsSubgraphClonable() const {
@@ -602,6 +729,63 @@
   return true;
 }
 
+bool SuperblockCloner::IsFastCase() const {
+  // Check that loop unrolling/loop peeling is being conducted.
+  // Check that all the basic blocks belong to the same loop.
+  bool flag = false;
+  HLoopInformation* common_loop_info = nullptr;
+  for (uint32_t idx : orig_bb_set_.Indexes()) {
+    HBasicBlock* block = GetBlockById(idx);
+    HLoopInformation* block_loop_info = block->GetLoopInformation();
+    if (!flag) {
+      common_loop_info = block_loop_info;
+    } else {
+      if (block_loop_info != common_loop_info) {
+        return false;
+      }
+    }
+  }
+
+  // Check that orig_bb_set_ corresponds to loop peeling/unrolling.
+  if (common_loop_info == nullptr || !orig_bb_set_.SameBitsSet(&common_loop_info->GetBlocks())) {
+    return false;
+  }
+
+  bool peeling_or_unrolling = false;
+  HEdgeSet remap_orig_internal(graph_->GetAllocator()->Adapter(kArenaAllocSuperblockCloner));
+  HEdgeSet remap_copy_internal(graph_->GetAllocator()->Adapter(kArenaAllocSuperblockCloner));
+  HEdgeSet remap_incoming(graph_->GetAllocator()->Adapter(kArenaAllocSuperblockCloner));
+
+
+  // Check whether remapping info corresponds to loop unrolling.
+  CollectRemappingInfoForPeelUnroll(/* to_unroll*/ true,
+                                    common_loop_info,
+                                    &remap_orig_internal,
+                                    &remap_copy_internal,
+                                    &remap_incoming);
+
+  peeling_or_unrolling |= EdgeHashSetsEqual(&remap_orig_internal, remap_orig_internal_) &&
+                          EdgeHashSetsEqual(&remap_copy_internal, remap_copy_internal_) &&
+                          EdgeHashSetsEqual(&remap_incoming, remap_incoming_);
+
+  remap_orig_internal.Clear();
+  remap_copy_internal.Clear();
+  remap_incoming.Clear();
+
+  // Check whether remapping info corresponds to loop peeling.
+  CollectRemappingInfoForPeelUnroll(/* to_unroll*/ false,
+                                    common_loop_info,
+                                    &remap_orig_internal,
+                                    &remap_copy_internal,
+                                    &remap_incoming);
+
+  peeling_or_unrolling |= EdgeHashSetsEqual(&remap_orig_internal, remap_orig_internal_) &&
+                          EdgeHashSetsEqual(&remap_copy_internal, remap_copy_internal_) &&
+                          EdgeHashSetsEqual(&remap_incoming, remap_incoming_);
+
+  return peeling_or_unrolling;
+}
+
 void SuperblockCloner::Run() {
   DCHECK(bb_map_ != nullptr);
   DCHECK(hir_map_ != nullptr);
@@ -609,6 +793,11 @@
          remap_copy_internal_ != nullptr &&
          remap_incoming_ != nullptr);
   DCHECK(IsSubgraphClonable());
+  DCHECK(IsFastCase());
+
+  if (kSuperblockClonerLogging) {
+    DumpInputSets();
+  }
 
   // Find an area in the graph for which control flow information should be adjusted.
   FindAndSetLocalAreaForAdjustments();
@@ -618,6 +807,19 @@
   // Connect the blocks together/remap successors and fix phis which are directly affected my the
   // remapping.
   RemapEdgesSuccessors();
+
+  // Check that the subgraph is connected.
+  if (kIsDebugBuild) {
+    HBasicBlockSet work_set(arena_, orig_bb_set_.GetSizeOf(), true, kArenaAllocSuperblockCloner);
+
+    // Add original and copy blocks of the subgraph to the work set.
+    for (auto iter : *bb_map_) {
+      work_set.SetBit(iter.first->GetBlockId());   // Original block.
+      work_set.SetBit(iter.second->GetBlockId());  // Copy block.
+    }
+    CHECK(IsSubgraphConnected(&work_set, graph_));
+  }
+
   // Recalculate dominance and backedge information which is required by the next stage.
   AdjustControlFlowInfo();
   // Fix data flow of the graph.
@@ -650,6 +852,10 @@
       }
     }
   }
+
+  if (kSuperblockClonerVerify) {
+    VerifyGraph();
+  }
 }
 
 HBasicBlock* SuperblockCloner::CloneBasicBlock(const HBasicBlock* orig_block) {
@@ -701,4 +907,125 @@
   }
 }
 
+//
+// Stand-alone methods.
+//
+
+void CollectRemappingInfoForPeelUnroll(bool to_unroll,
+                                       HLoopInformation* loop_info,
+                                       HEdgeSet* remap_orig_internal,
+                                       HEdgeSet* remap_copy_internal,
+                                       HEdgeSet* remap_incoming) {
+  DCHECK(loop_info != nullptr);
+  HBasicBlock* loop_header = loop_info->GetHeader();
+  // Set up remap_orig_internal edges set - set is empty.
+  // Set up remap_copy_internal edges set.
+  for (HBasicBlock* back_edge_block : loop_info->GetBackEdges()) {
+    HEdge e = HEdge(back_edge_block, loop_header);
+    if (to_unroll) {
+      remap_orig_internal->Insert(e);
+      remap_copy_internal->Insert(e);
+    } else {
+      if (kPeelUnrollPreserveHeader) {
+        remap_copy_internal->Insert(e);
+      } else {
+        remap_orig_internal->Insert(e);
+      }
+    }
+  }
+
+  // Set up remap_incoming edges set.
+  if (to_unroll != kPeelUnrollPreserveHeader) {
+    remap_incoming->Insert(HEdge(loop_info->GetPreHeader(), loop_header));
+  }
+}
+
+bool IsSubgraphConnected(SuperblockCloner::HBasicBlockSet* work_set, HGraph* graph) {
+  ArenaVector<HBasicBlock*> entry_blocks(
+      graph->GetAllocator()->Adapter(kArenaAllocSuperblockCloner));
+
+  // Find subgraph entry blocks.
+  for (uint32_t orig_block_id : work_set->Indexes()) {
+    HBasicBlock* block = graph->GetBlocks()[orig_block_id];
+    for (HBasicBlock* pred : block->GetPredecessors()) {
+      if (!work_set->IsBitSet(pred->GetBlockId())) {
+        entry_blocks.push_back(block);
+        break;
+      }
+    }
+  }
+
+  for (HBasicBlock* entry_block : entry_blocks) {
+    if (work_set->IsBitSet(entry_block->GetBlockId())) {
+      TraverseSubgraphForConnectivity(entry_block, work_set);
+    }
+  }
+
+  // Return whether there are unvisited - unreachable - blocks.
+  return work_set->NumSetBits() == 0;
+}
+
+HLoopInformation* FindCommonLoop(HLoopInformation* loop1, HLoopInformation* loop2) {
+  if (loop1 == nullptr || loop2 == nullptr) {
+    return nullptr;
+  }
+
+  if (loop1->IsIn(*loop2)) {
+    return loop2;
+  }
+
+  HLoopInformation* current = loop1;
+  while (current != nullptr && !loop2->IsIn(*current)) {
+    current = current->GetPreHeader()->GetLoopInformation();
+  }
+
+  return current;
+}
+
+bool PeelUnrollHelper::IsLoopClonable(HLoopInformation* loop_info) {
+  PeelUnrollHelper helper(loop_info, nullptr, nullptr);
+  return helper.IsLoopClonable();
+}
+
+HBasicBlock* PeelUnrollHelper::DoPeelUnrollImpl(bool to_unroll) {
+  // For now do peeling only for natural loops.
+  DCHECK(!loop_info_->IsIrreducible());
+
+  HBasicBlock* loop_header = loop_info_->GetHeader();
+  HGraph* graph = loop_header->GetGraph();
+  ArenaAllocator allocator(graph->GetAllocator()->GetArenaPool());
+
+  HEdgeSet remap_orig_internal(graph->GetAllocator()->Adapter(kArenaAllocSuperblockCloner));
+  HEdgeSet remap_copy_internal(graph->GetAllocator()->Adapter(kArenaAllocSuperblockCloner));
+  HEdgeSet remap_incoming(graph->GetAllocator()->Adapter(kArenaAllocSuperblockCloner));
+
+  CollectRemappingInfoForPeelUnroll(to_unroll,
+                                    loop_info_,
+                                    &remap_orig_internal,
+                                    &remap_copy_internal,
+                                    &remap_incoming);
+
+  cloner_.SetSuccessorRemappingInfo(&remap_orig_internal, &remap_copy_internal, &remap_incoming);
+  cloner_.Run();
+  cloner_.CleanUp();
+
+  return kPeelUnrollPreserveHeader ? loop_header : cloner_.GetBlockCopy(loop_header);
+}
+
+PeelUnrollSimpleHelper::PeelUnrollSimpleHelper(HLoopInformation* info)
+  : bb_map_(std::less<HBasicBlock*>(),
+            info->GetHeader()->GetGraph()->GetAllocator()->Adapter(kArenaAllocSuperblockCloner)),
+    hir_map_(std::less<HInstruction*>(),
+             info->GetHeader()->GetGraph()->GetAllocator()->Adapter(kArenaAllocSuperblockCloner)),
+    helper_(info, &bb_map_, &hir_map_) {}
+
 }  // namespace art
+
+namespace std {
+
+ostream& operator<<(ostream& os, const art::HEdge& e) {
+  e.Dump(os);
+  return os;
+}
+
+}  // namespace std
diff --git a/compiler/optimizing/superblock_cloner.h b/compiler/optimizing/superblock_cloner.h
index 23de692..19c9dd4 100644
--- a/compiler/optimizing/superblock_cloner.h
+++ b/compiler/optimizing/superblock_cloner.h
@@ -152,6 +152,15 @@
   // TODO: Start from small range of graph patterns then extend it.
   bool IsSubgraphClonable() const;
 
+  // Returns whether selected subgraph satisfies the criteria for fast data flow resolution
+  // when iterative DF algorithm is not required and dominators/instructions inputs can be
+  // trivially adjusted.
+  //
+  // TODO: formally describe the criteria.
+  //
+  // Loop peeling and unrolling satisfy the criteria.
+  bool IsFastCase() const;
+
   // Runs the copy algorithm according to the description.
   void Run();
 
@@ -202,11 +211,17 @@
     return IsInOrigBBSet(block->GetBlockId());
   }
 
+  // Returns the area (the most outer loop) in the graph for which control flow (back edges, loops,
+  // dominators) needs to be adjusted.
+  HLoopInformation* GetRegionToBeAdjusted() const {
+    return outer_loop_;
+  }
+
  private:
   // Fills the 'exits' vector with the subgraph exits.
   void SearchForSubgraphExits(ArenaVector<HBasicBlock*>* exits);
 
-  // Finds and records information about the area in the graph for which control-flow (back edges,
+  // Finds and records information about the area in the graph for which control flow (back edges,
   // loops, dominators) needs to be adjusted.
   void FindAndSetLocalAreaForAdjustments();
 
@@ -217,7 +232,7 @@
   // phis' nor instructions' inputs values are resolved.
   void RemapEdgesSuccessors();
 
-  // Adjusts control-flow (back edges, loops, dominators) for the local area defined by
+  // Adjusts control flow (back edges, loops, dominators) for the local area defined by
   // FindAndSetLocalAreaForAdjustments.
   void AdjustControlFlowInfo();
 
@@ -272,6 +287,9 @@
   // Debug and logging methods.
   //
   void CheckInstructionInputsRemapping(HInstruction* orig_instr);
+  bool CheckRemappingInfoIsValid();
+  void VerifyGraph();
+  void DumpInputSets();
 
   HBasicBlock* GetBlockById(uint32_t block_id) const {
     DCHECK(block_id < graph_->GetBlocks().size());
@@ -295,15 +313,94 @@
   HBasicBlockMap* bb_map_;
   // Correspondence map for instructions: (original HInstruction, copy HInstruction).
   HInstructionMap* hir_map_;
-  // Area in the graph for which control-flow (back edges, loops, dominators) needs to be adjusted.
+  // Area in the graph for which control flow (back edges, loops, dominators) needs to be adjusted.
   HLoopInformation* outer_loop_;
   HBasicBlockSet outer_loop_bb_set_;
 
   ART_FRIEND_TEST(SuperblockClonerTest, AdjustControlFlowInfo);
+  ART_FRIEND_TEST(SuperblockClonerTest, IsGraphConnected);
 
   DISALLOW_COPY_AND_ASSIGN(SuperblockCloner);
 };
 
+// Helper class to perform loop peeling/unrolling.
+//
+// This helper should be used when correspondence map between original and copied
+// basic blocks/instructions are demanded.
+class PeelUnrollHelper : public ValueObject {
+ public:
+  explicit PeelUnrollHelper(HLoopInformation* info,
+                            SuperblockCloner::HBasicBlockMap* bb_map,
+                            SuperblockCloner::HInstructionMap* hir_map) :
+      loop_info_(info),
+      cloner_(info->GetHeader()->GetGraph(), &info->GetBlocks(), bb_map, hir_map) {
+    // For now do peeling/unrolling only for natural loops.
+    DCHECK(!info->IsIrreducible());
+  }
+
+  // Returns whether the loop can be peeled/unrolled (static function).
+  static bool IsLoopClonable(HLoopInformation* loop_info);
+
+  // Returns whether the loop can be peeled/unrolled.
+  bool IsLoopClonable() const { return cloner_.IsSubgraphClonable(); }
+
+  HBasicBlock* DoPeeling() { return DoPeelUnrollImpl(/* to_unroll */ false); }
+  HBasicBlock* DoUnrolling() { return DoPeelUnrollImpl(/* to_unroll */ true); }
+  HLoopInformation* GetRegionToBeAdjusted() const { return cloner_.GetRegionToBeAdjusted(); }
+
+ protected:
+  // Applies loop peeling/unrolling for the loop specified by 'loop_info'.
+  //
+  // Depending on 'do_unroll' either unrolls loop by 2 or peels one iteration from it.
+  HBasicBlock* DoPeelUnrollImpl(bool to_unroll);
+
+ private:
+  HLoopInformation* loop_info_;
+  SuperblockCloner cloner_;
+
+  DISALLOW_COPY_AND_ASSIGN(PeelUnrollHelper);
+};
+
+// Helper class to perform loop peeling/unrolling.
+//
+// This helper should be used when there is no need to get correspondence information between
+// original and copied basic blocks/instructions.
+class PeelUnrollSimpleHelper : public ValueObject {
+ public:
+  explicit PeelUnrollSimpleHelper(HLoopInformation* info);
+  bool IsLoopClonable() const { return helper_.IsLoopClonable(); }
+  HBasicBlock* DoPeeling() { return helper_.DoPeeling(); }
+  HBasicBlock* DoUnrolling() { return helper_.DoUnrolling(); }
+  HLoopInformation* GetRegionToBeAdjusted() const { return helper_.GetRegionToBeAdjusted(); }
+
+ private:
+  SuperblockCloner::HBasicBlockMap bb_map_;
+  SuperblockCloner::HInstructionMap hir_map_;
+  PeelUnrollHelper helper_;
+
+  DISALLOW_COPY_AND_ASSIGN(PeelUnrollSimpleHelper);
+};
+
+// Collects edge remapping info for loop peeling/unrolling for the loop specified by loop info.
+void CollectRemappingInfoForPeelUnroll(bool to_unroll,
+                                       HLoopInformation* loop_info,
+                                       SuperblockCloner::HEdgeSet* remap_orig_internal,
+                                       SuperblockCloner::HEdgeSet* remap_copy_internal,
+                                       SuperblockCloner::HEdgeSet* remap_incoming);
+
+// Returns whether blocks from 'work_set' are reachable from the rest of the graph.
+//
+// Returns whether such a set 'outer_entries' of basic blocks exists that:
+// - each block from 'outer_entries' is not from 'work_set'.
+// - each block from 'work_set' is reachable from at least one block from 'outer_entries'.
+//
+// After the function returns work_set contains only blocks from the original 'work_set'
+// which are unreachable from the rest of the graph.
+bool IsSubgraphConnected(SuperblockCloner::HBasicBlockSet* work_set, HGraph* graph);
+
+// Returns a common predecessor of loop1 and loop2 in the loop tree or nullptr if it is the whole
+// graph.
+HLoopInformation* FindCommonLoop(HLoopInformation* loop1, HLoopInformation* loop2);
 }  // namespace art
 
 namespace std {
@@ -312,11 +409,12 @@
 struct hash<art::HEdge> {
   size_t operator()(art::HEdge const& x) const noexcept  {
     // Use Cantor pairing function as the hash function.
-    uint32_t a = x.GetFrom();
-    uint32_t b = x.GetTo();
+    size_t a = x.GetFrom();
+    size_t b = x.GetTo();
     return (a + b) * (a + b + 1) / 2 + b;
   }
 };
+ostream& operator<<(ostream& os, const art::HEdge& e);
 
 }  // namespace std
 
diff --git a/compiler/optimizing/superblock_cloner_test.cc b/compiler/optimizing/superblock_cloner_test.cc
index f1b7bff..df2e517 100644
--- a/compiler/optimizing/superblock_cloner_test.cc
+++ b/compiler/optimizing/superblock_cloner_test.cc
@@ -25,54 +25,67 @@
 
 using HBasicBlockMap = SuperblockCloner::HBasicBlockMap;
 using HInstructionMap = SuperblockCloner::HInstructionMap;
+using HBasicBlockSet = SuperblockCloner::HBasicBlockSet;
+using HEdgeSet = SuperblockCloner::HEdgeSet;
 
 // This class provides methods and helpers for testing various cloning and copying routines:
 // individual instruction cloning and cloning of the more coarse-grain structures.
 class SuperblockClonerTest : public OptimizingUnitTest {
  public:
-  SuperblockClonerTest()
-      : graph_(CreateGraph()), entry_block_(nullptr), exit_block_(nullptr), parameter_(nullptr) {}
+  SuperblockClonerTest() : graph_(CreateGraph()),
+                           entry_block_(nullptr),
+                           return_block_(nullptr),
+                           exit_block_(nullptr),
+                           parameter_(nullptr) {}
 
-  void CreateBasicLoopControlFlow(/* out */ HBasicBlock** header_p,
-                                  /* out */ HBasicBlock** body_p) {
+  void InitGraph() {
     entry_block_ = new (GetAllocator()) HBasicBlock(graph_);
     graph_->AddBlock(entry_block_);
     graph_->SetEntryBlock(entry_block_);
 
-    HBasicBlock* loop_preheader = new (GetAllocator()) HBasicBlock(graph_);
-    HBasicBlock* loop_header = new (GetAllocator()) HBasicBlock(graph_);
-    HBasicBlock* loop_body = new (GetAllocator()) HBasicBlock(graph_);
-    HBasicBlock* loop_exit = new (GetAllocator()) HBasicBlock(graph_);
-
-    graph_->AddBlock(loop_preheader);
-    graph_->AddBlock(loop_header);
-    graph_->AddBlock(loop_body);
-    graph_->AddBlock(loop_exit);
+    return_block_ = new (GetAllocator()) HBasicBlock(graph_);
+    graph_->AddBlock(return_block_);
 
     exit_block_ = new (GetAllocator()) HBasicBlock(graph_);
     graph_->AddBlock(exit_block_);
     graph_->SetExitBlock(exit_block_);
 
-    entry_block_->AddSuccessor(loop_preheader);
-    loop_preheader->AddSuccessor(loop_header);
-    // Loop exit first to have a proper exit condition/target for HIf.
-    loop_header->AddSuccessor(loop_exit);
-    loop_header->AddSuccessor(loop_body);
-    loop_body->AddSuccessor(loop_header);
-    loop_exit->AddSuccessor(exit_block_);
-
-    *header_p = loop_header;
-    *body_p = loop_body;
+    entry_block_->AddSuccessor(return_block_);
+    return_block_->AddSuccessor(exit_block_);
 
     parameter_ = new (GetAllocator()) HParameterValue(graph_->GetDexFile(),
                                                       dex::TypeIndex(0),
                                                       0,
                                                       DataType::Type::kInt32);
     entry_block_->AddInstruction(parameter_);
-    loop_exit->AddInstruction(new (GetAllocator()) HReturnVoid());
+    return_block_->AddInstruction(new (GetAllocator()) HReturnVoid());
     exit_block_->AddInstruction(new (GetAllocator()) HExit());
   }
 
+  void CreateBasicLoopControlFlow(HBasicBlock* position,
+                                  HBasicBlock* successor,
+                                  /* out */ HBasicBlock** header_p,
+                                  /* out */ HBasicBlock** body_p) {
+    HBasicBlock* loop_preheader = new (GetAllocator()) HBasicBlock(graph_);
+    HBasicBlock* loop_header = new (GetAllocator()) HBasicBlock(graph_);
+    HBasicBlock* loop_body = new (GetAllocator()) HBasicBlock(graph_);
+
+    graph_->AddBlock(loop_preheader);
+    graph_->AddBlock(loop_header);
+    graph_->AddBlock(loop_body);
+
+    position->ReplaceSuccessor(successor, loop_preheader);
+
+    loop_preheader->AddSuccessor(loop_header);
+    // Loop exit first to have a proper exit condition/target for HIf.
+    loop_header->AddSuccessor(successor);
+    loop_header->AddSuccessor(loop_body);
+    loop_body->AddSuccessor(loop_header);
+
+    *header_p = loop_header;
+    *body_p = loop_body;
+  }
+
   void CreateBasicLoopDataFlow(HBasicBlock* loop_header, HBasicBlock* loop_body) {
     uint32_t dex_pc = 0;
 
@@ -84,11 +97,12 @@
     // Header block.
     HPhi* phi = new (GetAllocator()) HPhi(GetAllocator(), 0, 0, DataType::Type::kInt32);
     HInstruction* suspend_check = new (GetAllocator()) HSuspendCheck();
+    HInstruction* loop_check = new (GetAllocator()) HGreaterThanOrEqual(phi, const_128);
 
     loop_header->AddPhi(phi);
     loop_header->AddInstruction(suspend_check);
-    loop_header->AddInstruction(new (GetAllocator()) HGreaterThanOrEqual(phi, const_128));
-    loop_header->AddInstruction(new (GetAllocator()) HIf(parameter_));
+    loop_header->AddInstruction(loop_check);
+    loop_header->AddInstruction(new (GetAllocator()) HIf(loop_check));
 
     // Loop body block.
     HInstruction* null_check = new (GetAllocator()) HNullCheck(parameter_, dex_pc);
@@ -97,8 +111,8 @@
     HInstruction* array_get =
         new (GetAllocator()) HArrayGet(null_check, bounds_check, DataType::Type::kInt32, dex_pc);
     HInstruction* add =  new (GetAllocator()) HAdd(DataType::Type::kInt32, array_get, const_1);
-    HInstruction* array_set =
-        new (GetAllocator()) HArraySet(null_check, bounds_check, add, DataType::Type::kInt32, dex_pc);
+    HInstruction* array_set = new (GetAllocator()) HArraySet(
+        null_check, bounds_check, add, DataType::Type::kInt32, dex_pc);
     HInstruction* induction_inc = new (GetAllocator()) HAdd(DataType::Type::kInt32, phi, const_1);
 
     loop_body->AddInstruction(null_check);
@@ -153,6 +167,7 @@
   HGraph* graph_;
 
   HBasicBlock* entry_block_;
+  HBasicBlock* return_block_;
   HBasicBlock* exit_block_;
 
   HInstruction* parameter_;
@@ -162,10 +177,11 @@
   HBasicBlock* header = nullptr;
   HBasicBlock* loop_body = nullptr;
 
-  CreateBasicLoopControlFlow(&header, &loop_body);
+  InitGraph();
+  CreateBasicLoopControlFlow(entry_block_, return_block_, &header, &loop_body);
   CreateBasicLoopDataFlow(header, loop_body);
   graph_->BuildDominatorTree();
-  ASSERT_TRUE(CheckGraph());
+  EXPECT_TRUE(CheckGraph());
 
   HSuspendCheck* old_suspend_check = header->GetLoopInformation()->GetSuspendCheck();
   CloneAndReplaceInstructionVisitor visitor(graph_);
@@ -193,7 +209,8 @@
   HBasicBlock* loop_body = nullptr;
   ArenaAllocator* arena = graph_->GetAllocator();
 
-  CreateBasicLoopControlFlow(&header, &loop_body);
+  InitGraph();
+  CreateBasicLoopControlFlow(entry_block_, return_block_, &header, &loop_body);
   CreateBasicLoopDataFlow(header, loop_body);
   graph_->BuildDominatorTree();
   ASSERT_TRUE(CheckGraph());
@@ -272,7 +289,8 @@
   HBasicBlock* loop_body = nullptr;
   ArenaAllocator* arena = graph_->GetAllocator();
 
-  CreateBasicLoopControlFlow(&header, &loop_body);
+  InitGraph();
+  CreateBasicLoopControlFlow(entry_block_, return_block_, &header, &loop_body);
   CreateBasicLoopDataFlow(header, loop_body);
   graph_->BuildDominatorTree();
   ASSERT_TRUE(CheckGraph());
@@ -303,4 +321,487 @@
   EXPECT_TRUE(loop_info->IsBackEdge(*loop_body));
 }
 
+// Tests IsSubgraphConnected function for negative case.
+TEST_F(SuperblockClonerTest, IsGraphConnected) {
+  HBasicBlock* header = nullptr;
+  HBasicBlock* loop_body = nullptr;
+  ArenaAllocator* arena = graph_->GetAllocator();
+
+  InitGraph();
+  CreateBasicLoopControlFlow(entry_block_, return_block_, &header, &loop_body);
+  CreateBasicLoopDataFlow(header, loop_body);
+  HBasicBlock* unreachable_block = new (GetAllocator()) HBasicBlock(graph_);
+  graph_->AddBlock(unreachable_block);
+
+  HBasicBlockSet bb_set(
+      arena, graph_->GetBlocks().size(), false, kArenaAllocSuperblockCloner);
+  bb_set.SetBit(header->GetBlockId());
+  bb_set.SetBit(loop_body->GetBlockId());
+  bb_set.SetBit(unreachable_block->GetBlockId());
+
+  EXPECT_FALSE(IsSubgraphConnected(&bb_set, graph_));
+  EXPECT_EQ(bb_set.NumSetBits(), 1u);
+  EXPECT_TRUE(bb_set.IsBitSet(unreachable_block->GetBlockId()));
+}
+
+// Tests SuperblockCloner for loop peeling case.
+//
+// Control Flow of the example (ignoring critical edges splitting).
+//
+//       Before                    After
+//
+//         |B|                      |B|
+//          |                        |
+//          v                        v
+//         |1|                      |1|
+//          |                        |
+//          v                        v
+//         |2|<-\              (6) |2A|
+//         / \  /                   / \
+//        v   v/                   /   v
+//       |4|  |3|                 /   |3A| (7)
+//        |                      /     /
+//        v                     |     v
+//       |E|                     \   |2|<-\
+//                                \ / \   /
+//                                 v   v /
+//                                |4|  |3|
+//                                 |
+//                                 v
+//                                |E|
+TEST_F(SuperblockClonerTest, LoopPeeling) {
+  HBasicBlock* header = nullptr;
+  HBasicBlock* loop_body = nullptr;
+
+  InitGraph();
+  CreateBasicLoopControlFlow(entry_block_, return_block_, &header, &loop_body);
+  CreateBasicLoopDataFlow(header, loop_body);
+  graph_->BuildDominatorTree();
+  EXPECT_TRUE(CheckGraph());
+
+  HBasicBlockMap bb_map(
+      std::less<HBasicBlock*>(), graph_->GetAllocator()->Adapter(kArenaAllocSuperblockCloner));
+  HInstructionMap hir_map(
+      std::less<HInstruction*>(), graph_->GetAllocator()->Adapter(kArenaAllocSuperblockCloner));
+
+  HLoopInformation* loop_info = header->GetLoopInformation();
+  PeelUnrollHelper helper(loop_info, &bb_map, &hir_map);
+  EXPECT_TRUE(helper.IsLoopClonable());
+  HBasicBlock* new_header = helper.DoPeeling();
+  HLoopInformation* new_loop_info = new_header->GetLoopInformation();
+
+  EXPECT_TRUE(CheckGraph());
+
+  // Check loop body successors.
+  EXPECT_EQ(loop_body->GetSingleSuccessor(), header);
+  EXPECT_EQ(bb_map.Get(loop_body)->GetSingleSuccessor(), header);
+
+  // Check loop structure.
+  EXPECT_EQ(header, new_header);
+  EXPECT_EQ(new_loop_info->GetHeader(), header);
+  EXPECT_EQ(new_loop_info->GetBackEdges().size(), 1u);
+  EXPECT_EQ(new_loop_info->GetBackEdges()[0], loop_body);
+}
+
+// Tests SuperblockCloner for loop unrolling case.
+//
+// Control Flow of the example (ignoring critical edges splitting).
+//
+//       Before                    After
+//
+//         |B|                      |B|
+//          |                        |
+//          v                        v
+//         |1|                      |1|
+//          |                        |
+//          v                        v
+//         |2|<-\               (6) |2A|<-\
+//         / \  /                   / \    \
+//        v   v/                   /   v    \
+//       |4|  |3|                 /(7)|3A|   |
+//        |                      /     /    /
+//        v                     |     v    /
+//       |E|                     \   |2|  /
+//                                \ / \  /
+//                                 v   v/
+//                                |4| |3|
+//                                 |
+//                                 v
+//                                |E|
+TEST_F(SuperblockClonerTest, LoopUnrolling) {
+  HBasicBlock* header = nullptr;
+  HBasicBlock* loop_body = nullptr;
+
+  InitGraph();
+  CreateBasicLoopControlFlow(entry_block_, return_block_, &header, &loop_body);
+  CreateBasicLoopDataFlow(header, loop_body);
+  graph_->BuildDominatorTree();
+  EXPECT_TRUE(CheckGraph());
+
+  HBasicBlockMap bb_map(
+      std::less<HBasicBlock*>(), graph_->GetAllocator()->Adapter(kArenaAllocSuperblockCloner));
+  HInstructionMap hir_map(
+      std::less<HInstruction*>(), graph_->GetAllocator()->Adapter(kArenaAllocSuperblockCloner));
+
+  HLoopInformation* loop_info = header->GetLoopInformation();
+  PeelUnrollHelper helper(loop_info, &bb_map, &hir_map);
+  EXPECT_TRUE(helper.IsLoopClonable());
+  HBasicBlock* new_header = helper.DoUnrolling();
+
+  EXPECT_TRUE(CheckGraph());
+
+  // Check loop body successors.
+  EXPECT_EQ(loop_body->GetSingleSuccessor(), bb_map.Get(header));
+  EXPECT_EQ(bb_map.Get(loop_body)->GetSingleSuccessor(), header);
+
+  // Check loop structure.
+  EXPECT_EQ(header, new_header);
+  EXPECT_EQ(loop_info, new_header->GetLoopInformation());
+  EXPECT_EQ(loop_info->GetHeader(), new_header);
+  EXPECT_EQ(loop_info->GetBackEdges().size(), 1u);
+  EXPECT_EQ(loop_info->GetBackEdges()[0], bb_map.Get(loop_body));
+}
+
+// Checks that loop unrolling works fine for a loop with multiple back edges. Tests that after
+// the transformation the loop has a single preheader.
+TEST_F(SuperblockClonerTest, LoopPeelingMultipleBackEdges) {
+  HBasicBlock* header = nullptr;
+  HBasicBlock* loop_body = nullptr;
+
+  InitGraph();
+  CreateBasicLoopControlFlow(entry_block_, return_block_, &header, &loop_body);
+  CreateBasicLoopDataFlow(header, loop_body);
+
+  // Transform a basic loop to have multiple back edges.
+  HBasicBlock* latch = header->GetSuccessors()[1];
+  HBasicBlock* if_block = new (GetAllocator()) HBasicBlock(graph_);
+  HBasicBlock* temp1 = new (GetAllocator()) HBasicBlock(graph_);
+  graph_->AddBlock(if_block);
+  graph_->AddBlock(temp1);
+  header->ReplaceSuccessor(latch, if_block);
+  if_block->AddSuccessor(latch);
+  if_block->AddSuccessor(temp1);
+  temp1->AddSuccessor(header);
+
+  if_block->AddInstruction(new (GetAllocator()) HIf(parameter_));
+
+  HInstructionIterator it(header->GetPhis());
+  DCHECK(!it.Done());
+  HPhi* loop_phi = it.Current()->AsPhi();
+  HInstruction* temp_add = new (GetAllocator()) HAdd(DataType::Type::kInt32,
+                                                     loop_phi,
+                                                     graph_->GetIntConstant(2));
+  temp1->AddInstruction(temp_add);
+  temp1->AddInstruction(new (GetAllocator()) HGoto());
+  loop_phi->AddInput(temp_add);
+
+  graph_->BuildDominatorTree();
+  EXPECT_TRUE(CheckGraph());
+
+  HLoopInformation* loop_info = header->GetLoopInformation();
+  PeelUnrollSimpleHelper helper(loop_info);
+  HBasicBlock* new_header = helper.DoPeeling();
+  EXPECT_EQ(header, new_header);
+
+  EXPECT_TRUE(CheckGraph());
+  EXPECT_EQ(header->GetPredecessors().size(), 3u);
+}
+
+static void CheckLoopStructureForLoopPeelingNested(HBasicBlock* loop1_header,
+                                                   HBasicBlock* loop2_header,
+                                                   HBasicBlock* loop3_header) {
+  EXPECT_EQ(loop1_header->GetLoopInformation()->GetHeader(), loop1_header);
+  EXPECT_EQ(loop2_header->GetLoopInformation()->GetHeader(), loop2_header);
+  EXPECT_EQ(loop3_header->GetLoopInformation()->GetHeader(), loop3_header);
+  EXPECT_EQ(loop1_header->GetLoopInformation()->GetPreHeader()->GetLoopInformation(), nullptr);
+  EXPECT_EQ(loop2_header->GetLoopInformation()->GetPreHeader()->GetLoopInformation(), nullptr);
+  EXPECT_EQ(loop3_header->GetLoopInformation()->GetPreHeader()->GetLoopInformation()->GetHeader(),
+            loop2_header);
+}
+
+TEST_F(SuperblockClonerTest, LoopPeelingNested) {
+  HBasicBlock* header = nullptr;
+  HBasicBlock* loop_body = nullptr;
+
+  InitGraph();
+
+  // Create the following nested structure of loops
+  //   Headers:  1    2 3
+  //             [ ], [ [ ] ]
+  CreateBasicLoopControlFlow(entry_block_, return_block_, &header, &loop_body);
+  CreateBasicLoopDataFlow(header, loop_body);
+  HBasicBlock* loop1_header = header;
+
+  CreateBasicLoopControlFlow(header, return_block_, &header, &loop_body);
+  CreateBasicLoopDataFlow(header, loop_body);
+  HBasicBlock* loop2_header = header;
+
+  CreateBasicLoopControlFlow(header, header->GetSuccessors()[1], &header, &loop_body);
+  CreateBasicLoopDataFlow(header, loop_body);
+  HBasicBlock* loop3_header = header;
+
+  graph_->BuildDominatorTree();
+  EXPECT_TRUE(CheckGraph());
+
+  HLoopInformation* loop2_info_before = loop2_header->GetLoopInformation();
+  HLoopInformation* loop3_info_before = loop3_header->GetLoopInformation();
+
+  // Check nested loops structure.
+  CheckLoopStructureForLoopPeelingNested(loop1_header, loop2_header, loop3_header);
+  PeelUnrollSimpleHelper helper(loop1_header->GetLoopInformation());
+  helper.DoPeeling();
+  // Check that nested loops structure has not changed after the transformation.
+  CheckLoopStructureForLoopPeelingNested(loop1_header, loop2_header, loop3_header);
+
+  // Test that the loop info is preserved.
+  EXPECT_EQ(loop2_info_before, loop2_header->GetLoopInformation());
+  EXPECT_EQ(loop3_info_before, loop3_header->GetLoopInformation());
+
+  EXPECT_EQ(loop3_info_before->GetPreHeader()->GetLoopInformation(), loop2_info_before);
+  EXPECT_EQ(loop2_info_before->GetPreHeader()->GetLoopInformation(), nullptr);
+
+  EXPECT_EQ(helper.GetRegionToBeAdjusted(), nullptr);
+
+  EXPECT_TRUE(CheckGraph());
+}
+
+// Checks that the loop population is correctly propagated after an inner loop is peeled.
+TEST_F(SuperblockClonerTest, OuterLoopPopulationAfterInnerPeeled) {
+  HBasicBlock* header = nullptr;
+  HBasicBlock* loop_body = nullptr;
+
+  InitGraph();
+
+  // Create the following nested structure of loops
+  //   Headers:  1 2 3        4
+  //             [ [ [ ] ] ], [ ]
+  CreateBasicLoopControlFlow(entry_block_, return_block_, &header, &loop_body);
+  CreateBasicLoopDataFlow(header, loop_body);
+  HBasicBlock* loop1_header = header;
+
+  CreateBasicLoopControlFlow(header, header->GetSuccessors()[1], &header, &loop_body);
+  CreateBasicLoopDataFlow(header, loop_body);
+  HBasicBlock* loop2_header = header;
+
+  CreateBasicLoopControlFlow(header, header->GetSuccessors()[1], &header, &loop_body);
+  CreateBasicLoopDataFlow(header, loop_body);
+  HBasicBlock* loop3_header = header;
+
+  CreateBasicLoopControlFlow(loop1_header, return_block_, &header, &loop_body);
+  CreateBasicLoopDataFlow(header, loop_body);
+  HBasicBlock* loop4_header = header;
+
+  graph_->BuildDominatorTree();
+  EXPECT_TRUE(CheckGraph());
+
+  PeelUnrollSimpleHelper helper(loop3_header->GetLoopInformation());
+  helper.DoPeeling();
+  HLoopInformation* loop1 = loop1_header->GetLoopInformation();
+  HLoopInformation* loop2 = loop2_header->GetLoopInformation();
+  HLoopInformation* loop3 = loop3_header->GetLoopInformation();
+  HLoopInformation* loop4 = loop4_header->GetLoopInformation();
+
+  EXPECT_TRUE(loop1->Contains(*loop2_header));
+  EXPECT_TRUE(loop1->Contains(*loop3_header));
+  EXPECT_TRUE(loop1->Contains(*loop3_header->GetLoopInformation()->GetPreHeader()));
+
+  // Check that loop4 info has not been touched after local run of AnalyzeLoops.
+  EXPECT_EQ(loop4, loop4_header->GetLoopInformation());
+
+  EXPECT_TRUE(loop1->IsIn(*loop1));
+  EXPECT_TRUE(loop2->IsIn(*loop1));
+  EXPECT_TRUE(loop3->IsIn(*loop1));
+  EXPECT_TRUE(loop3->IsIn(*loop2));
+  EXPECT_TRUE(!loop4->IsIn(*loop1));
+
+  EXPECT_EQ(loop4->GetPreHeader()->GetLoopInformation(), nullptr);
+
+  EXPECT_EQ(helper.GetRegionToBeAdjusted(), loop2);
+
+  EXPECT_TRUE(CheckGraph());
+}
+
+// Checks the case when inner loop have an exit not to its immediate outer_loop but some other loop
+// in the hierarchy. Loop population information must be valid after loop peeling.
+TEST_F(SuperblockClonerTest, NestedCaseExitToOutermost) {
+  HBasicBlock* header = nullptr;
+  HBasicBlock* loop_body = nullptr;
+
+  InitGraph();
+
+  // Create the following nested structure of loops then peel loop3.
+  //   Headers:  1 2 3
+  //             [ [ [ ] ] ]
+  CreateBasicLoopControlFlow(entry_block_, return_block_, &header, &loop_body);
+  CreateBasicLoopDataFlow(header, loop_body);
+  HBasicBlock* loop1_header = header;
+  HBasicBlock* loop_body1 = loop_body;
+
+  CreateBasicLoopControlFlow(header, header->GetSuccessors()[1], &header, &loop_body);
+  CreateBasicLoopDataFlow(header, loop_body);
+
+  CreateBasicLoopControlFlow(header, header->GetSuccessors()[1], &header, &loop_body);
+  CreateBasicLoopDataFlow(header, loop_body);
+  HBasicBlock* loop3_header = header;
+  HBasicBlock* loop_body3 = loop_body;
+
+  // Change the loop3 - insert an exit which leads to loop1.
+  HBasicBlock* loop3_extra_if_block = new (GetAllocator()) HBasicBlock(graph_);
+  graph_->AddBlock(loop3_extra_if_block);
+  loop3_extra_if_block->AddInstruction(new (GetAllocator()) HIf(parameter_));
+
+  loop3_header->ReplaceSuccessor(loop_body3, loop3_extra_if_block);
+  loop3_extra_if_block->AddSuccessor(loop_body1);  // Long exit.
+  loop3_extra_if_block->AddSuccessor(loop_body3);
+
+  graph_->BuildDominatorTree();
+  EXPECT_TRUE(CheckGraph());
+
+  HBasicBlock* loop3_long_exit = loop3_extra_if_block->GetSuccessors()[0];
+  EXPECT_TRUE(loop1_header->GetLoopInformation()->Contains(*loop3_long_exit));
+
+  PeelUnrollSimpleHelper helper(loop3_header->GetLoopInformation());
+  helper.DoPeeling();
+
+  HLoopInformation* loop1 = loop1_header->GetLoopInformation();
+  // Check that after the transformation the local area for CF adjustments has been chosen
+  // correctly and loop population has been updated.
+  loop3_long_exit = loop3_extra_if_block->GetSuccessors()[0];
+  EXPECT_TRUE(loop1->Contains(*loop3_long_exit));
+
+  EXPECT_EQ(helper.GetRegionToBeAdjusted(), loop1);
+
+  EXPECT_TRUE(loop1->Contains(*loop3_header));
+  EXPECT_TRUE(loop1->Contains(*loop3_header->GetLoopInformation()->GetPreHeader()));
+
+  EXPECT_TRUE(CheckGraph());
+}
+
+TEST_F(SuperblockClonerTest, FastCaseCheck) {
+  HBasicBlock* header = nullptr;
+  HBasicBlock* loop_body = nullptr;
+  ArenaAllocator* arena = graph_->GetAllocator();
+
+  InitGraph();
+  CreateBasicLoopControlFlow(entry_block_, return_block_, &header, &loop_body);
+  CreateBasicLoopDataFlow(header, loop_body);
+  graph_->BuildDominatorTree();
+
+  HLoopInformation* loop_info = header->GetLoopInformation();
+
+  ArenaBitVector orig_bb_set(
+      arena, graph_->GetBlocks().size(), false, kArenaAllocSuperblockCloner);
+  orig_bb_set.Union(&loop_info->GetBlocks());
+
+  HEdgeSet remap_orig_internal(graph_->GetAllocator()->Adapter(kArenaAllocSuperblockCloner));
+  HEdgeSet remap_copy_internal(graph_->GetAllocator()->Adapter(kArenaAllocSuperblockCloner));
+  HEdgeSet remap_incoming(graph_->GetAllocator()->Adapter(kArenaAllocSuperblockCloner));
+
+  CollectRemappingInfoForPeelUnroll(true,
+                                    loop_info,
+                                    &remap_orig_internal,
+                                    &remap_copy_internal,
+                                    &remap_incoming);
+
+  // Insert some extra nodes and edges.
+  HBasicBlock* preheader = loop_info->GetPreHeader();
+  orig_bb_set.SetBit(preheader->GetBlockId());
+
+  // Adjust incoming edges.
+  remap_incoming.Clear();
+  remap_incoming.Insert(HEdge(preheader->GetSinglePredecessor(), preheader));
+
+  HBasicBlockMap bb_map(std::less<HBasicBlock*>(), arena->Adapter(kArenaAllocSuperblockCloner));
+  HInstructionMap hir_map(std::less<HInstruction*>(), arena->Adapter(kArenaAllocSuperblockCloner));
+
+  SuperblockCloner cloner(graph_,
+                          &orig_bb_set,
+                          &bb_map,
+                          &hir_map);
+  cloner.SetSuccessorRemappingInfo(&remap_orig_internal, &remap_copy_internal, &remap_incoming);
+
+  EXPECT_FALSE(cloner.IsFastCase());
+}
+
+// Helper for FindCommonLoop which also check that FindCommonLoop is symmetric.
+static HLoopInformation* FindCommonLoopCheck(HLoopInformation* loop1, HLoopInformation* loop2) {
+  HLoopInformation* common_loop12 = FindCommonLoop(loop1, loop2);
+  HLoopInformation* common_loop21 = FindCommonLoop(loop2, loop1);
+  EXPECT_EQ(common_loop21, common_loop12);
+  return common_loop12;
+}
+
+// Tests FindCommonLoop function on a loop nest.
+TEST_F(SuperblockClonerTest, FindCommonLoop) {
+  HBasicBlock* header = nullptr;
+  HBasicBlock* loop_body = nullptr;
+
+  InitGraph();
+
+  // Create the following nested structure of loops
+  //   Headers:  1 2 3      4      5
+  //             [ [ [ ] ], [ ] ], [ ]
+  CreateBasicLoopControlFlow(entry_block_, return_block_, &header, &loop_body);
+  CreateBasicLoopDataFlow(header, loop_body);
+  HBasicBlock* loop1_header = header;
+
+  CreateBasicLoopControlFlow(header, header->GetSuccessors()[1], &header, &loop_body);
+  CreateBasicLoopDataFlow(header, loop_body);
+  HBasicBlock* loop2_header = header;
+
+  CreateBasicLoopControlFlow(header, header->GetSuccessors()[1], &header, &loop_body);
+  CreateBasicLoopDataFlow(header, loop_body);
+  HBasicBlock* loop3_header = header;
+
+  CreateBasicLoopControlFlow(loop2_header, loop2_header->GetSuccessors()[0], &header, &loop_body);
+  CreateBasicLoopDataFlow(header, loop_body);
+  HBasicBlock* loop4_header = header;
+
+  CreateBasicLoopControlFlow(loop1_header, return_block_, &header, &loop_body);
+  CreateBasicLoopDataFlow(header, loop_body);
+  HBasicBlock* loop5_header = header;
+
+  graph_->BuildDominatorTree();
+  EXPECT_TRUE(CheckGraph());
+
+  HLoopInformation* loop1 = loop1_header->GetLoopInformation();
+  HLoopInformation* loop2 = loop2_header->GetLoopInformation();
+  HLoopInformation* loop3 = loop3_header->GetLoopInformation();
+  HLoopInformation* loop4 = loop4_header->GetLoopInformation();
+  HLoopInformation* loop5 = loop5_header->GetLoopInformation();
+
+  EXPECT_TRUE(loop1->IsIn(*loop1));
+  EXPECT_TRUE(loop2->IsIn(*loop1));
+  EXPECT_TRUE(loop3->IsIn(*loop1));
+  EXPECT_TRUE(loop3->IsIn(*loop2));
+  EXPECT_TRUE(loop4->IsIn(*loop1));
+
+  EXPECT_FALSE(loop5->IsIn(*loop1));
+  EXPECT_FALSE(loop4->IsIn(*loop2));
+  EXPECT_FALSE(loop4->IsIn(*loop3));
+
+  EXPECT_EQ(loop1->GetPreHeader()->GetLoopInformation(), nullptr);
+  EXPECT_EQ(loop4->GetPreHeader()->GetLoopInformation(), loop1);
+
+  EXPECT_EQ(FindCommonLoopCheck(nullptr, nullptr), nullptr);
+  EXPECT_EQ(FindCommonLoopCheck(loop2, nullptr), nullptr);
+
+  EXPECT_EQ(FindCommonLoopCheck(loop1, loop1), loop1);
+  EXPECT_EQ(FindCommonLoopCheck(loop1, loop2), loop1);
+  EXPECT_EQ(FindCommonLoopCheck(loop1, loop3), loop1);
+  EXPECT_EQ(FindCommonLoopCheck(loop1, loop4), loop1);
+  EXPECT_EQ(FindCommonLoopCheck(loop1, loop5), nullptr);
+
+  EXPECT_EQ(FindCommonLoopCheck(loop2, loop3), loop2);
+  EXPECT_EQ(FindCommonLoopCheck(loop2, loop4), loop1);
+  EXPECT_EQ(FindCommonLoopCheck(loop2, loop5), nullptr);
+
+  EXPECT_EQ(FindCommonLoopCheck(loop3, loop4), loop1);
+  EXPECT_EQ(FindCommonLoopCheck(loop3, loop5), nullptr);
+
+  EXPECT_EQ(FindCommonLoopCheck(loop4, loop5), nullptr);
+
+  EXPECT_EQ(FindCommonLoopCheck(loop5, loop5), loop5);
+}
+
 }  // namespace art
diff --git a/compiler/utils/mips/assembler_mips.cc b/compiler/utils/mips/assembler_mips.cc
index 2218ef9..b2ad490 100644
--- a/compiler/utils/mips/assembler_mips.cc
+++ b/compiler/utils/mips/assembler_mips.cc
@@ -2793,6 +2793,26 @@
   DsFsmInstr(EmitMsa3R(0x5, 0x3, wt, ws, wd, 0x15)).FprOuts(wd).FprIns(ws, wt);
 }
 
+void MipsAssembler::PcntB(VectorRegister wd, VectorRegister ws) {
+  CHECK(HasMsa());
+  DsFsmInstr(EmitMsa2R(0xc1, 0x0, ws, wd, 0x1e)).FprOuts(wd).FprIns(ws);
+}
+
+void MipsAssembler::PcntH(VectorRegister wd, VectorRegister ws) {
+  CHECK(HasMsa());
+  DsFsmInstr(EmitMsa2R(0xc1, 0x1, ws, wd, 0x1e)).FprOuts(wd).FprIns(ws);
+}
+
+void MipsAssembler::PcntW(VectorRegister wd, VectorRegister ws) {
+  CHECK(HasMsa());
+  DsFsmInstr(EmitMsa2R(0xc1, 0x2, ws, wd, 0x1e)).FprOuts(wd).FprIns(ws);
+}
+
+void MipsAssembler::PcntD(VectorRegister wd, VectorRegister ws) {
+  CHECK(HasMsa());
+  DsFsmInstr(EmitMsa2R(0xc1, 0x3, ws, wd, 0x1e)).FprOuts(wd).FprIns(ws);
+}
+
 void MipsAssembler::ReplicateFPToVectorRegister(VectorRegister dst,
                                                 FRegister src,
                                                 bool is_double) {
diff --git a/compiler/utils/mips/assembler_mips.h b/compiler/utils/mips/assembler_mips.h
index 7de8e2e..c6ce62b 100644
--- a/compiler/utils/mips/assembler_mips.h
+++ b/compiler/utils/mips/assembler_mips.h
@@ -756,6 +756,11 @@
   void Hadd_uW(VectorRegister wd, VectorRegister ws, VectorRegister wt);
   void Hadd_uD(VectorRegister wd, VectorRegister ws, VectorRegister wt);
 
+  void PcntB(VectorRegister wd, VectorRegister ws);
+  void PcntH(VectorRegister wd, VectorRegister ws);
+  void PcntW(VectorRegister wd, VectorRegister ws);
+  void PcntD(VectorRegister wd, VectorRegister ws);
+
   // Helper for replicating floating point value in all destination elements.
   void ReplicateFPToVectorRegister(VectorRegister dst, FRegister src, bool is_double);
 
diff --git a/compiler/utils/mips/assembler_mips32r6_test.cc b/compiler/utils/mips/assembler_mips32r6_test.cc
index 937ee25..691c33f 100644
--- a/compiler/utils/mips/assembler_mips32r6_test.cc
+++ b/compiler/utils/mips/assembler_mips32r6_test.cc
@@ -2277,6 +2277,22 @@
   DriverStr(RepeatVR(&mips::MipsAssembler::FillW, "fill.w ${reg1}, ${reg2}"), "fill.w");
 }
 
+TEST_F(AssemblerMIPS32r6Test, PcntB) {
+  DriverStr(RepeatVV(&mips::MipsAssembler::PcntB, "pcnt.b ${reg1}, ${reg2}"), "pcnt.b");
+}
+
+TEST_F(AssemblerMIPS32r6Test, PcntH) {
+  DriverStr(RepeatVV(&mips::MipsAssembler::PcntH, "pcnt.h ${reg1}, ${reg2}"), "pcnt.h");
+}
+
+TEST_F(AssemblerMIPS32r6Test, PcntW) {
+  DriverStr(RepeatVV(&mips::MipsAssembler::PcntW, "pcnt.w ${reg1}, ${reg2}"), "pcnt.w");
+}
+
+TEST_F(AssemblerMIPS32r6Test, PcntD) {
+  DriverStr(RepeatVV(&mips::MipsAssembler::PcntD, "pcnt.d ${reg1}, ${reg2}"), "pcnt.d");
+}
+
 TEST_F(AssemblerMIPS32r6Test, LdiB) {
   DriverStr(RepeatVIb(&mips::MipsAssembler::LdiB, -8, "ldi.b ${reg}, {imm}"), "ldi.b");
 }
diff --git a/compiler/utils/mips64/assembler_mips64.cc b/compiler/utils/mips64/assembler_mips64.cc
index e1b0e75..5a817fa 100644
--- a/compiler/utils/mips64/assembler_mips64.cc
+++ b/compiler/utils/mips64/assembler_mips64.cc
@@ -2279,6 +2279,26 @@
   EmitMsa3R(0x5, 0x3, wt, ws, wd, 0x15);
 }
 
+void Mips64Assembler::PcntB(VectorRegister wd, VectorRegister ws) {
+  CHECK(HasMsa());
+  EmitMsa2R(0xc1, 0x0, ws, wd, 0x1e);
+}
+
+void Mips64Assembler::PcntH(VectorRegister wd, VectorRegister ws) {
+  CHECK(HasMsa());
+  EmitMsa2R(0xc1, 0x1, ws, wd, 0x1e);
+}
+
+void Mips64Assembler::PcntW(VectorRegister wd, VectorRegister ws) {
+  CHECK(HasMsa());
+  EmitMsa2R(0xc1, 0x2, ws, wd, 0x1e);
+}
+
+void Mips64Assembler::PcntD(VectorRegister wd, VectorRegister ws) {
+  CHECK(HasMsa());
+  EmitMsa2R(0xc1, 0x3, ws, wd, 0x1e);
+}
+
 void Mips64Assembler::ReplicateFPToVectorRegister(VectorRegister dst,
                                                   FpuRegister src,
                                                   bool is_double) {
diff --git a/compiler/utils/mips64/assembler_mips64.h b/compiler/utils/mips64/assembler_mips64.h
index 7a61f39..542dbaf 100644
--- a/compiler/utils/mips64/assembler_mips64.h
+++ b/compiler/utils/mips64/assembler_mips64.h
@@ -863,6 +863,11 @@
   void Hadd_uW(VectorRegister wd, VectorRegister ws, VectorRegister wt);
   void Hadd_uD(VectorRegister wd, VectorRegister ws, VectorRegister wt);
 
+  void PcntB(VectorRegister wd, VectorRegister ws);
+  void PcntH(VectorRegister wd, VectorRegister ws);
+  void PcntW(VectorRegister wd, VectorRegister ws);
+  void PcntD(VectorRegister wd, VectorRegister ws);
+
   // Helper for replicating floating point value in all destination elements.
   void ReplicateFPToVectorRegister(VectorRegister dst, FpuRegister src, bool is_double);
 
diff --git a/compiler/utils/mips64/assembler_mips64_test.cc b/compiler/utils/mips64/assembler_mips64_test.cc
index b0e1d91..fb5f12b 100644
--- a/compiler/utils/mips64/assembler_mips64_test.cc
+++ b/compiler/utils/mips64/assembler_mips64_test.cc
@@ -3529,6 +3529,22 @@
   DriverStr(RepeatVR(&mips64::Mips64Assembler::FillD, "fill.d ${reg1}, ${reg2}"), "fill.d");
 }
 
+TEST_F(AssemblerMIPS64Test, PcntB) {
+  DriverStr(RepeatVV(&mips64::Mips64Assembler::PcntB, "pcnt.b ${reg1}, ${reg2}"), "pcnt.b");
+}
+
+TEST_F(AssemblerMIPS64Test, PcntH) {
+  DriverStr(RepeatVV(&mips64::Mips64Assembler::PcntH, "pcnt.h ${reg1}, ${reg2}"), "pcnt.h");
+}
+
+TEST_F(AssemblerMIPS64Test, PcntW) {
+  DriverStr(RepeatVV(&mips64::Mips64Assembler::PcntW, "pcnt.w ${reg1}, ${reg2}"), "pcnt.w");
+}
+
+TEST_F(AssemblerMIPS64Test, PcntD) {
+  DriverStr(RepeatVV(&mips64::Mips64Assembler::PcntD, "pcnt.d ${reg1}, ${reg2}"), "pcnt.d");
+}
+
 TEST_F(AssemblerMIPS64Test, LdiB) {
   DriverStr(RepeatVIb(&mips64::Mips64Assembler::LdiB, -8, "ldi.b ${reg}, {imm}"), "ldi.b");
 }
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index ea160c8..42c2541 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -913,6 +913,78 @@
 }
 
 
+void X86Assembler::paddusb(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0xDC);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+
+void X86Assembler::paddsb(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0xEC);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+
+void X86Assembler::paddusw(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0xDD);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+
+void X86Assembler::paddsw(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0xED);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+
+void X86Assembler::psubusb(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0xD8);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+
+void X86Assembler::psubsb(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0xE8);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+
+void X86Assembler::psubusw(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0xD9);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+
+void X86Assembler::psubsw(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0xE9);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+
 void X86Assembler::cvtsi2ss(XmmRegister dst, Register src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index a085677..22eaedc 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -449,6 +449,15 @@
   void paddq(XmmRegister dst, XmmRegister src);
   void psubq(XmmRegister dst, XmmRegister src);
 
+  void paddusb(XmmRegister dst, XmmRegister src);
+  void paddsb(XmmRegister dst, XmmRegister src);
+  void paddusw(XmmRegister dst, XmmRegister src);
+  void paddsw(XmmRegister dst, XmmRegister src);
+  void psubusb(XmmRegister dst, XmmRegister src);
+  void psubsb(XmmRegister dst, XmmRegister src);
+  void psubusw(XmmRegister dst, XmmRegister src);
+  void psubsw(XmmRegister dst, XmmRegister src);
+
   void cvtsi2ss(XmmRegister dst, Register src);
   void cvtsi2sd(XmmRegister dst, Register src);
 
diff --git a/compiler/utils/x86/assembler_x86_test.cc b/compiler/utils/x86/assembler_x86_test.cc
index 2fd1b27..8f72db7 100644
--- a/compiler/utils/x86/assembler_x86_test.cc
+++ b/compiler/utils/x86/assembler_x86_test.cc
@@ -600,6 +600,38 @@
   DriverStr(RepeatFF(&x86::X86Assembler::psubq, "psubq %{reg2}, %{reg1}"), "psubq");
 }
 
+TEST_F(AssemblerX86Test, PAddUSB) {
+  DriverStr(RepeatFF(&x86::X86Assembler::paddusb, "paddusb %{reg2}, %{reg1}"), "paddusb");
+}
+
+TEST_F(AssemblerX86Test, PAddSB) {
+  DriverStr(RepeatFF(&x86::X86Assembler::paddsb, "paddsb %{reg2}, %{reg1}"), "paddsb");
+}
+
+TEST_F(AssemblerX86Test, PAddUSW) {
+  DriverStr(RepeatFF(&x86::X86Assembler::paddusw, "paddusw %{reg2}, %{reg1}"), "paddusw");
+}
+
+TEST_F(AssemblerX86Test, PAddSW) {
+  DriverStr(RepeatFF(&x86::X86Assembler::psubsw, "psubsw %{reg2}, %{reg1}"), "psubsw");
+}
+
+TEST_F(AssemblerX86Test, PSubUSB) {
+  DriverStr(RepeatFF(&x86::X86Assembler::psubusb, "psubusb %{reg2}, %{reg1}"), "psubusb");
+}
+
+TEST_F(AssemblerX86Test, PSubSB) {
+  DriverStr(RepeatFF(&x86::X86Assembler::psubsb, "psubsb %{reg2}, %{reg1}"), "psubsb");
+}
+
+TEST_F(AssemblerX86Test, PSubUSW) {
+  DriverStr(RepeatFF(&x86::X86Assembler::psubusw, "psubusw %{reg2}, %{reg1}"), "psubusw");
+}
+
+TEST_F(AssemblerX86Test, PSubSW) {
+  DriverStr(RepeatFF(&x86::X86Assembler::psubsw, "psubsw %{reg2}, %{reg1}"), "psubsw");
+}
+
 TEST_F(AssemblerX86Test, XorPD) {
   DriverStr(RepeatFF(&x86::X86Assembler::xorpd, "xorpd %{reg2}, %{reg1}"), "xorpd");
 }
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index ff5a357..c6e16e7 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -1011,6 +1011,86 @@
 }
 
 
+void X86_64Assembler::paddusb(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0xDC);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
+}
+
+
+void X86_64Assembler::paddsb(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0xEC);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
+}
+
+
+void X86_64Assembler::paddusw(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0xDD);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
+}
+
+
+void X86_64Assembler::paddsw(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0xED);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
+}
+
+
+void X86_64Assembler::psubusb(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0xD8);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
+}
+
+
+void X86_64Assembler::psubsb(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0xE8);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
+}
+
+
+void X86_64Assembler::psubusw(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0xD9);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
+}
+
+
+void X86_64Assembler::psubsw(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0xE9);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
+}
+
+
 void X86_64Assembler::cvtsi2ss(XmmRegister dst, CpuRegister src) {
   cvtsi2ss(dst, src, false);
 }
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index 7a5fdb5..ab761fb 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -485,6 +485,15 @@
   void paddq(XmmRegister dst, XmmRegister src);
   void psubq(XmmRegister dst, XmmRegister src);
 
+  void paddusb(XmmRegister dst, XmmRegister src);
+  void paddsb(XmmRegister dst, XmmRegister src);
+  void paddusw(XmmRegister dst, XmmRegister src);
+  void paddsw(XmmRegister dst, XmmRegister src);
+  void psubusb(XmmRegister dst, XmmRegister src);
+  void psubsb(XmmRegister dst, XmmRegister src);
+  void psubusw(XmmRegister dst, XmmRegister src);
+  void psubsw(XmmRegister dst, XmmRegister src);
+
   void cvtsi2ss(XmmRegister dst, CpuRegister src);  // Note: this is the r/m32 version.
   void cvtsi2ss(XmmRegister dst, CpuRegister src, bool is64bit);
   void cvtsi2ss(XmmRegister dst, const Address& src, bool is64bit);
diff --git a/compiler/utils/x86_64/assembler_x86_64_test.cc b/compiler/utils/x86_64/assembler_x86_64_test.cc
index 6b1e53c..104e215 100644
--- a/compiler/utils/x86_64/assembler_x86_64_test.cc
+++ b/compiler/utils/x86_64/assembler_x86_64_test.cc
@@ -1282,6 +1282,38 @@
   DriverStr(RepeatFF(&x86_64::X86_64Assembler::psubq, "psubq %{reg2}, %{reg1}"), "psubq");
 }
 
+TEST_F(AssemblerX86_64Test, Paddusb) {
+  DriverStr(RepeatFF(&x86_64::X86_64Assembler::paddusb, "paddusb %{reg2}, %{reg1}"), "paddusb");
+}
+
+TEST_F(AssemblerX86_64Test, Paddsb) {
+  DriverStr(RepeatFF(&x86_64::X86_64Assembler::paddsb, "paddsb %{reg2}, %{reg1}"), "paddsb");
+}
+
+TEST_F(AssemblerX86_64Test, Paddusw) {
+  DriverStr(RepeatFF(&x86_64::X86_64Assembler::paddusw, "paddusw %{reg2}, %{reg1}"), "paddusw");
+}
+
+TEST_F(AssemblerX86_64Test, Paddsw) {
+  DriverStr(RepeatFF(&x86_64::X86_64Assembler::paddsw, "paddsw %{reg2}, %{reg1}"), "paddsw");
+}
+
+TEST_F(AssemblerX86_64Test, Psubusb) {
+  DriverStr(RepeatFF(&x86_64::X86_64Assembler::psubusb, "psubusb %{reg2}, %{reg1}"), "psubusb");
+}
+
+TEST_F(AssemblerX86_64Test, Psubsb) {
+  DriverStr(RepeatFF(&x86_64::X86_64Assembler::psubsb, "psubsb %{reg2}, %{reg1}"), "psubsb");
+}
+
+TEST_F(AssemblerX86_64Test, Psubusw) {
+  DriverStr(RepeatFF(&x86_64::X86_64Assembler::psubusw, "psubusw %{reg2}, %{reg1}"), "psubusw");
+}
+
+TEST_F(AssemblerX86_64Test, Psubsw) {
+  DriverStr(RepeatFF(&x86_64::X86_64Assembler::psubsw, "psubsw %{reg2}, %{reg1}"), "psubsw");
+}
+
 TEST_F(AssemblerX86_64Test, Cvtsi2ss) {
   DriverStr(RepeatFr(&x86_64::X86_64Assembler::cvtsi2ss, "cvtsi2ss %{reg2}, %{reg1}"), "cvtsi2ss");
 }
diff --git a/dex2oat/Android.bp b/dex2oat/Android.bp
index b67898d..b158231 100644
--- a/dex2oat/Android.bp
+++ b/dex2oat/Android.bp
@@ -139,7 +139,14 @@
             "-Wno-frame-larger-than=",
             "-DART_PGO_INSTRUMENTATION",
         ],
-    }
+    },
+    target: {
+        android: {
+            lto: {
+                 thin: true,
+            },
+        },
+    },
 }
 
 art_cc_binary {
diff --git a/dex2oat/dex2oat.cc b/dex2oat/dex2oat.cc
index 73afbad..6eeec4e 100644
--- a/dex2oat/dex2oat.cc
+++ b/dex2oat/dex2oat.cc
@@ -2069,11 +2069,9 @@
         std::unique_ptr<linker::OatWriter>& oat_writer = oat_writers_[i];
 
         oat_writer->PrepareLayout(&patcher);
-
-        size_t rodata_size = oat_writer->GetOatHeader().GetExecutableOffset();
-        size_t text_size = oat_writer->GetOatSize() - rodata_size;
-        elf_writer->PrepareDynamicSection(rodata_size,
-                                          text_size,
+        elf_writer->PrepareDynamicSection(oat_writer->GetOatHeader().GetExecutableOffset(),
+                                          oat_writer->GetCodeSize(),
+                                          oat_writer->GetDataBimgRelRoSize(),
                                           oat_writer->GetBssSize(),
                                           oat_writer->GetBssMethodsOffset(),
                                           oat_writer->GetBssRootsOffset(),
@@ -2123,6 +2121,16 @@
         }
         elf_writer->EndText(text);
 
+        if (oat_writer->GetDataBimgRelRoSize() != 0u) {
+          linker::OutputStream* data_bimg_rel_ro = elf_writer->StartDataBimgRelRo();
+          if (!oat_writer->WriteDataBimgRelRo(data_bimg_rel_ro)) {
+            LOG(ERROR) << "Failed to write .data.bimg.rel.ro section to the ELF file "
+                << oat_file->GetPath();
+            return false;
+          }
+          elf_writer->EndDataBimgRelRo(data_bimg_rel_ro);
+        }
+
         if (!oat_writer->WriteHeader(elf_writer->GetStream(),
                                      image_file_location_oat_checksum_,
                                      image_file_location_oat_data_begin_,
diff --git a/dex2oat/dex2oat_image_test.cc b/dex2oat/dex2oat_image_test.cc
index 49b84bb..d895282 100644
--- a/dex2oat/dex2oat_image_test.cc
+++ b/dex2oat/dex2oat_image_test.cc
@@ -129,12 +129,15 @@
     std::string art_file = scratch.GetFilename() + ".art";
     std::string oat_file = scratch.GetFilename() + ".oat";
     std::string vdex_file = scratch.GetFilename() + ".vdex";
-    ret.art_size = OS::GetFileSizeBytes(art_file.c_str());
-    ret.oat_size = OS::GetFileSizeBytes(oat_file.c_str());
-    ret.vdex_size = OS::GetFileSizeBytes(vdex_file.c_str());
-    CHECK_GT(ret.art_size, 0u) << art_file;
-    CHECK_GT(ret.oat_size, 0u) << oat_file;
-    CHECK_GT(ret.vdex_size, 0u) << vdex_file;
+    int64_t art_size = OS::GetFileSizeBytes(art_file.c_str());
+    int64_t oat_size = OS::GetFileSizeBytes(oat_file.c_str());
+    int64_t vdex_size = OS::GetFileSizeBytes(vdex_file.c_str());
+    CHECK_GT(art_size, 0u) << art_file;
+    CHECK_GT(oat_size, 0u) << oat_file;
+    CHECK_GT(vdex_size, 0u) << vdex_file;
+    ret.art_size = art_size;
+    ret.oat_size = oat_size;
+    ret.vdex_size = vdex_size;
     scratch.Close();
     // Clear image files since we compile the image multiple times and don't want to leave any
     // artifacts behind.
diff --git a/dex2oat/linker/elf_writer.h b/dex2oat/linker/elf_writer.h
index bcf2cd7..cd8cf4c 100644
--- a/dex2oat/linker/elf_writer.h
+++ b/dex2oat/linker/elf_writer.h
@@ -63,6 +63,7 @@
   // This method must be called before calling GetLoadedSize().
   virtual void PrepareDynamicSection(size_t rodata_size,
                                      size_t text_size,
+                                     size_t data_bimg_rel_ro_size,
                                      size_t bss_size,
                                      size_t bss_methods_offset,
                                      size_t bss_roots_offset,
@@ -72,6 +73,8 @@
   virtual void EndRoData(OutputStream* rodata) = 0;
   virtual OutputStream* StartText() = 0;
   virtual void EndText(OutputStream* text) = 0;
+  virtual OutputStream* StartDataBimgRelRo() = 0;
+  virtual void EndDataBimgRelRo(OutputStream* data_bimg_rel_ro) = 0;
   virtual void WriteDynamicSection() = 0;
   virtual void WriteDebugInfo(const debug::DebugInfo& debug_info) = 0;
   virtual bool End() = 0;
diff --git a/dex2oat/linker/elf_writer_quick.cc b/dex2oat/linker/elf_writer_quick.cc
index 07b02f1..4ab2012 100644
--- a/dex2oat/linker/elf_writer_quick.cc
+++ b/dex2oat/linker/elf_writer_quick.cc
@@ -105,6 +105,7 @@
   void Start() OVERRIDE;
   void PrepareDynamicSection(size_t rodata_size,
                              size_t text_size,
+                             size_t data_bimg_rel_ro_size,
                              size_t bss_size,
                              size_t bss_methods_offset,
                              size_t bss_roots_offset,
@@ -114,6 +115,8 @@
   void EndRoData(OutputStream* rodata) OVERRIDE;
   OutputStream* StartText() OVERRIDE;
   void EndText(OutputStream* text) OVERRIDE;
+  OutputStream* StartDataBimgRelRo() OVERRIDE;
+  void EndDataBimgRelRo(OutputStream* data_bimg_rel_ro) OVERRIDE;
   void WriteDynamicSection() OVERRIDE;
   void WriteDebugInfo(const debug::DebugInfo& debug_info) OVERRIDE;
   bool End() OVERRIDE;
@@ -131,6 +134,7 @@
   File* const elf_file_;
   size_t rodata_size_;
   size_t text_size_;
+  size_t data_bimg_rel_ro_size_;
   size_t bss_size_;
   size_t dex_section_size_;
   std::unique_ptr<BufferedOutputStream> output_stream_;
@@ -171,6 +175,7 @@
       elf_file_(elf_file),
       rodata_size_(0u),
       text_size_(0u),
+      data_bimg_rel_ro_size_(0u),
       bss_size_(0u),
       dex_section_size_(0u),
       output_stream_(
@@ -192,6 +197,7 @@
 template <typename ElfTypes>
 void ElfWriterQuick<ElfTypes>::PrepareDynamicSection(size_t rodata_size,
                                                      size_t text_size,
+                                                     size_t data_bimg_rel_ro_size,
                                                      size_t bss_size,
                                                      size_t bss_methods_offset,
                                                      size_t bss_roots_offset,
@@ -200,6 +206,8 @@
   rodata_size_ = rodata_size;
   DCHECK_EQ(text_size_, 0u);
   text_size_ = text_size;
+  DCHECK_EQ(data_bimg_rel_ro_size_, 0u);
+  data_bimg_rel_ro_size_ = data_bimg_rel_ro_size;
   DCHECK_EQ(bss_size_, 0u);
   bss_size_ = bss_size;
   DCHECK_EQ(dex_section_size_, 0u);
@@ -207,6 +215,7 @@
   builder_->PrepareDynamicSection(elf_file_->GetPath(),
                                   rodata_size_,
                                   text_size_,
+                                  data_bimg_rel_ro_size_,
                                   bss_size_,
                                   bss_methods_offset,
                                   bss_roots_offset,
@@ -240,6 +249,19 @@
 }
 
 template <typename ElfTypes>
+OutputStream* ElfWriterQuick<ElfTypes>::StartDataBimgRelRo() {
+  auto* data_bimg_rel_ro = builder_->GetDataBimgRelRo();
+  data_bimg_rel_ro->Start();
+  return data_bimg_rel_ro;
+}
+
+template <typename ElfTypes>
+void ElfWriterQuick<ElfTypes>::EndDataBimgRelRo(OutputStream* data_bimg_rel_ro) {
+  CHECK_EQ(builder_->GetDataBimgRelRo(), data_bimg_rel_ro);
+  builder_->GetDataBimgRelRo()->End();
+}
+
+template <typename ElfTypes>
 void ElfWriterQuick<ElfTypes>::WriteDynamicSection() {
   if (builder_->GetIsa() == InstructionSet::kMips ||
       builder_->GetIsa() == InstructionSet::kMips64) {
diff --git a/dex2oat/linker/image_test.h b/dex2oat/linker/image_test.h
index 319c5fb..7449191 100644
--- a/dex2oat/linker/image_test.h
+++ b/dex2oat/linker/image_test.h
@@ -313,10 +313,9 @@
         oat_writer->WriteChecksumsAndVdexHeader(vdex_out.get());
 
         oat_writer->PrepareLayout(&patcher);
-        size_t rodata_size = oat_writer->GetOatHeader().GetExecutableOffset();
-        size_t text_size = oat_writer->GetOatSize() - rodata_size;
-        elf_writer->PrepareDynamicSection(rodata_size,
-                                          text_size,
+        elf_writer->PrepareDynamicSection(oat_writer->GetOatHeader().GetExecutableOffset(),
+                                          oat_writer->GetCodeSize(),
+                                          oat_writer->GetDataBimgRelRoSize(),
                                           oat_writer->GetBssSize(),
                                           oat_writer->GetBssMethodsOffset(),
                                           oat_writer->GetBssRootsOffset(),
@@ -336,6 +335,13 @@
         ASSERT_TRUE(text_ok);
         elf_writer->EndText(text);
 
+        if (oat_writer->GetDataBimgRelRoSize() != 0u) {
+          OutputStream* data_bimg_rel_ro = elf_writer->StartDataBimgRelRo();
+          bool data_bimg_rel_ro_ok = oat_writer->WriteDataBimgRelRo(data_bimg_rel_ro);
+          ASSERT_TRUE(data_bimg_rel_ro_ok);
+          elf_writer->EndDataBimgRelRo(data_bimg_rel_ro);
+        }
+
         bool header_ok = oat_writer->WriteHeader(elf_writer->GetStream(), 0u, 0u, 0u);
         ASSERT_TRUE(header_ok);
 
diff --git a/dex2oat/linker/oat_writer.cc b/dex2oat/linker/oat_writer.cc
index c72beea..a7d1ee0 100644
--- a/dex2oat/linker/oat_writer.cc
+++ b/dex2oat/linker/oat_writer.cc
@@ -375,11 +375,15 @@
     vdex_dex_shared_data_offset_(0u),
     vdex_verifier_deps_offset_(0u),
     vdex_quickening_info_offset_(0u),
+    code_size_(0u),
     oat_size_(0u),
+    data_bimg_rel_ro_start_(0u),
+    data_bimg_rel_ro_size_(0u),
     bss_start_(0u),
     bss_size_(0u),
     bss_methods_offset_(0u),
     bss_roots_offset_(0u),
+    data_bimg_rel_ro_entries_(),
     bss_method_entry_references_(),
     bss_method_entries_(),
     bss_type_entries_(),
@@ -409,6 +413,8 @@
     size_method_header_(0),
     size_code_(0),
     size_code_alignment_(0),
+    size_data_bimg_rel_ro_(0),
+    size_data_bimg_rel_ro_alignment_(0),
     size_relative_call_thunks_(0),
     size_misc_thunks_(0),
     size_vmap_table_(0),
@@ -737,8 +743,13 @@
   {
     TimingLogger::ScopedTiming split("InitOatCodeDexFiles", timings_);
     offset = InitOatCodeDexFiles(offset);
+    code_size_ = offset - GetOatHeader().GetExecutableOffset();
   }
-  oat_size_ = offset;
+  {
+    TimingLogger::ScopedTiming split("InitDataBimgRelRoLayout", timings_);
+    offset = InitDataBimgRelRoLayout(offset);
+  }
+  oat_size_ = offset;  // .bss does not count towards oat_size_.
   bss_start_ = (bss_size_ != 0u) ? RoundUp(oat_size_, kPageSize) : 0u;
 
   CHECK_EQ(dex_files_->size(), oat_dex_files_.size());
@@ -845,7 +856,10 @@
         MethodReference(dex_file_, it.GetMemberIndex()));
     if (HasCompiledCode(compiled_method)) {
       for (const LinkerPatch& patch : compiled_method->GetPatches()) {
-        if (patch.GetType() == LinkerPatch::Type::kMethodBssEntry) {
+        if (patch.GetType() == LinkerPatch::Type::kDataBimgRelRo) {
+          writer_->data_bimg_rel_ro_entries_.Overwrite(patch.BootImageOffset(),
+                                                       /* placeholder */ 0u);
+        } else if (patch.GetType() == LinkerPatch::Type::kMethodBssEntry) {
           MethodReference target_method = patch.TargetMethod();
           AddBssReference(target_method,
                           target_method.dex_file->NumMethodIds(),
@@ -1776,6 +1790,16 @@
         for (const LinkerPatch& patch : compiled_method->GetPatches()) {
           uint32_t literal_offset = patch.LiteralOffset();
           switch (patch.GetType()) {
+            case LinkerPatch::Type::kDataBimgRelRo: {
+              uint32_t target_offset =
+                  writer_->data_bimg_rel_ro_start_ +
+                  writer_->data_bimg_rel_ro_entries_.Get(patch.BootImageOffset());
+              writer_->relative_patcher_->PatchPcRelativeReference(&patched_code_,
+                                                                   patch,
+                                                                   offset_ + literal_offset,
+                                                                   target_offset);
+              break;
+            }
             case LinkerPatch::Type::kMethodBssEntry: {
               uint32_t target_offset =
                   writer_->bss_start_ + writer_->bss_method_entries_.Get(patch.TargetMethod());
@@ -2510,6 +2534,25 @@
   return offset;
 }
 
+size_t OatWriter::InitDataBimgRelRoLayout(size_t offset) {
+  DCHECK_EQ(data_bimg_rel_ro_size_, 0u);
+  if (data_bimg_rel_ro_entries_.empty()) {
+    // Nothing to put to the .data.bimg.rel.ro section.
+    return offset;
+  }
+
+  data_bimg_rel_ro_start_ = RoundUp(offset, kPageSize);
+
+  for (auto& entry : data_bimg_rel_ro_entries_) {
+    size_t& entry_offset = entry.second;
+    entry_offset = data_bimg_rel_ro_size_;
+    data_bimg_rel_ro_size_ += sizeof(uint32_t);
+  }
+
+  offset = data_bimg_rel_ro_start_ + data_bimg_rel_ro_size_;
+  return offset;
+}
+
 void OatWriter::InitBssLayout(InstructionSet instruction_set) {
   {
     InitBssLayoutMethodVisitor visitor(this);
@@ -2905,6 +2948,49 @@
     return false;
   }
 
+  if (data_bimg_rel_ro_size_ != 0u) {
+    write_state_ = WriteState::kWriteDataBimgRelRo;
+  } else {
+    if (!CheckOatSize(out, file_offset, relative_offset)) {
+      return false;
+    }
+    write_state_ = WriteState::kWriteHeader;
+  }
+  return true;
+}
+
+bool OatWriter::WriteDataBimgRelRo(OutputStream* out) {
+  CHECK(write_state_ == WriteState::kWriteDataBimgRelRo);
+
+  // Wrap out to update checksum with each write.
+  ChecksumUpdatingOutputStream checksum_updating_out(out, oat_header_.get());
+  out = &checksum_updating_out;
+
+  const size_t file_offset = oat_data_offset_;
+  size_t relative_offset = data_bimg_rel_ro_start_;
+
+  // Record the padding before the .data.bimg.rel.ro section.
+  // Do not write anything, this zero-filled part was skipped (Seek()) when starting the section.
+  size_t code_end = GetOatHeader().GetExecutableOffset() + code_size_;
+  DCHECK_EQ(RoundUp(code_end, kPageSize), relative_offset);
+  size_t padding_size = relative_offset - code_end;
+  DCHECK_EQ(size_data_bimg_rel_ro_alignment_, 0u);
+  size_data_bimg_rel_ro_alignment_ = padding_size;
+
+  relative_offset = WriteDataBimgRelRo(out, file_offset, relative_offset);
+  if (relative_offset == 0) {
+    LOG(ERROR) << "Failed to write boot image relocations to " << out->GetLocation();
+    return false;
+  }
+
+  if (!CheckOatSize(out, file_offset, relative_offset)) {
+    return false;
+  }
+  write_state_ = WriteState::kWriteHeader;
+  return true;
+}
+
+bool OatWriter::CheckOatSize(OutputStream* out, size_t file_offset, size_t relative_offset) {
   const off_t oat_end_file_offset = out->Seek(0, kSeekCurrent);
   if (oat_end_file_offset == static_cast<off_t>(-1)) {
     LOG(ERROR) << "Failed to get oat end file offset in " << out->GetLocation();
@@ -2939,6 +3025,8 @@
     DO_STAT(size_method_header_);
     DO_STAT(size_code_);
     DO_STAT(size_code_alignment_);
+    DO_STAT(size_data_bimg_rel_ro_);
+    DO_STAT(size_data_bimg_rel_ro_alignment_);
     DO_STAT(size_relative_call_thunks_);
     DO_STAT(size_misc_thunks_);
     DO_STAT(size_vmap_table_);
@@ -3316,6 +3404,32 @@
   return relative_offset;
 }
 
+size_t OatWriter::WriteDataBimgRelRo(OutputStream* out,
+                                     size_t file_offset,
+                                     size_t relative_offset) {
+  if (data_bimg_rel_ro_entries_.empty()) {
+    return relative_offset;
+  }
+
+  // Write the entire .data.bimg.rel.ro with a single WriteFully().
+  std::vector<uint32_t> data;
+  data.reserve(data_bimg_rel_ro_entries_.size());
+  for (const auto& entry : data_bimg_rel_ro_entries_) {
+    uint32_t boot_image_offset = entry.first;
+    data.push_back(boot_image_offset);
+  }
+  DCHECK_EQ(data.size(), data_bimg_rel_ro_entries_.size());
+  DCHECK_OFFSET();
+  if (!out->WriteFully(data.data(), data.size() * sizeof(data[0]))) {
+    PLOG(ERROR) << "Failed to write .data.bimg.rel.ro in " << out->GetLocation();
+    return 0u;
+  }
+  DCHECK_EQ(size_data_bimg_rel_ro_, 0u);
+  size_data_bimg_rel_ro_ = data.size() * sizeof(data[0]);
+  relative_offset += size_data_bimg_rel_ro_;
+  return relative_offset;
+}
+
 bool OatWriter::RecordOatDataOffset(OutputStream* out) {
   // Get the elf file offset of the oat file.
   const off_t raw_file_offset = out->Seek(0, kSeekCurrent);
diff --git a/dex2oat/linker/oat_writer.h b/dex2oat/linker/oat_writer.h
index 0cb0ef2..db249c0 100644
--- a/dex2oat/linker/oat_writer.h
+++ b/dex2oat/linker/oat_writer.h
@@ -137,6 +137,7 @@
   //   - PrepareLayout(),
   //   - WriteRodata(),
   //   - WriteCode(),
+  //   - WriteDataBimgRelRo() iff GetDataBimgRelRoSize() != 0,
   //   - WriteHeader().
 
   // Add dex file source(s) from a file, either a plain dex file or
@@ -197,6 +198,10 @@
   bool WriteRodata(OutputStream* out);
   // Write the code to the .text section.
   bool WriteCode(OutputStream* out);
+  // Write the boot image relocation data to the .data.bimg.rel.ro section.
+  bool WriteDataBimgRelRo(OutputStream* out);
+  // Check the size of the written oat file.
+  bool CheckOatSize(OutputStream* out, size_t file_offset, size_t relative_offset);
   // Write the oat header. This finalizes the oat file.
   bool WriteHeader(OutputStream* out,
                    uint32_t image_file_location_oat_checksum,
@@ -218,10 +223,18 @@
     return *oat_header_;
   }
 
+  size_t GetCodeSize() const {
+    return code_size_;
+  }
+
   size_t GetOatSize() const {
     return oat_size_;
   }
 
+  size_t GetDataBimgRelRoSize() const {
+    return data_bimg_rel_ro_size_;
+  }
+
   size_t GetBssSize() const {
     return bss_size_;
   }
@@ -323,6 +336,7 @@
   size_t InitOatDexFiles(size_t offset);
   size_t InitOatCode(size_t offset);
   size_t InitOatCodeDexFiles(size_t offset);
+  size_t InitDataBimgRelRoLayout(size_t offset);
   void InitBssLayout(InstructionSet instruction_set);
 
   size_t WriteClassOffsets(OutputStream* out, size_t file_offset, size_t relative_offset);
@@ -332,6 +346,7 @@
   size_t WriteOatDexFiles(OutputStream* out, size_t file_offset, size_t relative_offset);
   size_t WriteCode(OutputStream* out, size_t file_offset, size_t relative_offset);
   size_t WriteCodeDexFiles(OutputStream* out, size_t file_offset, size_t relative_offset);
+  size_t WriteDataBimgRelRo(OutputStream* out, size_t file_offset, size_t relative_offset);
 
   bool RecordOatDataOffset(OutputStream* out);
   bool WriteTypeLookupTables(OutputStream* oat_rodata,
@@ -360,6 +375,7 @@
     kPrepareLayout,
     kWriteRoData,
     kWriteText,
+    kWriteDataBimgRelRo,
     kWriteHeader,
     kDone
   };
@@ -401,9 +417,18 @@
   // Offset of section holding quickening info inside Vdex.
   size_t vdex_quickening_info_offset_;
 
+  // Size of the .text segment.
+  size_t code_size_;
+
   // Size required for Oat data structures.
   size_t oat_size_;
 
+  // The start of the required .data.bimg.rel.ro section.
+  size_t data_bimg_rel_ro_start_;
+
+  // The size of the required .data.bimg.rel.ro section holding the boot image relocations.
+  size_t data_bimg_rel_ro_size_;
+
   // The start of the required .bss section.
   size_t bss_start_;
 
@@ -416,6 +441,10 @@
   // The offset of the GC roots in .bss section.
   size_t bss_roots_offset_;
 
+  // Map for allocating .data.bimg.rel.ro entries. Indexed by the boot image offset of the
+  // relocation. The value is the assigned offset within the .data.bimg.rel.ro section.
+  SafeMap<uint32_t, size_t> data_bimg_rel_ro_entries_;
+
   // Map for recording references to ArtMethod entries in .bss.
   SafeMap<const DexFile*, BitVector> bss_method_entry_references_;
 
@@ -484,6 +513,8 @@
   uint32_t size_method_header_;
   uint32_t size_code_;
   uint32_t size_code_alignment_;
+  uint32_t size_data_bimg_rel_ro_;
+  uint32_t size_data_bimg_rel_ro_alignment_;
   uint32_t size_relative_call_thunks_;
   uint32_t size_misc_thunks_;
   uint32_t size_vmap_table_;
diff --git a/dex2oat/linker/oat_writer_test.cc b/dex2oat/linker/oat_writer_test.cc
index 00b9abe..f713ed6 100644
--- a/dex2oat/linker/oat_writer_test.cc
+++ b/dex2oat/linker/oat_writer_test.cc
@@ -216,10 +216,9 @@
                                     instruction_set_features_.get());
     oat_writer.Initialize(compiler_driver_.get(), nullptr, dex_files);
     oat_writer.PrepareLayout(&patcher);
-    size_t rodata_size = oat_writer.GetOatHeader().GetExecutableOffset();
-    size_t text_size = oat_writer.GetOatSize() - rodata_size;
-    elf_writer->PrepareDynamicSection(rodata_size,
-                                      text_size,
+    elf_writer->PrepareDynamicSection(oat_writer.GetOatHeader().GetExecutableOffset(),
+                                      oat_writer.GetCodeSize(),
+                                      oat_writer.GetDataBimgRelRoSize(),
                                       oat_writer.GetBssSize(),
                                       oat_writer.GetBssMethodsOffset(),
                                       oat_writer.GetBssRootsOffset(),
@@ -248,6 +247,14 @@
     }
     elf_writer->EndText(text);
 
+    if (oat_writer.GetDataBimgRelRoSize() != 0u) {
+      OutputStream* data_bimg_rel_ro = elf_writer->StartDataBimgRelRo();
+      if (!oat_writer.WriteDataBimgRelRo(data_bimg_rel_ro)) {
+        return false;
+      }
+      elf_writer->EndDataBimgRelRo(data_bimg_rel_ro);
+    }
+
     if (!oat_writer.WriteHeader(elf_writer->GetStream(), 42U, 4096U, 0)) {
       return false;
     }
diff --git a/dexlayout/Android.bp b/dexlayout/Android.bp
index bea61d0..24be25b 100644
--- a/dexlayout/Android.bp
+++ b/dexlayout/Android.bp
@@ -44,7 +44,14 @@
         instrumentation: true,
         profile_file: "art/dex2oat.profdata",
         benchmarks: ["dex2oat"],
-    }
+    },
+    target: {
+        android: {
+            lto: {
+                 thin: true,
+            },
+        },
+    },
 }
 
 art_cc_library {
diff --git a/disassembler/disassembler_mips.cc b/disassembler/disassembler_mips.cc
index b5f5d6f..eaf11be 100644
--- a/disassembler/disassembler_mips.cc
+++ b/disassembler/disassembler_mips.cc
@@ -487,6 +487,7 @@
   { kMsaMask | (0xf << 22), kMsa | (0x3 << 22) | 0x19, "copy_u", "yX" },
   { kMsaMask | (0xf << 22), kMsa | (0x4 << 22) | 0x19, "insert", "YD" },
   { kMsaMask | (0xff << 18), kMsa | (0xc0 << 18) | 0x1e, "fill", "vkD" },
+  { kMsaMask | (0xff << 18), kMsa | (0xc1 << 18) | 0x1e, "pcnt", "vkm" },
   { kMsaMask | (0x7 << 23), kMsa | (0x6 << 23) | 0x7, "ldi", "kx" },
   { kMsaSpecialMask | (0xf << 2), kMsa | (0x8 << 2), "ld", "kw" },
   { kMsaSpecialMask | (0xf << 2), kMsa | (0x9 << 2), "st", "kw" },
diff --git a/libartbase/base/os_linux.cc b/libartbase/base/os_linux.cc
index 6b5a604..cb228bd 100644
--- a/libartbase/base/os_linux.cc
+++ b/libartbase/base/os_linux.cc
@@ -89,9 +89,11 @@
 int64_t OS::GetFileSizeBytes(const char* name) {
   struct stat st;
   if (stat(name, &st) == 0) {
-    return -1;  // TODO: Deal with symlinks?
+    return st.st_size;  // TODO: Deal with symlinks? According to the documentation,
+                        // the st_size for a symlink is "the length of the pathname
+                        // it contains, without a terminating null byte."
   } else {
-    return st.st_size;
+    return -1;
   }
 }
 
diff --git a/oatdump/oatdump.cc b/oatdump/oatdump.cc
index 8069408..c8903db 100644
--- a/oatdump/oatdump.cc
+++ b/oatdump/oatdump.cc
@@ -172,6 +172,7 @@
     builder_->PrepareDynamicSection(elf_file->GetPath(),
                                     rodata_size,
                                     text_size,
+                                    oat_file_->DataBimgRelRoSize(),
                                     oat_file_->BssSize(),
                                     oat_file_->BssMethodsOffset(),
                                     oat_file_->BssRootsOffset(),
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 09fc2c2..375b050 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -613,56 +613,18 @@
 
 
 .macro INVOKE_STUB_CREATE_FRAME
+SAVE_SIZE=6*8   // x4, x5, x19, x20, FP, LR saved.
+    SAVE_TWO_REGS_INCREASE_FRAME x4, x5, SAVE_SIZE
+    SAVE_TWO_REGS x19, x20, 16
+    SAVE_TWO_REGS xFP, xLR, 32
 
-SAVE_SIZE=15*8   // x4, x5, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, SP, LR, FP saved.
-SAVE_SIZE_AND_METHOD=SAVE_SIZE+8
+    mov xFP, sp                            // Use xFP for frame pointer, as it's callee-saved.
+    .cfi_def_cfa_register xFP
 
+    add x10, x2, #(__SIZEOF_POINTER__ + 0xf) // Reserve space for ArtMethod*, arguments and
+    and x10, x10, # ~0xf                   // round up for 16-byte stack alignment.
+    sub sp, sp, x10                        // Adjust SP for ArtMethod*, args and alignment padding.
 
-    mov x9, sp                             // Save stack pointer.
-    .cfi_register sp,x9
-
-    add x10, x2, # SAVE_SIZE_AND_METHOD    // calculate size of frame.
-    sub x10, sp, x10                       // Calculate SP position - saves + ArtMethod* + args
-    and x10, x10, # ~0xf                   // Enforce 16 byte stack alignment.
-    mov sp, x10                            // Set new SP.
-
-    sub x10, x9, #SAVE_SIZE                // Calculate new FP (later). Done here as we must move SP
-    .cfi_def_cfa_register x10              // before this.
-    .cfi_adjust_cfa_offset SAVE_SIZE
-
-    str x28, [x10, #112]
-    .cfi_rel_offset x28, 112
-
-    stp x26, x27, [x10, #96]
-    .cfi_rel_offset x26, 96
-    .cfi_rel_offset x27, 104
-
-    stp x24, x25, [x10, #80]
-    .cfi_rel_offset x24, 80
-    .cfi_rel_offset x25, 88
-
-    stp x22, x23, [x10, #64]
-    .cfi_rel_offset x22, 64
-    .cfi_rel_offset x23, 72
-
-    stp x20, x21, [x10, #48]
-    .cfi_rel_offset x20, 48
-    .cfi_rel_offset x21, 56
-
-    stp x9, x19, [x10, #32]                // Save old stack pointer and x19.
-    .cfi_rel_offset sp, 32
-    .cfi_rel_offset x19, 40
-
-    stp x4, x5, [x10, #16]                 // Save result and shorty addresses.
-    .cfi_rel_offset x4, 16
-    .cfi_rel_offset x5, 24
-
-    stp xFP, xLR, [x10]                    // Store LR & FP.
-    .cfi_rel_offset x29, 0
-    .cfi_rel_offset x30, 8
-
-    mov xFP, x10                           // Use xFP now, as it's callee-saved.
-    .cfi_def_cfa_register x29
     mov xSELF, x3                          // Move thread pointer into SELF register.
 
     // Copy arguments into stack frame.
@@ -677,12 +639,10 @@
     // Copy parameters into the stack. Use numeric label as this is a macro and Clang's assembler
     // does not have unique-id variables.
 1:
-    cmp w2, #0
-    beq 2f
+    cbz w2, 2f
     sub w2, w2, #4      // Need 65536 bytes of range.
     ldr w10, [x1, x2]
     str w10, [x9, x2]
-
     b 1b
 
 2:
@@ -699,29 +659,14 @@
     // Branch to method.
     blr x9
 
-    // Restore return value address and shorty address.
-    ldp x4, x5, [xFP, #16]
-    .cfi_restore x4
-    .cfi_restore x5
+    // Pop the ArtMethod* (null), arguments and alignment padding from the stack.
+    mov sp, xFP
+    .cfi_def_cfa_register sp
 
-    ldr x28, [xFP, #112]
-    .cfi_restore x28
-
-    ldp x26, x27, [xFP, #96]
-    .cfi_restore x26
-    .cfi_restore x27
-
-    ldp x24, x25, [xFP, #80]
-    .cfi_restore x24
-    .cfi_restore x25
-
-    ldp x22, x23, [xFP, #64]
-    .cfi_restore x22
-    .cfi_restore x23
-
-    ldp x20, x21, [xFP, #48]
-    .cfi_restore x20
-    .cfi_restore x21
+    // Restore saved registers including value address and shorty address.
+    RESTORE_TWO_REGS x19, x20, 16
+    RESTORE_TWO_REGS xFP, xLR, 32
+    RESTORE_TWO_REGS_DECREASE_FRAME x4, x5, SAVE_SIZE
 
     // Store result (w0/x0/s0/d0) appropriately, depending on resultType.
     ldrb w10, [x5]
@@ -731,33 +676,28 @@
 
     // Don't set anything for a void type.
     cmp w10, #'V'
-    beq 3f
+    beq 1f
 
     // Is it a double?
     cmp w10, #'D'
-    bne 1f
-    str d0, [x4]
-    b 3f
+    beq 2f
 
-1:  // Is it a float?
+    // Is it a float?
     cmp w10, #'F'
-    bne 2f
-    str s0, [x4]
-    b 3f
+    beq 3f
 
-2:  // Just store x0. Doesn't matter if it is 64 or 32 bits.
+    // Just store x0. Doesn't matter if it is 64 or 32 bits.
     str x0, [x4]
 
-3:  // Finish up.
-    ldp x2, x19, [xFP, #32]   // Restore stack pointer and x19.
-    .cfi_restore x19
-    mov sp, x2
-    .cfi_restore sp
+1:  // Finish up.
+    ret
 
-    ldp xFP, xLR, [xFP]    // Restore old frame pointer and link register.
-    .cfi_restore x29
-    .cfi_restore x30
+2:  // Store double.
+    str d0, [x4]
+    ret
 
+3:  // Store float.
+    str s0, [x4]
     ret
 
 .endm
@@ -1056,7 +996,7 @@
 
 /*  extern"C" void art_quick_osr_stub(void** stack,                x0
  *                                    size_t stack_size_in_bytes,  x1
- *                                    const uin8_t* native_pc,     x2
+ *                                    const uint8_t* native_pc,    x2
  *                                    JValue *result,              x3
  *                                    char   *shorty,              x4
  *                                    Thread *self)                x5
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index 1d72875..7a1c3de 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -3361,9 +3361,10 @@
   CHECK_EQ(dex_cache_location, dex_file_suffix);
   const OatFile* oat_file =
       (dex_file.GetOatDexFile() != nullptr) ? dex_file.GetOatDexFile()->GetOatFile() : nullptr;
-  // Clean up pass to remove null dex caches. Also check if we need to initialize OatFile .bss.
-  // Null dex caches can occur due to class unloading and we are lazily removing null entries.
-  bool initialize_oat_file_bss = (oat_file != nullptr);
+  // Clean up pass to remove null dex caches; null dex caches can occur due to class unloading
+  // and we are lazily removing null entries. Also check if we need to initialize OatFile data
+  // (.data.bimg.rel.ro and .bss sections) needed for code execution.
+  bool initialize_oat_file_data = (oat_file != nullptr) && oat_file->IsExecutable();
   JavaVMExt* const vm = self->GetJniEnv()->GetVm();
   for (auto it = dex_caches_.begin(); it != dex_caches_.end(); ) {
     DexCacheData data = *it;
@@ -3371,15 +3372,36 @@
       vm->DeleteWeakGlobalRef(self, data.weak_root);
       it = dex_caches_.erase(it);
     } else {
-      if (initialize_oat_file_bss &&
+      if (initialize_oat_file_data &&
           it->dex_file->GetOatDexFile() != nullptr &&
           it->dex_file->GetOatDexFile()->GetOatFile() == oat_file) {
-        initialize_oat_file_bss = false;  // Already initialized.
+        initialize_oat_file_data = false;  // Already initialized.
       }
       ++it;
     }
   }
-  if (initialize_oat_file_bss) {
+  if (initialize_oat_file_data) {
+    // Initialize the .data.bimg.rel.ro section.
+    if (!oat_file->GetBootImageRelocations().empty()) {
+      uint8_t* reloc_begin = const_cast<uint8_t*>(oat_file->DataBimgRelRoBegin());
+      CheckedCall(mprotect,
+                  "un-protect boot image relocations",
+                  reloc_begin,
+                  oat_file->DataBimgRelRoSize(),
+                  PROT_READ | PROT_WRITE);
+      uint32_t boot_image_begin = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(
+          Runtime::Current()->GetHeap()->GetBootImageSpaces().front()->Begin()));
+      for (const uint32_t& relocation : oat_file->GetBootImageRelocations()) {
+        const_cast<uint32_t&>(relocation) += boot_image_begin;
+      }
+      CheckedCall(mprotect,
+                  "protect boot image relocations",
+                  reloc_begin,
+                  oat_file->DataBimgRelRoSize(),
+                  PROT_READ);
+    }
+
+    // Initialize the .bss section.
     // TODO: Pre-initialize from boot/app image?
     ArtMethod* resolution_method = Runtime::Current()->GetResolutionMethod();
     for (ArtMethod*& entry : oat_file->GetBssMethods()) {
diff --git a/runtime/oat.h b/runtime/oat.h
index 292c9d6..0fa1d4b 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -32,8 +32,8 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr uint8_t kOatMagic[] = { 'o', 'a', 't', '\n' };
-  // Last oat version changed reason: Math.pow() intrinsic.
-  static constexpr uint8_t kOatVersion[] = { '1', '3', '8', '\0' };
+  // Last oat version changed reason: Retrieve ArtMethod* from .data.bimg.rel.ro .
+  static constexpr uint8_t kOatVersion[] = { '1', '3', '9', '\0' };
 
   static constexpr const char* kImageLocationKey = "image-location";
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";
diff --git a/runtime/oat_file.cc b/runtime/oat_file.cc
index b0e1de2..66347e9 100644
--- a/runtime/oat_file.cc
+++ b/runtime/oat_file.cc
@@ -343,6 +343,19 @@
   // Readjust to be non-inclusive upper bound.
   end_ += sizeof(uint32_t);
 
+  data_bimg_rel_ro_begin_ = FindDynamicSymbolAddress("oatdatabimgrelro", &symbol_error_msg);
+  if (data_bimg_rel_ro_begin_ != nullptr) {
+    data_bimg_rel_ro_end_ =
+        FindDynamicSymbolAddress("oatdatabimgrelrolastword", &symbol_error_msg);
+    if (data_bimg_rel_ro_end_ == nullptr) {
+      *error_msg =
+          StringPrintf("Failed to find oatdatabimgrelrolastword symbol in '%s'", file_path.c_str());
+      return false;
+    }
+    // Readjust to be non-inclusive upper bound.
+    data_bimg_rel_ro_end_ += sizeof(uint32_t);
+  }
+
   bss_begin_ = const_cast<uint8_t*>(FindDynamicSymbolAddress("oatbss", &symbol_error_msg));
   if (bss_begin_ == nullptr) {
     // No .bss section.
@@ -536,6 +549,17 @@
   }
   const uint8_t* oat = Begin() + oat_dex_files_offset;  // Jump to the OatDexFile records.
 
+  if (!IsAligned<sizeof(uint32_t)>(data_bimg_rel_ro_begin_) ||
+      !IsAligned<sizeof(uint32_t)>(data_bimg_rel_ro_end_) ||
+      data_bimg_rel_ro_begin_ > data_bimg_rel_ro_end_) {
+    *error_msg = StringPrintf("In oat file '%s' found unaligned or unordered databimgrelro "
+                                  "symbol(s): begin = %p, end = %p",
+                              GetLocation().c_str(),
+                              data_bimg_rel_ro_begin_,
+                              data_bimg_rel_ro_end_);
+    return false;
+  }
+
   DCHECK_GE(static_cast<size_t>(pointer_size), alignof(GcRoot<mirror::Object>));
   if (!IsAligned<kPageSize>(bss_begin_) ||
       !IsAlignedParam(bss_methods_, static_cast<size_t>(pointer_size)) ||
@@ -849,8 +873,29 @@
     }
   }
 
+  Runtime* runtime = Runtime::Current();
+
+  if (DataBimgRelRoBegin() != nullptr) {
+    // Make .data.bimg.rel.ro read only. ClassLinker shall make it writable for relocation.
+    uint8_t* reloc_begin = const_cast<uint8_t*>(DataBimgRelRoBegin());
+    CheckedCall(mprotect, "protect relocations", reloc_begin, DataBimgRelRoSize(), PROT_READ);
+    if (UNLIKELY(runtime == nullptr)) {
+      // This must be oatdump without boot image.
+    } else if (!IsExecutable()) {
+      // Do not check whether we have a boot image if the oat file is not executable.
+    } else if (UNLIKELY(runtime->GetHeap()->GetBootImageSpaces().empty())) {
+      *error_msg = StringPrintf("Cannot load oat file '%s' with .data.bimg.rel.ro as executable "
+                                    "without boot image.",
+                                GetLocation().c_str());
+      return false;
+    } else {
+      // ClassLinker shall perform the relocation when we register a dex file from
+      // this oat file. We do not do the relocation here to avoid dirtying the pages
+      // if the code is never actually ready to be executed.
+    }
+  }
+
   if (boot_image_tables != nullptr) {
-    Runtime* runtime = Runtime::Current();
     if (UNLIKELY(runtime == nullptr)) {
       // This must be oatdump without boot image. Make sure the .bss is inaccessible.
       CheckedCall(mprotect, "protect bss", const_cast<uint8_t*>(BssBegin()), BssSize(), PROT_NONE);
@@ -1513,6 +1558,8 @@
       vdex_(nullptr),
       begin_(nullptr),
       end_(nullptr),
+      data_bimg_rel_ro_begin_(nullptr),
+      data_bimg_rel_ro_end_(nullptr),
       bss_begin_(nullptr),
       bss_end_(nullptr),
       bss_methods_(nullptr),
@@ -1542,22 +1589,6 @@
   return end_;
 }
 
-const uint8_t* OatFile::BssBegin() const {
-  return bss_begin_;
-}
-
-const uint8_t* OatFile::BssEnd() const {
-  return bss_end_;
-}
-
-const uint8_t* OatFile::VdexBegin() const {
-  return vdex_begin_;
-}
-
-const uint8_t* OatFile::VdexEnd() const {
-  return vdex_end_;
-}
-
 const uint8_t* OatFile::DexBegin() const {
   return vdex_->Begin();
 }
@@ -1566,6 +1597,16 @@
   return vdex_->End();
 }
 
+ArrayRef<const uint32_t> OatFile::GetBootImageRelocations() const {
+  if (data_bimg_rel_ro_begin_ != nullptr) {
+    const uint32_t* relocations = reinterpret_cast<const uint32_t*>(data_bimg_rel_ro_begin_);
+    const uint32_t* relocations_end = reinterpret_cast<const uint32_t*>(data_bimg_rel_ro_end_);
+    return ArrayRef<const uint32_t>(relocations, relocations_end - relocations);
+  } else {
+    return ArrayRef<const uint32_t>();
+  }
+}
+
 ArrayRef<ArtMethod*> OatFile::GetBssMethods() const {
   if (bss_methods_ != nullptr) {
     ArtMethod** methods = reinterpret_cast<ArtMethod**>(bss_methods_);
diff --git a/runtime/oat_file.h b/runtime/oat_file.h
index 3c2cd00..24868dd 100644
--- a/runtime/oat_file.h
+++ b/runtime/oat_file.h
@@ -275,6 +275,10 @@
     return p >= Begin() && p < End();
   }
 
+  size_t DataBimgRelRoSize() const {
+    return DataBimgRelRoEnd() - DataBimgRelRoBegin();
+  }
+
   size_t BssSize() const {
     return BssEnd() - BssBegin();
   }
@@ -300,15 +304,19 @@
   const uint8_t* Begin() const;
   const uint8_t* End() const;
 
-  const uint8_t* BssBegin() const;
-  const uint8_t* BssEnd() const;
+  const uint8_t* DataBimgRelRoBegin() const { return data_bimg_rel_ro_begin_; }
+  const uint8_t* DataBimgRelRoEnd() const { return data_bimg_rel_ro_end_; }
 
-  const uint8_t* VdexBegin() const;
-  const uint8_t* VdexEnd() const;
+  const uint8_t* BssBegin() const { return bss_begin_; }
+  const uint8_t* BssEnd() const { return bss_end_; }
+
+  const uint8_t* VdexBegin() const { return vdex_begin_; }
+  const uint8_t* VdexEnd() const { return vdex_end_; }
 
   const uint8_t* DexBegin() const;
   const uint8_t* DexEnd() const;
 
+  ArrayRef<const uint32_t> GetBootImageRelocations() const;
   ArrayRef<ArtMethod*> GetBssMethods() const;
   ArrayRef<GcRoot<mirror::Object>> GetBssGcRoots() const;
 
@@ -355,6 +363,12 @@
   // Pointer to end of oat region for bounds checking.
   const uint8_t* end_;
 
+  // Pointer to the .data.bimg.rel.ro section, if present, otherwise null.
+  const uint8_t* data_bimg_rel_ro_begin_;
+
+  // Pointer to the end of the .data.bimg.rel.ro section, if present, otherwise null.
+  const uint8_t* data_bimg_rel_ro_end_;
+
   // Pointer to the .bss section, if present, otherwise null.
   uint8_t* bss_begin_;
 
diff --git a/test/552-checker-sharpening/src/Main.java b/test/552-checker-sharpening/src/Main.java
index 3173afd..121e8f2 100644
--- a/test/552-checker-sharpening/src/Main.java
+++ b/test/552-checker-sharpening/src/Main.java
@@ -195,6 +195,32 @@
     return Other.class;
   }
 
+  /// CHECK-START-{ARM,ARM64,MIPS,MIPS64,X86,X86_64}: java.lang.String Main.$noinline$toHexString(int) builder (after)
+  /// CHECK:                InvokeStaticOrDirect method_load_kind:RuntimeCall
+
+  /// CHECK-START-{ARM,ARM64,MIPS,MIPS64,X86,X86_64}: java.lang.String Main.$noinline$toHexString(int) sharpening (after)
+  // Note: load kind depends on PIC/non-PIC
+  /// CHECK:                InvokeStaticOrDirect method_load_kind:{{BootImageRelRo|DirectAddress}}
+  public static String $noinline$toHexString(int value) {
+    return Integer.toString(value, 16);
+  }
+
+  /// CHECK-START-{ARM,ARM64,MIPS,MIPS64,X86,X86_64}: java.lang.String Main.$noinline$toHexStringIndirect(int) builder (after)
+  /// CHECK:                InvokeStaticOrDirect method_load_kind:RuntimeCall
+
+  /// CHECK-START-{ARM,ARM64,MIPS,MIPS64,X86,X86_64}: java.lang.String Main.$noinline$toHexStringIndirect(int) sharpening (after)
+  /// CHECK:                InvokeStaticOrDirect method_load_kind:BssEntry
+
+  /// CHECK-START-X86: java.lang.String Main.$noinline$toHexStringIndirect(int) pc_relative_fixups_x86 (before)
+  /// CHECK-NOT:            X86ComputeBaseMethodAddress
+
+  /// CHECK-START-X86: java.lang.String Main.$noinline$toHexStringIndirect(int) pc_relative_fixups_x86 (after)
+  /// CHECK-DAG:            X86ComputeBaseMethodAddress
+  /// CHECK-DAG:            InvokeStaticOrDirect method_load_kind:BssEntry
+  public static String $noinline$toHexStringIndirect(int value) {
+    return $noinline$toHexString(value);
+  }
+
   public static void main(String[] args) {
     assertIntEquals(1, testSimple(1));
     assertIntEquals(1, testDiamond(false, 1));
@@ -208,6 +234,8 @@
     assertStringEquals("non-boot-image-string", $noinline$getNonBootImageString());
     assertClassEquals(String.class, $noinline$getStringClass());
     assertClassEquals(Other.class, $noinline$getOtherClass());
+    assertStringEquals("12345678", $noinline$toHexString(0x12345678));
+    assertStringEquals("76543210", $noinline$toHexStringIndirect(0x76543210));
   }
 }
 
diff --git a/tools/external_oj_libjdwp_art_failures.txt b/tools/external_oj_libjdwp_art_failures.txt
index 6c2206f..9b6ff98 100644
--- a/tools/external_oj_libjdwp_art_failures.txt
+++ b/tools/external_oj_libjdwp_art_failures.txt
@@ -13,12 +13,6 @@
   name: "org.apache.harmony.jpda.tests.jdwp.ThreadReference.ThreadGroup002Test#testThreadGroup002"
 },
 {
-  description: "Test fails due to modifiers not including ACC_SUPER",
-  result: EXEC_FAILED,
-  bug: 66906055,
-  name: "org.apache.harmony.jpda.tests.jdwp.ReferenceType.ModifiersTest#testModifiers001"
-},
-{
   description: "Test fails due to static values not being set correctly.",
   result: EXEC_FAILED,
   bug: 66905894,