45 files changed, 1196 insertions, 434 deletions
diff --git a/compiler/Android.bp b/compiler/Android.bp
index 11521e68d0..e1d382f6f4 100644
--- a/compiler/Android.bp
+++ b/compiler/Android.bp
@@ -161,6 +161,7 @@ art_cc_defaults {
                 "utils/x86/assembler_x86.cc",
                 "utils/x86/jni_macro_assembler_x86.cc",
                 "utils/x86/managed_register_x86.cc",
+                "optimizing/instruction_simplifier_x86.cc",
             ],
         },
         x86_64: {
@@ -346,6 +347,7 @@ art_cc_test {
         "optimizing/parallel_move_test.cc",
         "optimizing/pretty_printer_test.cc",
         "optimizing/reference_type_propagation_test.cc",
+        "optimizing/select_generator_test.cc",
         "optimizing/side_effects_test.cc",
         "optimizing/ssa_liveness_analysis_test.cc",
         "optimizing/ssa_test.cc",
diff --git a/compiler/debug/elf_debug_info_writer.h b/compiler/debug/elf_debug_info_writer.h
index f2a942f34a..bda7108c74 100644
--- a/compiler/debug/elf_debug_info_writer.h
+++ b/compiler/debug/elf_debug_info_writer.h
@@ -208,8 +208,7 @@ class ElfCompilationUnitWriter {
       std::vector<DexRegisterMap> dex_reg_maps;
       if (accessor.HasCodeItem() && mi->code_info != nullptr) {
         code_info.reset(new CodeInfo(mi->code_info));
-        for (size_t s = 0; s < code_info->GetNumberOfStackMaps(); ++s) {
-          const StackMap stack_map = code_info->GetStackMapAt(s);
+        for (StackMap stack_map : code_info->GetStackMaps()) {
           dex_reg_maps.push_back(code_info->GetDexRegisterMapOf(stack_map));
         }
       }
diff --git a/compiler/debug/elf_debug_line_writer.h b/compiler/debug/elf_debug_line_writer.h
index a7adab5506..3d78943cd0 100644
--- a/compiler/debug/elf_debug_line_writer.h
+++ b/compiler/debug/elf_debug_line_writer.h
@@ -101,9 +101,7 @@ class ElfDebugLineWriter {
         // Use stack maps to create mapping table from pc to dex.
         const CodeInfo code_info(mi->code_info);
         pc2dex_map.reserve(code_info.GetNumberOfStackMaps());
-        for (uint32_t s = 0; s < code_info.GetNumberOfStackMaps(); s++) {
-          StackMap stack_map = code_info.GetStackMapAt(s);
-          DCHECK(stack_map.IsValid());
+        for (StackMap stack_map : code_info.GetStackMaps()) {
           const uint32_t pc = stack_map.GetNativePcOffset(isa);
           const int32_t dex = stack_map.GetDexPc();
           pc2dex_map.push_back({pc, dex});
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 26c9e9fa2b..d1c83ce625 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -5493,36 +5493,13 @@ void LocationsBuilderARM64::VisitNewInstance(HNewInstance* instruction) {
   LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(
       instruction, LocationSummary::kCallOnMainOnly);
   InvokeRuntimeCallingConvention calling_convention;
-  if (instruction->IsStringAlloc()) {
-    locations->AddTemp(LocationFrom(kArtMethodRegister));
-  } else {
-    locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
-  }
+  locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
   locations->SetOut(calling_convention.GetReturnLocation(DataType::Type::kReference));
 }
 
 void InstructionCodeGeneratorARM64::VisitNewInstance(HNewInstance* instruction) {
-  // Note: if heap poisoning is enabled, the entry point takes cares
-  // of poisoning the reference.
-  if (instruction->IsStringAlloc()) {
-    // String is allocated through StringFactory. Call NewEmptyString entry point.
-    Location temp = instruction->GetLocations()->GetTemp(0);
-    MemberOffset code_offset = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArm64PointerSize);
-    __ Ldr(XRegisterFrom(temp), MemOperand(tr, QUICK_ENTRY_POINT(pNewEmptyString)));
-    __ Ldr(lr, MemOperand(XRegisterFrom(temp), code_offset.Int32Value()));
-
-    {
-      // Ensure the pc position is recorded immediately after the `blr` instruction.
-      ExactAssemblyScope eas(GetVIXLAssembler(),
-                             kInstructionSize,
-                             CodeBufferCheckScope::kExactSize);
-      __ blr(lr);
-      codegen_->RecordPcInfo(instruction, instruction->GetDexPc());
-    }
-  } else {
-    codegen_->InvokeRuntime(instruction->GetEntrypoint(), instruction, instruction->GetDexPc());
-    CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>();
-  }
+  codegen_->InvokeRuntime(instruction->GetEntrypoint(), instruction, instruction->GetDexPc());
+  CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>();
   codegen_->MaybeGenerateMarkingRegisterCheck(/* code */ __LINE__);
 }
 
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index 9e1ef4002e..deab239362 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -5479,34 +5479,14 @@ void InstructionCodeGeneratorARMVIXL::VisitUShr(HUShr* ushr) {
 void LocationsBuilderARMVIXL::VisitNewInstance(HNewInstance* instruction) {
   LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(
       instruction, LocationSummary::kCallOnMainOnly);
-  if (instruction->IsStringAlloc()) {
-    locations->AddTemp(LocationFrom(kMethodRegister));
-  } else {
-    InvokeRuntimeCallingConventionARMVIXL calling_convention;
-    locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
-  }
+  InvokeRuntimeCallingConventionARMVIXL calling_convention;
+  locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
   locations->SetOut(LocationFrom(r0));
 }
 
 void InstructionCodeGeneratorARMVIXL::VisitNewInstance(HNewInstance* instruction) {
-  // Note: if heap poisoning is enabled, the entry point takes cares
-  // of poisoning the reference.
-  if (instruction->IsStringAlloc()) {
-    // String is allocated through StringFactory. Call NewEmptyString entry point.
-    vixl32::Register temp = RegisterFrom(instruction->GetLocations()->GetTemp(0));
-    MemberOffset code_offset = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArmPointerSize);
-    GetAssembler()->LoadFromOffset(kLoadWord, temp, tr, QUICK_ENTRY_POINT(pNewEmptyString));
-    GetAssembler()->LoadFromOffset(kLoadWord, lr, temp, code_offset.Int32Value());
-    // blx in T32 has only 16bit encoding that's why a stricter check for the scope is used.
-    ExactAssemblyScope aas(GetVIXLAssembler(),
-                           vixl32::k16BitT32InstructionSizeInBytes,
-                           CodeBufferCheckScope::kExactSize);
-    __ blx(lr);
-    codegen_->RecordPcInfo(instruction, instruction->GetDexPc());
-  } else {
-    codegen_->InvokeRuntime(instruction->GetEntrypoint(), instruction, instruction->GetDexPc());
-    CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>();
-  }
+  codegen_->InvokeRuntime(instruction->GetEntrypoint(), instruction, instruction->GetDexPc());
+  CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>();
   codegen_->MaybeGenerateMarkingRegisterCheck(/* code */ 11);
 }
 
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index f0ef30ee37..c7295e4db1 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -8701,30 +8701,13 @@ void LocationsBuilderMIPS::VisitNewInstance(HNewInstance* instruction) {
   LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(
       instruction, LocationSummary::kCallOnMainOnly);
   InvokeRuntimeCallingConvention calling_convention;
-  if (instruction->IsStringAlloc()) {
-    locations->AddTemp(Location::RegisterLocation(kMethodRegisterArgument));
-  } else {
-    locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
-  }
+  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
   locations->SetOut(calling_convention.GetReturnLocation(DataType::Type::kReference));
 }
 
 void InstructionCodeGeneratorMIPS::VisitNewInstance(HNewInstance* instruction) {
-  // Note: if heap poisoning is enabled, the entry point takes care
-  // of poisoning the reference.
-  if (instruction->IsStringAlloc()) {
-    // String is allocated through StringFactory. Call NewEmptyString entry point.
-    Register temp = instruction->GetLocations()->GetTemp(0).AsRegister<Register>();
-    MemberOffset code_offset = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kMipsPointerSize);
-    __ LoadFromOffset(kLoadWord, temp, TR, QUICK_ENTRY_POINT(pNewEmptyString));
-    __ LoadFromOffset(kLoadWord, T9, temp, code_offset.Int32Value());
-    __ Jalr(T9);
-    __ NopIfNoReordering();
-    codegen_->RecordPcInfo(instruction, instruction->GetDexPc());
-  } else {
-    codegen_->InvokeRuntime(instruction->GetEntrypoint(), instruction, instruction->GetDexPc());
-    CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>();
-  }
+  codegen_->InvokeRuntime(instruction->GetEntrypoint(), instruction, instruction->GetDexPc());
+  CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>();
 }
 
 void LocationsBuilderMIPS::VisitNot(HNot* instruction) {
diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc
index 6e72727f59..ffde45e95e 100644
--- a/compiler/optimizing/code_generator_mips64.cc
+++ b/compiler/optimizing/code_generator_mips64.cc
@@ -6632,31 +6632,13 @@ void LocationsBuilderMIPS64::VisitNewInstance(HNewInstance* instruction) {
   LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(
       instruction, LocationSummary::kCallOnMainOnly);
   InvokeRuntimeCallingConvention calling_convention;
-  if (instruction->IsStringAlloc()) {
-    locations->AddTemp(Location::RegisterLocation(kMethodRegisterArgument));
-  } else {
-    locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
-  }
+  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
   locations->SetOut(calling_convention.GetReturnLocation(DataType::Type::kReference));
 }
 
 void InstructionCodeGeneratorMIPS64::VisitNewInstance(HNewInstance* instruction) {
-  // Note: if heap poisoning is enabled, the entry point takes care
-  // of poisoning the reference.
-  if (instruction->IsStringAlloc()) {
-    // String is allocated through StringFactory. Call NewEmptyString entry point.
-    GpuRegister temp = instruction->GetLocations()->GetTemp(0).AsRegister<GpuRegister>();
-    MemberOffset code_offset =
-        ArtMethod::EntryPointFromQuickCompiledCodeOffset(kMips64PointerSize);
-    __ LoadFromOffset(kLoadDoubleword, temp, TR, QUICK_ENTRY_POINT(pNewEmptyString));
-    __ LoadFromOffset(kLoadDoubleword, T9, temp, code_offset.Int32Value());
-    __ Jalr(T9);
-    __ Nop();
-    codegen_->RecordPcInfo(instruction, instruction->GetDexPc());
-  } else {
-    codegen_->InvokeRuntime(instruction->GetEntrypoint(), instruction, instruction->GetDexPc());
-    CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>();
-  }
+  codegen_->InvokeRuntime(instruction->GetEntrypoint(), instruction, instruction->GetDexPc());
+  CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>();
 }
 
 void LocationsBuilderMIPS64::VisitNot(HNot* instruction) {
diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc
index 086ae07a06..58808769e2 100644
--- a/compiler/optimizing/code_generator_vector_x86.cc
+++ b/compiler/optimizing/code_generator_vector_x86.cc
@@ -1125,13 +1125,59 @@ static void CreateVecAccumLocations(ArenaAllocator* allocator, HVecOperation* in
   }
 }
 
-void LocationsBuilderX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
-  CreateVecAccumLocations(GetGraph()->GetAllocator(), instruction);
+void LocationsBuilderX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instr);
+  switch (instr->GetPackedType()) {
+    case DataType::Type::kFloat32:
+    case DataType::Type::kFloat64:
+      locations->SetInAt(
+          HVecMultiplyAccumulate::kInputAccumulatorIndex, Location::RequiresFpuRegister());
+      locations->SetInAt(
+          HVecMultiplyAccumulate::kInputMulLeftIndex, Location::RequiresFpuRegister());
+      locations->SetInAt(
+          HVecMultiplyAccumulate::kInputMulRightIndex, Location::RequiresFpuRegister());
+      DCHECK_EQ(HVecMultiplyAccumulate::kInputAccumulatorIndex, 0);
+      locations->SetOut(Location::SameAsFirstInput());
+  break;
+  default:
+    // VecMultiplyAccumulate is supported only for single and
+    // double precision floating points. Hence integral types
+    // are still not converted.
+    LOG(FATAL) << "Unsupported SIMD Type";
+  }
 }
 
-void InstructionCodeGeneratorX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
-  // TODO: pmaddwd?
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+void InstructionCodeGeneratorX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LocationSummary* locations = instr->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  XmmRegister accumulator = locations->InAt(
+      HVecMultiplyAccumulate::kInputAccumulatorIndex).AsFpuRegister<XmmRegister>();
+  XmmRegister mul_left = locations->InAt(
+      HVecMultiplyAccumulate::kInputMulLeftIndex).AsFpuRegister<XmmRegister>();
+  XmmRegister mul_right = locations->InAt(
+      HVecMultiplyAccumulate::kInputMulRightIndex).AsFpuRegister<XmmRegister>();
+  switch (instr->GetPackedType()) {
+    case DataType::Type::kFloat32:
+      DCHECK_EQ(4u, instr->GetVectorLength());
+      if (instr->GetOpKind() == HInstruction::InstructionKind::kAdd)
+        __ vfmadd231ps(accumulator, mul_left, mul_right);
+      else
+        __ vfmsub231ps(accumulator, mul_left, mul_right);
+    break;
+    case DataType::Type::kFloat64:
+      DCHECK_EQ(2u, instr->GetVectorLength());
+      if (instr->GetOpKind() == HInstruction::InstructionKind::kAdd)
+        __ vfmadd231pd(accumulator, mul_left, mul_right);
+      else
+        __ vfmsub231pd(accumulator, mul_left, mul_right);
+      break;
+  default:
+
+    // VecMultiplyAccumulate is supported only for single and
+    // double precision floating points. Hence integral types
+    // are still not converted.
+    LOG(FATAL) << "Unsupported SIMD Type";
+  }
 }
 
 void LocationsBuilderX86::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc
index 4d31ab68d1..4795e86933 100644
--- a/compiler/optimizing/code_generator_vector_x86_64.cc
+++ b/compiler/optimizing/code_generator_vector_x86_64.cc
@@ -1098,13 +1098,61 @@ static void CreateVecAccumLocations(ArenaAllocator* allocator, HVecOperation* in
   }
 }
 
-void LocationsBuilderX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
-  CreateVecAccumLocations(GetGraph()->GetAllocator(), instruction);
+void LocationsBuilderX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instr);
+  switch (instr->GetPackedType()) {
+    case DataType::Type::kFloat32:
+    case DataType::Type::kFloat64:
+      locations->SetInAt(
+          HVecMultiplyAccumulate::kInputAccumulatorIndex, Location::RequiresFpuRegister());
+      locations->SetInAt(
+          HVecMultiplyAccumulate::kInputMulLeftIndex, Location::RequiresFpuRegister());
+      locations->SetInAt(
+          HVecMultiplyAccumulate::kInputMulRightIndex, Location::RequiresFpuRegister());
+      DCHECK_EQ(HVecMultiplyAccumulate::kInputAccumulatorIndex, 0);
+      locations->SetOut(Location::SameAsFirstInput());
+      break;
+    default:
+      // VecMultiplyAccumulate is supported only for single and
+      // double precision floating points. Hence integral types
+      // are still not converted.
+      LOG(FATAL) << "Unsupported SIMD type";
+  }
 }
 
-void InstructionCodeGeneratorX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
-  // TODO: pmaddwd?
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+
+void InstructionCodeGeneratorX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LocationSummary* locations = instr->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  XmmRegister accumulator = locations->InAt(
+      HVecMultiplyAccumulate::kInputAccumulatorIndex).AsFpuRegister<XmmRegister>();
+  XmmRegister mul_left = locations->InAt(
+      HVecMultiplyAccumulate::kInputMulLeftIndex).AsFpuRegister<XmmRegister>();
+  XmmRegister mul_right = locations->InAt(
+      HVecMultiplyAccumulate::kInputMulRightIndex).AsFpuRegister<XmmRegister>();
+
+  switch (instr->GetPackedType()) {
+    case DataType::Type::kFloat32:
+      DCHECK_EQ(4u, instr->GetVectorLength());
+      if (instr->GetOpKind() == HInstruction::InstructionKind::kAdd)
+         __ vfmadd231ps(accumulator, mul_left, mul_right);
+      else
+        __ vfmsub231ps(accumulator, mul_left, mul_right);
+    break;
+    case DataType::Type::kFloat64:
+      DCHECK_EQ(2u, instr->GetVectorLength());
+      if (instr->GetOpKind() == HInstruction::InstructionKind::kAdd)
+        __ vfmadd231pd(accumulator, mul_left, mul_right);
+      else
+        __ vfmsub231pd(accumulator, mul_left, mul_right);
+    break;
+  default:
+
+    // VecMultiplyAccumulate is supported only for single and
+    // double precision floating points. Hence integral types
+    // are still not converted.
+    LOG(FATAL) << "Unsupported SIMD Type";
+  }
 }
 
 void LocationsBuilderX86_64::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index d189476a48..1c0d283ef6 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -4496,29 +4496,14 @@ void LocationsBuilderX86::VisitNewInstance(HNewInstance* instruction) {
   LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(
       instruction, LocationSummary::kCallOnMainOnly);
   locations->SetOut(Location::RegisterLocation(EAX));
-  if (instruction->IsStringAlloc()) {
-    locations->AddTemp(Location::RegisterLocation(kMethodRegisterArgument));
-  } else {
-    InvokeRuntimeCallingConvention calling_convention;
-    locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
-  }
+  InvokeRuntimeCallingConvention calling_convention;
+  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
 }
 
 void InstructionCodeGeneratorX86::VisitNewInstance(HNewInstance* instruction) {
-  // Note: if heap poisoning is enabled, the entry point takes cares
-  // of poisoning the reference.
-  if (instruction->IsStringAlloc()) {
-    // String is allocated through StringFactory. Call NewEmptyString entry point.
-    Register temp = instruction->GetLocations()->GetTemp(0).AsRegister<Register>();
-    MemberOffset code_offset = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86PointerSize);
-    __ fs()->movl(temp, Address::Absolute(QUICK_ENTRY_POINT(pNewEmptyString)));
-    __ call(Address(temp, code_offset.Int32Value()));
-    codegen_->RecordPcInfo(instruction, instruction->GetDexPc());
-  } else {
-    codegen_->InvokeRuntime(instruction->GetEntrypoint(), instruction, instruction->GetDexPc());
-    CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>();
-    DCHECK(!codegen_->IsLeafMethod());
-  }
+  codegen_->InvokeRuntime(instruction->GetEntrypoint(), instruction, instruction->GetDexPc());
+  CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>();
+  DCHECK(!codegen_->IsLeafMethod());
 }
 
 void LocationsBuilderX86::VisitNewArray(HNewArray* instruction) {
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index bea3da070a..3073be6ca7 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -4343,29 +4343,14 @@ void LocationsBuilderX86_64::VisitNewInstance(HNewInstance* instruction) {
   LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(
       instruction, LocationSummary::kCallOnMainOnly);
   InvokeRuntimeCallingConvention calling_convention;
-  if (instruction->IsStringAlloc()) {
-    locations->AddTemp(Location::RegisterLocation(kMethodRegisterArgument));
-  } else {
-    locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
-  }
+  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
   locations->SetOut(Location::RegisterLocation(RAX));
 }
 
 void InstructionCodeGeneratorX86_64::VisitNewInstance(HNewInstance* instruction) {
-  // Note: if heap poisoning is enabled, the entry point takes cares
-  // of poisoning the reference.
-  if (instruction->IsStringAlloc()) {
-    // String is allocated through StringFactory. Call NewEmptyString entry point.
-    CpuRegister temp = instruction->GetLocations()->GetTemp(0).AsRegister<CpuRegister>();
-    MemberOffset code_offset = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86_64PointerSize);
-    __ gs()->movq(temp, Address::Absolute(QUICK_ENTRY_POINT(pNewEmptyString), /* no_rip */ true));
-    __ call(Address(temp, code_offset.SizeValue()));
-    codegen_->RecordPcInfo(instruction, instruction->GetDexPc());
-  } else {
-    codegen_->InvokeRuntime(instruction->GetEntrypoint(), instruction, instruction->GetDexPc());
-    CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>();
-    DCHECK(!codegen_->IsLeafMethod());
-  }
+  codegen_->InvokeRuntime(instruction->GetEntrypoint(), instruction, instruction->GetDexPc());
+  CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>();
+  DCHECK(!codegen_->IsLeafMethod());
 }
 
 void LocationsBuilderX86_64::VisitNewArray(HNewArray* instruction) {
diff --git a/compiler/optimizing/instruction_builder.cc b/compiler/optimizing/instruction_builder.cc
index 7d918c47ca..ba160e55f8 100644
--- a/compiler/optimizing/instruction_builder.cc
+++ b/compiler/optimizing/instruction_builder.cc
@@ -1075,6 +1075,10 @@ HNewInstance* HInstructionBuilder::BuildNewInstance(dex::TypeIndex type_index, u
   if (load_class->NeedsAccessCheck() || klass->IsFinalizable() || !klass->IsInstantiable()) {
     entrypoint = kQuickAllocObjectWithChecks;
   }
+  // We will always be able to resolve the string class since it is in the BCP.
+  if (!klass.IsNull() && klass->IsStringClass()) {
+    entrypoint = kQuickAllocStringObject;
+  }
 
   // Consider classes we haven't resolved as potentially finalizable.
   bool finalizable = (klass == nullptr) || klass->IsFinalizable();
@@ -1308,29 +1312,25 @@ bool HInstructionBuilder::HandleStringInit(HInvoke* invoke,
   HInstruction* arg_this = LoadLocal(orig_this_reg, DataType::Type::kReference);
 
   // Replacing the NewInstance might render it redundant. Keep a list of these
-  // to be visited once it is clear whether it is has remaining uses.
+  // to be visited once it is clear whether it has remaining uses.
   if (arg_this->IsNewInstance()) {
     ssa_builder_->AddUninitializedString(arg_this->AsNewInstance());
+    // Walk over all vregs and replace any occurrence of `arg_this` with `invoke`.
+    for (size_t vreg = 0, e = current_locals_->size(); vreg < e; ++vreg) {
+      if ((*current_locals_)[vreg] == arg_this) {
+        (*current_locals_)[vreg] = invoke;
+      }
+    }
   } else {
-    // The only reason a HPhi can flow in a String.<init> is when there is an
-    // irreducible loop, which will create HPhi for all dex registers at loop entry.
     DCHECK(arg_this->IsPhi());
-    // TODO(b/109666561): Re-enable.
-    // DCHECK(graph_->HasIrreducibleLoops());
-    // Don't bother compiling a method in that situation. While we could look at all
-    // phis related to the HNewInstance, it's not worth the trouble.
-    MaybeRecordStat(compilation_stats_,
-                    MethodCompilationStat::kNotCompiledIrreducibleAndStringInit);
-    return false;
+    // We can get a phi as input of a String.<init> if there is a loop between the
+    // allocation and the String.<init> call. As we don't know which other phis might alias
+    // with `arg_this`, we keep a record of these phis and will analyze their inputs and
+    // uses once the inputs and users are populated (in ssa_builder.cc).
+    // Note: we only do this for phis, as it is a somewhat more expensive operation than
+    // what we're doing above when the input is the `HNewInstance`.
+    ssa_builder_->AddUninitializedStringPhi(arg_this->AsPhi(), invoke);
   }
-
-  // Walk over all vregs and replace any occurrence of `arg_this` with `invoke`.
-  for (size_t vreg = 0, e = current_locals_->size(); vreg < e; ++vreg) {
-    if ((*current_locals_)[vreg] == arg_this) {
-      (*current_locals_)[vreg] = invoke;
-    }
-  }
-
   return true;
 }
 
diff --git a/compiler/optimizing/instruction_simplifier_x86.cc b/compiler/optimizing/instruction_simplifier_x86.cc
new file mode 100644
index 0000000000..b3f67d6e84
--- /dev/null
+++ b/compiler/optimizing/instruction_simplifier_x86.cc
@@ -0,0 +1,149 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "instruction_simplifier_x86.h"
+#include "arch/x86/instruction_set_features_x86.h"
+#include "mirror/array-inl.h"
+#include "code_generator.h"
+
+
+namespace art {
+
+namespace x86 {
+
+class InstructionSimplifierX86Visitor : public HGraphVisitor {
+ public:
+  InstructionSimplifierX86Visitor(HGraph* graph,
+                                  CodeGeneratorX86 *codegen,
+                                  OptimizingCompilerStats* stats)
+      : HGraphVisitor(graph), codegen_(codegen), stats_(stats) {}
+
+ private:
+  void RecordSimplification() {
+    MaybeRecordStat(stats_, MethodCompilationStat::kInstructionSimplificationsArch);
+  }
+
+  bool HasCpuFeatureFlag() {
+     return (codegen_->GetInstructionSetFeatures().HasAVX2());
+  }
+
+  /**
+   * This simplifier uses a special-purpose BB visitor.
+   * (1) No need to visit Phi nodes.
+   * (2) Since statements can be removed in a "forward" fashion,
+   *     the visitor should test if each statement is still there.
+   */
+  void VisitBasicBlock(HBasicBlock* block) OVERRIDE {
+    // TODO: fragile iteration, provide more robust iterators?
+    for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
+      HInstruction* instruction = it.Current();
+      if (instruction->IsInBlock()) {
+        instruction->Accept(this);
+      }
+    }
+  }
+
+  bool TryGenerateVecMultiplyAccumulate(HVecMul* mul);
+  void VisitVecMul(HVecMul* instruction) OVERRIDE;
+
+  CodeGeneratorX86* codegen_;
+  OptimizingCompilerStats* stats_;
+};
+
+/* generic expressions for FMA
+a = (b * c) + a
+a = (b * c) – a
+*/
+bool InstructionSimplifierX86Visitor::TryGenerateVecMultiplyAccumulate(HVecMul* mul) {
+  if (!(mul->GetPackedType() == DataType::Type::kFloat32 ||
+        mul->GetPackedType() == DataType::Type::kFloat64)) {
+     return false;
+  }
+  ArenaAllocator* allocator = mul->GetBlock()->GetGraph()->GetAllocator();
+  if (mul->HasOnlyOneNonEnvironmentUse()) {
+    HInstruction* use = mul->GetUses().front().GetUser();
+    if (use->IsVecAdd() || use->IsVecSub()) {
+      // Replace code looking like
+      //    VECMUL tmp, x, y
+      //    VECADD dst, acc, tmp or VECADD dst, tmp, acc
+      //      or
+      //    VECSUB dst, tmp, acc
+      // with
+      //    VECMULACC dst, acc, x, y
+
+      // Note that we do not want to (unconditionally) perform the merge when the
+      // multiplication has multiple uses and it can be merged in all of them.
+      // Multiple uses could happen on the same control-flow path, and we would
+      // then increase the amount of work. In the future we could try to evaluate
+      // whether all uses are on different control-flow paths (using dominance and
+      // reverse-dominance information) and only perform the merge when they are.
+      HInstruction* accumulator = nullptr;
+      HVecBinaryOperation* binop = use->AsVecBinaryOperation();
+      HInstruction* binop_left = binop->GetLeft();
+      HInstruction* binop_right = binop->GetRight();
+      DCHECK_NE(binop_left, binop_right);
+      if (use->IsVecSub()) {
+        if (binop_left == mul) {
+          accumulator = binop_right;
+         }
+      } else {
+        // VecAdd
+        if (binop_right == mul) {
+          accumulator = binop_left;
+        } else {
+          DCHECK_EQ(binop_left, mul);
+          accumulator = binop_right;
+        }
+      }
+      HInstruction::InstructionKind kind =
+        use->IsVecAdd() ? HInstruction::kAdd : HInstruction::kSub;
+
+      if (accumulator != nullptr) {
+        HVecMultiplyAccumulate* mulacc =
+          new (allocator) HVecMultiplyAccumulate(allocator,
+                                                 kind,
+                                                 accumulator,
+                                                 mul->GetLeft(),
+                                                 mul->GetRight(),
+                                                 binop->GetPackedType(),
+                                                 binop->GetVectorLength(),
+                                                 binop->GetDexPc());
+        binop->GetBlock()->ReplaceAndRemoveInstructionWith(binop, mulacc);
+        DCHECK(!mul->HasUses());
+        mul->GetBlock()->RemoveInstruction(mul);
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+void InstructionSimplifierX86Visitor::VisitVecMul(HVecMul* instruction) {
+  if (HasCpuFeatureFlag()) {
+    if (TryGenerateVecMultiplyAccumulate(instruction)) {
+      RecordSimplification();
+    }
+  }
+}
+
+bool InstructionSimplifierX86::Run() {
+  InstructionSimplifierX86Visitor visitor(graph_, codegen_, stats_);
+  visitor.VisitReversePostOrder();
+  return true;
+}
+
+}  // namespace x86
+}  // namespace art
diff --git a/compiler/optimizing/instruction_simplifier_x86.h b/compiler/optimizing/instruction_simplifier_x86.h
new file mode 100644
index 0000000000..1fb199f728
--- /dev/null
+++ b/compiler/optimizing/instruction_simplifier_x86.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_X86_H_
+#define ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_X86_H_
+
+#include "nodes.h"
+#include "optimization.h"
+#include "code_generator_x86.h"
+
+namespace art {
+namespace x86 {
+
+class InstructionSimplifierX86 : public HOptimization {
+ public:
+  InstructionSimplifierX86(HGraph* graph, CodeGenerator* codegen, OptimizingCompilerStats* stats)
+      : HOptimization(graph, kInstructionSimplifierX86PassName, stats),
+        codegen_(down_cast<CodeGeneratorX86*>(codegen)) {}
+
+  static constexpr const char* kInstructionSimplifierX86PassName = "instruction_simplifier_x86";
+
+  bool Run() OVERRIDE;
+
+ private:
+  CodeGeneratorX86* codegen_;
+};
+
+}  // namespace x86
+}  // namespace art
+
+#endif  // ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_X86_H_
diff --git a/compiler/optimizing/loop_analysis.cc b/compiler/optimizing/loop_analysis.cc
index a2124455e2..efb23e7d3e 100644
--- a/compiler/optimizing/loop_analysis.cc
+++ b/compiler/optimizing/loop_analysis.cc
@@ -17,19 +17,34 @@
 #include "loop_analysis.h"
 
 #include "base/bit_vector-inl.h"
+#include "induction_var_range.h"
 
 namespace art {
 
 void LoopAnalysis::CalculateLoopBasicProperties(HLoopInformation* loop_info,
-                                                LoopAnalysisInfo* analysis_results) {
+                                                LoopAnalysisInfo* analysis_results,
+                                                int64_t trip_count) {
+  analysis_results->trip_count_ = trip_count;
+
   for (HBlocksInLoopIterator block_it(*loop_info);
        !block_it.Done();
        block_it.Advance()) {
     HBasicBlock* block = block_it.Current();
 
+    // Check whether one of the successor is loop exit.
     for (HBasicBlock* successor : block->GetSuccessors()) {
       if (!loop_info->Contains(*successor)) {
         analysis_results->exits_num_++;
+
+        // We track number of invariant loop exits which correspond to HIf instruction and
+        // can be eliminated by loop peeling; other control flow instruction are ignored and will
+        // not cause loop peeling to happen as they either cannot be inside a loop, or by
+        // definition cannot be loop exits (unconditional instructions), or are not beneficial for
+        // the optimization.
+        HIf* hif = block->GetLastInstruction()->AsIf();
+        if (hif != nullptr && !loop_info->Contains(*hif->InputAt(0)->GetBlock())) {
+          analysis_results->invariant_exits_num_++;
+        }
       }
     }
 
@@ -48,20 +63,13 @@ void LoopAnalysis::CalculateLoopBasicProperties(HLoopInformation* loop_info,
   }
 }
 
-bool LoopAnalysis::HasLoopAtLeastOneInvariantExit(HLoopInformation* loop_info) {
-  HGraph* graph = loop_info->GetHeader()->GetGraph();
-  for (uint32_t block_id : loop_info->GetBlocks().Indexes()) {
-    HBasicBlock* block = graph->GetBlocks()[block_id];
-    DCHECK(block != nullptr);
-    if (block->EndsWithIf()) {
-      HIf* hif = block->GetLastInstruction()->AsIf();
-      HInstruction* input = hif->InputAt(0);
-      if (IsLoopExit(loop_info, hif) && !loop_info->Contains(*input->GetBlock())) {
-        return true;
-      }
-    }
+int64_t LoopAnalysis::GetLoopTripCount(HLoopInformation* loop_info,
+                                       const InductionVarRange* induction_range) {
+  int64_t trip_count;
+  if (!induction_range->HasKnownTripCount(loop_info, &trip_count)) {
+    trip_count = LoopAnalysisInfo::kUnknownTripCount;
   }
-  return false;
+  return trip_count;
 }
 
 // Default implementation of loop helper; used for all targets unless a custom implementation
@@ -77,18 +85,22 @@ class ArchDefaultLoopHelper : public ArchNoOptsLoopHelper {
   // Loop's maximum basic block count. Loops with higher count will not be peeled/unrolled.
   static constexpr uint32_t kScalarHeuristicMaxBodySizeBlocks = 6;
 
-  bool IsLoopNonBeneficialForScalarOpts(LoopAnalysisInfo* loop_analysis_info) const OVERRIDE {
-    return loop_analysis_info->HasLongTypeInstructions() ||
-           IsLoopTooBig(loop_analysis_info,
+  bool IsLoopNonBeneficialForScalarOpts(LoopAnalysisInfo* analysis_info) const OVERRIDE {
+    return analysis_info->HasLongTypeInstructions() ||
+           IsLoopTooBig(analysis_info,
                         kScalarHeuristicMaxBodySizeInstr,
                         kScalarHeuristicMaxBodySizeBlocks);
   }
 
-  uint32_t GetScalarUnrollingFactor(HLoopInformation* loop_info ATTRIBUTE_UNUSED,
-                                    uint64_t trip_count) const OVERRIDE {
+  uint32_t GetScalarUnrollingFactor(const LoopAnalysisInfo* analysis_info) const OVERRIDE {
+    int64_t trip_count = analysis_info->GetTripCount();
+    // Unroll only loops with known trip count.
+    if (trip_count == LoopAnalysisInfo::kUnknownTripCount) {
+      return LoopAnalysisInfo::kNoUnrollingFactor;
+    }
     uint32_t desired_unrolling_factor = kScalarMaxUnrollFactor;
     if (trip_count < desired_unrolling_factor || trip_count % desired_unrolling_factor != 0) {
-      return kNoUnrollingFactor;
+      return LoopAnalysisInfo::kNoUnrollingFactor;
     }
 
     return desired_unrolling_factor;
@@ -136,12 +148,12 @@ class Arm64LoopHelper : public ArchDefaultLoopHelper {
     // TODO: Unroll loops with unknown trip count.
     DCHECK_NE(vector_length, 0u);
     if (trip_count < (2 * vector_length + max_peel)) {
-      return kNoUnrollingFactor;
+      return LoopAnalysisInfo::kNoUnrollingFactor;
     }
     // Don't unroll for large loop body size.
     uint32_t instruction_count = block->GetInstructions().CountSize();
     if (instruction_count >= kArm64SimdHeuristicMaxBodySizeInstr) {
-      return kNoUnrollingFactor;
+      return LoopAnalysisInfo::kNoUnrollingFactor;
     }
     // Find a beneficial unroll factor with the following restrictions:
     //  - At least one iteration of the transformed loop should be executed.
diff --git a/compiler/optimizing/loop_analysis.h b/compiler/optimizing/loop_analysis.h
index 7f321b73c8..bcb7b70494 100644
--- a/compiler/optimizing/loop_analysis.h
+++ b/compiler/optimizing/loop_analysis.h
@@ -21,26 +21,33 @@
 
 namespace art {
 
+class InductionVarRange;
 class LoopAnalysis;
 
-// No loop unrolling factor (just one copy of the loop-body).
-static constexpr uint32_t kNoUnrollingFactor = 1;
-
 // Class to hold cached information on properties of the loop.
 class LoopAnalysisInfo : public ValueObject {
  public:
+  // No loop unrolling factor (just one copy of the loop-body).
+  static constexpr uint32_t kNoUnrollingFactor = 1;
+  // Used for unknown and non-constant trip counts (see InductionVarRange::HasKnownTripCount).
+  static constexpr int64_t kUnknownTripCount = -1;
+
   explicit LoopAnalysisInfo(HLoopInformation* loop_info)
-      : bb_num_(0),
+      : trip_count_(kUnknownTripCount),
+        bb_num_(0),
         instr_num_(0),
         exits_num_(0),
+        invariant_exits_num_(0),
         has_instructions_preventing_scalar_peeling_(false),
         has_instructions_preventing_scalar_unrolling_(false),
         has_long_type_instructions_(false),
         loop_info_(loop_info) {}
 
+  int64_t GetTripCount() const { return trip_count_; }
   size_t GetNumberOfBasicBlocks() const { return bb_num_; }
   size_t GetNumberOfInstructions() const { return instr_num_; }
   size_t GetNumberOfExits() const { return exits_num_; }
+  size_t GetNumberOfInvariantExits() const { return invariant_exits_num_; }
 
   bool HasInstructionsPreventingScalarPeeling() const {
     return has_instructions_preventing_scalar_peeling_;
@@ -50,19 +57,27 @@ class LoopAnalysisInfo : public ValueObject {
     return has_instructions_preventing_scalar_unrolling_;
   }
 
+  bool HasInstructionsPreventingScalarOpts() const {
+    return HasInstructionsPreventingScalarPeeling() || HasInstructionsPreventingScalarUnrolling();
+  }
+
   bool HasLongTypeInstructions() const {
     return has_long_type_instructions_;
   }
 
-  const HLoopInformation* GetLoopInfo() const { return loop_info_; }
+  HLoopInformation* GetLoopInfo() const { return loop_info_; }
 
  private:
+  // Trip count of the loop if known, kUnknownTripCount otherwise.
+  int64_t trip_count_;
   // Number of basic blocks in the loop body.
   size_t bb_num_;
   // Number of instructions in the loop body.
   size_t instr_num_;
   // Number of loop's exits.
   size_t exits_num_;
+  // Number of "if" loop exits (with HIf instruction) whose condition is loop-invariant.
+  size_t invariant_exits_num_;
   // Whether the loop has instructions which make scalar loop peeling non-beneficial.
   bool has_instructions_preventing_scalar_peeling_;
   // Whether the loop has instructions which make scalar loop unrolling non-beneficial.
@@ -72,7 +87,7 @@ class LoopAnalysisInfo : public ValueObject {
   bool has_long_type_instructions_;
 
   // Corresponding HLoopInformation.
-  const HLoopInformation* loop_info_;
+  HLoopInformation* loop_info_;
 
   friend class LoopAnalysis;
 };
@@ -84,20 +99,12 @@ class LoopAnalysis : public ValueObject {
   // Calculates loops basic properties like body size, exits number, etc. and fills
   // 'analysis_results' with this information.
   static void CalculateLoopBasicProperties(HLoopInformation* loop_info,
-                                           LoopAnalysisInfo* analysis_results);
+                                           LoopAnalysisInfo* analysis_results,
+                                           int64_t trip_count);
 
-  // Returns whether the loop has at least one loop invariant exit.
-  static bool HasLoopAtLeastOneInvariantExit(HLoopInformation* loop_info);
-
-  // Returns whether HIf's true or false successor is outside the specified loop.
-  //
-  // Prerequisite: HIf must be in the specified loop.
-  static bool IsLoopExit(HLoopInformation* loop_info, const HIf* hif) {
-    DCHECK(loop_info->Contains(*hif->GetBlock()));
-    HBasicBlock* true_succ = hif->IfTrueSuccessor();
-    HBasicBlock* false_succ = hif->IfFalseSuccessor();
-    return (!loop_info->Contains(*true_succ) || !loop_info->Contains(*false_succ));
-  }
+  // Returns the trip count of the loop if it is known and kUnknownTripCount otherwise.
+  static int64_t GetLoopTripCount(HLoopInformation* loop_info,
+                                  const InductionVarRange* induction_range);
 
  private:
   // Returns whether an instruction makes scalar loop peeling/unrolling non-beneficial.
@@ -143,9 +150,9 @@ class ArchNoOptsLoopHelper : public ArenaObject<kArenaAllocOptimization> {
   // Returns optimal scalar unrolling factor for the loop.
   //
   // Returns kNoUnrollingFactor by default, should be overridden by particular target loop helper.
-  virtual uint32_t GetScalarUnrollingFactor(HLoopInformation* loop_info ATTRIBUTE_UNUSED,
-                                            uint64_t trip_count ATTRIBUTE_UNUSED) const {
-    return kNoUnrollingFactor;
+  virtual uint32_t GetScalarUnrollingFactor(
+      const LoopAnalysisInfo* analysis_info ATTRIBUTE_UNUSED) const {
+    return LoopAnalysisInfo::kNoUnrollingFactor;
   }
 
   // Returns whether scalar loop peeling is enabled,
@@ -160,7 +167,7 @@ class ArchNoOptsLoopHelper : public ArenaObject<kArenaAllocOptimization> {
                                           int64_t trip_count ATTRIBUTE_UNUSED,
                                           uint32_t max_peel ATTRIBUTE_UNUSED,
                                           uint32_t vector_length ATTRIBUTE_UNUSED) const {
-    return kNoUnrollingFactor;
+    return LoopAnalysisInfo::kNoUnrollingFactor;
   }
 };
 
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 72aa25302e..440cd3351e 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -744,64 +744,74 @@ bool HLoopOptimization::TryOptimizeInnerLoopFinite(LoopNode* node) {
 }
 
 bool HLoopOptimization::OptimizeInnerLoop(LoopNode* node) {
-  return TryOptimizeInnerLoopFinite(node) ||
-         TryPeelingForLoopInvariantExitsElimination(node) ||
-         TryUnrollingForBranchPenaltyReduction(node);
+  return TryOptimizeInnerLoopFinite(node) || TryPeelingAndUnrolling(node);
 }
 
 
 
 //
-// Loop unrolling: generic part methods.
+// Scalar loop peeling and unrolling: generic part methods.
 //
 
-bool HLoopOptimization::TryUnrollingForBranchPenaltyReduction(LoopNode* node) {
-  // Don't run peeling/unrolling if compiler_options_ is nullptr (i.e., running under tests)
-  // as InstructionSet is needed.
-  if (compiler_options_ == nullptr) {
+bool HLoopOptimization::TryUnrollingForBranchPenaltyReduction(LoopAnalysisInfo* analysis_info,
+                                                              bool generate_code) {
+  if (analysis_info->GetNumberOfExits() > 1) {
     return false;
   }
 
-  HLoopInformation* loop_info = node->loop_info;
-  int64_t trip_count = 0;
-  // Only unroll loops with a known tripcount.
-  if (!induction_range_.HasKnownTripCount(loop_info, &trip_count)) {
+  uint32_t unrolling_factor = arch_loop_helper_->GetScalarUnrollingFactor(analysis_info);
+  if (unrolling_factor == LoopAnalysisInfo::kNoUnrollingFactor) {
     return false;
   }
 
-  uint32_t unrolling_factor = arch_loop_helper_->GetScalarUnrollingFactor(loop_info, trip_count);
-  if (unrolling_factor == kNoUnrollingFactor) {
-    return false;
-  }
+  if (generate_code) {
+    // TODO: support other unrolling factors.
+    DCHECK_EQ(unrolling_factor, 2u);
 
-  LoopAnalysisInfo loop_analysis_info(loop_info);
-  LoopAnalysis::CalculateLoopBasicProperties(loop_info, &loop_analysis_info);
+    // Perform unrolling.
+    HLoopInformation* loop_info = analysis_info->GetLoopInfo();
+    PeelUnrollSimpleHelper helper(loop_info);
+    helper.DoUnrolling();
 
-  // Check "IsLoopClonable" last as it can be time-consuming.
-  if (loop_analysis_info.HasInstructionsPreventingScalarUnrolling() ||
-      arch_loop_helper_->IsLoopNonBeneficialForScalarOpts(&loop_analysis_info) ||
-      (loop_analysis_info.GetNumberOfExits() > 1) ||
-      !PeelUnrollHelper::IsLoopClonable(loop_info)) {
-    return false;
+    // Remove the redundant loop check after unrolling.
+    HIf* copy_hif =
+        helper.GetBasicBlockMap()->Get(loop_info->GetHeader())->GetLastInstruction()->AsIf();
+    int32_t constant = loop_info->Contains(*copy_hif->IfTrueSuccessor()) ? 1 : 0;
+    copy_hif->ReplaceInput(graph_->GetIntConstant(constant), 0u);
   }
+  return true;
+}
 
-  // TODO: support other unrolling factors.
-  DCHECK_EQ(unrolling_factor, 2u);
+bool HLoopOptimization::TryPeelingForLoopInvariantExitsElimination(LoopAnalysisInfo* analysis_info,
+                                                                   bool generate_code) {
+  HLoopInformation* loop_info = analysis_info->GetLoopInfo();
+  if (!arch_loop_helper_->IsLoopPeelingEnabled()) {
+    return false;
+  }
 
-  // Perform unrolling.
-  PeelUnrollSimpleHelper helper(loop_info);
-  helper.DoUnrolling();
+  if (analysis_info->GetNumberOfInvariantExits() == 0) {
+    return false;
+  }
 
-  // Remove the redundant loop check after unrolling.
-  HIf* copy_hif =
-      helper.GetBasicBlockMap()->Get(loop_info->GetHeader())->GetLastInstruction()->AsIf();
-  int32_t constant = loop_info->Contains(*copy_hif->IfTrueSuccessor()) ? 1 : 0;
-  copy_hif->ReplaceInput(graph_->GetIntConstant(constant), 0u);
+  if (generate_code) {
+    // Perform peeling.
+    PeelUnrollSimpleHelper helper(loop_info);
+    helper.DoPeeling();
+
+    // Statically evaluate loop check after peeling for loop invariant condition.
+    const SuperblockCloner::HInstructionMap* hir_map = helper.GetInstructionMap();
+    for (auto entry : *hir_map) {
+      HInstruction* copy = entry.second;
+      if (copy->IsIf()) {
+        TryToEvaluateIfCondition(copy->AsIf(), graph_);
+      }
+    }
+  }
 
   return true;
 }
 
-bool HLoopOptimization::TryPeelingForLoopInvariantExitsElimination(LoopNode* node) {
+bool HLoopOptimization::TryPeelingAndUnrolling(LoopNode* node) {
   // Don't run peeling/unrolling if compiler_options_ is nullptr (i.e., running under tests)
   // as InstructionSet is needed.
   if (compiler_options_ == nullptr) {
@@ -809,35 +819,27 @@ bool HLoopOptimization::TryPeelingForLoopInvariantExitsElimination(LoopNode* nod
   }
 
   HLoopInformation* loop_info = node->loop_info;
-  // Check 'IsLoopClonable' the last as it might be time-consuming.
-  if (!arch_loop_helper_->IsLoopPeelingEnabled()) {
+  int64_t trip_count = LoopAnalysis::GetLoopTripCount(loop_info, &induction_range_);
+  LoopAnalysisInfo analysis_info(loop_info);
+  LoopAnalysis::CalculateLoopBasicProperties(loop_info, &analysis_info, trip_count);
+
+  if (analysis_info.HasInstructionsPreventingScalarOpts() ||
+      arch_loop_helper_->IsLoopNonBeneficialForScalarOpts(&analysis_info)) {
     return false;
   }
 
-  LoopAnalysisInfo loop_analysis_info(loop_info);
-  LoopAnalysis::CalculateLoopBasicProperties(loop_info, &loop_analysis_info);
-
-  // Check "IsLoopClonable" last as it can be time-consuming.
-  if (loop_analysis_info.HasInstructionsPreventingScalarPeeling() ||
-      arch_loop_helper_->IsLoopNonBeneficialForScalarOpts(&loop_analysis_info) ||
-      !LoopAnalysis::HasLoopAtLeastOneInvariantExit(loop_info) ||
-      !PeelUnrollHelper::IsLoopClonable(loop_info)) {
+  if (!TryPeelingForLoopInvariantExitsElimination(&analysis_info, /*generate_code*/ false) &&
+      !TryUnrollingForBranchPenaltyReduction(&analysis_info, /*generate_code*/ false)) {
     return false;
   }
 
-  // Perform peeling.
-  PeelUnrollSimpleHelper helper(loop_info);
-  helper.DoPeeling();
-
-  const SuperblockCloner::HInstructionMap* hir_map = helper.GetInstructionMap();
-  for (auto entry : *hir_map) {
-    HInstruction* copy = entry.second;
-    if (copy->IsIf()) {
-      TryToEvaluateIfCondition(copy->AsIf(), graph_);
-    }
+  // Run 'IsLoopClonable' the last as it might be time-consuming.
+  if (!PeelUnrollHelper::IsLoopClonable(loop_info)) {
+    return false;
   }
 
-  return true;
+  return TryPeelingForLoopInvariantExitsElimination(&analysis_info) ||
+         TryUnrollingForBranchPenaltyReduction(&analysis_info);
 }
 
 //
@@ -1076,7 +1078,7 @@ void HLoopOptimization::Vectorize(LoopNode* node,
                     vector_index_,
                     ptc,
                     graph_->GetConstant(induc_type, 1),
-                    kNoUnrollingFactor);
+                    LoopAnalysisInfo::kNoUnrollingFactor);
   }
 
   // Generate vector loop, possibly further unrolled:
@@ -1103,7 +1105,7 @@ void HLoopOptimization::Vectorize(LoopNode* node,
                     vector_index_,
                     stc,
                     graph_->GetConstant(induc_type, 1),
-                    kNoUnrollingFactor);
+                    LoopAnalysisInfo::kNoUnrollingFactor);
   }
 
   // Link reductions to their final uses.
diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h
index 9743b25259..bc4792458b 100644
--- a/compiler/optimizing/loop_optimization.h
+++ b/compiler/optimizing/loop_optimization.h
@@ -144,12 +144,19 @@ class HLoopOptimization : public HOptimization {
   bool OptimizeInnerLoop(LoopNode* node);
 
   // Tries to apply loop unrolling for branch penalty reduction and better instruction scheduling
-  // opportunities. Returns whether transformation happened.
-  bool TryUnrollingForBranchPenaltyReduction(LoopNode* loop_node);
+  // opportunities. Returns whether transformation happened. 'generate_code' determines whether the
+  // optimization should be actually applied.
+  bool TryUnrollingForBranchPenaltyReduction(LoopAnalysisInfo* analysis_info,
+                                             bool generate_code = true);
 
   // Tries to apply loop peeling for loop invariant exits elimination. Returns whether
-  // transformation happened.
-  bool TryPeelingForLoopInvariantExitsElimination(LoopNode* loop_node);
+  // transformation happened. 'generate_code' determines whether the optimization should be
+  // actually applied.
+  bool TryPeelingForLoopInvariantExitsElimination(LoopAnalysisInfo* analysis_info,
+                                                  bool generate_code = true);
+
+  // Tries to apply scalar loop peeling and unrolling.
+  bool TryPeelingAndUnrolling(LoopNode* node);
 
   //
   // Vectorization analysis and synthesis.
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index 661f66a34c..50ce7559f5 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -1305,6 +1305,19 @@ void HInstruction::ReplaceUsesDominatedBy(HInstruction* dominator, HInstruction*
   }
 }
 
+void HInstruction::ReplaceEnvUsesDominatedBy(HInstruction* dominator, HInstruction* replacement) {
+  const HUseList<HEnvironment*>& uses = GetEnvUses();
+  for (auto it = uses.begin(), end = uses.end(); it != end; /* ++it below */) {
+    HEnvironment* user = it->GetUser();
+    size_t index = it->GetIndex();
+    // Increment `it` now because `*it` may disappear thanks to user->ReplaceInput().
+    ++it;
+    if (dominator->StrictlyDominates(user->GetHolder())) {
+      user->ReplaceInput(replacement, index);
+    }
+  }
+}
+
 void HInstruction::ReplaceInput(HInstruction* replacement, size_t index) {
   HUserRecord<HInstruction*> input_use = InputRecordAt(index);
   if (input_use.GetInstruction() == replacement) {
@@ -2879,8 +2892,7 @@ void HInvoke::SetIntrinsic(Intrinsics intrinsic,
 }
 
 bool HNewInstance::IsStringAlloc() const {
-  ScopedObjectAccess soa(Thread::Current());
-  return GetReferenceTypeInfo().IsStringClass();
+  return GetEntrypoint() == kQuickAllocStringObject;
 }
 
 bool HInvoke::NeedsEnvironment() const {
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 825779989c..cd8d07a17a 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -2217,6 +2217,7 @@ class HInstruction : public ArenaObject<kArenaAllocInstruction> {
 
   void ReplaceWith(HInstruction* instruction);
   void ReplaceUsesDominatedBy(HInstruction* dominator, HInstruction* replacement);
+  void ReplaceEnvUsesDominatedBy(HInstruction* dominator, HInstruction* replacement);
   void ReplaceInput(HInstruction* replacement, size_t index);
 
   // This is almost the same as doing `ReplaceWith()`. But in this helper, the
diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h
index c5e9a8d036..b4f9993ad6 100644
--- a/compiler/optimizing/nodes_vector.h
+++ b/compiler/optimizing/nodes_vector.h
@@ -958,6 +958,10 @@ class HVecMultiplyAccumulate FINAL : public HVecOperation {
     SetRawInputAt(2, mul_right);
   }
 
+  static constexpr int kInputAccumulatorIndex = 0;
+  static constexpr int kInputMulLeftIndex = 1;
+  static constexpr int kInputMulRightIndex = 2;
+
   bool CanBeMoved() const OVERRIDE { return true; }
 
   bool InstructionDataEquals(const HInstruction* other) const OVERRIDE {
diff --git a/compiler/optimizing/optimization.cc b/compiler/optimizing/optimization.cc
index 142ddb5fbb..3c803ab627 100644
--- a/compiler/optimizing/optimization.cc
+++ b/compiler/optimizing/optimization.cc
@@ -28,6 +28,7 @@
 #endif
 #ifdef ART_ENABLE_CODEGEN_x86
 #include "pc_relative_fixups_x86.h"
+#include "instruction_simplifier_x86.h"
 #endif
 #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
 #include "x86_memory_gen.h"
@@ -121,6 +122,8 @@ const char* OptimizationPassName(OptimizationPass pass) {
 #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
     case OptimizationPass::kX86MemoryOperandGeneration:
       return x86::X86MemoryOperandGeneration::kX86MemoryOperandGenerationPassName;
+    case OptimizationPass::kInstructionSimplifierX86:
+      return x86::InstructionSimplifierX86::kInstructionSimplifierX86PassName;
 #endif
     case OptimizationPass::kNone:
       LOG(FATAL) << "kNone does not represent an actual pass";
@@ -163,6 +166,7 @@ OptimizationPass OptimizationPassByName(const std::string& pass_name) {
 #ifdef ART_ENABLE_CODEGEN_x86
   X(OptimizationPass::kPcRelativeFixupsX86);
   X(OptimizationPass::kX86MemoryOperandGeneration);
+  X(OptimizationPass::kInstructionSimplifierX86);
 #endif
   LOG(FATAL) << "Cannot find optimization " << pass_name;
   UNREACHABLE();
@@ -323,6 +327,10 @@ ArenaVector<HOptimization*> ConstructOptimizations(
         DCHECK(alt_name == nullptr) << "arch-specific pass does not support alternative name";
         opt = new (allocator) x86::X86MemoryOperandGeneration(graph, codegen, stats);
         break;
+      case OptimizationPass::kInstructionSimplifierX86:
+        DCHECK(alt_name == nullptr) << "arch-specific pass does not support alternative name";
+        opt = new (allocator) x86::InstructionSimplifierX86(graph, codegen, stats);
+        break;
 #endif
       case OptimizationPass::kNone:
         LOG(FATAL) << "kNone does not represent an actual pass";
diff --git a/compiler/optimizing/optimization.h b/compiler/optimizing/optimization.h
index 88b283cebf..a9fafa0864 100644
--- a/compiler/optimizing/optimization.h
+++ b/compiler/optimizing/optimization.h
@@ -101,6 +101,7 @@ enum class OptimizationPass {
 #endif
 #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
   kX86MemoryOperandGeneration,
+  kInstructionSimplifierX86,
 #endif
   kNone,
   kLast = kNone
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index 84863e4357..f4bafcbef0 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -61,6 +61,7 @@
 #include "ssa_builder.h"
 #include "ssa_liveness_analysis.h"
 #include "ssa_phi_elimination.h"
+#include "stack_map_stream.h"
 #include "utils/assembler.h"
 #include "verifier/verifier_compiler_binding.h"
 
@@ -530,7 +531,8 @@ bool OptimizingCompiler::RunArchOptimizations(HGraph* graph,
         OptDef(OptimizationPass::kSideEffectsAnalysis),
         OptDef(OptimizationPass::kGlobalValueNumbering, "GVN$after_arch"),
         OptDef(OptimizationPass::kPcRelativeFixupsX86),
-        OptDef(OptimizationPass::kX86MemoryOperandGeneration)
+        OptDef(OptimizationPass::kX86MemoryOperandGeneration),
+        OptDef(OptimizationPass::kInstructionSimplifierX86)
       };
       return RunOptimizations(graph,
                               codegen,
@@ -545,7 +547,8 @@ bool OptimizingCompiler::RunArchOptimizations(HGraph* graph,
       OptimizationDef x86_64_optimizations[] = {
         OptDef(OptimizationPass::kSideEffectsAnalysis),
         OptDef(OptimizationPass::kGlobalValueNumbering, "GVN$after_arch"),
-        OptDef(OptimizationPass::kX86MemoryOperandGeneration)
+        OptDef(OptimizationPass::kX86MemoryOperandGeneration),
+        OptDef(OptimizationPass::kInstructionSimplifierX86)
       };
       return RunOptimizations(graph,
                               codegen,
@@ -846,23 +849,23 @@ CodeGenerator* OptimizingCompiler::TryCompile(ArenaAllocator* allocator,
         case kAnalysisSkipped: {
           MaybeRecordStat(compilation_stats_.get(),
                           MethodCompilationStat::kNotCompiledSkipped);
-        }
           break;
+        }
         case kAnalysisInvalidBytecode: {
           MaybeRecordStat(compilation_stats_.get(),
                           MethodCompilationStat::kNotCompiledInvalidBytecode);
-        }
           break;
+        }
         case kAnalysisFailThrowCatchLoop: {
           MaybeRecordStat(compilation_stats_.get(),
                           MethodCompilationStat::kNotCompiledThrowCatchLoop);
-        }
           break;
+        }
         case kAnalysisFailAmbiguousArrayOp: {
           MaybeRecordStat(compilation_stats_.get(),
                           MethodCompilationStat::kNotCompiledAmbiguousArrayOp);
-        }
           break;
+        }
         case kAnalysisSuccess:
           UNREACHABLE();
       }
@@ -1104,14 +1107,35 @@ CompiledMethod* OptimizingCompiler::Compile(const DexFile::CodeItem* code_item,
   return compiled_method;
 }
 
+static void CreateJniStackMap(ArenaStack* arena_stack,
+                              const JniCompiledMethod& jni_compiled_method,
+                              /* out */ ArenaVector<uint8_t>* stack_map,
+                              /* out */ ArenaVector<uint8_t>* method_info) {
+  ScopedArenaAllocator allocator(arena_stack);
+  StackMapStream stack_map_stream(&allocator, jni_compiled_method.GetInstructionSet());
+  stack_map_stream.BeginMethod(
+      jni_compiled_method.GetFrameSize(),
+      jni_compiled_method.GetCoreSpillMask(),
+      jni_compiled_method.GetFpSpillMask(),
+      /* num_dex_registers */ 0);
+  stack_map_stream.EndMethod();
+  stack_map->resize(stack_map_stream.PrepareForFillIn());
+  method_info->resize(stack_map_stream.ComputeMethodInfoSize());
+  stack_map_stream.FillInCodeInfo(MemoryRegion(stack_map->data(), stack_map->size()));
+  stack_map_stream.FillInMethodInfo(MemoryRegion(method_info->data(), method_info->size()));
+}
+
 CompiledMethod* OptimizingCompiler::JniCompile(uint32_t access_flags,
                                                uint32_t method_idx,
                                                const DexFile& dex_file,
                                                Handle<mirror::DexCache> dex_cache) const {
+  Runtime* runtime = Runtime::Current();
+  ArenaAllocator allocator(runtime->GetArenaPool());
+  ArenaStack arena_stack(runtime->GetArenaPool());
+
   const CompilerOptions& compiler_options = GetCompilerDriver()->GetCompilerOptions();
   if (compiler_options.IsBootImage()) {
     ScopedObjectAccess soa(Thread::Current());
-    Runtime* runtime = Runtime::Current();
     ArtMethod* method = runtime->GetClassLinker()->LookupResolvedMethod(
         method_idx, dex_cache.Get(), /* class_loader */ nullptr);
     if (method != nullptr && UNLIKELY(method->IsIntrinsic())) {
@@ -1126,8 +1150,6 @@ CompiledMethod* OptimizingCompiler::JniCompile(uint32_t access_flags,
           access_flags,
           /* verified_method */ nullptr,
           dex_cache);
-      ArenaAllocator allocator(runtime->GetArenaPool());
-      ArenaStack arena_stack(runtime->GetArenaPool());
       CodeVectorAllocator code_allocator(&allocator);
       VariableSizedHandleScope handles(soa.Self());
       // Go to native so that we don't block GC during compilation.
@@ -1153,6 +1175,10 @@ CompiledMethod* OptimizingCompiler::JniCompile(uint32_t access_flags,
   JniCompiledMethod jni_compiled_method = ArtQuickJniCompileMethod(
       compiler_options, access_flags, method_idx, dex_file);
   MaybeRecordStat(compilation_stats_.get(), MethodCompilationStat::kCompiledNativeStub);
+
+  ArenaVector<uint8_t> stack_map(allocator.Adapter(kArenaAllocStackMaps));
+  ArenaVector<uint8_t> method_info(allocator.Adapter(kArenaAllocStackMaps));
+  CreateJniStackMap(&arena_stack, jni_compiled_method, &stack_map, &method_info);
   return CompiledMethod::SwapAllocCompiledMethod(
       GetCompilerDriver(),
       jni_compiled_method.GetInstructionSet(),
@@ -1160,8 +1186,8 @@ CompiledMethod* OptimizingCompiler::JniCompile(uint32_t access_flags,
       jni_compiled_method.GetFrameSize(),
       jni_compiled_method.GetCoreSpillMask(),
       jni_compiled_method.GetFpSpillMask(),
-      /* method_info */ ArrayRef<const uint8_t>(),
-      /* vmap_table */ ArrayRef<const uint8_t>(),
+      ArrayRef<const uint8_t>(method_info),
+      ArrayRef<const uint8_t>(stack_map),
       jni_compiled_method.GetCfi(),
       /* patches */ ArrayRef<const linker::LinkerPatch>());
 }
@@ -1221,18 +1247,42 @@ bool OptimizingCompiler::JitCompile(Thread* self,
     ScopedNullHandle<mirror::ObjectArray<mirror::Object>> roots;
     ArenaSet<ArtMethod*, std::less<ArtMethod*>> cha_single_implementation_list(
         allocator.Adapter(kArenaAllocCHA));
+    ArenaVector<uint8_t> stack_map(allocator.Adapter(kArenaAllocStackMaps));
+    ArenaVector<uint8_t> method_info(allocator.Adapter(kArenaAllocStackMaps));
+    ArenaStack arena_stack(runtime->GetJitArenaPool());
+    // StackMapStream is large and it does not fit into this frame, so we need helper method.
+    // TODO: Try to avoid the extra memory copy that results from this.
+    CreateJniStackMap(&arena_stack, jni_compiled_method, &stack_map, &method_info);
+    uint8_t* stack_map_data = nullptr;
+    uint8_t* method_info_data = nullptr;
+    uint8_t* roots_data = nullptr;
+    uint32_t data_size = code_cache->ReserveData(self,
+                                                 stack_map.size(),
+                                                 method_info.size(),
+                                                 /* number_of_roots */ 0,
+                                                 method,
+                                                 &stack_map_data,
+                                                 &method_info_data,
+                                                 &roots_data);
+    if (stack_map_data == nullptr || roots_data == nullptr) {
+      MaybeRecordStat(compilation_stats_.get(), MethodCompilationStat::kJitOutOfMemoryForCommit);
+      return false;
+    }
+    memcpy(stack_map_data, stack_map.data(), stack_map.size());
+    memcpy(method_info_data, method_info.data(), method_info.size());
+
     const void* code = code_cache->CommitCode(
         self,
         method,
-        /* stack_map_data */ nullptr,
-        /* method_info_data */ nullptr,
-        /* roots_data */ nullptr,
+        stack_map_data,
+        method_info_data,
+        roots_data,
         jni_compiled_method.GetFrameSize(),
         jni_compiled_method.GetCoreSpillMask(),
         jni_compiled_method.GetFpSpillMask(),
         jni_compiled_method.GetCode().data(),
         jni_compiled_method.GetCode().size(),
-        /* data_size */ 0u,
+        data_size,
         osr,
         roots,
         /* has_should_deoptimize_flag */ false,
diff --git a/compiler/optimizing/optimizing_compiler_stats.h b/compiler/optimizing/optimizing_compiler_stats.h
index f246228074..9a26f2f6c4 100644
--- a/compiler/optimizing/optimizing_compiler_stats.h
+++ b/compiler/optimizing/optimizing_compiler_stats.h
@@ -50,7 +50,6 @@ enum class MethodCompilationStat {
   kNotCompiledThrowCatchLoop,
   kNotCompiledAmbiguousArrayOp,
   kNotCompiledHugeMethod,
-  kNotCompiledIrreducibleAndStringInit,
   kNotCompiledLargeMethodNoBranches,
   kNotCompiledMalformedOpcode,
   kNotCompiledNoCodegen,
diff --git a/compiler/optimizing/optimizing_unit_test.h b/compiler/optimizing/optimizing_unit_test.h
index a627f65ed4..f903f82d50 100644
--- a/compiler/optimizing/optimizing_unit_test.h
+++ b/compiler/optimizing/optimizing_unit_test.h
@@ -29,6 +29,7 @@
 #include "dex/dex_instruction.h"
 #include "dex/standard_dex_file.h"
 #include "driver/dex_compilation_unit.h"
+#include "graph_checker.h"
 #include "handle_scope-inl.h"
 #include "mirror/class_loader.h"
 #include "mirror/dex_cache.h"
@@ -129,10 +130,12 @@ class OptimizingUnitTestHelper {
     // Create the dex file based on the fake data. Call the constructor so that we can use virtual
     // functions. Don't use the arena for the StandardDexFile otherwise the dex location leaks.
     dex_files_.emplace_back(new StandardDexFile(
-        std::make_unique<NonOwningDexFileContainer>(dex_data, sizeof(StandardDexFile::Header)),
+        dex_data,
+        sizeof(StandardDexFile::Header),
         "no_location",
         /*location_checksum*/ 0,
-        /*oat_dex_file*/ nullptr));
+        /*oat_dex_file*/ nullptr,
+        /*container*/ nullptr));
 
     return new (allocator) HGraph(
         allocator,
@@ -185,6 +188,77 @@ class OptimizingUnitTestHelper {
 
 class OptimizingUnitTest : public CommonCompilerTest, public OptimizingUnitTestHelper {};
 
+// OptimizingUnitTest with some handy functions to ease the graph creation.
+class ImprovedOptimizingUnitTest : public OptimizingUnitTest {
+ public:
+  ImprovedOptimizingUnitTest() : graph_(CreateGraph()),
+                                 entry_block_(nullptr),
+                                 return_block_(nullptr),
+                                 exit_block_(nullptr),
+                                 parameter_(nullptr) {}
+
+  virtual ~ImprovedOptimizingUnitTest() {}
+
+  void InitGraph() {
+    entry_block_ = new (GetAllocator()) HBasicBlock(graph_);
+    graph_->AddBlock(entry_block_);
+    graph_->SetEntryBlock(entry_block_);
+
+    return_block_ = new (GetAllocator()) HBasicBlock(graph_);
+    graph_->AddBlock(return_block_);
+
+    exit_block_ = new (GetAllocator()) HBasicBlock(graph_);
+    graph_->AddBlock(exit_block_);
+    graph_->SetExitBlock(exit_block_);
+
+    entry_block_->AddSuccessor(return_block_);
+    return_block_->AddSuccessor(exit_block_);
+
+    parameter_ = new (GetAllocator()) HParameterValue(graph_->GetDexFile(),
+                                                      dex::TypeIndex(0),
+                                                      0,
+                                                      DataType::Type::kInt32);
+    entry_block_->AddInstruction(parameter_);
+    return_block_->AddInstruction(new (GetAllocator()) HReturnVoid());
+    exit_block_->AddInstruction(new (GetAllocator()) HExit());
+  }
+
+  bool CheckGraph() {
+    GraphChecker checker(graph_);
+    checker.Run();
+    if (!checker.IsValid()) {
+      for (const std::string& error : checker.GetErrors()) {
+        std::cout << error << std::endl;
+      }
+      return false;
+    }
+    return true;
+  }
+
+  HEnvironment* ManuallyBuildEnvFor(HInstruction* instruction,
+                                    ArenaVector<HInstruction*>* current_locals) {
+    HEnvironment* environment = new (GetAllocator()) HEnvironment(
+        (GetAllocator()),
+        current_locals->size(),
+        graph_->GetArtMethod(),
+        instruction->GetDexPc(),
+        instruction);
+
+    environment->CopyFrom(ArrayRef<HInstruction* const>(*current_locals));
+    instruction->SetRawEnvironment(environment);
+    return environment;
+  }
+
+ protected:
+  HGraph* graph_;
+
+  HBasicBlock* entry_block_;
+  HBasicBlock* return_block_;
+  HBasicBlock* exit_block_;
+
+  HInstruction* parameter_;
+};
+
 // Naive string diff data type.
 typedef std::list<std::pair<std::string, std::string>> diff_t;
 
diff --git a/compiler/optimizing/select_generator.cc b/compiler/optimizing/select_generator.cc
index 0d0f7cc748..dcc7f77fc2 100644
--- a/compiler/optimizing/select_generator.cc
+++ b/compiler/optimizing/select_generator.cc
@@ -45,7 +45,9 @@ static bool IsSimpleBlock(HBasicBlock* block) {
     HInstruction* instruction = it.Current();
     if (instruction->IsControlFlow()) {
       return instruction->IsGoto() || instruction->IsReturn();
-    } else if (instruction->CanBeMoved() && !instruction->HasSideEffects()) {
+    } else if (instruction->CanBeMoved() &&
+               !instruction->HasSideEffects() &&
+               !instruction->CanThrow()) {
       if (instruction->IsSelect() &&
           instruction->AsSelect()->GetCondition()->GetBlock() == block) {
         // Count one HCondition and HSelect in the same block as a single instruction.
@@ -119,10 +121,14 @@ bool HSelectGenerator::Run() {
     // TODO(dbrazdil): This puts an instruction between If and its condition.
     //                 Implement moving of conditions to first users if possible.
     while (!true_block->IsSingleGoto() && !true_block->IsSingleReturn()) {
-      true_block->GetFirstInstruction()->MoveBefore(if_instruction);
+      HInstruction* instr = true_block->GetFirstInstruction();
+      DCHECK(!instr->CanThrow());
+      instr->MoveBefore(if_instruction);
     }
     while (!false_block->IsSingleGoto() && !false_block->IsSingleReturn()) {
-      false_block->GetFirstInstruction()->MoveBefore(if_instruction);
+      HInstruction* instr = false_block->GetFirstInstruction();
+      DCHECK(!instr->CanThrow());
+      instr->MoveBefore(if_instruction);
     }
     DCHECK(true_block->IsSingleGoto() || true_block->IsSingleReturn());
     DCHECK(false_block->IsSingleGoto() || false_block->IsSingleReturn());
diff --git a/compiler/optimizing/select_generator_test.cc b/compiler/optimizing/select_generator_test.cc
new file mode 100644
index 0000000000..6e6549737c
--- /dev/null
+++ b/compiler/optimizing/select_generator_test.cc
@@ -0,0 +1,96 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "select_generator.h"
+
+#include "base/arena_allocator.h"
+#include "builder.h"
+#include "nodes.h"
+#include "optimizing_unit_test.h"
+#include "side_effects_analysis.h"
+
+namespace art {
+
+class SelectGeneratorTest : public ImprovedOptimizingUnitTest {
+ public:
+  void ConstructBasicGraphForSelect(HInstruction* instr) {
+    HBasicBlock* if_block = new (GetAllocator()) HBasicBlock(graph_);
+    HBasicBlock* then_block = new (GetAllocator()) HBasicBlock(graph_);
+    HBasicBlock* else_block = new (GetAllocator()) HBasicBlock(graph_);
+
+    graph_->AddBlock(if_block);
+    graph_->AddBlock(then_block);
+    graph_->AddBlock(else_block);
+
+    entry_block_->ReplaceSuccessor(return_block_, if_block);
+
+    if_block->AddSuccessor(then_block);
+    if_block->AddSuccessor(else_block);
+    then_block->AddSuccessor(return_block_);
+    else_block->AddSuccessor(return_block_);
+
+    HParameterValue* bool_param = new (GetAllocator()) HParameterValue(graph_->GetDexFile(),
+                                                                       dex::TypeIndex(0),
+                                                                       1,
+                                                                       DataType::Type::kBool);
+    entry_block_->AddInstruction(bool_param);
+    HIntConstant* const1 =  graph_->GetIntConstant(1);
+
+    if_block->AddInstruction(new (GetAllocator()) HIf(bool_param));
+
+    then_block->AddInstruction(instr);
+    then_block->AddInstruction(new (GetAllocator()) HGoto());
+
+    else_block->AddInstruction(new (GetAllocator()) HGoto());
+
+    HPhi* phi = new (GetAllocator()) HPhi(GetAllocator(), 0, 0, DataType::Type::kInt32);
+    return_block_->AddPhi(phi);
+    phi->AddInput(instr);
+    phi->AddInput(const1);
+  }
+
+  bool CheckGraphAndTrySelectGenerator() {
+    graph_->BuildDominatorTree();
+    EXPECT_TRUE(CheckGraph());
+
+    SideEffectsAnalysis side_effects(graph_);
+    side_effects.Run();
+    return HSelectGenerator(graph_, /*handles*/ nullptr, /*stats*/ nullptr).Run();
+  }
+};
+
+// HDivZeroCheck might throw and should not be hoisted from the conditional to an unconditional.
+TEST_F(SelectGeneratorTest, testZeroCheck) {
+  InitGraph();
+  HDivZeroCheck* instr = new (GetAllocator()) HDivZeroCheck(parameter_, 0);
+  ConstructBasicGraphForSelect(instr);
+
+  ArenaVector<HInstruction*> current_locals({parameter_, graph_->GetIntConstant(1)},
+                                            GetAllocator()->Adapter(kArenaAllocInstruction));
+  ManuallyBuildEnvFor(instr, &current_locals);
+
+  EXPECT_FALSE(CheckGraphAndTrySelectGenerator());
+}
+
+// Test that SelectGenerator succeeds with HAdd.
+TEST_F(SelectGeneratorTest, testAdd) {
+  InitGraph();
+  HAdd* instr = new (GetAllocator()) HAdd(DataType::Type::kInt32, parameter_, parameter_, 0);
+  ConstructBasicGraphForSelect(instr);
+  EXPECT_TRUE(CheckGraphAndTrySelectGenerator());
+}
+
+}  // namespace art
diff --git a/compiler/optimizing/ssa_builder.cc b/compiler/optimizing/ssa_builder.cc
index dd54468217..dda29a1b4b 100644
--- a/compiler/optimizing/ssa_builder.cc
+++ b/compiler/optimizing/ssa_builder.cc
@@ -440,6 +440,62 @@ static bool HasAliasInEnvironments(HInstruction* instruction) {
   return false;
 }
 
+void SsaBuilder::ReplaceUninitializedStringPhis() {
+  ScopedArenaHashSet<HInstruction*> seen_instructions(
+      local_allocator_->Adapter(kArenaAllocGraphBuilder));
+  ScopedArenaVector<HInstruction*> worklist(local_allocator_->Adapter(kArenaAllocGraphBuilder));
+
+  // Iterate over all inputs and uses of the phi, recursively, until all related instructions
+  // have been visited.
+  for (const auto& pair : uninitialized_string_phis_) {
+    HPhi* string_phi = pair.first;
+    HInvoke* invoke = pair.second;
+    worklist.push_back(string_phi);
+    HNewInstance* found_instance = nullptr;
+    do {
+      HInstruction* current = worklist.back();
+      worklist.pop_back();
+      if (seen_instructions.find(current) != seen_instructions.end()) {
+        continue;
+      }
+      seen_instructions.insert(current);
+      if (current->IsNewInstance()) {
+        // If it is the first time we see the allocation, replace its uses. We don't register
+        // it through `RemoveRedundantUninitializedStrings`, as that method makes assumption about
+        // aliasing and environment uses that don't hold when the string escapes to phis.
+        // Note that this also means we will keep the (useless) allocation.
+        if (found_instance == nullptr) {
+          found_instance = current->AsNewInstance();
+        } else {
+          DCHECK(found_instance == current);
+        }
+      } else if (current->IsPhi()) {
+        // Push all inputs to the worklist. Those should be Phis or NewInstance.
+        for (HInstruction* input : current->GetInputs()) {
+          DCHECK(input->IsPhi() || input->IsNewInstance()) << input->DebugName();
+          worklist.push_back(input);
+        }
+      } else {
+        // The verifier prevents any other DEX uses of the uninitialized string.
+        DCHECK(current->IsEqual() || current->IsNotEqual());
+        continue;
+      }
+      current->ReplaceUsesDominatedBy(invoke, invoke);
+      current->ReplaceEnvUsesDominatedBy(invoke, invoke);
+      // Push all users to the worklist. Now that we have replaced
+      // the uses dominated by the invokes, the remaining users should only
+      // be Phi, or Equal/NotEqual.
+      for (const HUseListNode<HInstruction*>& use : current->GetUses()) {
+        HInstruction* user = use.GetUser();
+        DCHECK(user->IsPhi() || user->IsEqual() || user->IsNotEqual()) << user->DebugName();
+        worklist.push_back(user);
+      }
+    } while (!worklist.empty());
+    seen_instructions.clear();
+    DCHECK(found_instance != nullptr);
+  }
+}
+
 void SsaBuilder::RemoveRedundantUninitializedStrings() {
   if (graph_->IsDebuggable()) {
     // Do not perform the optimization for consistency with the interpreter
@@ -488,27 +544,32 @@ void SsaBuilder::RemoveRedundantUninitializedStrings() {
 GraphAnalysisResult SsaBuilder::BuildSsa() {
   DCHECK(!graph_->IsInSsaForm());
 
-  // 1) Propagate types of phis. At this point, phis are typed void in the general
+  // Replace Phis that feed in a String.<init>, as well as their aliases, with
+  // the actual String allocation invocation. We do this first, as the phis stored in
+  // the data structure might get removed from the graph in later stages during `BuildSsa`.
+  ReplaceUninitializedStringPhis();
+
+  // Propagate types of phis. At this point, phis are typed void in the general
   // case, or float/double/reference if we created an equivalent phi. So we need
   // to propagate the types across phis to give them a correct type. If a type
   // conflict is detected in this stage, the phi is marked dead.
   RunPrimitiveTypePropagation();
 
-  // 2) Now that the correct primitive types have been assigned, we can get rid
+  // Now that the correct primitive types have been assigned, we can get rid
   // of redundant phis. Note that we cannot do this phase before type propagation,
   // otherwise we could get rid of phi equivalents, whose presence is a requirement
   // for the type propagation phase. Note that this is to satisfy statement (a)
   // of the SsaBuilder (see ssa_builder.h).
   SsaRedundantPhiElimination(graph_).Run();
 
-  // 3) Fix the type for null constants which are part of an equality comparison.
+  // Fix the type for null constants which are part of an equality comparison.
   // We need to do this after redundant phi elimination, to ensure the only cases
   // that we can see are reference comparison against 0. The redundant phi
   // elimination ensures we do not see a phi taking two 0 constants in a HEqual
   // or HNotEqual.
   FixNullConstantType();
 
-  // 4) Compute type of reference type instructions. The pass assumes that
+  // Compute type of reference type instructions. The pass assumes that
   // NullConstant has been fixed up.
   ReferenceTypePropagation(graph_,
                            class_loader_,
@@ -516,7 +577,7 @@ GraphAnalysisResult SsaBuilder::BuildSsa() {
                            handles_,
                            /* is_first_run */ true).Run();
 
-  // 5) HInstructionBuilder duplicated ArrayGet instructions with ambiguous type
+  // HInstructionBuilder duplicated ArrayGet instructions with ambiguous type
   // (int/float or long/double) and marked ArraySets with ambiguous input type.
   // Now that RTP computed the type of the array input, the ambiguity can be
   // resolved and the correct equivalents kept.
@@ -524,13 +585,13 @@ GraphAnalysisResult SsaBuilder::BuildSsa() {
     return kAnalysisFailAmbiguousArrayOp;
   }
 
-  // 6) Mark dead phis. This will mark phis which are not used by instructions
+  // Mark dead phis. This will mark phis which are not used by instructions
   // or other live phis. If compiling as debuggable code, phis will also be kept
   // live if they have an environment use.
   SsaDeadPhiElimination dead_phi_elimimation(graph_);
   dead_phi_elimimation.MarkDeadPhis();
 
-  // 7) Make sure environments use the right phi equivalent: a phi marked dead
+  // Make sure environments use the right phi equivalent: a phi marked dead
   // can have a phi equivalent that is not dead. In that case we have to replace
   // it with the live equivalent because deoptimization and try/catch rely on
   // environments containing values of all live vregs at that point. Note that
@@ -539,14 +600,14 @@ GraphAnalysisResult SsaBuilder::BuildSsa() {
   // environments to just reference one.
   FixEnvironmentPhis();
 
-  // 8) Now that the right phis are used for the environments, we can eliminate
+  // Now that the right phis are used for the environments, we can eliminate
   // phis we do not need. Regardless of the debuggable status, this phase is
   /// necessary for statement (b) of the SsaBuilder (see ssa_builder.h), as well
   // as for the code generation, which does not deal with phis of conflicting
   // input types.
   dead_phi_elimimation.EliminateDeadPhis();
 
-  // 9) HInstructionBuidler replaced uses of NewInstances of String with the
+  // HInstructionBuidler replaced uses of NewInstances of String with the
   // results of their corresponding StringFactory calls. Unless the String
   // objects are used before they are initialized, they can be replaced with
   // NullConstant. Note that this optimization is valid only if unsimplified
diff --git a/compiler/optimizing/ssa_builder.h b/compiler/optimizing/ssa_builder.h
index 60831a9e6a..765544508e 100644
--- a/compiler/optimizing/ssa_builder.h
+++ b/compiler/optimizing/ssa_builder.h
@@ -61,7 +61,8 @@ class SsaBuilder : public ValueObject {
         local_allocator_(local_allocator),
         ambiguous_agets_(local_allocator->Adapter(kArenaAllocGraphBuilder)),
         ambiguous_asets_(local_allocator->Adapter(kArenaAllocGraphBuilder)),
-        uninitialized_strings_(local_allocator->Adapter(kArenaAllocGraphBuilder)) {
+        uninitialized_strings_(local_allocator->Adapter(kArenaAllocGraphBuilder)),
+        uninitialized_string_phis_(local_allocator->Adapter(kArenaAllocGraphBuilder)) {
     graph_->InitializeInexactObjectRTI(handles);
   }
 
@@ -96,6 +97,10 @@ class SsaBuilder : public ValueObject {
     }
   }
 
+  void AddUninitializedStringPhi(HPhi* phi, HInvoke* invoke) {
+    uninitialized_string_phis_.push_back(std::make_pair(phi, invoke));
+  }
+
  private:
   void SetLoopHeaderPhiInputs();
   void FixEnvironmentPhis();
@@ -118,6 +123,7 @@ class SsaBuilder : public ValueObject {
   HArrayGet* GetFloatOrDoubleEquivalentOfArrayGet(HArrayGet* aget);
 
   void RemoveRedundantUninitializedStrings();
+  void ReplaceUninitializedStringPhis();
 
   HGraph* const graph_;
   Handle<mirror::ClassLoader> class_loader_;
@@ -131,6 +137,7 @@ class SsaBuilder : public ValueObject {
   ScopedArenaVector<HArrayGet*> ambiguous_agets_;
   ScopedArenaVector<HArraySet*> ambiguous_asets_;
   ScopedArenaVector<HNewInstance*> uninitialized_strings_;
+  ScopedArenaVector<std::pair<HPhi*, HInvoke*>> uninitialized_string_phis_;
 
   DISALLOW_COPY_AND_ASSIGN(SsaBuilder);
 };
diff --git a/compiler/optimizing/stack_map_stream.cc b/compiler/optimizing/stack_map_stream.cc
index 5d361953ba..3e1a36dc9b 100644
--- a/compiler/optimizing/stack_map_stream.cc
+++ b/compiler/optimizing/stack_map_stream.cc
@@ -151,7 +151,7 @@ void StackMapStream::EndStackMapEntry() {
       StackMap stack_map = code_info.GetStackMapAt(stack_map_index);
       CHECK_EQ(stack_map.HasDexRegisterMap(), (num_dex_registers != 0));
       CHECK_EQ(stack_map.HasInlineInfo(), (inlining_depth != 0));
-      CHECK_EQ(code_info.GetInlineDepthOf(stack_map), inlining_depth);
+      CHECK_EQ(code_info.GetInlineInfosOf(stack_map).size(), inlining_depth);
     });
   }
 }
@@ -209,7 +209,7 @@ void StackMapStream::BeginInlineInfoEntry(ArtMethod* method,
     size_t depth = current_inline_infos_.size() - 1;
     dchecks_.emplace_back([=](const CodeInfo& code_info) {
       StackMap stack_map = code_info.GetStackMapAt(stack_map_index);
-      InlineInfo inline_info = code_info.GetInlineInfoAtDepth(stack_map, depth);
+      InlineInfo inline_info = code_info.GetInlineInfosOf(stack_map)[depth];
       CHECK_EQ(inline_info.GetDexPc(), dex_pc);
       bool encode_art_method = EncodeArtMethodInInlineInfo(method);
       CHECK_EQ(inline_info.EncodesArtMethod(), encode_art_method);
@@ -275,7 +275,6 @@ void StackMapStream::CreateDexRegisterMap() {
 
   if (kVerifyStackMaps) {
     size_t stack_map_index = stack_maps_.size();
-    uint32_t depth = current_inline_infos_.size();
     // We need to make copy of the current registers for later (when the check is run).
     auto expected_dex_registers = std::make_shared<dchecked_vector<DexRegisterLocation>>(
         current_dex_registers_.begin(), current_dex_registers_.end());
@@ -285,8 +284,9 @@ void StackMapStream::CreateDexRegisterMap() {
       for (DexRegisterLocation reg : code_info.GetDexRegisterMapOf(stack_map)) {
         CHECK_EQ((*expected_dex_registers)[expected_reg++], reg);
       }
-      for (uint32_t d = 0; d < depth; d++) {
-        for (DexRegisterLocation reg : code_info.GetDexRegisterMapAtDepth(d, stack_map)) {
+      for (InlineInfo inline_info : code_info.GetInlineInfosOf(stack_map)) {
+        DexRegisterMap map = code_info.GetInlineDexRegisterMapOf(stack_map, inline_info);
+        for (DexRegisterLocation reg : map) {
           CHECK_EQ((*expected_dex_registers)[expected_reg++], reg);
         }
       }
diff --git a/compiler/optimizing/stack_map_test.cc b/compiler/optimizing/stack_map_test.cc
index 6241e0c25a..9ed90a4839 100644
--- a/compiler/optimizing/stack_map_test.cc
+++ b/compiler/optimizing/stack_map_test.cc
@@ -193,13 +193,12 @@ TEST(StackMapTest, Test2) {
     ASSERT_EQ(-2, location1.GetValue());
 
     ASSERT_TRUE(stack_map.HasInlineInfo());
-    InlineInfo inline_info0 = code_info.GetInlineInfoAtDepth(stack_map, 0);
-    InlineInfo inline_info1 = code_info.GetInlineInfoAtDepth(stack_map, 1);
-    ASSERT_EQ(2u, code_info.GetInlineDepthOf(stack_map));
-    ASSERT_EQ(3u, inline_info0.GetDexPc());
-    ASSERT_EQ(2u, inline_info1.GetDexPc());
-    ASSERT_TRUE(inline_info0.EncodesArtMethod());
-    ASSERT_TRUE(inline_info1.EncodesArtMethod());
+    auto inline_infos = code_info.GetInlineInfosOf(stack_map);
+    ASSERT_EQ(2u, inline_infos.size());
+    ASSERT_EQ(3u, inline_infos[0].GetDexPc());
+    ASSERT_EQ(2u, inline_infos[1].GetDexPc());
+    ASSERT_TRUE(inline_infos[0].EncodesArtMethod());
+    ASSERT_TRUE(inline_infos[1].EncodesArtMethod());
   }
 
   // Second stack map.
@@ -614,19 +613,18 @@ TEST(StackMapTest, InlineTest) {
     ASSERT_EQ(0, dex_registers0[0].GetStackOffsetInBytes());
     ASSERT_EQ(4, dex_registers0[1].GetConstant());
 
-    InlineInfo if0_0 = ci.GetInlineInfoAtDepth(sm0, 0);
-    InlineInfo if0_1 = ci.GetInlineInfoAtDepth(sm0, 1);
-    ASSERT_EQ(2u, ci.GetInlineDepthOf(sm0));
-    ASSERT_EQ(2u, if0_0.GetDexPc());
-    ASSERT_TRUE(if0_0.EncodesArtMethod());
-    ASSERT_EQ(3u, if0_1.GetDexPc());
-    ASSERT_TRUE(if0_1.EncodesArtMethod());
+    auto inline_infos = ci.GetInlineInfosOf(sm0);
+    ASSERT_EQ(2u, inline_infos.size());
+    ASSERT_EQ(2u, inline_infos[0].GetDexPc());
+    ASSERT_TRUE(inline_infos[0].EncodesArtMethod());
+    ASSERT_EQ(3u, inline_infos[1].GetDexPc());
+    ASSERT_TRUE(inline_infos[1].EncodesArtMethod());
 
-    DexRegisterMap dex_registers1 = ci.GetDexRegisterMapAtDepth(0, sm0);
+    DexRegisterMap dex_registers1 = ci.GetInlineDexRegisterMapOf(sm0, inline_infos[0]);
     ASSERT_EQ(1u, dex_registers1.size());
     ASSERT_EQ(8, dex_registers1[0].GetStackOffsetInBytes());
 
-    DexRegisterMap dex_registers2 = ci.GetDexRegisterMapAtDepth(1, sm0);
+    DexRegisterMap dex_registers2 = ci.GetInlineDexRegisterMapOf(sm0, inline_infos[1]);
     ASSERT_EQ(3u, dex_registers2.size());
     ASSERT_EQ(16, dex_registers2[0].GetStackOffsetInBytes());
     ASSERT_EQ(20, dex_registers2[1].GetConstant());
@@ -642,22 +640,20 @@ TEST(StackMapTest, InlineTest) {
     ASSERT_EQ(56, dex_registers0[0].GetStackOffsetInBytes());
     ASSERT_EQ(0, dex_registers0[1].GetConstant());
 
-    InlineInfo if1_0 = ci.GetInlineInfoAtDepth(sm1, 0);
-    InlineInfo if1_1 = ci.GetInlineInfoAtDepth(sm1, 1);
-    InlineInfo if1_2 = ci.GetInlineInfoAtDepth(sm1, 2);
-    ASSERT_EQ(3u, ci.GetInlineDepthOf(sm1));
-    ASSERT_EQ(2u, if1_0.GetDexPc());
-    ASSERT_TRUE(if1_0.EncodesArtMethod());
-    ASSERT_EQ(3u, if1_1.GetDexPc());
-    ASSERT_TRUE(if1_1.EncodesArtMethod());
-    ASSERT_EQ(5u, if1_2.GetDexPc());
-    ASSERT_TRUE(if1_2.EncodesArtMethod());
-
-    DexRegisterMap dex_registers1 = ci.GetDexRegisterMapAtDepth(0, sm1);
+    auto inline_infos = ci.GetInlineInfosOf(sm1);
+    ASSERT_EQ(3u, inline_infos.size());
+    ASSERT_EQ(2u, inline_infos[0].GetDexPc());
+    ASSERT_TRUE(inline_infos[0].EncodesArtMethod());
+    ASSERT_EQ(3u, inline_infos[1].GetDexPc());
+    ASSERT_TRUE(inline_infos[1].EncodesArtMethod());
+    ASSERT_EQ(5u, inline_infos[2].GetDexPc());
+    ASSERT_TRUE(inline_infos[2].EncodesArtMethod());
+
+    DexRegisterMap dex_registers1 = ci.GetInlineDexRegisterMapOf(sm1, inline_infos[0]);
     ASSERT_EQ(1u, dex_registers1.size());
     ASSERT_EQ(12, dex_registers1[0].GetStackOffsetInBytes());
 
-    DexRegisterMap dex_registers2 = ci.GetDexRegisterMapAtDepth(1, sm1);
+    DexRegisterMap dex_registers2 = ci.GetInlineDexRegisterMapOf(sm1, inline_infos[1]);
     ASSERT_EQ(3u, dex_registers2.size());
     ASSERT_EQ(80, dex_registers2[0].GetStackOffsetInBytes());
     ASSERT_EQ(10, dex_registers2[1].GetConstant());
@@ -684,22 +680,20 @@ TEST(StackMapTest, InlineTest) {
     ASSERT_EQ(56, dex_registers0[0].GetStackOffsetInBytes());
     ASSERT_EQ(0, dex_registers0[1].GetConstant());
 
-    InlineInfo if2_0 = ci.GetInlineInfoAtDepth(sm3, 0);
-    InlineInfo if2_1 = ci.GetInlineInfoAtDepth(sm3, 1);
-    InlineInfo if2_2 = ci.GetInlineInfoAtDepth(sm3, 2);
-    ASSERT_EQ(3u, ci.GetInlineDepthOf(sm3));
-    ASSERT_EQ(2u, if2_0.GetDexPc());
-    ASSERT_TRUE(if2_0.EncodesArtMethod());
-    ASSERT_EQ(5u, if2_1.GetDexPc());
-    ASSERT_TRUE(if2_1.EncodesArtMethod());
-    ASSERT_EQ(10u, if2_2.GetDexPc());
-    ASSERT_TRUE(if2_2.EncodesArtMethod());
-
-    DexRegisterMap dex_registers1 = ci.GetDexRegisterMapAtDepth(1, sm3);
+    auto inline_infos = ci.GetInlineInfosOf(sm3);
+    ASSERT_EQ(3u, inline_infos.size());
+    ASSERT_EQ(2u, inline_infos[0].GetDexPc());
+    ASSERT_TRUE(inline_infos[0].EncodesArtMethod());
+    ASSERT_EQ(5u, inline_infos[1].GetDexPc());
+    ASSERT_TRUE(inline_infos[1].EncodesArtMethod());
+    ASSERT_EQ(10u, inline_infos[2].GetDexPc());
+    ASSERT_TRUE(inline_infos[2].EncodesArtMethod());
+
+    DexRegisterMap dex_registers1 = ci.GetInlineDexRegisterMapOf(sm3, inline_infos[1]);
     ASSERT_EQ(1u, dex_registers1.size());
     ASSERT_EQ(2, dex_registers1[0].GetMachineRegister());
 
-    DexRegisterMap dex_registers2 = ci.GetDexRegisterMapAtDepth(2, sm3);
+    DexRegisterMap dex_registers2 = ci.GetInlineDexRegisterMapOf(sm3, inline_infos[2]);
     ASSERT_EQ(2u, dex_registers2.size());
     ASSERT_FALSE(dex_registers2[0].IsLive());
     ASSERT_EQ(3, dex_registers2[1].GetMachineRegister());
diff --git a/compiler/optimizing/superblock_cloner_test.cc b/compiler/optimizing/superblock_cloner_test.cc
index 6f3bcdac47..31114b6dcc 100644
--- a/compiler/optimizing/superblock_cloner_test.cc
+++ b/compiler/optimizing/superblock_cloner_test.cc
@@ -30,38 +30,8 @@ using HEdgeSet = SuperblockCloner::HEdgeSet;
 
 // This class provides methods and helpers for testing various cloning and copying routines:
 // individual instruction cloning and cloning of the more coarse-grain structures.
-class SuperblockClonerTest : public OptimizingUnitTest {
+class SuperblockClonerTest : public ImprovedOptimizingUnitTest {
  public:
-  SuperblockClonerTest() : graph_(CreateGraph()),
-                           entry_block_(nullptr),
-                           return_block_(nullptr),
-                           exit_block_(nullptr),
-                           parameter_(nullptr) {}
-
-  void InitGraph() {
-    entry_block_ = new (GetAllocator()) HBasicBlock(graph_);
-    graph_->AddBlock(entry_block_);
-    graph_->SetEntryBlock(entry_block_);
-
-    return_block_ = new (GetAllocator()) HBasicBlock(graph_);
-    graph_->AddBlock(return_block_);
-
-    exit_block_ = new (GetAllocator()) HBasicBlock(graph_);
-    graph_->AddBlock(exit_block_);
-    graph_->SetExitBlock(exit_block_);
-
-    entry_block_->AddSuccessor(return_block_);
-    return_block_->AddSuccessor(exit_block_);
-
-    parameter_ = new (GetAllocator()) HParameterValue(graph_->GetDexFile(),
-                                                      dex::TypeIndex(0),
-                                                      0,
-                                                      DataType::Type::kInt32);
-    entry_block_->AddInstruction(parameter_);
-    return_block_->AddInstruction(new (GetAllocator()) HReturnVoid());
-    exit_block_->AddInstruction(new (GetAllocator()) HExit());
-  }
-
   void CreateBasicLoopControlFlow(HBasicBlock* position,
                                   HBasicBlock* successor,
                                   /* out */ HBasicBlock** header_p,
@@ -137,40 +107,6 @@ class SuperblockClonerTest : public OptimizingUnitTest {
     null_check->CopyEnvironmentFrom(env);
     bounds_check->CopyEnvironmentFrom(env);
   }
-
-  HEnvironment* ManuallyBuildEnvFor(HInstruction* instruction,
-                                    ArenaVector<HInstruction*>* current_locals) {
-    HEnvironment* environment = new (GetAllocator()) HEnvironment(
-        (GetAllocator()),
-        current_locals->size(),
-        graph_->GetArtMethod(),
-        instruction->GetDexPc(),
-        instruction);
-
-    environment->CopyFrom(ArrayRef<HInstruction* const>(*current_locals));
-    instruction->SetRawEnvironment(environment);
-    return environment;
-  }
-
-  bool CheckGraph() {
-    GraphChecker checker(graph_);
-    checker.Run();
-    if (!checker.IsValid()) {
-      for (const std::string& error : checker.GetErrors()) {
-        std::cout << error << std::endl;
-      }
-      return false;
-    }
-    return true;
-  }
-
-  HGraph* graph_;
-
-  HBasicBlock* entry_block_;
-  HBasicBlock* return_block_;
-  HBasicBlock* exit_block_;
-
-  HInstruction* parameter_;
 };
 
 TEST_F(SuperblockClonerTest, IndividualInstrCloner) {
diff --git a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
index 2c428fac7e..c6c764e3a9 100644
--- a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
+++ b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
@@ -120,11 +120,10 @@ void ArmVIXLJNIMacroAssembler::BuildFrame(size_t frame_size,
 
   // Write out entry spills.
   int32_t offset = frame_size + kFramePointerSize;
-  for (size_t i = 0; i < entry_spills.size(); ++i) {
-    ArmManagedRegister reg = entry_spills.at(i).AsArm();
+  for (const ManagedRegisterSpill& spill : entry_spills) {
+    ArmManagedRegister reg = spill.AsArm();
     if (reg.IsNoRegister()) {
       // only increment stack offset.
-      ManagedRegisterSpill spill = entry_spills.at(i);
       offset += spill.getSize();
     } else if (reg.IsCoreRegister()) {
       asm_.StoreToOffset(kStoreWord, AsVIXLRegister(reg), sp, offset);
diff --git a/compiler/utils/arm64/jni_macro_assembler_arm64.cc b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
index a5aa1c12b3..d6ce03387c 100644
--- a/compiler/utils/arm64/jni_macro_assembler_arm64.cc
+++ b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
@@ -719,11 +719,10 @@ void Arm64JNIMacroAssembler::BuildFrame(size_t frame_size,
 
   // Write out entry spills
   int32_t offset = frame_size + static_cast<size_t>(kArm64PointerSize);
-  for (size_t i = 0; i < entry_spills.size(); ++i) {
-    Arm64ManagedRegister reg = entry_spills.at(i).AsArm64();
+  for (const ManagedRegisterSpill& spill : entry_spills) {
+    Arm64ManagedRegister reg = spill.AsArm64();
     if (reg.IsNoRegister()) {
       // only increment stack offset.
-      ManagedRegisterSpill spill = entry_spills.at(i);
       offset += spill.getSize();
     } else if (reg.IsXRegister()) {
       StoreToOffset(reg.AsXRegister(), SP, offset);
diff --git a/compiler/utils/assembler_thumb_test_expected.cc.inc b/compiler/utils/assembler_thumb_test_expected.cc.inc
index e76e98a2a3..85e4326494 100644
--- a/compiler/utils/assembler_thumb_test_expected.cc.inc
+++ b/compiler/utils/assembler_thumb_test_expected.cc.inc
@@ -153,7 +153,7 @@ const char* const VixlJniHelpersResults[] = {
   " 21c:	f8d9 8034 	ldr.w	r8, [r9, #52]	; 0x34\n",
   " 220:	4770      	bx	lr\n",
   " 222:	4660      	mov	r0, ip\n",
-  " 224:	f8d9 c2d0 	ldr.w	ip, [r9, #720]	; 0x2d0\n",
+  " 224:	f8d9 c2d4 	ldr.w	ip, [r9, #724]	; 0x2d4\n",
   " 228:	47e0      	blx	ip\n",
   nullptr
 };
diff --git a/compiler/utils/managed_register.h b/compiler/utils/managed_register.h
index 2b7b2aa7ce..db9c36cc75 100644
--- a/compiler/utils/managed_register.h
+++ b/compiler/utils/managed_register.h
@@ -101,11 +101,11 @@ class ManagedRegisterSpill : public ManagedRegister {
   ManagedRegisterSpill(const ManagedRegister& other, int32_t size)
       : ManagedRegister(other), size_(size), spill_offset_(-1) { }
 
-  int32_t getSpillOffset() {
+  int32_t getSpillOffset() const {
     return spill_offset_;
   }
 
-  int32_t getSize() {
+  int32_t getSize() const {
     return size_;
   }
 
diff --git a/compiler/utils/mips/assembler_mips.cc b/compiler/utils/mips/assembler_mips.cc
index dce5b95fec..c0b6f988d4 100644
--- a/compiler/utils/mips/assembler_mips.cc
+++ b/compiler/utils/mips/assembler_mips.cc
@@ -4801,10 +4801,9 @@ void MipsAssembler::BuildFrame(size_t frame_size,
 
   // Write out entry spills.
   int32_t offset = frame_size + kFramePointerSize;
-  for (size_t i = 0; i < entry_spills.size(); ++i) {
-    MipsManagedRegister reg = entry_spills.at(i).AsMips();
+  for (const ManagedRegisterSpill& spill : entry_spills) {
+    MipsManagedRegister reg = spill.AsMips();
     if (reg.IsNoRegister()) {
-      ManagedRegisterSpill spill = entry_spills.at(i);
       offset += spill.getSize();
     } else if (reg.IsCoreRegister()) {
       StoreToOffset(kStoreWord, reg.AsCoreRegister(), SP, offset);
diff --git a/compiler/utils/mips64/assembler_mips64.cc b/compiler/utils/mips64/assembler_mips64.cc
index bb1bb82fa5..5b1c5d9e01 100644
--- a/compiler/utils/mips64/assembler_mips64.cc
+++ b/compiler/utils/mips64/assembler_mips64.cc
@@ -3633,9 +3633,8 @@ void Mips64Assembler::BuildFrame(size_t frame_size,
 
   // Write out entry spills.
   int32_t offset = frame_size + kFramePointerSize;
-  for (size_t i = 0; i < entry_spills.size(); ++i) {
-    Mips64ManagedRegister reg = entry_spills[i].AsMips64();
-    ManagedRegisterSpill spill = entry_spills.at(i);
+  for (const ManagedRegisterSpill& spill : entry_spills) {
+    Mips64ManagedRegister reg = spill.AsMips64();
     int32_t size = spill.getSize();
     if (reg.IsNoRegister()) {
       // only increment stack offset.
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index 86f9010ea3..c2ce03b1f2 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -525,6 +525,58 @@ void X86Assembler::divss(XmmRegister dst, const Address& src) {
   EmitOperand(dst, src);
 }
 
+void X86Assembler::vfmadd231ps(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/);
+  uint8_t byte_one = EmitVexByte1(false, false, false, 2);
+  uint8_t byte_two = EmitVexByte2(false, 128, X86ManagedRegister::FromXmmRegister(mul_left), 1);
+  EmitUint8(byte_zero);
+  EmitUint8(byte_one);
+  EmitUint8(byte_two);
+  // Opcode field.
+  EmitUint8(0xB8);
+  EmitXmmRegisterOperand(acc, mul_right);
+}
+
+void X86Assembler::vfmsub231ps(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/);
+  uint8_t byte_one = EmitVexByte1(false, false, false, 2);
+  uint8_t byte_two = EmitVexByte2(false, 128, X86ManagedRegister::FromXmmRegister(mul_left), 1);
+  EmitUint8(byte_zero);
+  EmitUint8(byte_one);
+  EmitUint8(byte_two);
+  // Opcode field.
+  EmitUint8(0xBA);
+  EmitXmmRegisterOperand(acc, mul_right);
+}
+
+void X86Assembler::vfmadd231pd(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/);
+  uint8_t byte_one = EmitVexByte1(false, false, false, 2);
+  uint8_t byte_two = EmitVexByte2(true, 128, X86ManagedRegister::FromXmmRegister(mul_left), 1);
+  EmitUint8(byte_zero);
+  EmitUint8(byte_one);
+  EmitUint8(byte_two);
+  // Opcode field.
+  EmitUint8(0xB8);
+  EmitXmmRegisterOperand(acc, mul_right);
+}
+
+void X86Assembler::vfmsub231pd(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/);
+  uint8_t byte_one = EmitVexByte1(false, false, false, 2);
+  uint8_t byte_two = EmitVexByte2(true, 128, X86ManagedRegister::FromXmmRegister(mul_left), 1);
+  EmitUint8(byte_zero);
+  EmitUint8(byte_one);
+  EmitUint8(byte_two);
+  // Opcode field.
+  EmitUint8(0xBA);
+  EmitXmmRegisterOperand(acc, mul_right);
+}
+
 
 void X86Assembler::addps(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -2898,6 +2950,99 @@ void X86Assembler::EmitLabelLink(NearLabel* label) {
 }
 
 
+uint8_t X86Assembler::EmitVexByteZero(bool is_two_byte) {
+  uint8_t vex_zero = 0xC0;
+  if (!is_two_byte) {
+    vex_zero |= 0xC4;
+  } else {
+    vex_zero |= 0xC5;
+  }
+  return vex_zero;
+}
+
+uint8_t X86Assembler::EmitVexByte1(bool r, bool x, bool b, int mmmmm ) {
+  // VEX Byte 1.
+  uint8_t vex_prefix = 0;
+  if (!r) {
+    vex_prefix |= 0x80;  // VEX.R .
+  }
+  if (!x) {
+    vex_prefix |= 0x40;  // VEX.X .
+  }
+  if (!b) {
+    vex_prefix |= 0x20;  // VEX.B .
+  }
+
+  // VEX.mmmmm.
+  switch (mmmmm) {
+  case 1:
+    // Implied 0F leading opcode byte.
+    vex_prefix |= 0x01;
+    break;
+  case 2:
+    // Implied leading 0F 38 opcode byte.
+    vex_prefix |= 0x02;
+    break;
+  case 3:
+    // Implied leading OF 3A opcode byte.
+    vex_prefix |= 0x03;
+    break;
+  default:
+    LOG(FATAL) << "unknown opcode bytes";
+  }
+  return vex_prefix;
+}
+
+uint8_t X86Assembler::EmitVexByte2(bool w, int l, X86ManagedRegister operand, int pp) {
+  uint8_t vex_prefix = 0;
+  // VEX Byte 2.
+  if (w) {
+    vex_prefix |= 0x80;
+  }
+
+  // VEX.vvvv.
+  if (operand.IsXmmRegister()) {
+    XmmRegister vvvv = operand.AsXmmRegister();
+    int inverted_reg = 15-static_cast<int>(vvvv);
+    uint8_t reg = static_cast<uint8_t>(inverted_reg);
+    vex_prefix |= ((reg & 0x0F) << 3);
+  } else if (operand.IsCpuRegister()) {
+    Register vvvv = operand.AsCpuRegister();
+    int inverted_reg = 15 - static_cast<int>(vvvv);
+    uint8_t reg = static_cast<uint8_t>(inverted_reg);
+    vex_prefix |= ((reg & 0x0F) << 3);
+  }
+
+  // VEX.L.
+  if (l == 256) {
+    vex_prefix |= 0x04;
+  }
+
+  // VEX.pp.
+  switch (pp) {
+  case 0:
+    // SIMD Pefix - None.
+    vex_prefix |= 0x00;
+    break;
+  case 1:
+    // SIMD Prefix - 66.
+    vex_prefix |= 0x01;
+    break;
+  case 2:
+    // SIMD Prefix - F3.
+    vex_prefix |= 0x02;
+    break;
+  case 3:
+    // SIMD Prefix - F2.
+    vex_prefix |= 0x03;
+    break;
+  default:
+    LOG(FATAL) << "unknown SIMD Prefix";
+  }
+
+  return vex_prefix;
+}
+
 void X86Assembler::EmitGenericShift(int reg_or_opcode,
                                     const Operand& operand,
                                     const Immediate& imm) {
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index e42c4c986a..8c9ce82687 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -397,6 +397,12 @@ class X86Assembler FINAL : public Assembler {
   void divss(XmmRegister dst, XmmRegister src);
   void divss(XmmRegister dst, const Address& src);
 
+  // FMA Mac Instructions
+  void vfmadd231ps(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+  void vfmadd231pd(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+  void vfmsub231ps(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+  void vfmsub231pd(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+
   void addps(XmmRegister dst, XmmRegister src);  // no addr variant (for now)
   void subps(XmmRegister dst, XmmRegister src);
   void mulps(XmmRegister dst, XmmRegister src);
@@ -834,6 +840,11 @@ class X86Assembler FINAL : public Assembler {
   void EmitLabelLink(Label* label);
   void EmitLabelLink(NearLabel* label);
 
+  // Emit a 3 byte VEX Prefix
+  uint8_t EmitVexByteZero(bool is_two_byte);
+  uint8_t EmitVexByte1(bool r, bool x, bool b, int mmmmm);
+  uint8_t EmitVexByte2(bool w , int l , X86ManagedRegister vvv, int pp);
+
   void EmitGenericShift(int rm, const Operand& operand, const Immediate& imm);
   void EmitGenericShift(int rm, const Operand& operand, Register shifter);
 
diff --git a/compiler/utils/x86/jni_macro_assembler_x86.cc b/compiler/utils/x86/jni_macro_assembler_x86.cc
index 7e29c4aa26..dd99f03aa7 100644
--- a/compiler/utils/x86/jni_macro_assembler_x86.cc
+++ b/compiler/utils/x86/jni_macro_assembler_x86.cc
@@ -67,8 +67,7 @@ void X86JNIMacroAssembler::BuildFrame(size_t frame_size,
   cfi().AdjustCFAOffset(kFramePointerSize);
   DCHECK_EQ(static_cast<size_t>(cfi().GetCurrentCFAOffset()), frame_size);
 
-  for (size_t i = 0; i < entry_spills.size(); ++i) {
-    ManagedRegisterSpill spill = entry_spills.at(i);
+  for (const ManagedRegisterSpill& spill : entry_spills) {
     if (spill.AsX86().IsCpuRegister()) {
       int offset = frame_size + spill.getSpillOffset();
       __ movl(Address(ESP, offset), spill.AsX86().AsCpuRegister());
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index bd31561937..9983eaeeea 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -603,6 +603,56 @@ void X86_64Assembler::divss(XmmRegister dst, const Address& src) {
 }
 
 
+void X86_64Assembler::vfmadd231ps(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/);
+  uint8_t byte_one = EmitVexByte1(acc.NeedsRex(), false, mul_right.NeedsRex(), 2);
+  uint8_t byte_two = EmitVexByte2(false, 128, X86_64ManagedRegister::FromXmmRegister(mul_left.AsFloatRegister()), 1);
+  EmitUint8(byte_zero);
+  EmitUint8(byte_one);
+  EmitUint8(byte_two);
+  // Opcode field.
+  EmitUint8(0xB8);
+  EmitXmmRegisterOperand(acc.LowBits(), mul_right);
+}
+
+
+void X86_64Assembler::vfmsub231ps(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/);
+  uint8_t byte_one = EmitVexByte1(acc.NeedsRex(), false, mul_right.NeedsRex(), 2);
+  uint8_t byte_two = EmitVexByte2(false, 128, X86_64ManagedRegister::FromXmmRegister(mul_left.AsFloatRegister()), 1);
+  EmitUint8(byte_zero);
+  EmitUint8(byte_one);
+  EmitUint8(byte_two);
+  // Opcode field
+  EmitUint8(0xBA);
+  EmitXmmRegisterOperand(acc.LowBits(), mul_right);
+}
+
+void X86_64Assembler::vfmadd231pd(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/);
+  uint8_t byte_one = EmitVexByte1(acc.NeedsRex(), false, mul_right.NeedsRex(), 2);
+  uint8_t byte_two = EmitVexByte2(true, 128, X86_64ManagedRegister::FromXmmRegister(mul_left.AsFloatRegister()), 1);
+  EmitUint8(byte_zero);
+  EmitUint8(byte_one);
+  EmitUint8(byte_two);
+  EmitUint8(0xB8);
+  EmitXmmRegisterOperand(acc.LowBits(), mul_right);
+}
+
+void X86_64Assembler::vfmsub231pd(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/);
+  uint8_t byte_one = EmitVexByte1(acc.NeedsRex(), false, mul_right.NeedsRex(), 2);
+  uint8_t byte_two = EmitVexByte2(true, 128, X86_64ManagedRegister::FromXmmRegister(mul_left.AsFloatRegister()), 1);
+  EmitUint8(byte_zero);
+  EmitUint8(byte_one);
+  EmitUint8(byte_two);
+  EmitUint8(0xBA);
+  EmitXmmRegisterOperand(acc.LowBits(), mul_right);
+}
 void X86_64Assembler::addps(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitOptionalRex32(dst, src);
@@ -3544,6 +3594,98 @@ void X86_64Assembler::EmitLabelLink(NearLabel* label) {
   label->LinkTo(position);
 }
 
+uint8_t X86_64Assembler::EmitVexByteZero(bool is_two_byte) {
+  uint8_t vex_zero = 0xC0;
+  if (!is_two_byte) {
+    vex_zero |= 0xC4;
+  } else {
+    vex_zero |= 0xC5;
+  }
+  return vex_zero;
+}
+
+uint8_t X86_64Assembler::EmitVexByte1(bool r, bool x, bool b, int mmmmm) {
+  // VEX Byte 1.
+  uint8_t vex_prefix = 0;
+  if (!r) {
+    vex_prefix |= 0x80;  // VEX.R .
+  }
+  if (!x) {
+    vex_prefix |= 0x40;  // VEX.X .
+  }
+  if (!b) {
+    vex_prefix |= 0x20;  // VEX.B .
+  }
+
+  // VEX.mmmmm.
+  switch (mmmmm) {
+  case 1:
+    // Implied 0F leading opcode byte.
+    vex_prefix |= 0x01;
+    break;
+  case 2:
+    // Implied leading 0F 38 opcode byte.
+    vex_prefix |= 0x02;
+    break;
+  case 3:
+    // Implied leading OF 3A opcode byte.
+    vex_prefix |= 0x03;
+    break;
+  default:
+    LOG(FATAL) << "unknown opcode bytes";
+  }
+
+  return vex_prefix;
+}
+
+uint8_t X86_64Assembler::EmitVexByte2(bool w, int l, X86_64ManagedRegister operand, int pp) {
+  // VEX Byte 2.
+  uint8_t vex_prefix = 0;
+  if (w) {
+    vex_prefix |= 0x80;
+  }
+    // VEX.vvvv.
+  if (operand.IsXmmRegister()) {
+    XmmRegister vvvv = operand.AsXmmRegister();
+    int inverted_reg = 15-static_cast<int>(vvvv.AsFloatRegister());
+    uint8_t reg = static_cast<uint8_t>(inverted_reg);
+    vex_prefix |= ((reg & 0x0F) << 3);
+  } else if (operand.IsCpuRegister()) {
+    CpuRegister vvvv = operand.AsCpuRegister();
+    int inverted_reg = 15 - static_cast<int>(vvvv.AsRegister());
+    uint8_t reg = static_cast<uint8_t>(inverted_reg);
+    vex_prefix |= ((reg & 0x0F) << 3);
+  }
+
+  // VEX.L.
+  if (l == 256) {
+    vex_prefix |= 0x04;
+  }
+
+  // VEX.pp.
+  switch (pp) {
+  case 0:
+    // SIMD Pefix - None.
+    vex_prefix |= 0x00;
+    break;
+  case 1:
+    // SIMD Prefix - 66.
+    vex_prefix |= 0x01;
+    break;
+  case 2:
+    // SIMD Prefix - F3.
+    vex_prefix |= 0x02;
+    break;
+  case 3:
+    // SIMD Prefix - F2.
+    vex_prefix |= 0x03;
+    break;
+  default:
+    LOG(FATAL) << "unknown SIMD Prefix";
+  }
+
+  return vex_prefix;
+}
 
 void X86_64Assembler::EmitGenericShift(bool wide,
                                        int reg_or_opcode,
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index e4d72a7ba2..d5779aa786 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -436,6 +436,16 @@ class X86_64Assembler FINAL : public Assembler {
   void divss(XmmRegister dst, XmmRegister src);
   void divss(XmmRegister dst, const Address& src);
 
+  // Mac Instructions
+  // For reference look at the Instruction reference volume 2C.
+  // The below URL is broken down in two lines.
+  // https://www.intel.com/content/www/us/en/architecture-and-technology/
+  // 64-ia-32-architectures-software-developer-vol-2c-manual.html
+  void vfmadd231ps(XmmRegister acc, XmmRegister left, XmmRegister right);
+  void vfmadd231pd(XmmRegister acc, XmmRegister left, XmmRegister right);
+  void vfmsub231ps(XmmRegister acc, XmmRegister left, XmmRegister right);
+  void vfmsub231pd(XmmRegister acc, XmmRegister left, XmmRegister right);
+
   void addps(XmmRegister dst, XmmRegister src);  // no addr variant (for now)
   void subps(XmmRegister dst, XmmRegister src);
   void mulps(XmmRegister dst, XmmRegister src);
@@ -921,6 +931,11 @@ class X86_64Assembler FINAL : public Assembler {
   void EmitLabelLink(Label* label);
   void EmitLabelLink(NearLabel* label);
 
+  // Emit a 3 byte VEX Prefix.
+  uint8_t EmitVexByteZero(bool is_two_byte);
+  uint8_t EmitVexByte1(bool r, bool x, bool b, int mmmmm);
+  uint8_t EmitVexByte2(bool w , int l , X86_64ManagedRegister operand, int pp);
+
   void EmitGenericShift(bool wide, int rm, CpuRegister reg, const Immediate& imm);
   void EmitGenericShift(bool wide, int rm, CpuRegister operand, CpuRegister shifter);
 
diff --git a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
index 9486cb44c5..f6b2f9df34 100644
--- a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
+++ b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
@@ -75,8 +75,7 @@ void X86_64JNIMacroAssembler::BuildFrame(size_t frame_size,
 
   __ movq(Address(CpuRegister(RSP), 0), method_reg.AsX86_64().AsCpuRegister());
 
-  for (size_t i = 0; i < entry_spills.size(); ++i) {
-    ManagedRegisterSpill spill = entry_spills.at(i);
+  for (const ManagedRegisterSpill& spill : entry_spills) {
     if (spill.AsX86_64().IsCpuRegister()) {
       if (spill.getSize() == 8) {
         __ movq(Address(CpuRegister(RSP), frame_size + spill.getSpillOffset()),