24 files changed, 819 insertions, 98 deletions
diff --git a/compiler/Android.bp b/compiler/Android.bp
index 11521e68d0..e1d382f6f4 100644
--- a/compiler/Android.bp
+++ b/compiler/Android.bp
@@ -161,6 +161,7 @@ art_cc_defaults {
                 "utils/x86/assembler_x86.cc",
                 "utils/x86/jni_macro_assembler_x86.cc",
                 "utils/x86/managed_register_x86.cc",
+                "optimizing/instruction_simplifier_x86.cc",
             ],
         },
         x86_64: {
@@ -346,6 +347,7 @@ art_cc_test {
         "optimizing/parallel_move_test.cc",
         "optimizing/pretty_printer_test.cc",
         "optimizing/reference_type_propagation_test.cc",
+        "optimizing/select_generator_test.cc",
         "optimizing/side_effects_test.cc",
         "optimizing/ssa_liveness_analysis_test.cc",
         "optimizing/ssa_test.cc",
diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc
index 086ae07a06..58808769e2 100644
--- a/compiler/optimizing/code_generator_vector_x86.cc
+++ b/compiler/optimizing/code_generator_vector_x86.cc
@@ -1125,13 +1125,59 @@ static void CreateVecAccumLocations(ArenaAllocator* allocator, HVecOperation* in
   }
 }
 
-void LocationsBuilderX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
-  CreateVecAccumLocations(GetGraph()->GetAllocator(), instruction);
+void LocationsBuilderX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instr);
+  switch (instr->GetPackedType()) {
+    case DataType::Type::kFloat32:
+    case DataType::Type::kFloat64:
+      locations->SetInAt(
+          HVecMultiplyAccumulate::kInputAccumulatorIndex, Location::RequiresFpuRegister());
+      locations->SetInAt(
+          HVecMultiplyAccumulate::kInputMulLeftIndex, Location::RequiresFpuRegister());
+      locations->SetInAt(
+          HVecMultiplyAccumulate::kInputMulRightIndex, Location::RequiresFpuRegister());
+      DCHECK_EQ(HVecMultiplyAccumulate::kInputAccumulatorIndex, 0);
+      locations->SetOut(Location::SameAsFirstInput());
+  break;
+  default:
+    // VecMultiplyAccumulate is supported only for single and
+    // double precision floating points. Hence integral types
+    // are still not converted.
+    LOG(FATAL) << "Unsupported SIMD Type";
+  }
 }
 
-void InstructionCodeGeneratorX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
-  // TODO: pmaddwd?
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+void InstructionCodeGeneratorX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LocationSummary* locations = instr->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  XmmRegister accumulator = locations->InAt(
+      HVecMultiplyAccumulate::kInputAccumulatorIndex).AsFpuRegister<XmmRegister>();
+  XmmRegister mul_left = locations->InAt(
+      HVecMultiplyAccumulate::kInputMulLeftIndex).AsFpuRegister<XmmRegister>();
+  XmmRegister mul_right = locations->InAt(
+      HVecMultiplyAccumulate::kInputMulRightIndex).AsFpuRegister<XmmRegister>();
+  switch (instr->GetPackedType()) {
+    case DataType::Type::kFloat32:
+      DCHECK_EQ(4u, instr->GetVectorLength());
+      if (instr->GetOpKind() == HInstruction::InstructionKind::kAdd)
+        __ vfmadd231ps(accumulator, mul_left, mul_right);
+      else
+        __ vfmsub231ps(accumulator, mul_left, mul_right);
+    break;
+    case DataType::Type::kFloat64:
+      DCHECK_EQ(2u, instr->GetVectorLength());
+      if (instr->GetOpKind() == HInstruction::InstructionKind::kAdd)
+        __ vfmadd231pd(accumulator, mul_left, mul_right);
+      else
+        __ vfmsub231pd(accumulator, mul_left, mul_right);
+      break;
+  default:
+
+    // VecMultiplyAccumulate is supported only for single and
+    // double precision floating points. Hence integral types
+    // are still not converted.
+    LOG(FATAL) << "Unsupported SIMD Type";
+  }
 }
 
 void LocationsBuilderX86::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc
index 4d31ab68d1..4795e86933 100644
--- a/compiler/optimizing/code_generator_vector_x86_64.cc
+++ b/compiler/optimizing/code_generator_vector_x86_64.cc
@@ -1098,13 +1098,61 @@ static void CreateVecAccumLocations(ArenaAllocator* allocator, HVecOperation* in
   }
 }
 
-void LocationsBuilderX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
-  CreateVecAccumLocations(GetGraph()->GetAllocator(), instruction);
+void LocationsBuilderX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instr);
+  switch (instr->GetPackedType()) {
+    case DataType::Type::kFloat32:
+    case DataType::Type::kFloat64:
+      locations->SetInAt(
+          HVecMultiplyAccumulate::kInputAccumulatorIndex, Location::RequiresFpuRegister());
+      locations->SetInAt(
+          HVecMultiplyAccumulate::kInputMulLeftIndex, Location::RequiresFpuRegister());
+      locations->SetInAt(
+          HVecMultiplyAccumulate::kInputMulRightIndex, Location::RequiresFpuRegister());
+      DCHECK_EQ(HVecMultiplyAccumulate::kInputAccumulatorIndex, 0);
+      locations->SetOut(Location::SameAsFirstInput());
+      break;
+    default:
+      // VecMultiplyAccumulate is supported only for single and
+      // double precision floating points. Hence integral types
+      // are still not converted.
+      LOG(FATAL) << "Unsupported SIMD type";
+  }
 }
 
-void InstructionCodeGeneratorX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
-  // TODO: pmaddwd?
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+
+void InstructionCodeGeneratorX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LocationSummary* locations = instr->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  XmmRegister accumulator = locations->InAt(
+      HVecMultiplyAccumulate::kInputAccumulatorIndex).AsFpuRegister<XmmRegister>();
+  XmmRegister mul_left = locations->InAt(
+      HVecMultiplyAccumulate::kInputMulLeftIndex).AsFpuRegister<XmmRegister>();
+  XmmRegister mul_right = locations->InAt(
+      HVecMultiplyAccumulate::kInputMulRightIndex).AsFpuRegister<XmmRegister>();
+
+  switch (instr->GetPackedType()) {
+    case DataType::Type::kFloat32:
+      DCHECK_EQ(4u, instr->GetVectorLength());
+      if (instr->GetOpKind() == HInstruction::InstructionKind::kAdd)
+         __ vfmadd231ps(accumulator, mul_left, mul_right);
+      else
+        __ vfmsub231ps(accumulator, mul_left, mul_right);
+    break;
+    case DataType::Type::kFloat64:
+      DCHECK_EQ(2u, instr->GetVectorLength());
+      if (instr->GetOpKind() == HInstruction::InstructionKind::kAdd)
+        __ vfmadd231pd(accumulator, mul_left, mul_right);
+      else
+        __ vfmsub231pd(accumulator, mul_left, mul_right);
+    break;
+  default:
+
+    // VecMultiplyAccumulate is supported only for single and
+    // double precision floating points. Hence integral types
+    // are still not converted.
+    LOG(FATAL) << "Unsupported SIMD Type";
+  }
 }
 
 void LocationsBuilderX86_64::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
diff --git a/compiler/optimizing/instruction_simplifier_x86.cc b/compiler/optimizing/instruction_simplifier_x86.cc
new file mode 100644
index 0000000000..b3f67d6e84
--- /dev/null
+++ b/compiler/optimizing/instruction_simplifier_x86.cc
@@ -0,0 +1,149 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "instruction_simplifier_x86.h"
+#include "arch/x86/instruction_set_features_x86.h"
+#include "mirror/array-inl.h"
+#include "code_generator.h"
+
+
+namespace art {
+
+namespace x86 {
+
+class InstructionSimplifierX86Visitor : public HGraphVisitor {
+ public:
+  InstructionSimplifierX86Visitor(HGraph* graph,
+                                  CodeGeneratorX86 *codegen,
+                                  OptimizingCompilerStats* stats)
+      : HGraphVisitor(graph), codegen_(codegen), stats_(stats) {}
+
+ private:
+  void RecordSimplification() {
+    MaybeRecordStat(stats_, MethodCompilationStat::kInstructionSimplificationsArch);
+  }
+
+  bool HasCpuFeatureFlag() {
+     return (codegen_->GetInstructionSetFeatures().HasAVX2());
+  }
+
+  /**
+   * This simplifier uses a special-purpose BB visitor.
+   * (1) No need to visit Phi nodes.
+   * (2) Since statements can be removed in a "forward" fashion,
+   *     the visitor should test if each statement is still there.
+   */
+  void VisitBasicBlock(HBasicBlock* block) OVERRIDE {
+    // TODO: fragile iteration, provide more robust iterators?
+    for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
+      HInstruction* instruction = it.Current();
+      if (instruction->IsInBlock()) {
+        instruction->Accept(this);
+      }
+    }
+  }
+
+  bool TryGenerateVecMultiplyAccumulate(HVecMul* mul);
+  void VisitVecMul(HVecMul* instruction) OVERRIDE;
+
+  CodeGeneratorX86* codegen_;
+  OptimizingCompilerStats* stats_;
+};
+
+/* generic expressions for FMA
+a = (b * c) + a
+a = (b * c) – a
+*/
+bool InstructionSimplifierX86Visitor::TryGenerateVecMultiplyAccumulate(HVecMul* mul) {
+  if (!(mul->GetPackedType() == DataType::Type::kFloat32 ||
+        mul->GetPackedType() == DataType::Type::kFloat64)) {
+     return false;
+  }
+  ArenaAllocator* allocator = mul->GetBlock()->GetGraph()->GetAllocator();
+  if (mul->HasOnlyOneNonEnvironmentUse()) {
+    HInstruction* use = mul->GetUses().front().GetUser();
+    if (use->IsVecAdd() || use->IsVecSub()) {
+      // Replace code looking like
+      //    VECMUL tmp, x, y
+      //    VECADD dst, acc, tmp or VECADD dst, tmp, acc
+      //      or
+      //    VECSUB dst, tmp, acc
+      // with
+      //    VECMULACC dst, acc, x, y
+
+      // Note that we do not want to (unconditionally) perform the merge when the
+      // multiplication has multiple uses and it can be merged in all of them.
+      // Multiple uses could happen on the same control-flow path, and we would
+      // then increase the amount of work. In the future we could try to evaluate
+      // whether all uses are on different control-flow paths (using dominance and
+      // reverse-dominance information) and only perform the merge when they are.
+      HInstruction* accumulator = nullptr;
+      HVecBinaryOperation* binop = use->AsVecBinaryOperation();
+      HInstruction* binop_left = binop->GetLeft();
+      HInstruction* binop_right = binop->GetRight();
+      DCHECK_NE(binop_left, binop_right);
+      if (use->IsVecSub()) {
+        if (binop_left == mul) {
+          accumulator = binop_right;
+         }
+      } else {
+        // VecAdd
+        if (binop_right == mul) {
+          accumulator = binop_left;
+        } else {
+          DCHECK_EQ(binop_left, mul);
+          accumulator = binop_right;
+        }
+      }
+      HInstruction::InstructionKind kind =
+        use->IsVecAdd() ? HInstruction::kAdd : HInstruction::kSub;
+
+      if (accumulator != nullptr) {
+        HVecMultiplyAccumulate* mulacc =
+          new (allocator) HVecMultiplyAccumulate(allocator,
+                                                 kind,
+                                                 accumulator,
+                                                 mul->GetLeft(),
+                                                 mul->GetRight(),
+                                                 binop->GetPackedType(),
+                                                 binop->GetVectorLength(),
+                                                 binop->GetDexPc());
+        binop->GetBlock()->ReplaceAndRemoveInstructionWith(binop, mulacc);
+        DCHECK(!mul->HasUses());
+        mul->GetBlock()->RemoveInstruction(mul);
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+void InstructionSimplifierX86Visitor::VisitVecMul(HVecMul* instruction) {
+  if (HasCpuFeatureFlag()) {
+    if (TryGenerateVecMultiplyAccumulate(instruction)) {
+      RecordSimplification();
+    }
+  }
+}
+
+bool InstructionSimplifierX86::Run() {
+  InstructionSimplifierX86Visitor visitor(graph_, codegen_, stats_);
+  visitor.VisitReversePostOrder();
+  return true;
+}
+
+}  // namespace x86
+}  // namespace art
diff --git a/compiler/optimizing/instruction_simplifier_x86.h b/compiler/optimizing/instruction_simplifier_x86.h
new file mode 100644
index 0000000000..1fb199f728
--- /dev/null
+++ b/compiler/optimizing/instruction_simplifier_x86.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_X86_H_
+#define ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_X86_H_
+
+#include "nodes.h"
+#include "optimization.h"
+#include "code_generator_x86.h"
+
+namespace art {
+namespace x86 {
+
+class InstructionSimplifierX86 : public HOptimization {
+ public:
+  InstructionSimplifierX86(HGraph* graph, CodeGenerator* codegen, OptimizingCompilerStats* stats)
+      : HOptimization(graph, kInstructionSimplifierX86PassName, stats),
+        codegen_(down_cast<CodeGeneratorX86*>(codegen)) {}
+
+  static constexpr const char* kInstructionSimplifierX86PassName = "instruction_simplifier_x86";
+
+  bool Run() OVERRIDE;
+
+ private:
+  CodeGeneratorX86* codegen_;
+};
+
+}  // namespace x86
+}  // namespace art
+
+#endif  // ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_X86_H_
diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h
index c5e9a8d036..b4f9993ad6 100644
--- a/compiler/optimizing/nodes_vector.h
+++ b/compiler/optimizing/nodes_vector.h
@@ -958,6 +958,10 @@ class HVecMultiplyAccumulate FINAL : public HVecOperation {
     SetRawInputAt(2, mul_right);
   }
 
+  static constexpr int kInputAccumulatorIndex = 0;
+  static constexpr int kInputMulLeftIndex = 1;
+  static constexpr int kInputMulRightIndex = 2;
+
   bool CanBeMoved() const OVERRIDE { return true; }
 
   bool InstructionDataEquals(const HInstruction* other) const OVERRIDE {
diff --git a/compiler/optimizing/optimization.cc b/compiler/optimizing/optimization.cc
index 142ddb5fbb..3c803ab627 100644
--- a/compiler/optimizing/optimization.cc
+++ b/compiler/optimizing/optimization.cc
@@ -28,6 +28,7 @@
 #endif
 #ifdef ART_ENABLE_CODEGEN_x86
 #include "pc_relative_fixups_x86.h"
+#include "instruction_simplifier_x86.h"
 #endif
 #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
 #include "x86_memory_gen.h"
@@ -121,6 +122,8 @@ const char* OptimizationPassName(OptimizationPass pass) {
 #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
     case OptimizationPass::kX86MemoryOperandGeneration:
       return x86::X86MemoryOperandGeneration::kX86MemoryOperandGenerationPassName;
+    case OptimizationPass::kInstructionSimplifierX86:
+      return x86::InstructionSimplifierX86::kInstructionSimplifierX86PassName;
 #endif
     case OptimizationPass::kNone:
       LOG(FATAL) << "kNone does not represent an actual pass";
@@ -163,6 +166,7 @@ OptimizationPass OptimizationPassByName(const std::string& pass_name) {
 #ifdef ART_ENABLE_CODEGEN_x86
   X(OptimizationPass::kPcRelativeFixupsX86);
   X(OptimizationPass::kX86MemoryOperandGeneration);
+  X(OptimizationPass::kInstructionSimplifierX86);
 #endif
   LOG(FATAL) << "Cannot find optimization " << pass_name;
   UNREACHABLE();
@@ -323,6 +327,10 @@ ArenaVector<HOptimization*> ConstructOptimizations(
         DCHECK(alt_name == nullptr) << "arch-specific pass does not support alternative name";
         opt = new (allocator) x86::X86MemoryOperandGeneration(graph, codegen, stats);
         break;
+      case OptimizationPass::kInstructionSimplifierX86:
+        DCHECK(alt_name == nullptr) << "arch-specific pass does not support alternative name";
+        opt = new (allocator) x86::InstructionSimplifierX86(graph, codegen, stats);
+        break;
 #endif
       case OptimizationPass::kNone:
         LOG(FATAL) << "kNone does not represent an actual pass";
diff --git a/compiler/optimizing/optimization.h b/compiler/optimizing/optimization.h
index 88b283cebf..a9fafa0864 100644
--- a/compiler/optimizing/optimization.h
+++ b/compiler/optimizing/optimization.h
@@ -101,6 +101,7 @@ enum class OptimizationPass {
 #endif
 #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
   kX86MemoryOperandGeneration,
+  kInstructionSimplifierX86,
 #endif
   kNone,
   kLast = kNone
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index 84863e4357..bb33ba3564 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -530,7 +530,8 @@ bool OptimizingCompiler::RunArchOptimizations(HGraph* graph,
         OptDef(OptimizationPass::kSideEffectsAnalysis),
         OptDef(OptimizationPass::kGlobalValueNumbering, "GVN$after_arch"),
         OptDef(OptimizationPass::kPcRelativeFixupsX86),
-        OptDef(OptimizationPass::kX86MemoryOperandGeneration)
+        OptDef(OptimizationPass::kX86MemoryOperandGeneration),
+        OptDef(OptimizationPass::kInstructionSimplifierX86)
       };
       return RunOptimizations(graph,
                               codegen,
@@ -545,7 +546,8 @@ bool OptimizingCompiler::RunArchOptimizations(HGraph* graph,
       OptimizationDef x86_64_optimizations[] = {
         OptDef(OptimizationPass::kSideEffectsAnalysis),
         OptDef(OptimizationPass::kGlobalValueNumbering, "GVN$after_arch"),
-        OptDef(OptimizationPass::kX86MemoryOperandGeneration)
+        OptDef(OptimizationPass::kX86MemoryOperandGeneration),
+        OptDef(OptimizationPass::kInstructionSimplifierX86)
       };
       return RunOptimizations(graph,
                               codegen,
diff --git a/compiler/optimizing/optimizing_unit_test.h b/compiler/optimizing/optimizing_unit_test.h
index a627f65ed4..e44d7b82e5 100644
--- a/compiler/optimizing/optimizing_unit_test.h
+++ b/compiler/optimizing/optimizing_unit_test.h
@@ -29,6 +29,7 @@
 #include "dex/dex_instruction.h"
 #include "dex/standard_dex_file.h"
 #include "driver/dex_compilation_unit.h"
+#include "graph_checker.h"
 #include "handle_scope-inl.h"
 #include "mirror/class_loader.h"
 #include "mirror/dex_cache.h"
@@ -185,6 +186,77 @@ class OptimizingUnitTestHelper {
 
 class OptimizingUnitTest : public CommonCompilerTest, public OptimizingUnitTestHelper {};
 
+// OptimizingUnitTest with some handy functions to ease the graph creation.
+class ImprovedOptimizingUnitTest : public OptimizingUnitTest {
+ public:
+  ImprovedOptimizingUnitTest() : graph_(CreateGraph()),
+                                 entry_block_(nullptr),
+                                 return_block_(nullptr),
+                                 exit_block_(nullptr),
+                                 parameter_(nullptr) {}
+
+  virtual ~ImprovedOptimizingUnitTest() {}
+
+  void InitGraph() {
+    entry_block_ = new (GetAllocator()) HBasicBlock(graph_);
+    graph_->AddBlock(entry_block_);
+    graph_->SetEntryBlock(entry_block_);
+
+    return_block_ = new (GetAllocator()) HBasicBlock(graph_);
+    graph_->AddBlock(return_block_);
+
+    exit_block_ = new (GetAllocator()) HBasicBlock(graph_);
+    graph_->AddBlock(exit_block_);
+    graph_->SetExitBlock(exit_block_);
+
+    entry_block_->AddSuccessor(return_block_);
+    return_block_->AddSuccessor(exit_block_);
+
+    parameter_ = new (GetAllocator()) HParameterValue(graph_->GetDexFile(),
+                                                      dex::TypeIndex(0),
+                                                      0,
+                                                      DataType::Type::kInt32);
+    entry_block_->AddInstruction(parameter_);
+    return_block_->AddInstruction(new (GetAllocator()) HReturnVoid());
+    exit_block_->AddInstruction(new (GetAllocator()) HExit());
+  }
+
+  bool CheckGraph() {
+    GraphChecker checker(graph_);
+    checker.Run();
+    if (!checker.IsValid()) {
+      for (const std::string& error : checker.GetErrors()) {
+        std::cout << error << std::endl;
+      }
+      return false;
+    }
+    return true;
+  }
+
+  HEnvironment* ManuallyBuildEnvFor(HInstruction* instruction,
+                                    ArenaVector<HInstruction*>* current_locals) {
+    HEnvironment* environment = new (GetAllocator()) HEnvironment(
+        (GetAllocator()),
+        current_locals->size(),
+        graph_->GetArtMethod(),
+        instruction->GetDexPc(),
+        instruction);
+
+    environment->CopyFrom(ArrayRef<HInstruction* const>(*current_locals));
+    instruction->SetRawEnvironment(environment);
+    return environment;
+  }
+
+ protected:
+  HGraph* graph_;
+
+  HBasicBlock* entry_block_;
+  HBasicBlock* return_block_;
+  HBasicBlock* exit_block_;
+
+  HInstruction* parameter_;
+};
+
 // Naive string diff data type.
 typedef std::list<std::pair<std::string, std::string>> diff_t;
 
diff --git a/compiler/optimizing/select_generator.cc b/compiler/optimizing/select_generator.cc
index 0d0f7cc748..dcc7f77fc2 100644
--- a/compiler/optimizing/select_generator.cc
+++ b/compiler/optimizing/select_generator.cc
@@ -45,7 +45,9 @@ static bool IsSimpleBlock(HBasicBlock* block) {
     HInstruction* instruction = it.Current();
     if (instruction->IsControlFlow()) {
       return instruction->IsGoto() || instruction->IsReturn();
-    } else if (instruction->CanBeMoved() && !instruction->HasSideEffects()) {
+    } else if (instruction->CanBeMoved() &&
+               !instruction->HasSideEffects() &&
+               !instruction->CanThrow()) {
       if (instruction->IsSelect() &&
           instruction->AsSelect()->GetCondition()->GetBlock() == block) {
         // Count one HCondition and HSelect in the same block as a single instruction.
@@ -119,10 +121,14 @@ bool HSelectGenerator::Run() {
     // TODO(dbrazdil): This puts an instruction between If and its condition.
     //                 Implement moving of conditions to first users if possible.
     while (!true_block->IsSingleGoto() && !true_block->IsSingleReturn()) {
-      true_block->GetFirstInstruction()->MoveBefore(if_instruction);
+      HInstruction* instr = true_block->GetFirstInstruction();
+      DCHECK(!instr->CanThrow());
+      instr->MoveBefore(if_instruction);
     }
     while (!false_block->IsSingleGoto() && !false_block->IsSingleReturn()) {
-      false_block->GetFirstInstruction()->MoveBefore(if_instruction);
+      HInstruction* instr = false_block->GetFirstInstruction();
+      DCHECK(!instr->CanThrow());
+      instr->MoveBefore(if_instruction);
     }
     DCHECK(true_block->IsSingleGoto() || true_block->IsSingleReturn());
     DCHECK(false_block->IsSingleGoto() || false_block->IsSingleReturn());
diff --git a/compiler/optimizing/select_generator_test.cc b/compiler/optimizing/select_generator_test.cc
new file mode 100644
index 0000000000..6e6549737c
--- /dev/null
+++ b/compiler/optimizing/select_generator_test.cc
@@ -0,0 +1,96 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "select_generator.h"
+
+#include "base/arena_allocator.h"
+#include "builder.h"
+#include "nodes.h"
+#include "optimizing_unit_test.h"
+#include "side_effects_analysis.h"
+
+namespace art {
+
+class SelectGeneratorTest : public ImprovedOptimizingUnitTest {
+ public:
+  void ConstructBasicGraphForSelect(HInstruction* instr) {
+    HBasicBlock* if_block = new (GetAllocator()) HBasicBlock(graph_);
+    HBasicBlock* then_block = new (GetAllocator()) HBasicBlock(graph_);
+    HBasicBlock* else_block = new (GetAllocator()) HBasicBlock(graph_);
+
+    graph_->AddBlock(if_block);
+    graph_->AddBlock(then_block);
+    graph_->AddBlock(else_block);
+
+    entry_block_->ReplaceSuccessor(return_block_, if_block);
+
+    if_block->AddSuccessor(then_block);
+    if_block->AddSuccessor(else_block);
+    then_block->AddSuccessor(return_block_);
+    else_block->AddSuccessor(return_block_);
+
+    HParameterValue* bool_param = new (GetAllocator()) HParameterValue(graph_->GetDexFile(),
+                                                                       dex::TypeIndex(0),
+                                                                       1,
+                                                                       DataType::Type::kBool);
+    entry_block_->AddInstruction(bool_param);
+    HIntConstant* const1 =  graph_->GetIntConstant(1);
+
+    if_block->AddInstruction(new (GetAllocator()) HIf(bool_param));
+
+    then_block->AddInstruction(instr);
+    then_block->AddInstruction(new (GetAllocator()) HGoto());
+
+    else_block->AddInstruction(new (GetAllocator()) HGoto());
+
+    HPhi* phi = new (GetAllocator()) HPhi(GetAllocator(), 0, 0, DataType::Type::kInt32);
+    return_block_->AddPhi(phi);
+    phi->AddInput(instr);
+    phi->AddInput(const1);
+  }
+
+  bool CheckGraphAndTrySelectGenerator() {
+    graph_->BuildDominatorTree();
+    EXPECT_TRUE(CheckGraph());
+
+    SideEffectsAnalysis side_effects(graph_);
+    side_effects.Run();
+    return HSelectGenerator(graph_, /*handles*/ nullptr, /*stats*/ nullptr).Run();
+  }
+};
+
+// HDivZeroCheck might throw and should not be hoisted from the conditional to an unconditional.
+TEST_F(SelectGeneratorTest, testZeroCheck) {
+  InitGraph();
+  HDivZeroCheck* instr = new (GetAllocator()) HDivZeroCheck(parameter_, 0);
+  ConstructBasicGraphForSelect(instr);
+
+  ArenaVector<HInstruction*> current_locals({parameter_, graph_->GetIntConstant(1)},
+                                            GetAllocator()->Adapter(kArenaAllocInstruction));
+  ManuallyBuildEnvFor(instr, &current_locals);
+
+  EXPECT_FALSE(CheckGraphAndTrySelectGenerator());
+}
+
+// Test that SelectGenerator succeeds with HAdd.
+TEST_F(SelectGeneratorTest, testAdd) {
+  InitGraph();
+  HAdd* instr = new (GetAllocator()) HAdd(DataType::Type::kInt32, parameter_, parameter_, 0);
+  ConstructBasicGraphForSelect(instr);
+  EXPECT_TRUE(CheckGraphAndTrySelectGenerator());
+}
+
+}  // namespace art
diff --git a/compiler/optimizing/superblock_cloner_test.cc b/compiler/optimizing/superblock_cloner_test.cc
index 6f3bcdac47..31114b6dcc 100644
--- a/compiler/optimizing/superblock_cloner_test.cc
+++ b/compiler/optimizing/superblock_cloner_test.cc
@@ -30,38 +30,8 @@ using HEdgeSet = SuperblockCloner::HEdgeSet;
 
 // This class provides methods and helpers for testing various cloning and copying routines:
 // individual instruction cloning and cloning of the more coarse-grain structures.
-class SuperblockClonerTest : public OptimizingUnitTest {
+class SuperblockClonerTest : public ImprovedOptimizingUnitTest {
  public:
-  SuperblockClonerTest() : graph_(CreateGraph()),
-                           entry_block_(nullptr),
-                           return_block_(nullptr),
-                           exit_block_(nullptr),
-                           parameter_(nullptr) {}
-
-  void InitGraph() {
-    entry_block_ = new (GetAllocator()) HBasicBlock(graph_);
-    graph_->AddBlock(entry_block_);
-    graph_->SetEntryBlock(entry_block_);
-
-    return_block_ = new (GetAllocator()) HBasicBlock(graph_);
-    graph_->AddBlock(return_block_);
-
-    exit_block_ = new (GetAllocator()) HBasicBlock(graph_);
-    graph_->AddBlock(exit_block_);
-    graph_->SetExitBlock(exit_block_);
-
-    entry_block_->AddSuccessor(return_block_);
-    return_block_->AddSuccessor(exit_block_);
-
-    parameter_ = new (GetAllocator()) HParameterValue(graph_->GetDexFile(),
-                                                      dex::TypeIndex(0),
-                                                      0,
-                                                      DataType::Type::kInt32);
-    entry_block_->AddInstruction(parameter_);
-    return_block_->AddInstruction(new (GetAllocator()) HReturnVoid());
-    exit_block_->AddInstruction(new (GetAllocator()) HExit());
-  }
-
   void CreateBasicLoopControlFlow(HBasicBlock* position,
                                   HBasicBlock* successor,
                                   /* out */ HBasicBlock** header_p,
@@ -137,40 +107,6 @@ class SuperblockClonerTest : public OptimizingUnitTest {
     null_check->CopyEnvironmentFrom(env);
     bounds_check->CopyEnvironmentFrom(env);
   }
-
-  HEnvironment* ManuallyBuildEnvFor(HInstruction* instruction,
-                                    ArenaVector<HInstruction*>* current_locals) {
-    HEnvironment* environment = new (GetAllocator()) HEnvironment(
-        (GetAllocator()),
-        current_locals->size(),
-        graph_->GetArtMethod(),
-        instruction->GetDexPc(),
-        instruction);
-
-    environment->CopyFrom(ArrayRef<HInstruction* const>(*current_locals));
-    instruction->SetRawEnvironment(environment);
-    return environment;
-  }
-
-  bool CheckGraph() {
-    GraphChecker checker(graph_);
-    checker.Run();
-    if (!checker.IsValid()) {
-      for (const std::string& error : checker.GetErrors()) {
-        std::cout << error << std::endl;
-      }
-      return false;
-    }
-    return true;
-  }
-
-  HGraph* graph_;
-
-  HBasicBlock* entry_block_;
-  HBasicBlock* return_block_;
-  HBasicBlock* exit_block_;
-
-  HInstruction* parameter_;
 };
 
 TEST_F(SuperblockClonerTest, IndividualInstrCloner) {
diff --git a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
index 2c428fac7e..c6c764e3a9 100644
--- a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
+++ b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
@@ -120,11 +120,10 @@ void ArmVIXLJNIMacroAssembler::BuildFrame(size_t frame_size,
 
   // Write out entry spills.
   int32_t offset = frame_size + kFramePointerSize;
-  for (size_t i = 0; i < entry_spills.size(); ++i) {
-    ArmManagedRegister reg = entry_spills.at(i).AsArm();
+  for (const ManagedRegisterSpill& spill : entry_spills) {
+    ArmManagedRegister reg = spill.AsArm();
     if (reg.IsNoRegister()) {
       // only increment stack offset.
-      ManagedRegisterSpill spill = entry_spills.at(i);
       offset += spill.getSize();
     } else if (reg.IsCoreRegister()) {
       asm_.StoreToOffset(kStoreWord, AsVIXLRegister(reg), sp, offset);
diff --git a/compiler/utils/arm64/jni_macro_assembler_arm64.cc b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
index a5aa1c12b3..d6ce03387c 100644
--- a/compiler/utils/arm64/jni_macro_assembler_arm64.cc
+++ b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
@@ -719,11 +719,10 @@ void Arm64JNIMacroAssembler::BuildFrame(size_t frame_size,
 
   // Write out entry spills
   int32_t offset = frame_size + static_cast<size_t>(kArm64PointerSize);
-  for (size_t i = 0; i < entry_spills.size(); ++i) {
-    Arm64ManagedRegister reg = entry_spills.at(i).AsArm64();
+  for (const ManagedRegisterSpill& spill : entry_spills) {
+    Arm64ManagedRegister reg = spill.AsArm64();
     if (reg.IsNoRegister()) {
       // only increment stack offset.
-      ManagedRegisterSpill spill = entry_spills.at(i);
       offset += spill.getSize();
     } else if (reg.IsXRegister()) {
       StoreToOffset(reg.AsXRegister(), SP, offset);
diff --git a/compiler/utils/managed_register.h b/compiler/utils/managed_register.h
index 2b7b2aa7ce..db9c36cc75 100644
--- a/compiler/utils/managed_register.h
+++ b/compiler/utils/managed_register.h
@@ -101,11 +101,11 @@ class ManagedRegisterSpill : public ManagedRegister {
   ManagedRegisterSpill(const ManagedRegister& other, int32_t size)
       : ManagedRegister(other), size_(size), spill_offset_(-1) { }
 
-  int32_t getSpillOffset() {
+  int32_t getSpillOffset() const {
     return spill_offset_;
   }
 
-  int32_t getSize() {
+  int32_t getSize() const {
     return size_;
   }
 
diff --git a/compiler/utils/mips/assembler_mips.cc b/compiler/utils/mips/assembler_mips.cc
index dce5b95fec..c0b6f988d4 100644
--- a/compiler/utils/mips/assembler_mips.cc
+++ b/compiler/utils/mips/assembler_mips.cc
@@ -4801,10 +4801,9 @@ void MipsAssembler::BuildFrame(size_t frame_size,
 
   // Write out entry spills.
   int32_t offset = frame_size + kFramePointerSize;
-  for (size_t i = 0; i < entry_spills.size(); ++i) {
-    MipsManagedRegister reg = entry_spills.at(i).AsMips();
+  for (const ManagedRegisterSpill& spill : entry_spills) {
+    MipsManagedRegister reg = spill.AsMips();
     if (reg.IsNoRegister()) {
-      ManagedRegisterSpill spill = entry_spills.at(i);
       offset += spill.getSize();
     } else if (reg.IsCoreRegister()) {
       StoreToOffset(kStoreWord, reg.AsCoreRegister(), SP, offset);
diff --git a/compiler/utils/mips64/assembler_mips64.cc b/compiler/utils/mips64/assembler_mips64.cc
index bb1bb82fa5..5b1c5d9e01 100644
--- a/compiler/utils/mips64/assembler_mips64.cc
+++ b/compiler/utils/mips64/assembler_mips64.cc
@@ -3633,9 +3633,8 @@ void Mips64Assembler::BuildFrame(size_t frame_size,
 
   // Write out entry spills.
   int32_t offset = frame_size + kFramePointerSize;
-  for (size_t i = 0; i < entry_spills.size(); ++i) {
-    Mips64ManagedRegister reg = entry_spills[i].AsMips64();
-    ManagedRegisterSpill spill = entry_spills.at(i);
+  for (const ManagedRegisterSpill& spill : entry_spills) {
+    Mips64ManagedRegister reg = spill.AsMips64();
     int32_t size = spill.getSize();
     if (reg.IsNoRegister()) {
       // only increment stack offset.
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index 86f9010ea3..c2ce03b1f2 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -525,6 +525,58 @@ void X86Assembler::divss(XmmRegister dst, const Address& src) {
   EmitOperand(dst, src);
 }
 
+void X86Assembler::vfmadd231ps(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/);
+  uint8_t byte_one = EmitVexByte1(false, false, false, 2);
+  uint8_t byte_two = EmitVexByte2(false, 128, X86ManagedRegister::FromXmmRegister(mul_left), 1);
+  EmitUint8(byte_zero);
+  EmitUint8(byte_one);
+  EmitUint8(byte_two);
+  // Opcode field.
+  EmitUint8(0xB8);
+  EmitXmmRegisterOperand(acc, mul_right);
+}
+
+void X86Assembler::vfmsub231ps(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/);
+  uint8_t byte_one = EmitVexByte1(false, false, false, 2);
+  uint8_t byte_two = EmitVexByte2(false, 128, X86ManagedRegister::FromXmmRegister(mul_left), 1);
+  EmitUint8(byte_zero);
+  EmitUint8(byte_one);
+  EmitUint8(byte_two);
+  // Opcode field.
+  EmitUint8(0xBA);
+  EmitXmmRegisterOperand(acc, mul_right);
+}
+
+void X86Assembler::vfmadd231pd(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/);
+  uint8_t byte_one = EmitVexByte1(false, false, false, 2);
+  uint8_t byte_two = EmitVexByte2(true, 128, X86ManagedRegister::FromXmmRegister(mul_left), 1);
+  EmitUint8(byte_zero);
+  EmitUint8(byte_one);
+  EmitUint8(byte_two);
+  // Opcode field.
+  EmitUint8(0xB8);
+  EmitXmmRegisterOperand(acc, mul_right);
+}
+
+void X86Assembler::vfmsub231pd(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/);
+  uint8_t byte_one = EmitVexByte1(false, false, false, 2);
+  uint8_t byte_two = EmitVexByte2(true, 128, X86ManagedRegister::FromXmmRegister(mul_left), 1);
+  EmitUint8(byte_zero);
+  EmitUint8(byte_one);
+  EmitUint8(byte_two);
+  // Opcode field.
+  EmitUint8(0xBA);
+  EmitXmmRegisterOperand(acc, mul_right);
+}
+
 
 void X86Assembler::addps(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -2898,6 +2950,99 @@ void X86Assembler::EmitLabelLink(NearLabel* label) {
 }
 
 
+uint8_t X86Assembler::EmitVexByteZero(bool is_two_byte) {
+  uint8_t vex_zero = 0xC0;
+  if (!is_two_byte) {
+    vex_zero |= 0xC4;
+  } else {
+    vex_zero |= 0xC5;
+  }
+  return vex_zero;
+}
+
+uint8_t X86Assembler::EmitVexByte1(bool r, bool x, bool b, int mmmmm ) {
+  // VEX Byte 1.
+  uint8_t vex_prefix = 0;
+  if (!r) {
+    vex_prefix |= 0x80;  // VEX.R .
+  }
+  if (!x) {
+    vex_prefix |= 0x40;  // VEX.X .
+  }
+  if (!b) {
+    vex_prefix |= 0x20;  // VEX.B .
+  }
+
+  // VEX.mmmmm.
+  switch (mmmmm) {
+  case 1:
+    // Implied 0F leading opcode byte.
+    vex_prefix |= 0x01;
+    break;
+  case 2:
+    // Implied leading 0F 38 opcode byte.
+    vex_prefix |= 0x02;
+    break;
+  case 3:
+    // Implied leading OF 3A opcode byte.
+    vex_prefix |= 0x03;
+    break;
+  default:
+    LOG(FATAL) << "unknown opcode bytes";
+  }
+  return vex_prefix;
+}
+
+uint8_t X86Assembler::EmitVexByte2(bool w, int l, X86ManagedRegister operand, int pp) {
+  uint8_t vex_prefix = 0;
+  // VEX Byte 2.
+  if (w) {
+    vex_prefix |= 0x80;
+  }
+
+  // VEX.vvvv.
+  if (operand.IsXmmRegister()) {
+    XmmRegister vvvv = operand.AsXmmRegister();
+    int inverted_reg = 15-static_cast<int>(vvvv);
+    uint8_t reg = static_cast<uint8_t>(inverted_reg);
+    vex_prefix |= ((reg & 0x0F) << 3);
+  } else if (operand.IsCpuRegister()) {
+    Register vvvv = operand.AsCpuRegister();
+    int inverted_reg = 15 - static_cast<int>(vvvv);
+    uint8_t reg = static_cast<uint8_t>(inverted_reg);
+    vex_prefix |= ((reg & 0x0F) << 3);
+  }
+
+  // VEX.L.
+  if (l == 256) {
+    vex_prefix |= 0x04;
+  }
+
+  // VEX.pp.
+  switch (pp) {
+  case 0:
+    // SIMD Pefix - None.
+    vex_prefix |= 0x00;
+    break;
+  case 1:
+    // SIMD Prefix - 66.
+    vex_prefix |= 0x01;
+    break;
+  case 2:
+    // SIMD Prefix - F3.
+    vex_prefix |= 0x02;
+    break;
+  case 3:
+    // SIMD Prefix - F2.
+    vex_prefix |= 0x03;
+    break;
+  default:
+    LOG(FATAL) << "unknown SIMD Prefix";
+  }
+
+  return vex_prefix;
+}
+
 void X86Assembler::EmitGenericShift(int reg_or_opcode,
                                     const Operand& operand,
                                     const Immediate& imm) {
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index e42c4c986a..8c9ce82687 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -397,6 +397,12 @@ class X86Assembler FINAL : public Assembler {
   void divss(XmmRegister dst, XmmRegister src);
   void divss(XmmRegister dst, const Address& src);
 
+  // FMA Mac Instructions
+  void vfmadd231ps(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+  void vfmadd231pd(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+  void vfmsub231ps(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+  void vfmsub231pd(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+
   void addps(XmmRegister dst, XmmRegister src);  // no addr variant (for now)
   void subps(XmmRegister dst, XmmRegister src);
   void mulps(XmmRegister dst, XmmRegister src);
@@ -834,6 +840,11 @@ class X86Assembler FINAL : public Assembler {
   void EmitLabelLink(Label* label);
   void EmitLabelLink(NearLabel* label);
 
+  // Emit a 3 byte VEX Prefix
+  uint8_t EmitVexByteZero(bool is_two_byte);
+  uint8_t EmitVexByte1(bool r, bool x, bool b, int mmmmm);
+  uint8_t EmitVexByte2(bool w , int l , X86ManagedRegister vvv, int pp);
+
   void EmitGenericShift(int rm, const Operand& operand, const Immediate& imm);
   void EmitGenericShift(int rm, const Operand& operand, Register shifter);
 
diff --git a/compiler/utils/x86/jni_macro_assembler_x86.cc b/compiler/utils/x86/jni_macro_assembler_x86.cc
index 7e29c4aa26..dd99f03aa7 100644
--- a/compiler/utils/x86/jni_macro_assembler_x86.cc
+++ b/compiler/utils/x86/jni_macro_assembler_x86.cc
@@ -67,8 +67,7 @@ void X86JNIMacroAssembler::BuildFrame(size_t frame_size,
   cfi().AdjustCFAOffset(kFramePointerSize);
   DCHECK_EQ(static_cast<size_t>(cfi().GetCurrentCFAOffset()), frame_size);
 
-  for (size_t i = 0; i < entry_spills.size(); ++i) {
-    ManagedRegisterSpill spill = entry_spills.at(i);
+  for (const ManagedRegisterSpill& spill : entry_spills) {
     if (spill.AsX86().IsCpuRegister()) {
       int offset = frame_size + spill.getSpillOffset();
       __ movl(Address(ESP, offset), spill.AsX86().AsCpuRegister());
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index bd31561937..9983eaeeea 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -603,6 +603,56 @@ void X86_64Assembler::divss(XmmRegister dst, const Address& src) {
 }
 
 
+void X86_64Assembler::vfmadd231ps(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/);
+  uint8_t byte_one = EmitVexByte1(acc.NeedsRex(), false, mul_right.NeedsRex(), 2);
+  uint8_t byte_two = EmitVexByte2(false, 128, X86_64ManagedRegister::FromXmmRegister(mul_left.AsFloatRegister()), 1);
+  EmitUint8(byte_zero);
+  EmitUint8(byte_one);
+  EmitUint8(byte_two);
+  // Opcode field.
+  EmitUint8(0xB8);
+  EmitXmmRegisterOperand(acc.LowBits(), mul_right);
+}
+
+
+void X86_64Assembler::vfmsub231ps(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/);
+  uint8_t byte_one = EmitVexByte1(acc.NeedsRex(), false, mul_right.NeedsRex(), 2);
+  uint8_t byte_two = EmitVexByte2(false, 128, X86_64ManagedRegister::FromXmmRegister(mul_left.AsFloatRegister()), 1);
+  EmitUint8(byte_zero);
+  EmitUint8(byte_one);
+  EmitUint8(byte_two);
+  // Opcode field
+  EmitUint8(0xBA);
+  EmitXmmRegisterOperand(acc.LowBits(), mul_right);
+}
+
+void X86_64Assembler::vfmadd231pd(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/);
+  uint8_t byte_one = EmitVexByte1(acc.NeedsRex(), false, mul_right.NeedsRex(), 2);
+  uint8_t byte_two = EmitVexByte2(true, 128, X86_64ManagedRegister::FromXmmRegister(mul_left.AsFloatRegister()), 1);
+  EmitUint8(byte_zero);
+  EmitUint8(byte_one);
+  EmitUint8(byte_two);
+  EmitUint8(0xB8);
+  EmitXmmRegisterOperand(acc.LowBits(), mul_right);
+}
+
+void X86_64Assembler::vfmsub231pd(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/);
+  uint8_t byte_one = EmitVexByte1(acc.NeedsRex(), false, mul_right.NeedsRex(), 2);
+  uint8_t byte_two = EmitVexByte2(true, 128, X86_64ManagedRegister::FromXmmRegister(mul_left.AsFloatRegister()), 1);
+  EmitUint8(byte_zero);
+  EmitUint8(byte_one);
+  EmitUint8(byte_two);
+  EmitUint8(0xBA);
+  EmitXmmRegisterOperand(acc.LowBits(), mul_right);
+}
 void X86_64Assembler::addps(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitOptionalRex32(dst, src);
@@ -3544,6 +3594,98 @@ void X86_64Assembler::EmitLabelLink(NearLabel* label) {
   label->LinkTo(position);
 }
 
+uint8_t X86_64Assembler::EmitVexByteZero(bool is_two_byte) {
+  uint8_t vex_zero = 0xC0;
+  if (!is_two_byte) {
+    vex_zero |= 0xC4;
+  } else {
+    vex_zero |= 0xC5;
+  }
+  return vex_zero;
+}
+
+uint8_t X86_64Assembler::EmitVexByte1(bool r, bool x, bool b, int mmmmm) {
+  // VEX Byte 1.
+  uint8_t vex_prefix = 0;
+  if (!r) {
+    vex_prefix |= 0x80;  // VEX.R .
+  }
+  if (!x) {
+    vex_prefix |= 0x40;  // VEX.X .
+  }
+  if (!b) {
+    vex_prefix |= 0x20;  // VEX.B .
+  }
+
+  // VEX.mmmmm.
+  switch (mmmmm) {
+  case 1:
+    // Implied 0F leading opcode byte.
+    vex_prefix |= 0x01;
+    break;
+  case 2:
+    // Implied leading 0F 38 opcode byte.
+    vex_prefix |= 0x02;
+    break;
+  case 3:
+    // Implied leading OF 3A opcode byte.
+    vex_prefix |= 0x03;
+    break;
+  default:
+    LOG(FATAL) << "unknown opcode bytes";
+  }
+
+  return vex_prefix;
+}
+
+uint8_t X86_64Assembler::EmitVexByte2(bool w, int l, X86_64ManagedRegister operand, int pp) {
+  // VEX Byte 2.
+  uint8_t vex_prefix = 0;
+  if (w) {
+    vex_prefix |= 0x80;
+  }
+    // VEX.vvvv.
+  if (operand.IsXmmRegister()) {
+    XmmRegister vvvv = operand.AsXmmRegister();
+    int inverted_reg = 15-static_cast<int>(vvvv.AsFloatRegister());
+    uint8_t reg = static_cast<uint8_t>(inverted_reg);
+    vex_prefix |= ((reg & 0x0F) << 3);
+  } else if (operand.IsCpuRegister()) {
+    CpuRegister vvvv = operand.AsCpuRegister();
+    int inverted_reg = 15 - static_cast<int>(vvvv.AsRegister());
+    uint8_t reg = static_cast<uint8_t>(inverted_reg);
+    vex_prefix |= ((reg & 0x0F) << 3);
+  }
+
+  // VEX.L.
+  if (l == 256) {
+    vex_prefix |= 0x04;
+  }
+
+  // VEX.pp.
+  switch (pp) {
+  case 0:
+    // SIMD Pefix - None.
+    vex_prefix |= 0x00;
+    break;
+  case 1:
+    // SIMD Prefix - 66.
+    vex_prefix |= 0x01;
+    break;
+  case 2:
+    // SIMD Prefix - F3.
+    vex_prefix |= 0x02;
+    break;
+  case 3:
+    // SIMD Prefix - F2.
+    vex_prefix |= 0x03;
+    break;
+  default:
+    LOG(FATAL) << "unknown SIMD Prefix";
+  }
+
+  return vex_prefix;
+}
 
 void X86_64Assembler::EmitGenericShift(bool wide,
                                        int reg_or_opcode,
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index e4d72a7ba2..d5779aa786 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -436,6 +436,16 @@ class X86_64Assembler FINAL : public Assembler {
   void divss(XmmRegister dst, XmmRegister src);
   void divss(XmmRegister dst, const Address& src);
 
+  // Mac Instructions
+  // For reference look at the Instruction reference volume 2C.
+  // The below URL is broken down in two lines.
+  // https://www.intel.com/content/www/us/en/architecture-and-technology/
+  // 64-ia-32-architectures-software-developer-vol-2c-manual.html
+  void vfmadd231ps(XmmRegister acc, XmmRegister left, XmmRegister right);
+  void vfmadd231pd(XmmRegister acc, XmmRegister left, XmmRegister right);
+  void vfmsub231ps(XmmRegister acc, XmmRegister left, XmmRegister right);
+  void vfmsub231pd(XmmRegister acc, XmmRegister left, XmmRegister right);
+
   void addps(XmmRegister dst, XmmRegister src);  // no addr variant (for now)
   void subps(XmmRegister dst, XmmRegister src);
   void mulps(XmmRegister dst, XmmRegister src);
@@ -921,6 +931,11 @@ class X86_64Assembler FINAL : public Assembler {
   void EmitLabelLink(Label* label);
   void EmitLabelLink(NearLabel* label);
 
+  // Emit a 3 byte VEX Prefix.
+  uint8_t EmitVexByteZero(bool is_two_byte);
+  uint8_t EmitVexByte1(bool r, bool x, bool b, int mmmmm);
+  uint8_t EmitVexByte2(bool w , int l , X86_64ManagedRegister operand, int pp);
+
   void EmitGenericShift(bool wide, int rm, CpuRegister reg, const Immediate& imm);
   void EmitGenericShift(bool wide, int rm, CpuRegister operand, CpuRegister shifter);
 
diff --git a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
index 9486cb44c5..f6b2f9df34 100644
--- a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
+++ b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
@@ -75,8 +75,7 @@ void X86_64JNIMacroAssembler::BuildFrame(size_t frame_size,
 
   __ movq(Address(CpuRegister(RSP), 0), method_reg.AsX86_64().AsCpuRegister());
 
-  for (size_t i = 0; i < entry_spills.size(); ++i) {
-    ManagedRegisterSpill spill = entry_spills.at(i);
+  for (const ManagedRegisterSpill& spill : entry_spills) {
     if (spill.AsX86_64().IsCpuRegister()) {
       if (spill.getSize() == 8) {
         __ movq(Address(CpuRegister(RSP), frame_size + spill.getSpillOffset()),