Add AVX support for packed mul/div instructions.

This is a follow up for the below patch:
https://android-review.googlesource.com/c/platform/build/+/830841

Test: ./test.py --host --64, test-art-host-gtest
Change-Id: Id2aa473035556ee230e66addeb69707df8530e75
Signed-off-by: Shalini Salomi Bodapati <shalini.salomi.bodapati@intel.com>
diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc
index c8964dd..29a1354 100644
--- a/compiler/optimizing/code_generator_vector_x86.cc
+++ b/compiler/optimizing/code_generator_vector_x86.cc
@@ -431,48 +431,6 @@
   }
 }
 
-void LocationsBuilderX86::VisitVecAdd(HVecAdd* instruction) {
-  CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
-}
-
-void InstructionCodeGeneratorX86::VisitVecAdd(HVecAdd* instruction) {
-  LocationSummary* locations = instruction->GetLocations();
-  DCHECK(locations->InAt(0).Equals(locations->Out()));
-  XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
-  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
-  switch (instruction->GetPackedType()) {
-    case DataType::Type::kUint8:
-    case DataType::Type::kInt8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
-      __ paddb(dst, src);
-      break;
-    case DataType::Type::kUint16:
-    case DataType::Type::kInt16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ paddw(dst, src);
-      break;
-    case DataType::Type::kInt32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ paddd(dst, src);
-      break;
-    case DataType::Type::kInt64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ paddq(dst, src);
-      break;
-    case DataType::Type::kFloat32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ addps(dst, src);
-      break;
-    case DataType::Type::kFloat64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ addpd(dst, src);
-      break;
-    default:
-      LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
-      UNREACHABLE();
-  }
-}
-
 static void CreateVecTerOpLocations(ArenaAllocator* allocator, HVecOperation* instruction) {
   LocationSummary* locations = new (allocator) LocationSummary(instruction);
   switch (instruction->GetPackedType()) {
@@ -495,44 +453,50 @@
   }
 }
 
-void LocationsBuilderX86::VisitVecAvxAdd(HVecAvxAdd* instruction) {
+void LocationsBuilderX86::VisitVecAdd(HVecAdd* instruction) {
+  if (CpuHasAvxFeatureFlag()) {
     CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
+  } else {
+    CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
+  }
 }
 
-void InstructionCodeGeneratorX86::VisitVecAvxAdd(HVecAvxAdd* instruction) {
+void InstructionCodeGeneratorX86::VisitVecAdd(HVecAdd* instruction) {
+  bool cpu_has_avx = CpuHasAvxFeatureFlag();
   LocationSummary* locations = instruction->GetLocations();
-  XmmRegister src1 = locations->InAt(0).AsFpuRegister<XmmRegister>();
-  XmmRegister src2 = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister other_src = locations->InAt(0).AsFpuRegister<XmmRegister>();
   XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  DCHECK(cpu_has_avx || other_src == dst);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
       DCHECK_EQ(16u, instruction->GetVectorLength());
-        __ vpaddb(dst, src1, src2);
+      cpu_has_avx ? __ vpaddb(dst, other_src, src) : __ paddb(dst, src);
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
       DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ vpaddw(dst, src1, src2);
+      cpu_has_avx ? __ vpaddw(dst, other_src, src) : __ paddw(dst, src);
       break;
     case DataType::Type::kInt32:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ vpaddd(dst, src1, src2);
+      cpu_has_avx ?  __ vpaddd(dst, other_src, src) : __ paddd(dst, src);
       break;
     case DataType::Type::kInt64:
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ vpaddq(dst, src1, src2);
+      cpu_has_avx ? __ vpaddq(dst, other_src, src) : __ paddq(dst, src);
       break;
     case DataType::Type::kFloat32:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ vaddps(dst, src1, src2);
+      cpu_has_avx ? __ vaddps(dst, other_src, src) : __ addps(dst, src);
       break;
     case DataType::Type::kFloat64:
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ vaddpd(dst, src1, src2);
+      cpu_has_avx ? __ vaddpd(dst, other_src, src) : __ addpd(dst, src);
       break;
     default:
-      LOG(FATAL) << "Unsupported SIMD type";
+      LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
       UNREACHABLE();
   }
 }
@@ -597,85 +561,49 @@
 }
 
 void LocationsBuilderX86::VisitVecSub(HVecSub* instruction) {
-  CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
-}
-
-void InstructionCodeGeneratorX86::VisitVecSub(HVecSub* instruction) {
-  LocationSummary* locations = instruction->GetLocations();
-  DCHECK(locations->InAt(0).Equals(locations->Out()));
-  XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
-  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
-  switch (instruction->GetPackedType()) {
-    case DataType::Type::kUint8:
-    case DataType::Type::kInt8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
-      __ psubb(dst, src);
-      break;
-    case DataType::Type::kUint16:
-    case DataType::Type::kInt16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ psubw(dst, src);
-      break;
-    case DataType::Type::kInt32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ psubd(dst, src);
-      break;
-    case DataType::Type::kInt64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ psubq(dst, src);
-      break;
-    case DataType::Type::kFloat32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ subps(dst, src);
-      break;
-    case DataType::Type::kFloat64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ subpd(dst, src);
-      break;
-    default:
-      LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
-      UNREACHABLE();
+  if (CpuHasAvxFeatureFlag()) {
+    CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
+  } else {
+    CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
   }
 }
 
-void LocationsBuilderX86::VisitVecAvxSub(HVecAvxSub* instruction) {
-  CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
-}
-
-void InstructionCodeGeneratorX86::VisitVecAvxSub(HVecAvxSub* instruction) {
+void InstructionCodeGeneratorX86::VisitVecSub(HVecSub* instruction) {
+  bool cpu_has_avx = CpuHasAvxFeatureFlag();
   LocationSummary* locations = instruction->GetLocations();
-  XmmRegister src1 = locations->InAt(0).AsFpuRegister<XmmRegister>();
-  XmmRegister src2 = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister other_src = locations->InAt(0).AsFpuRegister<XmmRegister>();
   XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  DCHECK(cpu_has_avx || other_src == dst);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
       DCHECK_EQ(16u, instruction->GetVectorLength());
-      __ vpsubb(dst, src1, src2);
+      cpu_has_avx ? __ vpsubb(dst, other_src, src) : __ psubb(dst, src);
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
       DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ vpsubw(dst, src1, src2);
+      cpu_has_avx ? __ vpsubw(dst, other_src, src) : __ psubw(dst, src);
       break;
     case DataType::Type::kInt32:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ vpsubd(dst, src1, src2);
+      cpu_has_avx ?  __ vpsubd(dst, other_src, src) : __ psubd(dst, src);
       break;
     case DataType::Type::kInt64:
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ vpsubq(dst, src1, src2);
+      cpu_has_avx ? __ vpsubq(dst, other_src, src) : __ psubq(dst, src);
       break;
     case DataType::Type::kFloat32:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ vsubps(dst, src1, src2);
+      cpu_has_avx ? __ vsubps(dst, other_src, src) : __ subps(dst, src);
       break;
     case DataType::Type::kFloat64:
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ vsubpd(dst, src1, src2);
+      cpu_has_avx ? __ vsubpd(dst, other_src, src) : __ subpd(dst, src);
       break;
     default:
-      LOG(FATAL) << "Unsupported SIMD type";
+      LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
       UNREACHABLE();
   }
 }
@@ -713,31 +641,37 @@
 }
 
 void LocationsBuilderX86::VisitVecMul(HVecMul* instruction) {
-  CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
+  if (CpuHasAvxFeatureFlag()) {
+    CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
+  } else {
+    CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
+  }
 }
 
 void InstructionCodeGeneratorX86::VisitVecMul(HVecMul* instruction) {
+  bool cpu_has_avx = CpuHasAvxFeatureFlag();
   LocationSummary* locations = instruction->GetLocations();
-  DCHECK(locations->InAt(0).Equals(locations->Out()));
   XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister other_src = locations->InAt(0).AsFpuRegister<XmmRegister>();
   XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  DCHECK(cpu_has_avx || other_src == dst);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
       DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ pmullw(dst, src);
+      cpu_has_avx ? __ vpmullw(dst, other_src, src) : __ pmullw(dst, src);
       break;
     case DataType::Type::kInt32:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ pmulld(dst, src);
+      cpu_has_avx ? __ vpmulld(dst, other_src, src) : __ pmulld(dst, src);
       break;
     case DataType::Type::kFloat32:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ mulps(dst, src);
+      cpu_has_avx ? __ vmulps(dst, other_src, src) : __ mulps(dst, src);
       break;
     case DataType::Type::kFloat64:
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ mulpd(dst, src);
+      cpu_has_avx ? __ vmulpd(dst, other_src, src) : __ mulpd(dst, src);
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -746,22 +680,28 @@
 }
 
 void LocationsBuilderX86::VisitVecDiv(HVecDiv* instruction) {
-  CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
+  if (CpuHasAvxFeatureFlag()) {
+    CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
+  } else {
+    CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
+  }
 }
 
 void InstructionCodeGeneratorX86::VisitVecDiv(HVecDiv* instruction) {
+  bool cpu_has_avx = CpuHasAvxFeatureFlag();
   LocationSummary* locations = instruction->GetLocations();
-  DCHECK(locations->InAt(0).Equals(locations->Out()));
   XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister other_src = locations->InAt(0).AsFpuRegister<XmmRegister>();
   XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  DCHECK(cpu_has_avx || other_src == dst);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kFloat32:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ divps(dst, src);
+      cpu_has_avx ? __ vdivps(dst, other_src, src) : __ divps(dst, src);
       break;
     case DataType::Type::kFloat64:
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ divpd(dst, src);
+      cpu_has_avx ?  __ vdivpd(dst, other_src, src) : __ divpd(dst, src);
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc
index c147659..f28268b 100644
--- a/compiler/optimizing/code_generator_vector_x86_64.cc
+++ b/compiler/optimizing/code_generator_vector_x86_64.cc
@@ -437,85 +437,49 @@
 }
 
 void LocationsBuilderX86_64::VisitVecAdd(HVecAdd* instruction) {
-  CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
-}
-
-void InstructionCodeGeneratorX86_64::VisitVecAdd(HVecAdd* instruction) {
-  LocationSummary* locations = instruction->GetLocations();
-  DCHECK(locations->InAt(0).Equals(locations->Out()));
-  XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
-  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
-  switch (instruction->GetPackedType()) {
-    case DataType::Type::kUint8:
-    case DataType::Type::kInt8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
-      __ paddb(dst, src);
-      break;
-    case DataType::Type::kUint16:
-    case DataType::Type::kInt16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ paddw(dst, src);
-      break;
-    case DataType::Type::kInt32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ paddd(dst, src);
-      break;
-    case DataType::Type::kInt64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ paddq(dst, src);
-      break;
-    case DataType::Type::kFloat32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ addps(dst, src);
-      break;
-    case DataType::Type::kFloat64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ addpd(dst, src);
-      break;
-    default:
-      LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
-      UNREACHABLE();
+  if (CpuHasAvxFeatureFlag()) {
+    CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
+  } else {
+    CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
   }
 }
 
-void LocationsBuilderX86_64::VisitVecAvxAdd(HVecAvxAdd* instruction) {
-    CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
-}
-
-void InstructionCodeGeneratorX86_64::VisitVecAvxAdd(HVecAvxAdd* instruction) {
+void InstructionCodeGeneratorX86_64::VisitVecAdd(HVecAdd* instruction) {
+  bool cpu_has_avx = CpuHasAvxFeatureFlag();
   LocationSummary* locations = instruction->GetLocations();
-  XmmRegister src1 = locations->InAt(0).AsFpuRegister<XmmRegister>();
-  XmmRegister src2 = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister other_src = locations->InAt(0).AsFpuRegister<XmmRegister>();
   XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  DCHECK(cpu_has_avx || other_src == dst);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
       DCHECK_EQ(16u, instruction->GetVectorLength());
-        __ vpaddb(dst, src1, src2);
+      cpu_has_avx ? __ vpaddb(dst, other_src, src) : __ paddb(dst, src);
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
       DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ vpaddw(dst, src1, src2);
+      cpu_has_avx ? __ vpaddw(dst, other_src, src) : __ paddw(dst, src);
       break;
     case DataType::Type::kInt32:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ vpaddd(dst, src1, src2);
+      cpu_has_avx ? __ vpaddd(dst, other_src, src) : __ paddd(dst, src);
       break;
     case DataType::Type::kInt64:
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ vpaddq(dst, src1, src2);
+      cpu_has_avx ? __ vpaddq(dst, other_src, src) : __ paddq(dst, src);
       break;
     case DataType::Type::kFloat32:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ vaddps(dst, src1, src2);
+      cpu_has_avx ? __ vaddps(dst, other_src, src) : __ addps(dst, src);
       break;
     case DataType::Type::kFloat64:
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ vaddpd(dst, src1, src2);
+      cpu_has_avx ? __ vaddpd(dst, other_src, src) : __ addpd(dst, src);
       break;
     default:
-      LOG(FATAL) << "Unsupported SIMD type";
+      LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
       UNREACHABLE();
   }
 }
@@ -580,85 +544,49 @@
 }
 
 void LocationsBuilderX86_64::VisitVecSub(HVecSub* instruction) {
-  CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
-}
-
-void InstructionCodeGeneratorX86_64::VisitVecSub(HVecSub* instruction) {
-  LocationSummary* locations = instruction->GetLocations();
-  DCHECK(locations->InAt(0).Equals(locations->Out()));
-  XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
-  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
-  switch (instruction->GetPackedType()) {
-    case DataType::Type::kUint8:
-    case DataType::Type::kInt8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
-      __ psubb(dst, src);
-      break;
-    case DataType::Type::kUint16:
-    case DataType::Type::kInt16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ psubw(dst, src);
-      break;
-    case DataType::Type::kInt32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ psubd(dst, src);
-      break;
-    case DataType::Type::kInt64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ psubq(dst, src);
-      break;
-    case DataType::Type::kFloat32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ subps(dst, src);
-      break;
-    case DataType::Type::kFloat64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ subpd(dst, src);
-      break;
-    default:
-      LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
-      UNREACHABLE();
+  if (CpuHasAvxFeatureFlag()) {
+    CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
+  } else {
+    CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
   }
 }
 
-void LocationsBuilderX86_64::VisitVecAvxSub(HVecAvxSub* instruction) {
-  CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
-}
-
-void InstructionCodeGeneratorX86_64::VisitVecAvxSub(HVecAvxSub* instruction) {
+void InstructionCodeGeneratorX86_64::VisitVecSub(HVecSub* instruction) {
+  bool cpu_has_avx = CpuHasAvxFeatureFlag();
   LocationSummary* locations = instruction->GetLocations();
-  XmmRegister src1 = locations->InAt(0).AsFpuRegister<XmmRegister>();
-  XmmRegister src2 = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister other_src = locations->InAt(0).AsFpuRegister<XmmRegister>();
   XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  DCHECK(cpu_has_avx || other_src == dst);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
       DCHECK_EQ(16u, instruction->GetVectorLength());
-      __ vpsubb(dst, src1, src2);
+      cpu_has_avx ? __ vpsubb(dst, other_src, src) : __ psubb(dst, src);
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
       DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ vpsubw(dst, src1, src2);
+      cpu_has_avx ? __ vpsubw(dst, other_src, src) : __ psubw(dst, src);
       break;
     case DataType::Type::kInt32:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ vpsubd(dst, src1, src2);
+      cpu_has_avx ? __ vpsubd(dst, other_src, src) : __ psubd(dst, src);
       break;
     case DataType::Type::kInt64:
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ vpsubq(dst, src1, src2);
+      cpu_has_avx ? __ vpsubq(dst, other_src, src) : __ psubq(dst, src);
       break;
     case DataType::Type::kFloat32:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ vsubps(dst, src1, src2);
+      cpu_has_avx ? __ vsubps(dst, other_src, src) : __ subps(dst, src);
       break;
     case DataType::Type::kFloat64:
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ vsubpd(dst, src1, src2);
+      cpu_has_avx ? __ vsubpd(dst, other_src, src) : __ subpd(dst, src);
       break;
     default:
-      LOG(FATAL) << "Unsupported SIMD type";
+      LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
       UNREACHABLE();
   }
 }
@@ -696,31 +624,37 @@
 }
 
 void LocationsBuilderX86_64::VisitVecMul(HVecMul* instruction) {
-  CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
+  if (CpuHasAvxFeatureFlag()) {
+    CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
+  } else {
+    CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
+  }
 }
 
 void InstructionCodeGeneratorX86_64::VisitVecMul(HVecMul* instruction) {
+  bool cpu_has_avx = CpuHasAvxFeatureFlag();
   LocationSummary* locations = instruction->GetLocations();
-  DCHECK(locations->InAt(0).Equals(locations->Out()));
   XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister other_src = locations->InAt(0).AsFpuRegister<XmmRegister>();
   XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  DCHECK(cpu_has_avx || other_src == dst);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
       DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ pmullw(dst, src);
+      cpu_has_avx ? __ vpmullw(dst, other_src, src) : __ pmullw(dst, src);
       break;
     case DataType::Type::kInt32:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ pmulld(dst, src);
+      cpu_has_avx ? __ vpmulld(dst, other_src, src): __ pmulld(dst, src);
       break;
     case DataType::Type::kFloat32:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ mulps(dst, src);
+      cpu_has_avx ? __ vmulps(dst, other_src, src) : __ mulps(dst, src);
       break;
     case DataType::Type::kFloat64:
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ mulpd(dst, src);
+      cpu_has_avx ? __ vmulpd(dst, other_src, src) : __ mulpd(dst, src);
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -729,22 +663,28 @@
 }
 
 void LocationsBuilderX86_64::VisitVecDiv(HVecDiv* instruction) {
-  CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
+  if (CpuHasAvxFeatureFlag()) {
+    CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
+  } else {
+    CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
+  }
 }
 
 void InstructionCodeGeneratorX86_64::VisitVecDiv(HVecDiv* instruction) {
+  bool cpu_has_avx = CpuHasAvxFeatureFlag();
   LocationSummary* locations = instruction->GetLocations();
-  DCHECK(locations->InAt(0).Equals(locations->Out()));
   XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister other_src = locations->InAt(0).AsFpuRegister<XmmRegister>();
   XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  DCHECK(cpu_has_avx || other_src == dst);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kFloat32:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ divps(dst, src);
+      cpu_has_avx ? __ vdivps(dst, other_src, src) : __ divps(dst, src);
       break;
     case DataType::Type::kFloat64:
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ divpd(dst, src);
+      cpu_has_avx ? __ vdivpd(dst, other_src, src) : __ divpd(dst, src);
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index d8a54e5..7f7e3a5 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -8425,6 +8425,19 @@
   LOG(FATAL) << "Unreachable";
 }
 
+bool LocationsBuilderX86::CpuHasAvxFeatureFlag() {
+  return codegen_->GetInstructionSetFeatures().HasAVX();
+}
+bool LocationsBuilderX86::CpuHasAvx2FeatureFlag() {
+  return codegen_->GetInstructionSetFeatures().HasAVX2();
+}
+bool InstructionCodeGeneratorX86::CpuHasAvxFeatureFlag() {
+  return codegen_->GetInstructionSetFeatures().HasAVX();
+}
+bool InstructionCodeGeneratorX86::CpuHasAvx2FeatureFlag() {
+  return codegen_->GetInstructionSetFeatures().HasAVX2();
+}
+
 #undef __
 
 }  // namespace x86
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index bbca764..368c584 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -175,6 +175,8 @@
   void HandleShift(HBinaryOperation* instruction);
   void HandleFieldSet(HInstruction* instruction, const FieldInfo& field_info);
   void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info);
+  bool CpuHasAvxFeatureFlag();
+  bool CpuHasAvx2FeatureFlag();
 
   CodeGeneratorX86* const codegen_;
   InvokeDexCallingConventionVisitorX86 parameter_visitor_;
@@ -307,6 +309,8 @@
                                    HBasicBlock* default_block);
 
   void GenerateFPCompare(Location lhs, Location rhs, HInstruction* insn, bool is_double);
+  bool CpuHasAvxFeatureFlag();
+  bool CpuHasAvx2FeatureFlag();
 
   X86Assembler* const assembler_;
   CodeGeneratorX86* const codegen_;
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index b3d76a3..8067b9c 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -7667,6 +7667,22 @@
   }
 }
 
+bool LocationsBuilderX86_64::CpuHasAvxFeatureFlag() {
+  return codegen_->GetInstructionSetFeatures().HasAVX();
+}
+
+bool LocationsBuilderX86_64::CpuHasAvx2FeatureFlag() {
+  return codegen_->GetInstructionSetFeatures().HasAVX2();
+}
+
+bool InstructionCodeGeneratorX86_64::CpuHasAvxFeatureFlag() {
+  return codegen_->GetInstructionSetFeatures().HasAVX();
+}
+
+bool InstructionCodeGeneratorX86_64::CpuHasAvx2FeatureFlag() {
+  return codegen_->GetInstructionSetFeatures().HasAVX2();
+}
+
 #undef __
 
 }  // namespace x86_64
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index a25c29f..d3b49ea 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -177,6 +177,8 @@
   void HandleShift(HBinaryOperation* operation);
   void HandleFieldSet(HInstruction* instruction, const FieldInfo& field_info);
   void HandleFieldGet(HInstruction* instruction);
+  bool CpuHasAvxFeatureFlag();
+  bool CpuHasAvx2FeatureFlag();
 
   CodeGeneratorX86_64* const codegen_;
   InvokeDexCallingConventionVisitorX86_64 parameter_visitor_;
@@ -287,6 +289,9 @@
 
   void HandleGoto(HInstruction* got, HBasicBlock* successor);
 
+  bool CpuHasAvxFeatureFlag();
+  bool CpuHasAvx2FeatureFlag();
+
   X86_64Assembler* const assembler_;
   CodeGeneratorX86_64* const codegen_;
 
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index c6e7560..9914127 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -353,9 +353,6 @@
 static HVecReduce::ReductionKind GetReductionKind(HVecOperation* reduction) {
   if (reduction->IsVecAdd()  ||
       reduction->IsVecSub() ||
-      #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
-      reduction->IsVecAvxSub() || reduction->IsVecAvxAdd() ||
-      #endif
       reduction->IsVecSADAccumulate() ||
       reduction->IsVecDotProd()) {
     return HVecReduce::kSum;
@@ -1943,34 +1940,10 @@
         new (global_allocator_) HVecCnv(global_allocator_, opa, type, vector_length_, dex_pc),
         new (global_allocator_) HTypeConversion(org_type, opa, dex_pc));
     case HInstruction::kAdd:
-      #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
-      if ((compiler_options_->GetInstructionSet() == InstructionSet::kX86 ||
-           compiler_options_->GetInstructionSet() == InstructionSet::kX86_64) &&
-           compiler_options_->GetInstructionSetFeatures()->AsX86InstructionSetFeatures()
-               ->HasAVX2()) {
-        GENERATE_VEC(
-          new (global_allocator_) HVecAvxAdd(
-                                      global_allocator_, opa, opb, type, vector_length_, dex_pc),
-          new (global_allocator_) HAdd(org_type, opa, opb, dex_pc));
-        UNREACHABLE();  // GENERATE_VEC ends with a "break".
-      }
-      #endif
       GENERATE_VEC(
         new (global_allocator_) HVecAdd(global_allocator_, opa, opb, type, vector_length_, dex_pc),
         new (global_allocator_) HAdd(org_type, opa, opb, dex_pc));
     case HInstruction::kSub:
-      #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
-      if ((compiler_options_->GetInstructionSet() == InstructionSet::kX86 ||
-           compiler_options_->GetInstructionSet() == InstructionSet::kX86_64) &&
-           compiler_options_->GetInstructionSetFeatures()->AsX86InstructionSetFeatures()
-               ->HasAVX2()) {
-        GENERATE_VEC(
-          new (global_allocator_) HVecAvxSub(
-                                      global_allocator_, opa, opb, type, vector_length_, dex_pc),
-          new (global_allocator_) HSub(org_type, opa, opb, dex_pc));
-        UNREACHABLE();  // GENERATE_VEC ends with a "break".
-      }
-      #endif
       GENERATE_VEC(
         new (global_allocator_) HVecSub(global_allocator_, opa, opb, type, vector_length_, dex_pc),
         new (global_allocator_) HSub(org_type, opa, opb, dex_pc));
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 3e6e211..25f9e3c 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -1540,9 +1540,7 @@
 #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
 #define FOR_EACH_CONCRETE_INSTRUCTION_X86_COMMON(M)                     \
   M(X86AndNot, Instruction)                                             \
-  M(X86MaskOrResetLeastSetBit, Instruction)                             \
-  M(VecAvxSub, VecOperation)                                            \
-  M(VecAvxAdd, VecOperation)
+  M(X86MaskOrResetLeastSetBit, Instruction)
 #else
 #define FOR_EACH_CONCRETE_INSTRUCTION_X86_COMMON(M)
 #endif
@@ -7874,7 +7872,6 @@
 #endif
 #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
 #include "nodes_x86.h"
-#include "nodes_vector_x86.h"
 #endif
 
 namespace art {
diff --git a/compiler/optimizing/nodes_vector_x86.h b/compiler/optimizing/nodes_vector_x86.h
deleted file mode 100644
index a8f576f..0000000
--- a/compiler/optimizing/nodes_vector_x86.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (C) 2018 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ART_COMPILER_OPTIMIZING_NODES_VECTOR_X86_H_
-#define ART_COMPILER_OPTIMIZING_NODES_VECTOR_X86_H_
-
-#include "nodes_vector.h"
-
-namespace art {
-
-class HVecAvxAdd final : public HVecOperation {
- public:
-  HVecAvxAdd(ArenaAllocator* allocator,
-             HInstruction* src1,
-             HInstruction* src2,
-             DataType::Type packed_type,
-             size_t vector_length,
-             uint32_t dex_pc)
-      : HVecOperation(kVecAvxAdd,
-                      allocator,
-                      packed_type,
-                      SideEffects::None(),
-                      /* number_of_inputs */ 2,
-                      vector_length,
-                      dex_pc) {
-    DCHECK(HasConsistentPackedTypes(src1, packed_type));
-    DCHECK(HasConsistentPackedTypes(src2, packed_type));
-    SetRawInputAt(0, src1);
-    SetRawInputAt(1, src2);
-  }
-
-  bool CanBeMoved() const override { return true; }
-
-  DECLARE_INSTRUCTION(VecAvxAdd);
-
- protected:
-  DEFAULT_COPY_CONSTRUCTOR(VecAvxAdd);
-};
-
-class HVecAvxSub final : public HVecOperation {
- public:
-  HVecAvxSub(ArenaAllocator* allocator,
-             HInstruction* src1,
-             HInstruction* src2,
-             DataType::Type packed_type,
-             size_t vector_length,
-             uint32_t dex_pc)
-      : HVecOperation(kVecAvxSub,
-                      allocator,
-                      packed_type,
-                      SideEffects::None(),
-                      /* number_of_inputs */ 2,
-                      vector_length,
-                      dex_pc) {
-    DCHECK(HasConsistentPackedTypes(src1, packed_type));
-    DCHECK(HasConsistentPackedTypes(src2, packed_type));
-    SetRawInputAt(0, src1);
-    SetRawInputAt(1, src2);
-  }
-
-  bool CanBeMoved() const override { return true; }
-
-  DECLARE_INSTRUCTION(VecAvxSub);
-
- protected:
-  DEFAULT_COPY_CONSTRUCTOR(VecAvxSub);
-};
-
-}  // namespace art
-
-#endif  // ART_COMPILER_OPTIMIZING_NODES_VECTOR_X86_H_