Add AVX support for packed add/sub instructions on x86

Test: ./test.py --host, test-art-host-gtest

Change-Id: I48d05e6f6befd54657d962119a543b27a8a51d71
Signed-off-by: Shalini Salomi Bodapati <shalini.salomi.bodapati@intel.com>
diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc
index 0ee0035..c8964dd 100644
--- a/compiler/optimizing/code_generator_vector_x86.cc
+++ b/compiler/optimizing/code_generator_vector_x86.cc
@@ -473,6 +473,70 @@
   }
 }
 
+static void CreateVecTerOpLocations(ArenaAllocator* allocator, HVecOperation* instruction) {
+  LocationSummary* locations = new (allocator) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case DataType::Type::kBool:
+    case DataType::Type::kUint8:
+    case DataType::Type::kInt8:
+    case DataType::Type::kUint16:
+    case DataType::Type::kInt16:
+    case DataType::Type::kInt32:
+    case DataType::Type::kInt64:
+    case DataType::Type::kFloat32:
+    case DataType::Type::kFloat64:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetInAt(1, Location::RequiresFpuRegister());
+      locations->SetOut(Location::RequiresFpuRegister());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderX86::VisitVecAvxAdd(HVecAvxAdd* instruction) {
+    CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
+}
+
+void InstructionCodeGeneratorX86::VisitVecAvxAdd(HVecAvxAdd* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  XmmRegister src1 = locations->InAt(0).AsFpuRegister<XmmRegister>();
+  XmmRegister src2 = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case DataType::Type::kUint8:
+    case DataType::Type::kInt8:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+        __ vpaddb(dst, src1, src2);
+      break;
+    case DataType::Type::kUint16:
+    case DataType::Type::kInt16:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ vpaddw(dst, src1, src2);
+      break;
+    case DataType::Type::kInt32:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ vpaddd(dst, src1, src2);
+      break;
+    case DataType::Type::kInt64:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ vpaddq(dst, src1, src2);
+      break;
+    case DataType::Type::kFloat32:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ vaddps(dst, src1, src2);
+      break;
+    case DataType::Type::kFloat64:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ vaddpd(dst, src1, src2);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
 void LocationsBuilderX86::VisitVecSaturationAdd(HVecSaturationAdd* instruction) {
   CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
 }
@@ -574,6 +638,48 @@
   }
 }
 
+void LocationsBuilderX86::VisitVecAvxSub(HVecAvxSub* instruction) {
+  CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
+}
+
+void InstructionCodeGeneratorX86::VisitVecAvxSub(HVecAvxSub* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  XmmRegister src1 = locations->InAt(0).AsFpuRegister<XmmRegister>();
+  XmmRegister src2 = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case DataType::Type::kUint8:
+    case DataType::Type::kInt8:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ vpsubb(dst, src1, src2);
+      break;
+    case DataType::Type::kUint16:
+    case DataType::Type::kInt16:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ vpsubw(dst, src1, src2);
+      break;
+    case DataType::Type::kInt32:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ vpsubd(dst, src1, src2);
+      break;
+    case DataType::Type::kInt64:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ vpsubq(dst, src1, src2);
+      break;
+    case DataType::Type::kFloat32:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ vsubps(dst, src1, src2);
+      break;
+    case DataType::Type::kFloat64:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ vsubpd(dst, src1, src2);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
 void LocationsBuilderX86::VisitVecSaturationSub(HVecSaturationSub* instruction) {
   CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
 }
diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc
index 9c28827..c147659 100644
--- a/compiler/optimizing/code_generator_vector_x86_64.cc
+++ b/compiler/optimizing/code_generator_vector_x86_64.cc
@@ -414,6 +414,28 @@
   }
 }
 
+static void CreateVecTerOpLocations(ArenaAllocator* allocator, HVecOperation* instruction) {
+  LocationSummary* locations = new (allocator) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case DataType::Type::kBool:
+    case DataType::Type::kUint8:
+    case DataType::Type::kInt8:
+    case DataType::Type::kUint16:
+    case DataType::Type::kInt16:
+    case DataType::Type::kInt32:
+    case DataType::Type::kInt64:
+    case DataType::Type::kFloat32:
+    case DataType::Type::kFloat64:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetInAt(1, Location::RequiresFpuRegister());
+      locations->SetOut(Location::RequiresFpuRegister());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
 void LocationsBuilderX86_64::VisitVecAdd(HVecAdd* instruction) {
   CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
 }
@@ -456,6 +478,48 @@
   }
 }
 
+void LocationsBuilderX86_64::VisitVecAvxAdd(HVecAvxAdd* instruction) {
+    CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecAvxAdd(HVecAvxAdd* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  XmmRegister src1 = locations->InAt(0).AsFpuRegister<XmmRegister>();
+  XmmRegister src2 = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case DataType::Type::kUint8:
+    case DataType::Type::kInt8:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+        __ vpaddb(dst, src1, src2);
+      break;
+    case DataType::Type::kUint16:
+    case DataType::Type::kInt16:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ vpaddw(dst, src1, src2);
+      break;
+    case DataType::Type::kInt32:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ vpaddd(dst, src1, src2);
+      break;
+    case DataType::Type::kInt64:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ vpaddq(dst, src1, src2);
+      break;
+    case DataType::Type::kFloat32:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ vaddps(dst, src1, src2);
+      break;
+    case DataType::Type::kFloat64:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ vaddpd(dst, src1, src2);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
 void LocationsBuilderX86_64::VisitVecSaturationAdd(HVecSaturationAdd* instruction) {
   CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
 }
@@ -557,6 +621,48 @@
   }
 }
 
+void LocationsBuilderX86_64::VisitVecAvxSub(HVecAvxSub* instruction) {
+  CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecAvxSub(HVecAvxSub* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  XmmRegister src1 = locations->InAt(0).AsFpuRegister<XmmRegister>();
+  XmmRegister src2 = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case DataType::Type::kUint8:
+    case DataType::Type::kInt8:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ vpsubb(dst, src1, src2);
+      break;
+    case DataType::Type::kUint16:
+    case DataType::Type::kInt16:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ vpsubw(dst, src1, src2);
+      break;
+    case DataType::Type::kInt32:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ vpsubd(dst, src1, src2);
+      break;
+    case DataType::Type::kInt64:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ vpsubq(dst, src1, src2);
+      break;
+    case DataType::Type::kFloat32:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ vsubps(dst, src1, src2);
+      break;
+    case DataType::Type::kFloat64:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ vsubpd(dst, src1, src2);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
 void LocationsBuilderX86_64::VisitVecSaturationSub(HVecSaturationSub* instruction) {
   CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
 }
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 6c76ab8..c6e7560 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -351,8 +351,11 @@
 
 // Translates vector operation to reduction kind.
 static HVecReduce::ReductionKind GetReductionKind(HVecOperation* reduction) {
-  if (reduction->IsVecAdd() ||
+  if (reduction->IsVecAdd()  ||
       reduction->IsVecSub() ||
+      #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
+      reduction->IsVecAvxSub() || reduction->IsVecAvxAdd() ||
+      #endif
       reduction->IsVecSADAccumulate() ||
       reduction->IsVecDotProd()) {
     return HVecReduce::kSum;
@@ -1940,10 +1943,34 @@
         new (global_allocator_) HVecCnv(global_allocator_, opa, type, vector_length_, dex_pc),
         new (global_allocator_) HTypeConversion(org_type, opa, dex_pc));
     case HInstruction::kAdd:
+      #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
+      if ((compiler_options_->GetInstructionSet() == InstructionSet::kX86 ||
+           compiler_options_->GetInstructionSet() == InstructionSet::kX86_64) &&
+           compiler_options_->GetInstructionSetFeatures()->AsX86InstructionSetFeatures()
+               ->HasAVX2()) {
+        GENERATE_VEC(
+          new (global_allocator_) HVecAvxAdd(
+                                      global_allocator_, opa, opb, type, vector_length_, dex_pc),
+          new (global_allocator_) HAdd(org_type, opa, opb, dex_pc));
+        UNREACHABLE();  // GENERATE_VEC ends with a "break".
+      }
+      #endif
       GENERATE_VEC(
         new (global_allocator_) HVecAdd(global_allocator_, opa, opb, type, vector_length_, dex_pc),
         new (global_allocator_) HAdd(org_type, opa, opb, dex_pc));
     case HInstruction::kSub:
+      #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
+      if ((compiler_options_->GetInstructionSet() == InstructionSet::kX86 ||
+           compiler_options_->GetInstructionSet() == InstructionSet::kX86_64) &&
+           compiler_options_->GetInstructionSetFeatures()->AsX86InstructionSetFeatures()
+               ->HasAVX2()) {
+        GENERATE_VEC(
+          new (global_allocator_) HVecAvxSub(
+                                      global_allocator_, opa, opb, type, vector_length_, dex_pc),
+          new (global_allocator_) HSub(org_type, opa, opb, dex_pc));
+        UNREACHABLE();  // GENERATE_VEC ends with a "break".
+      }
+      #endif
       GENERATE_VEC(
         new (global_allocator_) HVecSub(global_allocator_, opa, opb, type, vector_length_, dex_pc),
         new (global_allocator_) HSub(org_type, opa, opb, dex_pc));
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index cb53ae3..57ed71d 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -1521,8 +1521,10 @@
 
 #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
 #define FOR_EACH_CONCRETE_INSTRUCTION_X86_COMMON(M)                     \
-  M(X86AndNot, Instruction)                                                \
-  M(X86MaskOrResetLeastSetBit, Instruction)
+  M(X86AndNot, Instruction)                                             \
+  M(X86MaskOrResetLeastSetBit, Instruction)                             \
+  M(VecAvxSub, VecOperation)                                            \
+  M(VecAvxAdd, VecOperation)
 #else
 #define FOR_EACH_CONCRETE_INSTRUCTION_X86_COMMON(M)
 #endif
@@ -7853,6 +7855,7 @@
 #endif
 #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
 #include "nodes_x86.h"
+#include "nodes_vector_x86.h"
 #endif
 
 namespace art {
diff --git a/compiler/optimizing/nodes_vector_x86.h b/compiler/optimizing/nodes_vector_x86.h
new file mode 100644
index 0000000..a8f576f
--- /dev/null
+++ b/compiler/optimizing/nodes_vector_x86.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_OPTIMIZING_NODES_VECTOR_X86_H_
+#define ART_COMPILER_OPTIMIZING_NODES_VECTOR_X86_H_
+
+#include "nodes_vector.h"
+
+namespace art {
+
+class HVecAvxAdd final : public HVecOperation {
+ public:
+  HVecAvxAdd(ArenaAllocator* allocator,
+             HInstruction* src1,
+             HInstruction* src2,
+             DataType::Type packed_type,
+             size_t vector_length,
+             uint32_t dex_pc)
+      : HVecOperation(kVecAvxAdd,
+                      allocator,
+                      packed_type,
+                      SideEffects::None(),
+                      /* number_of_inputs */ 2,
+                      vector_length,
+                      dex_pc) {
+    DCHECK(HasConsistentPackedTypes(src1, packed_type));
+    DCHECK(HasConsistentPackedTypes(src2, packed_type));
+    SetRawInputAt(0, src1);
+    SetRawInputAt(1, src2);
+  }
+
+  bool CanBeMoved() const override { return true; }
+
+  DECLARE_INSTRUCTION(VecAvxAdd);
+
+ protected:
+  DEFAULT_COPY_CONSTRUCTOR(VecAvxAdd);
+};
+
+class HVecAvxSub final : public HVecOperation {
+ public:
+  HVecAvxSub(ArenaAllocator* allocator,
+             HInstruction* src1,
+             HInstruction* src2,
+             DataType::Type packed_type,
+             size_t vector_length,
+             uint32_t dex_pc)
+      : HVecOperation(kVecAvxSub,
+                      allocator,
+                      packed_type,
+                      SideEffects::None(),
+                      /* number_of_inputs */ 2,
+                      vector_length,
+                      dex_pc) {
+    DCHECK(HasConsistentPackedTypes(src1, packed_type));
+    DCHECK(HasConsistentPackedTypes(src2, packed_type));
+    SetRawInputAt(0, src1);
+    SetRawInputAt(1, src2);
+  }
+
+  bool CanBeMoved() const override { return true; }
+
+  DECLARE_INSTRUCTION(VecAvxSub);
+
+ protected:
+  DEFAULT_COPY_CONSTRUCTOR(VecAvxSub);
+};
+
+}  // namespace art
+
+#endif  // ART_COMPILER_OPTIMIZING_NODES_VECTOR_X86_H_
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index bcc197b..3eaf93a 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -703,6 +703,20 @@
   EmitXmmRegisterOperand(dst, src);
 }
 
+void X86Assembler::vaddps(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+  DCHECK(CpuHasAVXorAVX2FeatureFlag());
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t ByteZero = 0x00, ByteOne = 0x00;
+  ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ true);
+  ByteOne = EmitVexPrefixByteOne(/*R=*/ false,
+                                 X86ManagedRegister::FromXmmRegister(add_left),
+                                 SET_VEX_L_128,
+                                 SET_VEX_PP_NONE);
+  EmitUint8(ByteZero);
+  EmitUint8(ByteOne);
+  EmitUint8(0x58);
+  EmitXmmRegisterOperand(dst, add_right);
+}
 
 void X86Assembler::subps(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -711,6 +725,18 @@
   EmitXmmRegisterOperand(dst, src);
 }
 
+void X86Assembler::vsubps(XmmRegister dst, XmmRegister src1, XmmRegister src2) {
+  DCHECK(CpuHasAVXorAVX2FeatureFlag());
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t byte_zero = 0x00, byte_one = 0x00;
+  byte_zero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ true);
+  X86ManagedRegister vvvv_reg = X86ManagedRegister::FromXmmRegister(src1);
+  byte_one = EmitVexPrefixByteOne(/*R=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_NONE);
+  EmitUint8(byte_zero);
+  EmitUint8(byte_one);
+  EmitUint8(0x5C);
+  EmitXmmRegisterOperand(dst, src2);
+}
 
 void X86Assembler::mulps(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -1041,6 +1067,21 @@
 }
 
 
+void X86Assembler::vaddpd(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t ByteZero = 0x00, ByteOne = 0x00;
+  ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ true);
+  ByteOne = EmitVexPrefixByteOne(/*R=*/ false,
+                                 X86ManagedRegister::FromXmmRegister(add_left),
+                                 SET_VEX_L_128,
+                                 SET_VEX_PP_66);
+  EmitUint8(ByteZero);
+  EmitUint8(ByteOne);
+  EmitUint8(0x58);
+  EmitXmmRegisterOperand(dst, add_right);
+}
+
+
 void X86Assembler::subpd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
@@ -1050,6 +1091,20 @@
 }
 
 
+void X86Assembler::vsubpd(XmmRegister dst, XmmRegister src1, XmmRegister src2) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t ByteZero = 0x00, ByteOne = 0x00;
+  ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form*/ true);
+  ByteOne = EmitVexPrefixByteOne(/*R=*/ false,
+                                 X86ManagedRegister::FromXmmRegister(src1),
+                                 SET_VEX_L_128,
+                                 SET_VEX_PP_66);
+  EmitUint8(ByteZero);
+  EmitUint8(ByteOne);
+  EmitUint8(0x5C);
+  EmitXmmRegisterOperand(dst, src2);
+}
+
 void X86Assembler::mulpd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
@@ -1232,6 +1287,18 @@
   EmitXmmRegisterOperand(dst, src);
 }
 
+void X86Assembler::vpaddb(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+  DCHECK(CpuHasAVXorAVX2FeatureFlag());
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t ByteOne = 0x00, ByteZero = 0x00;
+  ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ true);
+  X86ManagedRegister vvvv_reg = X86ManagedRegister::FromXmmRegister(add_left);
+  ByteOne = EmitVexPrefixByteOne(/*R=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+  EmitUint8(ByteZero);
+  EmitUint8(ByteOne);
+  EmitUint8(0xFC);
+  EmitXmmRegisterOperand(dst, add_right);
+}
 
 void X86Assembler::psubb(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -1241,6 +1308,18 @@
   EmitXmmRegisterOperand(dst, src);
 }
 
+void X86Assembler::vpsubb(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+  DCHECK(CpuHasAVXorAVX2FeatureFlag());
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t ByteZero = 0x00, ByteOne = 0x00;
+  ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ true);
+  X86ManagedRegister vvvv_reg = X86ManagedRegister::FromXmmRegister(add_left);
+  ByteOne = EmitVexPrefixByteOne(/*R=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+  EmitUint8(ByteZero);
+  EmitUint8(ByteOne);
+  EmitUint8(0xF8);
+  EmitXmmRegisterOperand(dst, add_right);
+}
 
 void X86Assembler::paddw(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -1250,6 +1329,18 @@
   EmitXmmRegisterOperand(dst, src);
 }
 
+void X86Assembler::vpaddw(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+  DCHECK(CpuHasAVXorAVX2FeatureFlag());
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t ByteZero = 0x00, ByteOne = 0x00;
+  ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ true);
+  X86ManagedRegister vvvv_reg = X86ManagedRegister::FromXmmRegister(add_left);
+  ByteOne = EmitVexPrefixByteOne(/*R=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+  EmitUint8(ByteZero);
+  EmitUint8(ByteOne);
+  EmitUint8(0xFD);
+  EmitXmmRegisterOperand(dst, add_right);
+}
 
 void X86Assembler::psubw(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -1259,6 +1350,18 @@
   EmitXmmRegisterOperand(dst, src);
 }
 
+void X86Assembler::vpsubw(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+  DCHECK(CpuHasAVXorAVX2FeatureFlag());
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t ByteZero = 0x00, ByteOne = 0x00;
+  ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ true);
+  X86ManagedRegister vvvv_reg = X86ManagedRegister::FromXmmRegister(add_left);
+  ByteOne = EmitVexPrefixByteOne(/*R=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+  EmitUint8(ByteZero);
+  EmitUint8(ByteOne);
+  EmitUint8(0xF9);
+  EmitXmmRegisterOperand(dst, add_right);
+}
 
 void X86Assembler::pmullw(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -1277,6 +1380,18 @@
   EmitXmmRegisterOperand(dst, src);
 }
 
+void X86Assembler::vpaddd(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+  DCHECK(CpuHasAVXorAVX2FeatureFlag());
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t ByteZero = 0x00, ByteOne = 0x00;
+  ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ true);
+  X86ManagedRegister vvvv_reg = X86ManagedRegister::FromXmmRegister(add_left);
+  ByteOne = EmitVexPrefixByteOne(/*R=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+  EmitUint8(ByteZero);
+  EmitUint8(ByteOne);
+  EmitUint8(0xFE);
+  EmitXmmRegisterOperand(dst, add_right);
+}
 
 void X86Assembler::psubd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -1287,6 +1402,20 @@
 }
 
 
+void X86Assembler::vpsubd(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+  DCHECK(CpuHasAVXorAVX2FeatureFlag());
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t ByteZero = 0x00, ByteOne = 0x00;
+  ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ true);
+  X86ManagedRegister vvvv_reg = X86ManagedRegister::FromXmmRegister(add_left);
+  ByteOne = EmitVexPrefixByteOne(/*R=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+  EmitUint8(ByteZero);
+  EmitUint8(ByteOne);
+  EmitUint8(0xFA);
+  EmitXmmRegisterOperand(dst, add_right);
+}
+
+
 void X86Assembler::pmulld(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
@@ -1305,6 +1434,19 @@
   EmitXmmRegisterOperand(dst, src);
 }
 
+void X86Assembler::vpaddq(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+  DCHECK(CpuHasAVXorAVX2FeatureFlag());
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t ByteZero = 0x00, ByteOne = 0x00;
+  ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ true);
+  X86ManagedRegister vvvv_reg = X86ManagedRegister::FromXmmRegister(add_left);
+  ByteOne = EmitVexPrefixByteOne(/*R=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+  EmitUint8(ByteZero);
+  EmitUint8(ByteOne);
+  EmitUint8(0xD4);
+  EmitXmmRegisterOperand(dst, add_right);
+}
+
 
 void X86Assembler::psubq(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -1314,6 +1456,18 @@
   EmitXmmRegisterOperand(dst, src);
 }
 
+void X86Assembler::vpsubq(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+  DCHECK(CpuHasAVXorAVX2FeatureFlag());
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t ByteZero = 0x00, ByteOne = 0x00;
+  ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ true);
+  X86ManagedRegister vvvv_reg = X86ManagedRegister::FromXmmRegister(add_left);
+  ByteOne = EmitVexPrefixByteOne(/*R=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+  EmitUint8(ByteZero);
+  EmitUint8(ByteOne);
+  EmitUint8(0xFB);
+  EmitXmmRegisterOperand(dst, add_right);
+}
 
 void X86Assembler::paddusb(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index e84294a..17039f0 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -417,6 +417,11 @@
   void mulps(XmmRegister dst, XmmRegister src);
   void divps(XmmRegister dst, XmmRegister src);
 
+  void vaddps(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
+  void vsubps(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
+  void vsubpd(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
+  void vaddpd(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
+
   void movapd(XmmRegister dst, XmmRegister src);     // move
   void movapd(XmmRegister dst, const Address& src);  // load aligned
   void movupd(XmmRegister dst, const Address& src);  // load unaligned
@@ -465,17 +470,29 @@
   void paddb(XmmRegister dst, XmmRegister src);  // no addr variant (for now)
   void psubb(XmmRegister dst, XmmRegister src);
 
+  void vpaddb(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
+  void vpaddw(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
+
   void paddw(XmmRegister dst, XmmRegister src);
   void psubw(XmmRegister dst, XmmRegister src);
   void pmullw(XmmRegister dst, XmmRegister src);
 
+  void vpsubb(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+  void vpsubw(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+  void vpsubd(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+
   void paddd(XmmRegister dst, XmmRegister src);
   void psubd(XmmRegister dst, XmmRegister src);
   void pmulld(XmmRegister dst, XmmRegister src);
 
+  void vpaddd(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+
   void paddq(XmmRegister dst, XmmRegister src);
   void psubq(XmmRegister dst, XmmRegister src);
 
+  void vpaddq(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
+  void vpsubq(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
+
   void paddusb(XmmRegister dst, XmmRegister src);
   void paddsb(XmmRegister dst, XmmRegister src);
   void paddusw(XmmRegister dst, XmmRegister src);
diff --git a/compiler/utils/x86/assembler_x86_test.cc b/compiler/utils/x86/assembler_x86_test.cc
index ee29482..42ee383 100644
--- a/compiler/utils/x86/assembler_x86_test.cc
+++ b/compiler/utils/x86/assembler_x86_test.cc
@@ -677,18 +677,36 @@
   DriverStr(RepeatFF(&x86::X86Assembler::addps, "addps %{reg2}, %{reg1}"), "addps");
 }
 
+TEST_F(AssemblerX86AVXTest, VAddps) {
+  DriverStr(RepeatFFF(&x86::X86Assembler::vaddps, "vaddps %{reg3}, %{reg2}, %{reg1}"), "vaddps");
+}
+
 TEST_F(AssemblerX86Test, AddPD) {
   DriverStr(RepeatFF(&x86::X86Assembler::addpd, "addpd %{reg2}, %{reg1}"), "addpd");
 }
 
+TEST_F(AssemblerX86AVXTest, VAddpd) {
+  DriverStr(RepeatFFF(&x86::X86Assembler::vaddpd, "vaddpd %{reg3}, %{reg2}, %{reg1}"), "vaddpd");
+}
+
 TEST_F(AssemblerX86Test, SubPS) {
   DriverStr(RepeatFF(&x86::X86Assembler::subps, "subps %{reg2}, %{reg1}"), "subps");
 }
 
+TEST_F(AssemblerX86AVXTest, VSubps) {
+  DriverStr(RepeatFFF(&x86::X86Assembler::vsubps, "vsubps %{reg3},%{reg2}, %{reg1}"), "vsubps");
+}
+
+
 TEST_F(AssemblerX86Test, SubPD) {
   DriverStr(RepeatFF(&x86::X86Assembler::subpd, "subpd %{reg2}, %{reg1}"), "subpd");
 }
 
+TEST_F(AssemblerX86AVXTest, VSubpd) {
+  DriverStr(RepeatFFF(&x86::X86Assembler::vsubpd, "vsubpd %{reg3}, %{reg2}, %{reg1}"), "vsubpd");
+}
+
+
 TEST_F(AssemblerX86Test, MulPS) {
   DriverStr(RepeatFF(&x86::X86Assembler::mulps, "mulps %{reg2}, %{reg1}"), "mulps");
 }
@@ -709,18 +727,34 @@
   DriverStr(RepeatFF(&x86::X86Assembler::paddb, "paddb %{reg2}, %{reg1}"), "paddb");
 }
 
+TEST_F(AssemblerX86AVXTest, VPaddb) {
+  DriverStr(RepeatFFF(&x86::X86Assembler::vpaddb, "vpaddb %{reg3}, %{reg2}, %{reg1}"), "vpaddb");
+}
+
 TEST_F(AssemblerX86Test, PSubB) {
   DriverStr(RepeatFF(&x86::X86Assembler::psubb, "psubb %{reg2}, %{reg1}"), "psubb");
 }
 
+TEST_F(AssemblerX86AVXTest, VPsubb) {
+  DriverStr(RepeatFFF(&x86::X86Assembler::vpsubb, "vpsubb %{reg3},%{reg2}, %{reg1}"), "vpsubb");
+}
+
 TEST_F(AssemblerX86Test, PAddW) {
   DriverStr(RepeatFF(&x86::X86Assembler::paddw, "paddw %{reg2}, %{reg1}"), "paddw");
 }
 
+TEST_F(AssemblerX86AVXTest, VPaddw) {
+  DriverStr(RepeatFFF(&x86::X86Assembler::vpaddw, "vpaddw %{reg3}, %{reg2}, %{reg1}"), "vpaddw");
+}
+
 TEST_F(AssemblerX86Test, PSubW) {
   DriverStr(RepeatFF(&x86::X86Assembler::psubw, "psubw %{reg2}, %{reg1}"), "psubw");
 }
 
+TEST_F(AssemblerX86AVXTest, VPsubw) {
+  DriverStr(RepeatFFF(&x86::X86Assembler::vpsubw, "vpsubw %{reg3}, %{reg2}, %{reg1}"), "vpsubw");
+}
+
 TEST_F(AssemblerX86Test, PMullW) {
   DriverStr(RepeatFF(&x86::X86Assembler::pmullw, "pmullw %{reg2}, %{reg1}"), "pmullw");
 }
@@ -729,10 +763,18 @@
   DriverStr(RepeatFF(&x86::X86Assembler::paddd, "paddd %{reg2}, %{reg1}"), "paddd");
 }
 
+TEST_F(AssemblerX86AVXTest, VPaddd) {
+  DriverStr(RepeatFFF(&x86::X86Assembler::vpaddd, "vpaddd %{reg3}, %{reg2}, %{reg1}"), "vpaddd");
+}
+
 TEST_F(AssemblerX86Test, PSubD) {
   DriverStr(RepeatFF(&x86::X86Assembler::psubd, "psubd %{reg2}, %{reg1}"), "psubd");
 }
 
+TEST_F(AssemblerX86AVXTest, VPsubd) {
+  DriverStr(RepeatFFF(&x86::X86Assembler::vpsubd, "vpsubd %{reg3}, %{reg2}, %{reg1}"), "vpsubd");
+}
+
 TEST_F(AssemblerX86Test, PMullD) {
   DriverStr(RepeatFF(&x86::X86Assembler::pmulld, "pmulld %{reg2}, %{reg1}"), "pmulld");
 }
@@ -741,10 +783,18 @@
   DriverStr(RepeatFF(&x86::X86Assembler::paddq, "paddq %{reg2}, %{reg1}"), "paddq");
 }
 
+TEST_F(AssemblerX86AVXTest, VPaddq) {
+  DriverStr(RepeatFFF(&x86::X86Assembler::vpaddq, "vpaddq %{reg3}, %{reg2}, %{reg1}"), "vpaddq");
+}
+
 TEST_F(AssemblerX86Test, PSubQ) {
   DriverStr(RepeatFF(&x86::X86Assembler::psubq, "psubq %{reg2}, %{reg1}"), "psubq");
 }
 
+TEST_F(AssemblerX86AVXTest, VPsubq) {
+  DriverStr(RepeatFFF(&x86::X86Assembler::vpsubq, "vpsubq %{reg3}, %{reg2}, %{reg1}"), "vpsubq");
+}
+
 TEST_F(AssemblerX86Test, PAddUSB) {
   DriverStr(RepeatFF(&x86::X86Assembler::paddusb, "paddusb %{reg2}, %{reg1}"), "paddusb");
 }
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index 336ecbf..72b7ae0 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -614,8 +614,8 @@
                                    SET_VEX_PP_NONE);
   } else {
     ByteOne = EmitVexPrefixByteOne(src.NeedsRex(),
-                                   Rex_x ,
-                                   Rex_b ,
+                                   Rex_x,
+                                   Rex_b,
                                    SET_VEX_M_0F);
     ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false,
                                    SET_VEX_L_128,
@@ -856,6 +856,60 @@
   EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
+void X86_64Assembler::vaddps(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+  DCHECK(CpuHasAVXorAVX2FeatureFlag());
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  bool is_twobyte_form = false;
+  uint8_t ByteZero = 0x00, ByteOne = 0x00, ByteTwo = 0x00;
+  if (!add_right.NeedsRex()) {
+    is_twobyte_form = true;
+  }
+  X86_64ManagedRegister vvvv_reg =
+      X86_64ManagedRegister::FromXmmRegister(add_left.AsFloatRegister());
+  ByteZero = EmitVexPrefixByteZero(is_twobyte_form);
+  if (is_twobyte_form) {
+    ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(), vvvv_reg, SET_VEX_L_128, SET_VEX_PP_NONE);
+  } else {
+    ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(),
+                                   /*X=*/ false,
+                                   add_right.NeedsRex(),
+                                   SET_VEX_M_0F);
+    ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_NONE);
+  }
+  EmitUint8(ByteZero);
+  EmitUint8(ByteOne);
+  if (!is_twobyte_form) {
+    EmitUint8(ByteTwo);
+  }
+  EmitUint8(0x58);
+  EmitXmmRegisterOperand(dst.LowBits(), add_right);
+}
+
+void X86_64Assembler::vsubps(XmmRegister dst, XmmRegister src1, XmmRegister src2) {
+  DCHECK(CpuHasAVXorAVX2FeatureFlag());
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  bool is_twobyte_form = false;
+  uint8_t byte_zero = 0x00, byte_one = 0x00, byte_two = 0x00;
+  if (!src2.NeedsRex()) {
+    is_twobyte_form = true;
+  }
+  byte_zero = EmitVexPrefixByteZero(is_twobyte_form);
+  X86_64ManagedRegister vvvv_reg = X86_64ManagedRegister::FromXmmRegister(src1.AsFloatRegister());
+  if (is_twobyte_form) {
+    byte_one = EmitVexPrefixByteOne(dst.NeedsRex(), vvvv_reg, SET_VEX_L_128, SET_VEX_PP_NONE);
+  } else {
+    byte_one = EmitVexPrefixByteOne(dst.NeedsRex(), /*X=*/ false, src2.NeedsRex(), SET_VEX_M_0F);
+    byte_two = EmitVexPrefixByteTwo(/*W=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_NONE);
+  }
+  EmitUint8(byte_zero);
+  EmitUint8(byte_one);
+  if (!is_twobyte_form) {
+    EmitUint8(byte_two);
+  }
+  EmitUint8(0x5C);
+  EmitXmmRegisterOperand(dst.LowBits(), src2);
+}
+
 
 void X86_64Assembler::mulps(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -931,7 +985,7 @@
                                    SET_VEX_PP_66);
   } else {
     ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(),
-                                   /*X=*/ false ,
+                                   /*X=*/ false,
                                    src.NeedsRex(),
                                    SET_VEX_M_0F);
     ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false,
@@ -1292,6 +1346,35 @@
 }
 
 
+void X86_64Assembler::vaddpd(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  bool is_twobyte_form = false;
+  uint8_t ByteZero = 0x00, ByteOne = 0x00, ByteTwo = 0x00;
+  if (!add_right.NeedsRex()) {
+    is_twobyte_form = true;
+  }
+  ByteZero = EmitVexPrefixByteZero(is_twobyte_form);
+  X86_64ManagedRegister vvvv_reg =
+      X86_64ManagedRegister::FromXmmRegister(add_left.AsFloatRegister());
+  if (is_twobyte_form) {
+    ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(), vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+  } else {
+    ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(),
+                                   /*X=*/ false,
+                                   add_right.NeedsRex(),
+                                   SET_VEX_M_0F);
+    ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+  }
+  EmitUint8(ByteZero);
+  EmitUint8(ByteOne);
+  if (!is_twobyte_form) {
+    EmitUint8(ByteTwo);
+  }
+  EmitUint8(0x58);
+  EmitXmmRegisterOperand(dst.LowBits(), add_right);
+}
+
+
 void X86_64Assembler::subpd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
@@ -1302,6 +1385,35 @@
 }
 
 
+void X86_64Assembler::vsubpd(XmmRegister dst, XmmRegister src1, XmmRegister src2) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  bool is_twobyte_form = false;
+  uint8_t ByteZero = 0x00, ByteOne = 0x00, ByteTwo = 0x00;
+  if (!src2.NeedsRex()) {
+    is_twobyte_form = true;
+  }
+  ByteZero = EmitVexPrefixByteZero(is_twobyte_form);
+  X86_64ManagedRegister vvvv_reg =
+      X86_64ManagedRegister::FromXmmRegister(src1.AsFloatRegister());
+  if (is_twobyte_form) {
+    ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(), vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+  } else {
+    ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(),
+                                   /*X=*/ false,
+                                   src2.NeedsRex(),
+                                   SET_VEX_M_0F);
+    ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+  }
+  EmitUint8(ByteZero);
+  EmitUint8(ByteOne);
+  if (!is_twobyte_form) {
+    EmitUint8(ByteTwo);
+  }
+  EmitUint8(0x5C);
+  EmitXmmRegisterOperand(dst.LowBits(), src2);
+}
+
+
 void X86_64Assembler::mulpd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
@@ -1609,6 +1721,36 @@
 }
 
 
+void X86_64Assembler::vpaddb(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+  DCHECK(CpuHasAVXorAVX2FeatureFlag());
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t ByteOne = 0x00, ByteZero = 0x00, ByteTwo = 0x00;
+  bool is_twobyte_form = true;
+  if (add_right.NeedsRex()) {
+    is_twobyte_form = false;
+  }
+  ByteZero = EmitVexPrefixByteZero(is_twobyte_form);
+  X86_64ManagedRegister vvvv_reg =
+      X86_64ManagedRegister::FromXmmRegister(add_left.AsFloatRegister());
+  if (is_twobyte_form) {
+    ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(), vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+  } else {
+    ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(),
+                                   /*X=*/ false,
+                                   add_right.NeedsRex(),
+                                   SET_VEX_M_0F);
+    ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+  }
+  EmitUint8(ByteZero);
+  EmitUint8(ByteOne);
+  if (!is_twobyte_form) {
+    EmitUint8(ByteTwo);
+  }
+  EmitUint8(0xFC);
+  EmitXmmRegisterOperand(dst.LowBits(), add_right);
+}
+
+
 void X86_64Assembler::psubb(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
@@ -1619,6 +1761,36 @@
 }
 
 
+void X86_64Assembler::vpsubb(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+  DCHECK(CpuHasAVXorAVX2FeatureFlag());
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  bool is_twobyte_form = false;
+  uint8_t ByteZero = 0x00, ByteOne = 0x00, ByteTwo = 0x00;
+  if (!add_right.NeedsRex()) {
+    is_twobyte_form = true;
+  }
+  ByteZero = EmitVexPrefixByteZero(is_twobyte_form);
+  X86_64ManagedRegister vvvv_reg =
+      X86_64ManagedRegister::FromXmmRegister(add_left.AsFloatRegister());
+  if (is_twobyte_form) {
+    ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(), vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+  } else {
+    ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(),
+                                   /*X=*/ false,
+                                   add_right.NeedsRex(),
+                                   SET_VEX_M_0F);
+    ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+  }
+  EmitUint8(ByteZero);
+  EmitUint8(ByteOne);
+  if (!is_twobyte_form) {
+    EmitUint8(ByteTwo);
+  }
+  EmitUint8(0xF8);
+  EmitXmmRegisterOperand(dst.LowBits(), add_right);
+}
+
+
 void X86_64Assembler::paddw(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
@@ -1628,6 +1800,35 @@
   EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
+void X86_64Assembler::vpaddw(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+  DCHECK(CpuHasAVXorAVX2FeatureFlag());
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  bool is_twobyte_form = false;
+  uint8_t ByteZero = 0x00, ByteOne = 0x00, ByteTwo = 0x00;
+  if (!add_right.NeedsRex()) {
+    is_twobyte_form = true;
+  }
+  ByteZero = EmitVexPrefixByteZero(is_twobyte_form);
+  X86_64ManagedRegister vvvv_reg =
+      X86_64ManagedRegister::FromXmmRegister(add_left.AsFloatRegister());
+  if (is_twobyte_form) {
+    ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(), vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+  } else {
+    ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(),
+                                   /*X=*/ false,
+                                   add_right.NeedsRex(),
+                                   SET_VEX_M_0F);
+    ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+  }
+  EmitUint8(ByteZero);
+  EmitUint8(ByteOne);
+  if (!is_twobyte_form) {
+    EmitUint8(ByteTwo);
+  }
+  EmitUint8(0xFD);
+  EmitXmmRegisterOperand(dst.LowBits(), add_right);
+}
+
 
 void X86_64Assembler::psubw(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -1638,6 +1839,35 @@
   EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
+void X86_64Assembler::vpsubw(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+  DCHECK(CpuHasAVXorAVX2FeatureFlag());
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  bool is_twobyte_form = false;
+  uint8_t ByteZero = 0x00, ByteOne = 0x00, ByteTwo = 0x00;
+  if (!add_right.NeedsRex()) {
+    is_twobyte_form = true;
+  }
+  ByteZero = EmitVexPrefixByteZero(is_twobyte_form);
+  X86_64ManagedRegister vvvv_reg =
+      X86_64ManagedRegister::FromXmmRegister(add_left.AsFloatRegister());
+  if (is_twobyte_form) {
+    ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(), vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+  } else {
+    ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(),
+                                   /*X=*/ false,
+                                   add_right.NeedsRex(),
+                                   SET_VEX_M_0F);
+    ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+  }
+  EmitUint8(ByteZero);
+  EmitUint8(ByteOne);
+  if (!is_twobyte_form) {
+    EmitUint8(ByteTwo);
+  }
+  EmitUint8(0xF9);
+  EmitXmmRegisterOperand(dst.LowBits(), add_right);
+}
+
 
 void X86_64Assembler::pmullw(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -1658,6 +1888,34 @@
   EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
+void X86_64Assembler::vpaddd(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+  DCHECK(CpuHasAVXorAVX2FeatureFlag());
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  bool is_twobyte_form = false;
+  uint8_t ByteZero = 0x00, ByteOne = 0x00, ByteTwo = 0x00;
+  if (!add_right.NeedsRex()) {
+    is_twobyte_form = true;
+  }
+  ByteZero = EmitVexPrefixByteZero(is_twobyte_form);
+  X86_64ManagedRegister vvvv_reg =
+      X86_64ManagedRegister::FromXmmRegister(add_left.AsFloatRegister());
+  if (is_twobyte_form) {
+    ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(), vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+  } else {
+    ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(),
+                                   /*X=*/ false,
+                                   add_right.NeedsRex(),
+                                   SET_VEX_M_0F);
+    ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+  }
+  EmitUint8(ByteZero);
+  EmitUint8(ByteOne);
+  if (!is_twobyte_form) {
+    EmitUint8(ByteTwo);
+  }
+  EmitUint8(0xFE);
+  EmitXmmRegisterOperand(dst.LowBits(), add_right);
+}
 
 void X86_64Assembler::psubd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -1690,6 +1948,36 @@
 }
 
 
+void X86_64Assembler::vpaddq(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+  DCHECK(CpuHasAVXorAVX2FeatureFlag());
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  bool is_twobyte_form = false;
+  uint8_t ByteZero = 0x00, ByteOne = 0x00, ByteTwo = 0x00;
+  if (!add_right.NeedsRex()) {
+    is_twobyte_form = true;
+  }
+  ByteZero = EmitVexPrefixByteZero(is_twobyte_form);
+  X86_64ManagedRegister vvvv_reg =
+      X86_64ManagedRegister::FromXmmRegister(add_left.AsFloatRegister());
+  if (is_twobyte_form) {
+    ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(), vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+  } else {
+    ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(),
+                                   /*X=*/ false,
+                                   add_right.NeedsRex(),
+                                   SET_VEX_M_0F);
+    ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+  }
+  EmitUint8(ByteZero);
+  EmitUint8(ByteOne);
+  if (!is_twobyte_form) {
+    EmitUint8(ByteTwo);
+  }
+  EmitUint8(0xD4);
+  EmitXmmRegisterOperand(dst.LowBits(), add_right);
+}
+
+
 void X86_64Assembler::psubq(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
@@ -1699,6 +1987,35 @@
   EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
+void X86_64Assembler::vpsubq(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+  DCHECK(CpuHasAVXorAVX2FeatureFlag());
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  bool is_twobyte_form = false;
+  uint8_t ByteZero = 0x00, ByteOne = 0x00, ByteTwo = 0x00;
+  if (!add_right.NeedsRex()) {
+    is_twobyte_form = true;
+  }
+  ByteZero = EmitVexPrefixByteZero(is_twobyte_form);
+  X86_64ManagedRegister vvvv_reg =
+      X86_64ManagedRegister::FromXmmRegister(add_left.AsFloatRegister());
+  if (is_twobyte_form) {
+    ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(), vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+  } else {
+    ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(),
+                                   /*X=*/ false,
+                                   add_right.NeedsRex(),
+                                   SET_VEX_M_0F);
+    ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+  }
+  EmitUint8(ByteZero);
+  EmitUint8(ByteOne);
+  if (!is_twobyte_form) {
+    EmitUint8(ByteTwo);
+  }
+  EmitUint8(0xFB);
+  EmitXmmRegisterOperand(dst.LowBits(), add_right);
+}
+
 
 void X86_64Assembler::paddusb(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -1760,6 +2077,36 @@
 }
 
 
+void X86_64Assembler::vpsubd(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+  DCHECK(CpuHasAVXorAVX2FeatureFlag());
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  bool is_twobyte_form = false;
+  uint8_t ByteZero = 0x00, ByteOne = 0x00, ByteTwo = 0x00;
+  if (!add_right.NeedsRex()) {
+    is_twobyte_form = true;
+  }
+  ByteZero = EmitVexPrefixByteZero(is_twobyte_form);
+  X86_64ManagedRegister vvvv_reg =
+      X86_64ManagedRegister::FromXmmRegister(add_left.AsFloatRegister());
+  if (is_twobyte_form) {
+    ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(), vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+  } else {
+    ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(),
+                                   /*X=*/ false,
+                                   add_right.NeedsRex(),
+                                   SET_VEX_M_0F);
+    ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+  }
+  EmitUint8(ByteZero);
+  EmitUint8(ByteOne);
+  if (!is_twobyte_form) {
+    EmitUint8(ByteTwo);
+  }
+  EmitUint8(0xFA);
+  EmitXmmRegisterOperand(dst.LowBits(), add_right);
+}
+
+
 void X86_64Assembler::psubusw(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
@@ -4597,7 +4944,7 @@
     uint8_t reg = static_cast<uint8_t>(inverted_reg);
     vex_prefix |= ((reg & 0x0F) << 3);
   }
-  /** Bit[2] - "L" If VEX.L = 1 indicates 256-bit vector operation ,
+  /** Bit[2] - "L" If VEX.L = 1 indicates 256-bit vector operation,
   VEX.L = 0 indicates 128 bit vector operation */
   vex_prefix |= SET_VEX_L;
   // Bits[1:0] -  "pp"
@@ -4629,7 +4976,7 @@
     uint8_t reg = static_cast<uint8_t>(inverted_reg);
     vex_prefix |= ((reg & 0x0F) << 3);
   }
-  /** Bit[2] - "L" If VEX.L = 1 indicates 256-bit vector operation ,
+  /** Bit[2] - "L" If VEX.L = 1 indicates 256-bit vector operation,
   VEX.L = 0 indicates 128 bit vector operation */
   vex_prefix |= SET_VEX_L;
   // Bits[1:0] -  "pp"
@@ -4650,7 +4997,7 @@
   }
   /** Bits[6:3] - 'vvvv' the source or dest register specifier */
   vex_prefix |= (0x0F << 3);
-  /** Bit[2] - "L" If VEX.L = 1 indicates 256-bit vector operation ,
+  /** Bit[2] - "L" If VEX.L = 1 indicates 256-bit vector operation,
   VEX.L = 0 indicates 128 bit vector operation */
   vex_prefix |= SET_VEX_L;
 
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index 471b314..8fc69f6 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -452,6 +452,11 @@
   void mulps(XmmRegister dst, XmmRegister src);
   void divps(XmmRegister dst, XmmRegister src);
 
+  void vaddps(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
+  void vsubps(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
+  void vsubpd(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
+  void vaddpd(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
+
   void movapd(XmmRegister dst, XmmRegister src);     // move
   void movapd(XmmRegister dst, const Address& src);  // load aligned
   void movupd(XmmRegister dst, const Address& src);  // load unaligned
@@ -497,17 +502,29 @@
   void paddb(XmmRegister dst, XmmRegister src);  // no addr variant (for now)
   void psubb(XmmRegister dst, XmmRegister src);
 
+  void vpaddb(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
+  void vpaddw(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
+
   void paddw(XmmRegister dst, XmmRegister src);
   void psubw(XmmRegister dst, XmmRegister src);
   void pmullw(XmmRegister dst, XmmRegister src);
 
+  void vpsubb(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+  void vpsubw(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+  void vpsubd(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+
   void paddd(XmmRegister dst, XmmRegister src);
   void psubd(XmmRegister dst, XmmRegister src);
   void pmulld(XmmRegister dst, XmmRegister src);
 
+  void vpaddd(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+
   void paddq(XmmRegister dst, XmmRegister src);
   void psubq(XmmRegister dst, XmmRegister src);
 
+  void vpaddq(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
+  void vpsubq(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
+
   void paddusb(XmmRegister dst, XmmRegister src);
   void paddsb(XmmRegister dst, XmmRegister src);
   void paddusw(XmmRegister dst, XmmRegister src);
diff --git a/compiler/utils/x86_64/assembler_x86_64_test.cc b/compiler/utils/x86_64/assembler_x86_64_test.cc
index 411c30b..24a6c3c 100644
--- a/compiler/utils/x86_64/assembler_x86_64_test.cc
+++ b/compiler/utils/x86_64/assembler_x86_64_test.cc
@@ -1323,10 +1323,20 @@
   DriverStr(RepeatFF(&x86_64::X86_64Assembler::addps, "addps %{reg2}, %{reg1}"), "addps");
 }
 
+TEST_F(AssemblerX86_64AVXTest, VAddps) {
+  DriverStr(
+      RepeatFFF(&x86_64::X86_64Assembler::vaddps, "vaddps %{reg3}, %{reg2}, %{reg1}"), "vaddps");
+}
+
 TEST_F(AssemblerX86_64Test, Addpd) {
   DriverStr(RepeatFF(&x86_64::X86_64Assembler::addpd, "addpd %{reg2}, %{reg1}"), "addpd");
 }
 
+TEST_F(AssemblerX86_64AVXTest, VAddpd) {
+  DriverStr(
+      RepeatFFF(&x86_64::X86_64Assembler::vaddpd, "vaddpd %{reg3}, %{reg2}, %{reg1}"), "vaddpd");
+}
+
 TEST_F(AssemblerX86_64Test, Subss) {
   DriverStr(RepeatFF(&x86_64::X86_64Assembler::subss, "subss %{reg2}, %{reg1}"), "subss");
 }
@@ -1339,10 +1349,20 @@
   DriverStr(RepeatFF(&x86_64::X86_64Assembler::subps, "subps %{reg2}, %{reg1}"), "subps");
 }
 
+TEST_F(AssemblerX86_64AVXTest, VSubps) {
+  DriverStr(
+      RepeatFFF(&x86_64::X86_64Assembler::vsubps, "vsubps %{reg3},%{reg2}, %{reg1}"), "vsubps");
+}
+
 TEST_F(AssemblerX86_64Test, Subpd) {
   DriverStr(RepeatFF(&x86_64::X86_64Assembler::subpd, "subpd %{reg2}, %{reg1}"), "subpd");
 }
 
+TEST_F(AssemblerX86_64AVXTest, VSubpd) {
+  DriverStr(
+      RepeatFFF(&x86_64::X86_64Assembler::vsubpd, "vsubpd %{reg3}, %{reg2}, %{reg1}"), "vsubpd");
+}
+
 TEST_F(AssemblerX86_64Test, Mulss) {
   DriverStr(RepeatFF(&x86_64::X86_64Assembler::mulss, "mulss %{reg2}, %{reg1}"), "mulss");
 }
@@ -1379,14 +1399,35 @@
   DriverStr(RepeatFF(&x86_64::X86_64Assembler::paddb, "paddb %{reg2}, %{reg1}"), "paddb");
 }
 
+TEST_F(AssemblerX86_64AVXTest, VPaddb) {
+  DriverStr(
+      RepeatFFF(&x86_64::X86_64Assembler::vpaddb, "vpaddb %{reg3}, %{reg2}, %{reg1}"), "vpaddb");
+}
+
 TEST_F(AssemblerX86_64Test, Psubb) {
   DriverStr(RepeatFF(&x86_64::X86_64Assembler::psubb, "psubb %{reg2}, %{reg1}"), "psubb");
 }
 
+TEST_F(AssemblerX86_64AVXTest, VPsubb) {
+  DriverStr(
+      RepeatFFF(&x86_64::X86_64Assembler::vpsubb, "vpsubb %{reg3},%{reg2}, %{reg1}"), "vpsubb");
+}
+
 TEST_F(AssemblerX86_64Test, Paddw) {
   DriverStr(RepeatFF(&x86_64::X86_64Assembler::paddw, "paddw %{reg2}, %{reg1}"), "paddw");
 }
 
+TEST_F(AssemblerX86_64AVXTest, VPsubw) {
+  DriverStr(
+      RepeatFFF(&x86_64::X86_64Assembler::vpsubw, "vpsubw %{reg3}, %{reg2}, %{reg1}"), "vpsubw");
+}
+
+TEST_F(AssemblerX86_64AVXTest, VPaddw) {
+  DriverStr(
+      RepeatFFF(&x86_64::X86_64Assembler::vpaddw, "vpaddw %{reg3}, %{reg2}, %{reg1}"), "vpaddw");
+}
+
+
 TEST_F(AssemblerX86_64Test, Psubw) {
   DriverStr(RepeatFF(&x86_64::X86_64Assembler::psubw, "psubw %{reg2}, %{reg1}"), "psubw");
 }
@@ -1399,10 +1440,20 @@
   DriverStr(RepeatFF(&x86_64::X86_64Assembler::paddd, "paddd %{reg2}, %{reg1}"), "paddd");
 }
 
+TEST_F(AssemblerX86_64AVXTest, VPaddd) {
+  DriverStr(
+      RepeatFFF(&x86_64::X86_64Assembler::vpaddd, "vpaddd %{reg3}, %{reg2}, %{reg1}"), "vpaddd");
+}
+
 TEST_F(AssemblerX86_64Test, Psubd) {
   DriverStr(RepeatFF(&x86_64::X86_64Assembler::psubd, "psubd %{reg2}, %{reg1}"), "psubd");
 }
 
+TEST_F(AssemblerX86_64AVXTest, VPsubd) {
+  DriverStr(
+      RepeatFFF(&x86_64::X86_64Assembler::vpsubd, "vpsubd %{reg3}, %{reg2}, %{reg1}"), "vpsubd");
+}
+
 TEST_F(AssemblerX86_64Test, Pmulld) {
   DriverStr(RepeatFF(&x86_64::X86_64Assembler::pmulld, "pmulld %{reg2}, %{reg1}"), "pmulld");
 }
@@ -1411,10 +1462,20 @@
   DriverStr(RepeatFF(&x86_64::X86_64Assembler::paddq, "paddq %{reg2}, %{reg1}"), "paddq");
 }
 
+TEST_F(AssemblerX86_64AVXTest, VPaddq) {
+  DriverStr(
+      RepeatFFF(&x86_64::X86_64Assembler::vpaddq, "vpaddq %{reg3}, %{reg2}, %{reg1}"), "vpaddq");
+}
+
 TEST_F(AssemblerX86_64Test, Psubq) {
   DriverStr(RepeatFF(&x86_64::X86_64Assembler::psubq, "psubq %{reg2}, %{reg1}"), "psubq");
 }
 
+TEST_F(AssemblerX86_64AVXTest, VPsubq) {
+  DriverStr(
+      RepeatFFF(&x86_64::X86_64Assembler::vpsubq, "vpsubq %{reg3}, %{reg2}, %{reg1}"), "vpsubq");
+}
+
 TEST_F(AssemblerX86_64Test, Paddusb) {
   DriverStr(RepeatFF(&x86_64::X86_64Assembler::paddusb, "paddusb %{reg2}, %{reg1}"), "paddusb");
 }