Add AVX support for packed add/sub instructions on x86
Test: ./test.py --host, test-art-host-gtest
Change-Id: I48d05e6f6befd54657d962119a543b27a8a51d71
Signed-off-by: Shalini Salomi Bodapati <shalini.salomi.bodapati@intel.com>
diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc
index 0ee0035..c8964dd 100644
--- a/compiler/optimizing/code_generator_vector_x86.cc
+++ b/compiler/optimizing/code_generator_vector_x86.cc
@@ -473,6 +473,70 @@
}
}
+static void CreateVecTerOpLocations(ArenaAllocator* allocator, HVecOperation* instruction) {
+ LocationSummary* locations = new (allocator) LocationSummary(instruction);
+ switch (instruction->GetPackedType()) {
+ case DataType::Type::kBool:
+ case DataType::Type::kUint8:
+ case DataType::Type::kInt8:
+ case DataType::Type::kUint16:
+ case DataType::Type::kInt16:
+ case DataType::Type::kInt32:
+ case DataType::Type::kInt64:
+ case DataType::Type::kFloat32:
+ case DataType::Type::kFloat64:
+ locations->SetInAt(0, Location::RequiresFpuRegister());
+ locations->SetInAt(1, Location::RequiresFpuRegister());
+ locations->SetOut(Location::RequiresFpuRegister());
+ break;
+ default:
+ LOG(FATAL) << "Unsupported SIMD type";
+ UNREACHABLE();
+ }
+}
+
+void LocationsBuilderX86::VisitVecAvxAdd(HVecAvxAdd* instruction) {
+ CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
+}
+
+void InstructionCodeGeneratorX86::VisitVecAvxAdd(HVecAvxAdd* instruction) {
+ LocationSummary* locations = instruction->GetLocations();
+ XmmRegister src1 = locations->InAt(0).AsFpuRegister<XmmRegister>();
+ XmmRegister src2 = locations->InAt(1).AsFpuRegister<XmmRegister>();
+ XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+ switch (instruction->GetPackedType()) {
+ case DataType::Type::kUint8:
+ case DataType::Type::kInt8:
+ DCHECK_EQ(16u, instruction->GetVectorLength());
+ __ vpaddb(dst, src1, src2);
+ break;
+ case DataType::Type::kUint16:
+ case DataType::Type::kInt16:
+ DCHECK_EQ(8u, instruction->GetVectorLength());
+ __ vpaddw(dst, src1, src2);
+ break;
+ case DataType::Type::kInt32:
+ DCHECK_EQ(4u, instruction->GetVectorLength());
+ __ vpaddd(dst, src1, src2);
+ break;
+ case DataType::Type::kInt64:
+ DCHECK_EQ(2u, instruction->GetVectorLength());
+ __ vpaddq(dst, src1, src2);
+ break;
+ case DataType::Type::kFloat32:
+ DCHECK_EQ(4u, instruction->GetVectorLength());
+ __ vaddps(dst, src1, src2);
+ break;
+ case DataType::Type::kFloat64:
+ DCHECK_EQ(2u, instruction->GetVectorLength());
+ __ vaddpd(dst, src1, src2);
+ break;
+ default:
+ LOG(FATAL) << "Unsupported SIMD type";
+ UNREACHABLE();
+ }
+}
+
void LocationsBuilderX86::VisitVecSaturationAdd(HVecSaturationAdd* instruction) {
CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
}
@@ -574,6 +638,48 @@
}
}
+void LocationsBuilderX86::VisitVecAvxSub(HVecAvxSub* instruction) {
+ CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
+}
+
+void InstructionCodeGeneratorX86::VisitVecAvxSub(HVecAvxSub* instruction) {
+ LocationSummary* locations = instruction->GetLocations();
+ XmmRegister src1 = locations->InAt(0).AsFpuRegister<XmmRegister>();
+ XmmRegister src2 = locations->InAt(1).AsFpuRegister<XmmRegister>();
+ XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+ switch (instruction->GetPackedType()) {
+ case DataType::Type::kUint8:
+ case DataType::Type::kInt8:
+ DCHECK_EQ(16u, instruction->GetVectorLength());
+ __ vpsubb(dst, src1, src2);
+ break;
+ case DataType::Type::kUint16:
+ case DataType::Type::kInt16:
+ DCHECK_EQ(8u, instruction->GetVectorLength());
+ __ vpsubw(dst, src1, src2);
+ break;
+ case DataType::Type::kInt32:
+ DCHECK_EQ(4u, instruction->GetVectorLength());
+ __ vpsubd(dst, src1, src2);
+ break;
+ case DataType::Type::kInt64:
+ DCHECK_EQ(2u, instruction->GetVectorLength());
+ __ vpsubq(dst, src1, src2);
+ break;
+ case DataType::Type::kFloat32:
+ DCHECK_EQ(4u, instruction->GetVectorLength());
+ __ vsubps(dst, src1, src2);
+ break;
+ case DataType::Type::kFloat64:
+ DCHECK_EQ(2u, instruction->GetVectorLength());
+ __ vsubpd(dst, src1, src2);
+ break;
+ default:
+ LOG(FATAL) << "Unsupported SIMD type";
+ UNREACHABLE();
+ }
+}
+
void LocationsBuilderX86::VisitVecSaturationSub(HVecSaturationSub* instruction) {
CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
}
diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc
index 9c28827..c147659 100644
--- a/compiler/optimizing/code_generator_vector_x86_64.cc
+++ b/compiler/optimizing/code_generator_vector_x86_64.cc
@@ -414,6 +414,28 @@
}
}
+static void CreateVecTerOpLocations(ArenaAllocator* allocator, HVecOperation* instruction) {
+ LocationSummary* locations = new (allocator) LocationSummary(instruction);
+ switch (instruction->GetPackedType()) {
+ case DataType::Type::kBool:
+ case DataType::Type::kUint8:
+ case DataType::Type::kInt8:
+ case DataType::Type::kUint16:
+ case DataType::Type::kInt16:
+ case DataType::Type::kInt32:
+ case DataType::Type::kInt64:
+ case DataType::Type::kFloat32:
+ case DataType::Type::kFloat64:
+ locations->SetInAt(0, Location::RequiresFpuRegister());
+ locations->SetInAt(1, Location::RequiresFpuRegister());
+ locations->SetOut(Location::RequiresFpuRegister());
+ break;
+ default:
+ LOG(FATAL) << "Unsupported SIMD type";
+ UNREACHABLE();
+ }
+}
+
void LocationsBuilderX86_64::VisitVecAdd(HVecAdd* instruction) {
CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
}
@@ -456,6 +478,48 @@
}
}
+void LocationsBuilderX86_64::VisitVecAvxAdd(HVecAvxAdd* instruction) {
+ CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecAvxAdd(HVecAvxAdd* instruction) {
+ LocationSummary* locations = instruction->GetLocations();
+ XmmRegister src1 = locations->InAt(0).AsFpuRegister<XmmRegister>();
+ XmmRegister src2 = locations->InAt(1).AsFpuRegister<XmmRegister>();
+ XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+ switch (instruction->GetPackedType()) {
+ case DataType::Type::kUint8:
+ case DataType::Type::kInt8:
+ DCHECK_EQ(16u, instruction->GetVectorLength());
+ __ vpaddb(dst, src1, src2);
+ break;
+ case DataType::Type::kUint16:
+ case DataType::Type::kInt16:
+ DCHECK_EQ(8u, instruction->GetVectorLength());
+ __ vpaddw(dst, src1, src2);
+ break;
+ case DataType::Type::kInt32:
+ DCHECK_EQ(4u, instruction->GetVectorLength());
+ __ vpaddd(dst, src1, src2);
+ break;
+ case DataType::Type::kInt64:
+ DCHECK_EQ(2u, instruction->GetVectorLength());
+ __ vpaddq(dst, src1, src2);
+ break;
+ case DataType::Type::kFloat32:
+ DCHECK_EQ(4u, instruction->GetVectorLength());
+ __ vaddps(dst, src1, src2);
+ break;
+ case DataType::Type::kFloat64:
+ DCHECK_EQ(2u, instruction->GetVectorLength());
+ __ vaddpd(dst, src1, src2);
+ break;
+ default:
+ LOG(FATAL) << "Unsupported SIMD type";
+ UNREACHABLE();
+ }
+}
+
void LocationsBuilderX86_64::VisitVecSaturationAdd(HVecSaturationAdd* instruction) {
CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
}
@@ -557,6 +621,48 @@
}
}
+void LocationsBuilderX86_64::VisitVecAvxSub(HVecAvxSub* instruction) {
+ CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecAvxSub(HVecAvxSub* instruction) {
+ LocationSummary* locations = instruction->GetLocations();
+ XmmRegister src1 = locations->InAt(0).AsFpuRegister<XmmRegister>();
+ XmmRegister src2 = locations->InAt(1).AsFpuRegister<XmmRegister>();
+ XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+ switch (instruction->GetPackedType()) {
+ case DataType::Type::kUint8:
+ case DataType::Type::kInt8:
+ DCHECK_EQ(16u, instruction->GetVectorLength());
+ __ vpsubb(dst, src1, src2);
+ break;
+ case DataType::Type::kUint16:
+ case DataType::Type::kInt16:
+ DCHECK_EQ(8u, instruction->GetVectorLength());
+ __ vpsubw(dst, src1, src2);
+ break;
+ case DataType::Type::kInt32:
+ DCHECK_EQ(4u, instruction->GetVectorLength());
+ __ vpsubd(dst, src1, src2);
+ break;
+ case DataType::Type::kInt64:
+ DCHECK_EQ(2u, instruction->GetVectorLength());
+ __ vpsubq(dst, src1, src2);
+ break;
+ case DataType::Type::kFloat32:
+ DCHECK_EQ(4u, instruction->GetVectorLength());
+ __ vsubps(dst, src1, src2);
+ break;
+ case DataType::Type::kFloat64:
+ DCHECK_EQ(2u, instruction->GetVectorLength());
+ __ vsubpd(dst, src1, src2);
+ break;
+ default:
+ LOG(FATAL) << "Unsupported SIMD type";
+ UNREACHABLE();
+ }
+}
+
void LocationsBuilderX86_64::VisitVecSaturationSub(HVecSaturationSub* instruction) {
CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
}
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 6c76ab8..c6e7560 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -351,8 +351,11 @@
// Translates vector operation to reduction kind.
static HVecReduce::ReductionKind GetReductionKind(HVecOperation* reduction) {
- if (reduction->IsVecAdd() ||
+ if (reduction->IsVecAdd() ||
reduction->IsVecSub() ||
+ #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
+ reduction->IsVecAvxSub() || reduction->IsVecAvxAdd() ||
+ #endif
reduction->IsVecSADAccumulate() ||
reduction->IsVecDotProd()) {
return HVecReduce::kSum;
@@ -1940,10 +1943,34 @@
new (global_allocator_) HVecCnv(global_allocator_, opa, type, vector_length_, dex_pc),
new (global_allocator_) HTypeConversion(org_type, opa, dex_pc));
case HInstruction::kAdd:
+ #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
+ if ((compiler_options_->GetInstructionSet() == InstructionSet::kX86 ||
+ compiler_options_->GetInstructionSet() == InstructionSet::kX86_64) &&
+ compiler_options_->GetInstructionSetFeatures()->AsX86InstructionSetFeatures()
+ ->HasAVX2()) {
+ GENERATE_VEC(
+ new (global_allocator_) HVecAvxAdd(
+ global_allocator_, opa, opb, type, vector_length_, dex_pc),
+ new (global_allocator_) HAdd(org_type, opa, opb, dex_pc));
+ UNREACHABLE(); // GENERATE_VEC ends with a "break".
+ }
+ #endif
GENERATE_VEC(
new (global_allocator_) HVecAdd(global_allocator_, opa, opb, type, vector_length_, dex_pc),
new (global_allocator_) HAdd(org_type, opa, opb, dex_pc));
case HInstruction::kSub:
+ #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
+ if ((compiler_options_->GetInstructionSet() == InstructionSet::kX86 ||
+ compiler_options_->GetInstructionSet() == InstructionSet::kX86_64) &&
+ compiler_options_->GetInstructionSetFeatures()->AsX86InstructionSetFeatures()
+ ->HasAVX2()) {
+ GENERATE_VEC(
+ new (global_allocator_) HVecAvxSub(
+ global_allocator_, opa, opb, type, vector_length_, dex_pc),
+ new (global_allocator_) HSub(org_type, opa, opb, dex_pc));
+ UNREACHABLE(); // GENERATE_VEC ends with a "break".
+ }
+ #endif
GENERATE_VEC(
new (global_allocator_) HVecSub(global_allocator_, opa, opb, type, vector_length_, dex_pc),
new (global_allocator_) HSub(org_type, opa, opb, dex_pc));
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index cb53ae3..57ed71d 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -1521,8 +1521,10 @@
#if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
#define FOR_EACH_CONCRETE_INSTRUCTION_X86_COMMON(M) \
- M(X86AndNot, Instruction) \
- M(X86MaskOrResetLeastSetBit, Instruction)
+ M(X86AndNot, Instruction) \
+ M(X86MaskOrResetLeastSetBit, Instruction) \
+ M(VecAvxSub, VecOperation) \
+ M(VecAvxAdd, VecOperation)
#else
#define FOR_EACH_CONCRETE_INSTRUCTION_X86_COMMON(M)
#endif
@@ -7853,6 +7855,7 @@
#endif
#if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
#include "nodes_x86.h"
+#include "nodes_vector_x86.h"
#endif
namespace art {
diff --git a/compiler/optimizing/nodes_vector_x86.h b/compiler/optimizing/nodes_vector_x86.h
new file mode 100644
index 0000000..a8f576f
--- /dev/null
+++ b/compiler/optimizing/nodes_vector_x86.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_OPTIMIZING_NODES_VECTOR_X86_H_
+#define ART_COMPILER_OPTIMIZING_NODES_VECTOR_X86_H_
+
+#include "nodes_vector.h"
+
+namespace art {
+
+class HVecAvxAdd final : public HVecOperation {
+ public:
+ HVecAvxAdd(ArenaAllocator* allocator,
+ HInstruction* src1,
+ HInstruction* src2,
+ DataType::Type packed_type,
+ size_t vector_length,
+ uint32_t dex_pc)
+ : HVecOperation(kVecAvxAdd,
+ allocator,
+ packed_type,
+ SideEffects::None(),
+ /* number_of_inputs */ 2,
+ vector_length,
+ dex_pc) {
+ DCHECK(HasConsistentPackedTypes(src1, packed_type));
+ DCHECK(HasConsistentPackedTypes(src2, packed_type));
+ SetRawInputAt(0, src1);
+ SetRawInputAt(1, src2);
+ }
+
+ bool CanBeMoved() const override { return true; }
+
+ DECLARE_INSTRUCTION(VecAvxAdd);
+
+ protected:
+ DEFAULT_COPY_CONSTRUCTOR(VecAvxAdd);
+};
+
+class HVecAvxSub final : public HVecOperation {
+ public:
+ HVecAvxSub(ArenaAllocator* allocator,
+ HInstruction* src1,
+ HInstruction* src2,
+ DataType::Type packed_type,
+ size_t vector_length,
+ uint32_t dex_pc)
+ : HVecOperation(kVecAvxSub,
+ allocator,
+ packed_type,
+ SideEffects::None(),
+ /* number_of_inputs */ 2,
+ vector_length,
+ dex_pc) {
+ DCHECK(HasConsistentPackedTypes(src1, packed_type));
+ DCHECK(HasConsistentPackedTypes(src2, packed_type));
+ SetRawInputAt(0, src1);
+ SetRawInputAt(1, src2);
+ }
+
+ bool CanBeMoved() const override { return true; }
+
+ DECLARE_INSTRUCTION(VecAvxSub);
+
+ protected:
+ DEFAULT_COPY_CONSTRUCTOR(VecAvxSub);
+};
+
+} // namespace art
+
+#endif // ART_COMPILER_OPTIMIZING_NODES_VECTOR_X86_H_
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index bcc197b..3eaf93a 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -703,6 +703,20 @@
EmitXmmRegisterOperand(dst, src);
}
+void X86Assembler::vaddps(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ uint8_t ByteZero = 0x00, ByteOne = 0x00;
+ ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ true);
+ ByteOne = EmitVexPrefixByteOne(/*R=*/ false,
+ X86ManagedRegister::FromXmmRegister(add_left),
+ SET_VEX_L_128,
+ SET_VEX_PP_NONE);
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ EmitUint8(0x58);
+ EmitXmmRegisterOperand(dst, add_right);
+}
void X86Assembler::subps(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -711,6 +725,18 @@
EmitXmmRegisterOperand(dst, src);
}
+void X86Assembler::vsubps(XmmRegister dst, XmmRegister src1, XmmRegister src2) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ uint8_t byte_zero = 0x00, byte_one = 0x00;
+ byte_zero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ true);
+ X86ManagedRegister vvvv_reg = X86ManagedRegister::FromXmmRegister(src1);
+ byte_one = EmitVexPrefixByteOne(/*R=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_NONE);
+ EmitUint8(byte_zero);
+ EmitUint8(byte_one);
+ EmitUint8(0x5C);
+ EmitXmmRegisterOperand(dst, src2);
+}
void X86Assembler::mulps(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -1041,6 +1067,21 @@
}
+void X86Assembler::vaddpd(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ uint8_t ByteZero = 0x00, ByteOne = 0x00;
+ ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ true);
+ ByteOne = EmitVexPrefixByteOne(/*R=*/ false,
+ X86ManagedRegister::FromXmmRegister(add_left),
+ SET_VEX_L_128,
+ SET_VEX_PP_66);
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ EmitUint8(0x58);
+ EmitXmmRegisterOperand(dst, add_right);
+}
+
+
void X86Assembler::subpd(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
EmitUint8(0x66);
@@ -1050,6 +1091,20 @@
}
+void X86Assembler::vsubpd(XmmRegister dst, XmmRegister src1, XmmRegister src2) {
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ uint8_t ByteZero = 0x00, ByteOne = 0x00;
+ ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form*/ true);
+ ByteOne = EmitVexPrefixByteOne(/*R=*/ false,
+ X86ManagedRegister::FromXmmRegister(src1),
+ SET_VEX_L_128,
+ SET_VEX_PP_66);
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ EmitUint8(0x5C);
+ EmitXmmRegisterOperand(dst, src2);
+}
+
void X86Assembler::mulpd(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
EmitUint8(0x66);
@@ -1232,6 +1287,18 @@
EmitXmmRegisterOperand(dst, src);
}
+void X86Assembler::vpaddb(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ uint8_t ByteOne = 0x00, ByteZero = 0x00;
+ ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ true);
+ X86ManagedRegister vvvv_reg = X86ManagedRegister::FromXmmRegister(add_left);
+ ByteOne = EmitVexPrefixByteOne(/*R=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ EmitUint8(0xFC);
+ EmitXmmRegisterOperand(dst, add_right);
+}
void X86Assembler::psubb(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -1241,6 +1308,18 @@
EmitXmmRegisterOperand(dst, src);
}
+void X86Assembler::vpsubb(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ uint8_t ByteZero = 0x00, ByteOne = 0x00;
+ ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ true);
+ X86ManagedRegister vvvv_reg = X86ManagedRegister::FromXmmRegister(add_left);
+ ByteOne = EmitVexPrefixByteOne(/*R=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ EmitUint8(0xF8);
+ EmitXmmRegisterOperand(dst, add_right);
+}
void X86Assembler::paddw(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -1250,6 +1329,18 @@
EmitXmmRegisterOperand(dst, src);
}
+void X86Assembler::vpaddw(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ uint8_t ByteZero = 0x00, ByteOne = 0x00;
+ ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ true);
+ X86ManagedRegister vvvv_reg = X86ManagedRegister::FromXmmRegister(add_left);
+ ByteOne = EmitVexPrefixByteOne(/*R=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ EmitUint8(0xFD);
+ EmitXmmRegisterOperand(dst, add_right);
+}
void X86Assembler::psubw(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -1259,6 +1350,18 @@
EmitXmmRegisterOperand(dst, src);
}
+void X86Assembler::vpsubw(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ uint8_t ByteZero = 0x00, ByteOne = 0x00;
+ ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ true);
+ X86ManagedRegister vvvv_reg = X86ManagedRegister::FromXmmRegister(add_left);
+ ByteOne = EmitVexPrefixByteOne(/*R=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ EmitUint8(0xF9);
+ EmitXmmRegisterOperand(dst, add_right);
+}
void X86Assembler::pmullw(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -1277,6 +1380,18 @@
EmitXmmRegisterOperand(dst, src);
}
+void X86Assembler::vpaddd(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ uint8_t ByteZero = 0x00, ByteOne = 0x00;
+ ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ true);
+ X86ManagedRegister vvvv_reg = X86ManagedRegister::FromXmmRegister(add_left);
+ ByteOne = EmitVexPrefixByteOne(/*R=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ EmitUint8(0xFE);
+ EmitXmmRegisterOperand(dst, add_right);
+}
void X86Assembler::psubd(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -1287,6 +1402,20 @@
}
+void X86Assembler::vpsubd(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ uint8_t ByteZero = 0x00, ByteOne = 0x00;
+ ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ true);
+ X86ManagedRegister vvvv_reg = X86ManagedRegister::FromXmmRegister(add_left);
+ ByteOne = EmitVexPrefixByteOne(/*R=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ EmitUint8(0xFA);
+ EmitXmmRegisterOperand(dst, add_right);
+}
+
+
void X86Assembler::pmulld(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
EmitUint8(0x66);
@@ -1305,6 +1434,19 @@
EmitXmmRegisterOperand(dst, src);
}
+void X86Assembler::vpaddq(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ uint8_t ByteZero = 0x00, ByteOne = 0x00;
+ ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ true);
+ X86ManagedRegister vvvv_reg = X86ManagedRegister::FromXmmRegister(add_left);
+ ByteOne = EmitVexPrefixByteOne(/*R=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ EmitUint8(0xD4);
+ EmitXmmRegisterOperand(dst, add_right);
+}
+
void X86Assembler::psubq(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -1314,6 +1456,18 @@
EmitXmmRegisterOperand(dst, src);
}
+void X86Assembler::vpsubq(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ uint8_t ByteZero = 0x00, ByteOne = 0x00;
+ ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ true);
+ X86ManagedRegister vvvv_reg = X86ManagedRegister::FromXmmRegister(add_left);
+ ByteOne = EmitVexPrefixByteOne(/*R=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ EmitUint8(0xFB);
+ EmitXmmRegisterOperand(dst, add_right);
+}
void X86Assembler::paddusb(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index e84294a..17039f0 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -417,6 +417,11 @@
void mulps(XmmRegister dst, XmmRegister src);
void divps(XmmRegister dst, XmmRegister src);
+ void vaddps(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
+ void vsubps(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
+ void vsubpd(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
+ void vaddpd(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
+
void movapd(XmmRegister dst, XmmRegister src); // move
void movapd(XmmRegister dst, const Address& src); // load aligned
void movupd(XmmRegister dst, const Address& src); // load unaligned
@@ -465,17 +470,29 @@
void paddb(XmmRegister dst, XmmRegister src); // no addr variant (for now)
void psubb(XmmRegister dst, XmmRegister src);
+ void vpaddb(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
+ void vpaddw(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
+
void paddw(XmmRegister dst, XmmRegister src);
void psubw(XmmRegister dst, XmmRegister src);
void pmullw(XmmRegister dst, XmmRegister src);
+ void vpsubb(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+ void vpsubw(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+ void vpsubd(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+
void paddd(XmmRegister dst, XmmRegister src);
void psubd(XmmRegister dst, XmmRegister src);
void pmulld(XmmRegister dst, XmmRegister src);
+ void vpaddd(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+
void paddq(XmmRegister dst, XmmRegister src);
void psubq(XmmRegister dst, XmmRegister src);
+ void vpaddq(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
+ void vpsubq(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
+
void paddusb(XmmRegister dst, XmmRegister src);
void paddsb(XmmRegister dst, XmmRegister src);
void paddusw(XmmRegister dst, XmmRegister src);
diff --git a/compiler/utils/x86/assembler_x86_test.cc b/compiler/utils/x86/assembler_x86_test.cc
index ee29482..42ee383 100644
--- a/compiler/utils/x86/assembler_x86_test.cc
+++ b/compiler/utils/x86/assembler_x86_test.cc
@@ -677,18 +677,36 @@
DriverStr(RepeatFF(&x86::X86Assembler::addps, "addps %{reg2}, %{reg1}"), "addps");
}
+TEST_F(AssemblerX86AVXTest, VAddps) {
+ DriverStr(RepeatFFF(&x86::X86Assembler::vaddps, "vaddps %{reg3}, %{reg2}, %{reg1}"), "vaddps");
+}
+
TEST_F(AssemblerX86Test, AddPD) {
DriverStr(RepeatFF(&x86::X86Assembler::addpd, "addpd %{reg2}, %{reg1}"), "addpd");
}
+TEST_F(AssemblerX86AVXTest, VAddpd) {
+ DriverStr(RepeatFFF(&x86::X86Assembler::vaddpd, "vaddpd %{reg3}, %{reg2}, %{reg1}"), "vaddpd");
+}
+
TEST_F(AssemblerX86Test, SubPS) {
DriverStr(RepeatFF(&x86::X86Assembler::subps, "subps %{reg2}, %{reg1}"), "subps");
}
+TEST_F(AssemblerX86AVXTest, VSubps) {
+ DriverStr(RepeatFFF(&x86::X86Assembler::vsubps, "vsubps %{reg3},%{reg2}, %{reg1}"), "vsubps");
+}
+
+
TEST_F(AssemblerX86Test, SubPD) {
DriverStr(RepeatFF(&x86::X86Assembler::subpd, "subpd %{reg2}, %{reg1}"), "subpd");
}
+TEST_F(AssemblerX86AVXTest, VSubpd) {
+ DriverStr(RepeatFFF(&x86::X86Assembler::vsubpd, "vsubpd %{reg3}, %{reg2}, %{reg1}"), "vsubpd");
+}
+
+
TEST_F(AssemblerX86Test, MulPS) {
DriverStr(RepeatFF(&x86::X86Assembler::mulps, "mulps %{reg2}, %{reg1}"), "mulps");
}
@@ -709,18 +727,34 @@
DriverStr(RepeatFF(&x86::X86Assembler::paddb, "paddb %{reg2}, %{reg1}"), "paddb");
}
+TEST_F(AssemblerX86AVXTest, VPaddb) {
+ DriverStr(RepeatFFF(&x86::X86Assembler::vpaddb, "vpaddb %{reg3}, %{reg2}, %{reg1}"), "vpaddb");
+}
+
TEST_F(AssemblerX86Test, PSubB) {
DriverStr(RepeatFF(&x86::X86Assembler::psubb, "psubb %{reg2}, %{reg1}"), "psubb");
}
+TEST_F(AssemblerX86AVXTest, VPsubb) {
+ DriverStr(RepeatFFF(&x86::X86Assembler::vpsubb, "vpsubb %{reg3},%{reg2}, %{reg1}"), "vpsubb");
+}
+
TEST_F(AssemblerX86Test, PAddW) {
DriverStr(RepeatFF(&x86::X86Assembler::paddw, "paddw %{reg2}, %{reg1}"), "paddw");
}
+TEST_F(AssemblerX86AVXTest, VPaddw) {
+ DriverStr(RepeatFFF(&x86::X86Assembler::vpaddw, "vpaddw %{reg3}, %{reg2}, %{reg1}"), "vpaddw");
+}
+
TEST_F(AssemblerX86Test, PSubW) {
DriverStr(RepeatFF(&x86::X86Assembler::psubw, "psubw %{reg2}, %{reg1}"), "psubw");
}
+TEST_F(AssemblerX86AVXTest, VPsubw) {
+ DriverStr(RepeatFFF(&x86::X86Assembler::vpsubw, "vpsubw %{reg3}, %{reg2}, %{reg1}"), "vpsubw");
+}
+
TEST_F(AssemblerX86Test, PMullW) {
DriverStr(RepeatFF(&x86::X86Assembler::pmullw, "pmullw %{reg2}, %{reg1}"), "pmullw");
}
@@ -729,10 +763,18 @@
DriverStr(RepeatFF(&x86::X86Assembler::paddd, "paddd %{reg2}, %{reg1}"), "paddd");
}
+TEST_F(AssemblerX86AVXTest, VPaddd) {
+ DriverStr(RepeatFFF(&x86::X86Assembler::vpaddd, "vpaddd %{reg3}, %{reg2}, %{reg1}"), "vpaddd");
+}
+
TEST_F(AssemblerX86Test, PSubD) {
DriverStr(RepeatFF(&x86::X86Assembler::psubd, "psubd %{reg2}, %{reg1}"), "psubd");
}
+TEST_F(AssemblerX86AVXTest, VPsubd) {
+ DriverStr(RepeatFFF(&x86::X86Assembler::vpsubd, "vpsubd %{reg3}, %{reg2}, %{reg1}"), "vpsubd");
+}
+
TEST_F(AssemblerX86Test, PMullD) {
DriverStr(RepeatFF(&x86::X86Assembler::pmulld, "pmulld %{reg2}, %{reg1}"), "pmulld");
}
@@ -741,10 +783,18 @@
DriverStr(RepeatFF(&x86::X86Assembler::paddq, "paddq %{reg2}, %{reg1}"), "paddq");
}
+TEST_F(AssemblerX86AVXTest, VPaddq) {
+ DriverStr(RepeatFFF(&x86::X86Assembler::vpaddq, "vpaddq %{reg3}, %{reg2}, %{reg1}"), "vpaddq");
+}
+
TEST_F(AssemblerX86Test, PSubQ) {
DriverStr(RepeatFF(&x86::X86Assembler::psubq, "psubq %{reg2}, %{reg1}"), "psubq");
}
+TEST_F(AssemblerX86AVXTest, VPsubq) {
+ DriverStr(RepeatFFF(&x86::X86Assembler::vpsubq, "vpsubq %{reg3}, %{reg2}, %{reg1}"), "vpsubq");
+}
+
TEST_F(AssemblerX86Test, PAddUSB) {
DriverStr(RepeatFF(&x86::X86Assembler::paddusb, "paddusb %{reg2}, %{reg1}"), "paddusb");
}
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index 336ecbf..72b7ae0 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -614,8 +614,8 @@
SET_VEX_PP_NONE);
} else {
ByteOne = EmitVexPrefixByteOne(src.NeedsRex(),
- Rex_x ,
- Rex_b ,
+ Rex_x,
+ Rex_b,
SET_VEX_M_0F);
ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false,
SET_VEX_L_128,
@@ -856,6 +856,60 @@
EmitXmmRegisterOperand(dst.LowBits(), src);
}
+void X86_64Assembler::vaddps(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ bool is_twobyte_form = false;
+ uint8_t ByteZero = 0x00, ByteOne = 0x00, ByteTwo = 0x00;
+ if (!add_right.NeedsRex()) {
+ is_twobyte_form = true;
+ }
+ X86_64ManagedRegister vvvv_reg =
+ X86_64ManagedRegister::FromXmmRegister(add_left.AsFloatRegister());
+ ByteZero = EmitVexPrefixByteZero(is_twobyte_form);
+ if (is_twobyte_form) {
+ ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(), vvvv_reg, SET_VEX_L_128, SET_VEX_PP_NONE);
+ } else {
+ ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(),
+ /*X=*/ false,
+ add_right.NeedsRex(),
+ SET_VEX_M_0F);
+ ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_NONE);
+ }
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ if (!is_twobyte_form) {
+ EmitUint8(ByteTwo);
+ }
+ EmitUint8(0x58);
+ EmitXmmRegisterOperand(dst.LowBits(), add_right);
+}
+
+void X86_64Assembler::vsubps(XmmRegister dst, XmmRegister src1, XmmRegister src2) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ bool is_twobyte_form = false;
+ uint8_t byte_zero = 0x00, byte_one = 0x00, byte_two = 0x00;
+ if (!src2.NeedsRex()) {
+ is_twobyte_form = true;
+ }
+ byte_zero = EmitVexPrefixByteZero(is_twobyte_form);
+ X86_64ManagedRegister vvvv_reg = X86_64ManagedRegister::FromXmmRegister(src1.AsFloatRegister());
+ if (is_twobyte_form) {
+ byte_one = EmitVexPrefixByteOne(dst.NeedsRex(), vvvv_reg, SET_VEX_L_128, SET_VEX_PP_NONE);
+ } else {
+ byte_one = EmitVexPrefixByteOne(dst.NeedsRex(), /*X=*/ false, src2.NeedsRex(), SET_VEX_M_0F);
+ byte_two = EmitVexPrefixByteTwo(/*W=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_NONE);
+ }
+ EmitUint8(byte_zero);
+ EmitUint8(byte_one);
+ if (!is_twobyte_form) {
+ EmitUint8(byte_two);
+ }
+ EmitUint8(0x5C);
+ EmitXmmRegisterOperand(dst.LowBits(), src2);
+}
+
void X86_64Assembler::mulps(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -931,7 +985,7 @@
SET_VEX_PP_66);
} else {
ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(),
- /*X=*/ false ,
+ /*X=*/ false,
src.NeedsRex(),
SET_VEX_M_0F);
ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false,
@@ -1292,6 +1346,35 @@
}
+void X86_64Assembler::vaddpd(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ bool is_twobyte_form = false;
+ uint8_t ByteZero = 0x00, ByteOne = 0x00, ByteTwo = 0x00;
+ if (!add_right.NeedsRex()) {
+ is_twobyte_form = true;
+ }
+ ByteZero = EmitVexPrefixByteZero(is_twobyte_form);
+ X86_64ManagedRegister vvvv_reg =
+ X86_64ManagedRegister::FromXmmRegister(add_left.AsFloatRegister());
+ if (is_twobyte_form) {
+ ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(), vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ } else {
+ ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(),
+ /*X=*/ false,
+ add_right.NeedsRex(),
+ SET_VEX_M_0F);
+ ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ }
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ if (!is_twobyte_form) {
+ EmitUint8(ByteTwo);
+ }
+ EmitUint8(0x58);
+ EmitXmmRegisterOperand(dst.LowBits(), add_right);
+}
+
+
void X86_64Assembler::subpd(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
EmitUint8(0x66);
@@ -1302,6 +1385,35 @@
}
+void X86_64Assembler::vsubpd(XmmRegister dst, XmmRegister src1, XmmRegister src2) {
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ bool is_twobyte_form = false;
+ uint8_t ByteZero = 0x00, ByteOne = 0x00, ByteTwo = 0x00;
+ if (!src2.NeedsRex()) {
+ is_twobyte_form = true;
+ }
+ ByteZero = EmitVexPrefixByteZero(is_twobyte_form);
+ X86_64ManagedRegister vvvv_reg =
+ X86_64ManagedRegister::FromXmmRegister(src1.AsFloatRegister());
+ if (is_twobyte_form) {
+ ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(), vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ } else {
+ ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(),
+ /*X=*/ false,
+ src2.NeedsRex(),
+ SET_VEX_M_0F);
+ ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ }
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ if (!is_twobyte_form) {
+ EmitUint8(ByteTwo);
+ }
+ EmitUint8(0x5C);
+ EmitXmmRegisterOperand(dst.LowBits(), src2);
+}
+
+
void X86_64Assembler::mulpd(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
EmitUint8(0x66);
@@ -1609,6 +1721,36 @@
}
+void X86_64Assembler::vpaddb(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ uint8_t ByteOne = 0x00, ByteZero = 0x00, ByteTwo = 0x00;
+ bool is_twobyte_form = true;
+ if (add_right.NeedsRex()) {
+ is_twobyte_form = false;
+ }
+ ByteZero = EmitVexPrefixByteZero(is_twobyte_form);
+ X86_64ManagedRegister vvvv_reg =
+ X86_64ManagedRegister::FromXmmRegister(add_left.AsFloatRegister());
+ if (is_twobyte_form) {
+ ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(), vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ } else {
+ ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(),
+ /*X=*/ false,
+ add_right.NeedsRex(),
+ SET_VEX_M_0F);
+ ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ }
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ if (!is_twobyte_form) {
+ EmitUint8(ByteTwo);
+ }
+ EmitUint8(0xFC);
+ EmitXmmRegisterOperand(dst.LowBits(), add_right);
+}
+
+
void X86_64Assembler::psubb(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
EmitUint8(0x66);
@@ -1619,6 +1761,36 @@
}
+void X86_64Assembler::vpsubb(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ bool is_twobyte_form = false;
+ uint8_t ByteZero = 0x00, ByteOne = 0x00, ByteTwo = 0x00;
+ if (!add_right.NeedsRex()) {
+ is_twobyte_form = true;
+ }
+ ByteZero = EmitVexPrefixByteZero(is_twobyte_form);
+ X86_64ManagedRegister vvvv_reg =
+ X86_64ManagedRegister::FromXmmRegister(add_left.AsFloatRegister());
+ if (is_twobyte_form) {
+ ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(), vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ } else {
+ ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(),
+ /*X=*/ false,
+ add_right.NeedsRex(),
+ SET_VEX_M_0F);
+ ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ }
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ if (!is_twobyte_form) {
+ EmitUint8(ByteTwo);
+ }
+ EmitUint8(0xF8);
+ EmitXmmRegisterOperand(dst.LowBits(), add_right);
+}
+
+
void X86_64Assembler::paddw(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
EmitUint8(0x66);
@@ -1628,6 +1800,35 @@
EmitXmmRegisterOperand(dst.LowBits(), src);
}
+void X86_64Assembler::vpaddw(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ bool is_twobyte_form = false;
+ uint8_t ByteZero = 0x00, ByteOne = 0x00, ByteTwo = 0x00;
+ if (!add_right.NeedsRex()) {
+ is_twobyte_form = true;
+ }
+ ByteZero = EmitVexPrefixByteZero(is_twobyte_form);
+ X86_64ManagedRegister vvvv_reg =
+ X86_64ManagedRegister::FromXmmRegister(add_left.AsFloatRegister());
+ if (is_twobyte_form) {
+ ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(), vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ } else {
+ ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(),
+ /*X=*/ false,
+ add_right.NeedsRex(),
+ SET_VEX_M_0F);
+ ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ }
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ if (!is_twobyte_form) {
+ EmitUint8(ByteTwo);
+ }
+ EmitUint8(0xFD);
+ EmitXmmRegisterOperand(dst.LowBits(), add_right);
+}
+
void X86_64Assembler::psubw(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -1638,6 +1839,35 @@
EmitXmmRegisterOperand(dst.LowBits(), src);
}
+void X86_64Assembler::vpsubw(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ bool is_twobyte_form = false;
+ uint8_t ByteZero = 0x00, ByteOne = 0x00, ByteTwo = 0x00;
+ if (!add_right.NeedsRex()) {
+ is_twobyte_form = true;
+ }
+ ByteZero = EmitVexPrefixByteZero(is_twobyte_form);
+ X86_64ManagedRegister vvvv_reg =
+ X86_64ManagedRegister::FromXmmRegister(add_left.AsFloatRegister());
+ if (is_twobyte_form) {
+ ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(), vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ } else {
+ ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(),
+ /*X=*/ false,
+ add_right.NeedsRex(),
+ SET_VEX_M_0F);
+ ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ }
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ if (!is_twobyte_form) {
+ EmitUint8(ByteTwo);
+ }
+ EmitUint8(0xF9);
+ EmitXmmRegisterOperand(dst.LowBits(), add_right);
+}
+
void X86_64Assembler::pmullw(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -1658,6 +1888,34 @@
EmitXmmRegisterOperand(dst.LowBits(), src);
}
+void X86_64Assembler::vpaddd(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ bool is_twobyte_form = false;
+ uint8_t ByteZero = 0x00, ByteOne = 0x00, ByteTwo = 0x00;
+ if (!add_right.NeedsRex()) {
+ is_twobyte_form = true;
+ }
+ ByteZero = EmitVexPrefixByteZero(is_twobyte_form);
+ X86_64ManagedRegister vvvv_reg =
+ X86_64ManagedRegister::FromXmmRegister(add_left.AsFloatRegister());
+ if (is_twobyte_form) {
+ ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(), vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ } else {
+ ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(),
+ /*X=*/ false,
+ add_right.NeedsRex(),
+ SET_VEX_M_0F);
+ ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ }
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ if (!is_twobyte_form) {
+ EmitUint8(ByteTwo);
+ }
+ EmitUint8(0xFE);
+ EmitXmmRegisterOperand(dst.LowBits(), add_right);
+}
void X86_64Assembler::psubd(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -1690,6 +1948,36 @@
}
+void X86_64Assembler::vpaddq(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ bool is_twobyte_form = false;
+ uint8_t ByteZero = 0x00, ByteOne = 0x00, ByteTwo = 0x00;
+ if (!add_right.NeedsRex()) {
+ is_twobyte_form = true;
+ }
+ ByteZero = EmitVexPrefixByteZero(is_twobyte_form);
+ X86_64ManagedRegister vvvv_reg =
+ X86_64ManagedRegister::FromXmmRegister(add_left.AsFloatRegister());
+ if (is_twobyte_form) {
+ ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(), vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ } else {
+ ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(),
+ /*X=*/ false,
+ add_right.NeedsRex(),
+ SET_VEX_M_0F);
+ ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ }
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ if (!is_twobyte_form) {
+ EmitUint8(ByteTwo);
+ }
+ EmitUint8(0xD4);
+ EmitXmmRegisterOperand(dst.LowBits(), add_right);
+}
+
+
void X86_64Assembler::psubq(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
EmitUint8(0x66);
@@ -1699,6 +1987,35 @@
EmitXmmRegisterOperand(dst.LowBits(), src);
}
+void X86_64Assembler::vpsubq(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ bool is_twobyte_form = false;
+ uint8_t ByteZero = 0x00, ByteOne = 0x00, ByteTwo = 0x00;
+ if (!add_right.NeedsRex()) {
+ is_twobyte_form = true;
+ }
+ ByteZero = EmitVexPrefixByteZero(is_twobyte_form);
+ X86_64ManagedRegister vvvv_reg =
+ X86_64ManagedRegister::FromXmmRegister(add_left.AsFloatRegister());
+ if (is_twobyte_form) {
+ ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(), vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ } else {
+ ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(),
+ /*X=*/ false,
+ add_right.NeedsRex(),
+ SET_VEX_M_0F);
+ ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ }
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ if (!is_twobyte_form) {
+ EmitUint8(ByteTwo);
+ }
+ EmitUint8(0xFB);
+ EmitXmmRegisterOperand(dst.LowBits(), add_right);
+}
+
void X86_64Assembler::paddusb(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -1760,6 +2077,36 @@
}
+void X86_64Assembler::vpsubd(XmmRegister dst, XmmRegister add_left, XmmRegister add_right) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ bool is_twobyte_form = false;
+ uint8_t ByteZero = 0x00, ByteOne = 0x00, ByteTwo = 0x00;
+ if (!add_right.NeedsRex()) {
+ is_twobyte_form = true;
+ }
+ ByteZero = EmitVexPrefixByteZero(is_twobyte_form);
+ X86_64ManagedRegister vvvv_reg =
+ X86_64ManagedRegister::FromXmmRegister(add_left.AsFloatRegister());
+ if (is_twobyte_form) {
+ ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(), vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ } else {
+ ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(),
+ /*X=*/ false,
+ add_right.NeedsRex(),
+ SET_VEX_M_0F);
+ ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ }
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ if (!is_twobyte_form) {
+ EmitUint8(ByteTwo);
+ }
+ EmitUint8(0xFA);
+ EmitXmmRegisterOperand(dst.LowBits(), add_right);
+}
+
+
void X86_64Assembler::psubusw(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
EmitUint8(0x66);
@@ -4597,7 +4944,7 @@
uint8_t reg = static_cast<uint8_t>(inverted_reg);
vex_prefix |= ((reg & 0x0F) << 3);
}
- /** Bit[2] - "L" If VEX.L = 1 indicates 256-bit vector operation ,
+ /** Bit[2] - "L" If VEX.L = 1 indicates 256-bit vector operation,
VEX.L = 0 indicates 128 bit vector operation */
vex_prefix |= SET_VEX_L;
// Bits[1:0] - "pp"
@@ -4629,7 +4976,7 @@
uint8_t reg = static_cast<uint8_t>(inverted_reg);
vex_prefix |= ((reg & 0x0F) << 3);
}
- /** Bit[2] - "L" If VEX.L = 1 indicates 256-bit vector operation ,
+ /** Bit[2] - "L" If VEX.L = 1 indicates 256-bit vector operation,
VEX.L = 0 indicates 128 bit vector operation */
vex_prefix |= SET_VEX_L;
// Bits[1:0] - "pp"
@@ -4650,7 +4997,7 @@
}
/** Bits[6:3] - 'vvvv' the source or dest register specifier */
vex_prefix |= (0x0F << 3);
- /** Bit[2] - "L" If VEX.L = 1 indicates 256-bit vector operation ,
+ /** Bit[2] - "L" If VEX.L = 1 indicates 256-bit vector operation,
VEX.L = 0 indicates 128 bit vector operation */
vex_prefix |= SET_VEX_L;
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index 471b314..8fc69f6 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -452,6 +452,11 @@
void mulps(XmmRegister dst, XmmRegister src);
void divps(XmmRegister dst, XmmRegister src);
+ void vaddps(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
+ void vsubps(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
+ void vsubpd(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
+ void vaddpd(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
+
void movapd(XmmRegister dst, XmmRegister src); // move
void movapd(XmmRegister dst, const Address& src); // load aligned
void movupd(XmmRegister dst, const Address& src); // load unaligned
@@ -497,17 +502,29 @@
void paddb(XmmRegister dst, XmmRegister src); // no addr variant (for now)
void psubb(XmmRegister dst, XmmRegister src);
+ void vpaddb(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
+ void vpaddw(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
+
void paddw(XmmRegister dst, XmmRegister src);
void psubw(XmmRegister dst, XmmRegister src);
void pmullw(XmmRegister dst, XmmRegister src);
+ void vpsubb(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+ void vpsubw(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+ void vpsubd(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+
void paddd(XmmRegister dst, XmmRegister src);
void psubd(XmmRegister dst, XmmRegister src);
void pmulld(XmmRegister dst, XmmRegister src);
+ void vpaddd(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+
void paddq(XmmRegister dst, XmmRegister src);
void psubq(XmmRegister dst, XmmRegister src);
+ void vpaddq(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
+ void vpsubq(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
+
void paddusb(XmmRegister dst, XmmRegister src);
void paddsb(XmmRegister dst, XmmRegister src);
void paddusw(XmmRegister dst, XmmRegister src);
diff --git a/compiler/utils/x86_64/assembler_x86_64_test.cc b/compiler/utils/x86_64/assembler_x86_64_test.cc
index 411c30b..24a6c3c 100644
--- a/compiler/utils/x86_64/assembler_x86_64_test.cc
+++ b/compiler/utils/x86_64/assembler_x86_64_test.cc
@@ -1323,10 +1323,20 @@
DriverStr(RepeatFF(&x86_64::X86_64Assembler::addps, "addps %{reg2}, %{reg1}"), "addps");
}
+TEST_F(AssemblerX86_64AVXTest, VAddps) {
+ DriverStr(
+ RepeatFFF(&x86_64::X86_64Assembler::vaddps, "vaddps %{reg3}, %{reg2}, %{reg1}"), "vaddps");
+}
+
TEST_F(AssemblerX86_64Test, Addpd) {
DriverStr(RepeatFF(&x86_64::X86_64Assembler::addpd, "addpd %{reg2}, %{reg1}"), "addpd");
}
+TEST_F(AssemblerX86_64AVXTest, VAddpd) {
+ DriverStr(
+ RepeatFFF(&x86_64::X86_64Assembler::vaddpd, "vaddpd %{reg3}, %{reg2}, %{reg1}"), "vaddpd");
+}
+
TEST_F(AssemblerX86_64Test, Subss) {
DriverStr(RepeatFF(&x86_64::X86_64Assembler::subss, "subss %{reg2}, %{reg1}"), "subss");
}
@@ -1339,10 +1349,20 @@
DriverStr(RepeatFF(&x86_64::X86_64Assembler::subps, "subps %{reg2}, %{reg1}"), "subps");
}
+TEST_F(AssemblerX86_64AVXTest, VSubps) {
+ DriverStr(
+ RepeatFFF(&x86_64::X86_64Assembler::vsubps, "vsubps %{reg3},%{reg2}, %{reg1}"), "vsubps");
+}
+
TEST_F(AssemblerX86_64Test, Subpd) {
DriverStr(RepeatFF(&x86_64::X86_64Assembler::subpd, "subpd %{reg2}, %{reg1}"), "subpd");
}
+TEST_F(AssemblerX86_64AVXTest, VSubpd) {
+ DriverStr(
+ RepeatFFF(&x86_64::X86_64Assembler::vsubpd, "vsubpd %{reg3}, %{reg2}, %{reg1}"), "vsubpd");
+}
+
TEST_F(AssemblerX86_64Test, Mulss) {
DriverStr(RepeatFF(&x86_64::X86_64Assembler::mulss, "mulss %{reg2}, %{reg1}"), "mulss");
}
@@ -1379,14 +1399,35 @@
DriverStr(RepeatFF(&x86_64::X86_64Assembler::paddb, "paddb %{reg2}, %{reg1}"), "paddb");
}
+TEST_F(AssemblerX86_64AVXTest, VPaddb) {
+ DriverStr(
+ RepeatFFF(&x86_64::X86_64Assembler::vpaddb, "vpaddb %{reg3}, %{reg2}, %{reg1}"), "vpaddb");
+}
+
TEST_F(AssemblerX86_64Test, Psubb) {
DriverStr(RepeatFF(&x86_64::X86_64Assembler::psubb, "psubb %{reg2}, %{reg1}"), "psubb");
}
+TEST_F(AssemblerX86_64AVXTest, VPsubb) {
+ DriverStr(
+ RepeatFFF(&x86_64::X86_64Assembler::vpsubb, "vpsubb %{reg3},%{reg2}, %{reg1}"), "vpsubb");
+}
+
TEST_F(AssemblerX86_64Test, Paddw) {
DriverStr(RepeatFF(&x86_64::X86_64Assembler::paddw, "paddw %{reg2}, %{reg1}"), "paddw");
}
+TEST_F(AssemblerX86_64AVXTest, VPsubw) {
+ DriverStr(
+ RepeatFFF(&x86_64::X86_64Assembler::vpsubw, "vpsubw %{reg3}, %{reg2}, %{reg1}"), "vpsubw");
+}
+
+TEST_F(AssemblerX86_64AVXTest, VPaddw) {
+ DriverStr(
+ RepeatFFF(&x86_64::X86_64Assembler::vpaddw, "vpaddw %{reg3}, %{reg2}, %{reg1}"), "vpaddw");
+}
+
+
TEST_F(AssemblerX86_64Test, Psubw) {
DriverStr(RepeatFF(&x86_64::X86_64Assembler::psubw, "psubw %{reg2}, %{reg1}"), "psubw");
}
@@ -1399,10 +1440,20 @@
DriverStr(RepeatFF(&x86_64::X86_64Assembler::paddd, "paddd %{reg2}, %{reg1}"), "paddd");
}
+TEST_F(AssemblerX86_64AVXTest, VPaddd) {
+ DriverStr(
+ RepeatFFF(&x86_64::X86_64Assembler::vpaddd, "vpaddd %{reg3}, %{reg2}, %{reg1}"), "vpaddd");
+}
+
TEST_F(AssemblerX86_64Test, Psubd) {
DriverStr(RepeatFF(&x86_64::X86_64Assembler::psubd, "psubd %{reg2}, %{reg1}"), "psubd");
}
+TEST_F(AssemblerX86_64AVXTest, VPsubd) {
+ DriverStr(
+ RepeatFFF(&x86_64::X86_64Assembler::vpsubd, "vpsubd %{reg3}, %{reg2}, %{reg1}"), "vpsubd");
+}
+
TEST_F(AssemblerX86_64Test, Pmulld) {
DriverStr(RepeatFF(&x86_64::X86_64Assembler::pmulld, "pmulld %{reg2}, %{reg1}"), "pmulld");
}
@@ -1411,10 +1462,20 @@
DriverStr(RepeatFF(&x86_64::X86_64Assembler::paddq, "paddq %{reg2}, %{reg1}"), "paddq");
}
+TEST_F(AssemblerX86_64AVXTest, VPaddq) {
+ DriverStr(
+ RepeatFFF(&x86_64::X86_64Assembler::vpaddq, "vpaddq %{reg3}, %{reg2}, %{reg1}"), "vpaddq");
+}
+
TEST_F(AssemblerX86_64Test, Psubq) {
DriverStr(RepeatFF(&x86_64::X86_64Assembler::psubq, "psubq %{reg2}, %{reg1}"), "psubq");
}
+TEST_F(AssemblerX86_64AVXTest, VPsubq) {
+ DriverStr(
+ RepeatFFF(&x86_64::X86_64Assembler::vpsubq, "vpsubq %{reg3}, %{reg2}, %{reg1}"), "vpsubq");
+}
+
TEST_F(AssemblerX86_64Test, Paddusb) {
DriverStr(RepeatFF(&x86_64::X86_64Assembler::paddusb, "paddusb %{reg2}, %{reg1}"), "paddusb");
}