AVX support for bitwise instructions (Xor, Or, And, Andn)

Test: ./test.py --host --64, test-art-host-gtest

Change-Id: Ia8302d12d3ebb8447d73db576fb5b945485c11e1
Signed-off-by: Neeraj Solanki <neeraj.solanki@intel.com>
diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc
index 29a1354..68aef77 100644
--- a/compiler/optimizing/code_generator_vector_x86.cc
+++ b/compiler/optimizing/code_generator_vector_x86.cc
@@ -63,9 +63,10 @@
   LocationSummary* locations = instruction->GetLocations();
   XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
 
+  bool cpu_has_avx = CpuHasAvxFeatureFlag();
   // Shorthand for any type of zero.
   if (IsZeroBitPattern(instruction->InputAt(0))) {
-    __ xorps(dst, dst);
+    cpu_has_avx ? __ vxorps(dst, dst, dst) : __ xorps(dst, dst);
     return;
   }
 
@@ -808,14 +809,20 @@
 }
 
 void LocationsBuilderX86::VisitVecAnd(HVecAnd* instruction) {
-  CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
+  if (CpuHasAvxFeatureFlag()) {
+    CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
+  } else {
+    CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
+  }
 }
 
 void InstructionCodeGeneratorX86::VisitVecAnd(HVecAnd* instruction) {
+  bool cpu_has_avx = CpuHasAvxFeatureFlag();
   LocationSummary* locations = instruction->GetLocations();
-  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  XmmRegister other_src = locations->InAt(0).AsFpuRegister<XmmRegister>();
   XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
   XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  DCHECK(cpu_has_avx || other_src == dst);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kBool:
     case DataType::Type::kUint8:
@@ -826,15 +833,15 @@
     case DataType::Type::kInt64:
       DCHECK_LE(2u, instruction->GetVectorLength());
       DCHECK_LE(instruction->GetVectorLength(), 16u);
-      __ pand(dst, src);
+      cpu_has_avx ? __ vpand(dst, other_src, src) : __ pand(dst, src);
       break;
     case DataType::Type::kFloat32:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ andps(dst, src);
+      cpu_has_avx ? __ vandps(dst, other_src, src) : __ andps(dst, src);
       break;
     case DataType::Type::kFloat64:
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ andpd(dst, src);
+      cpu_has_avx ? __ vandpd(dst, other_src, src) : __ andpd(dst, src);
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -843,14 +850,20 @@
 }
 
 void LocationsBuilderX86::VisitVecAndNot(HVecAndNot* instruction) {
-  CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
+  if (CpuHasAvxFeatureFlag()) {
+    CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
+  } else {
+    CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
+  }
 }
 
 void InstructionCodeGeneratorX86::VisitVecAndNot(HVecAndNot* instruction) {
+  bool cpu_has_avx = CpuHasAvxFeatureFlag();
   LocationSummary* locations = instruction->GetLocations();
-  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  XmmRegister other_src = locations->InAt(0).AsFpuRegister<XmmRegister>();
   XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
   XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  DCHECK(cpu_has_avx || other_src == dst);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kBool:
     case DataType::Type::kUint8:
@@ -861,15 +874,15 @@
     case DataType::Type::kInt64:
       DCHECK_LE(2u, instruction->GetVectorLength());
       DCHECK_LE(instruction->GetVectorLength(), 16u);
-      __ pandn(dst, src);
+      cpu_has_avx ? __ vpandn(dst, other_src, src) : __ pandn(dst, src);
       break;
     case DataType::Type::kFloat32:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ andnps(dst, src);
+      cpu_has_avx ? __ vandnps(dst, other_src, src) : __ andnps(dst, src);
       break;
     case DataType::Type::kFloat64:
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ andnpd(dst, src);
+      cpu_has_avx ? __ vandnpd(dst, other_src, src) : __ andnpd(dst, src);
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -878,14 +891,20 @@
 }
 
 void LocationsBuilderX86::VisitVecOr(HVecOr* instruction) {
-  CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
+  if (CpuHasAvxFeatureFlag()) {
+    CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
+  } else {
+    CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
+  }
 }
 
 void InstructionCodeGeneratorX86::VisitVecOr(HVecOr* instruction) {
+  bool cpu_has_avx = CpuHasAvxFeatureFlag();
   LocationSummary* locations = instruction->GetLocations();
-  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  XmmRegister other_src = locations->InAt(0).AsFpuRegister<XmmRegister>();
   XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
   XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  DCHECK(cpu_has_avx || other_src == dst);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kBool:
     case DataType::Type::kUint8:
@@ -896,15 +915,15 @@
     case DataType::Type::kInt64:
       DCHECK_LE(2u, instruction->GetVectorLength());
       DCHECK_LE(instruction->GetVectorLength(), 16u);
-      __ por(dst, src);
+      cpu_has_avx ? __ vpor(dst, other_src, src) : __ por(dst, src);
       break;
     case DataType::Type::kFloat32:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ orps(dst, src);
+      cpu_has_avx ? __ vorps(dst, other_src, src) : __ orps(dst, src);
       break;
     case DataType::Type::kFloat64:
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ orpd(dst, src);
+      cpu_has_avx ? __ vorpd(dst, other_src, src) : __ orpd(dst, src);
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -913,14 +932,20 @@
 }
 
 void LocationsBuilderX86::VisitVecXor(HVecXor* instruction) {
-  CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
+  if (CpuHasAvxFeatureFlag()) {
+    CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
+  } else {
+    CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
+  }
 }
 
 void InstructionCodeGeneratorX86::VisitVecXor(HVecXor* instruction) {
+  bool cpu_has_avx = CpuHasAvxFeatureFlag();
   LocationSummary* locations = instruction->GetLocations();
-  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  XmmRegister other_src = locations->InAt(0).AsFpuRegister<XmmRegister>();
   XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
   XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  DCHECK(cpu_has_avx || other_src == dst);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kBool:
     case DataType::Type::kUint8:
@@ -931,15 +956,15 @@
     case DataType::Type::kInt64:
       DCHECK_LE(2u, instruction->GetVectorLength());
       DCHECK_LE(instruction->GetVectorLength(), 16u);
-      __ pxor(dst, src);
+      cpu_has_avx ? __ vpxor(dst, other_src, src) : __ pxor(dst, src);
       break;
     case DataType::Type::kFloat32:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ xorps(dst, src);
+      cpu_has_avx ? __ vxorps(dst, other_src, src) : __ xorps(dst, src);
       break;
     case DataType::Type::kFloat64:
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ xorpd(dst, src);
+      cpu_has_avx ? __ vxorpd(dst, other_src, src) : __ xorpd(dst, src);
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -1092,7 +1117,8 @@
   DCHECK_EQ(1u, instruction->InputCount());  // only one input currently implemented
 
   // Zero out all other elements first.
-  __ xorps(dst, dst);
+  bool cpu_has_avx = CpuHasAvxFeatureFlag();
+  cpu_has_avx ? __ vxorps(dst, dst, dst) : __ xorps(dst, dst);
 
   // Shorthand for any type of zero.
   if (IsZeroBitPattern(instruction->InputAt(0))) {