Basic SIMD reduction support.

Rationale:
Enables vectorization of x += .... for very basic (simple, same-type)
constructs. Paves the way for more complex (narrower and/or mixed-type)
constructs, which will be handled by the next CL.

This is  a revert   of Icb5d6c805516db0a1d911c3ede9a246ccef89a22
and thus a revert^2 of I2454778dd0ef1da915c178c7274e1cf33e271d0f
and thus a revert^3 of I1c1c87b6323e01442e8fbd94869ddc9e760ea1fc
and thus a revert^4 of I7880c135aee3ed0a39da9ae5b468cbf80e613766

PS1-2 shows what needed to change

Test: test-art-host test-art-target

Bug: 64091002
Change-Id: I647889e0da0959ca405b70081b79c7d3c9bcb2e9
diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc
index e7aec76..37190f8 100644
--- a/compiler/optimizing/code_generator_vector_x86.cc
+++ b/compiler/optimizing/code_generator_vector_x86.cc
@@ -27,9 +27,99 @@
 
 void LocationsBuilderX86::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
+  HInstruction* input = instruction->InputAt(0);
+  bool is_zero = IsZeroBitPattern(input);
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimLong:
-      // Long needs extra temporary to load the register pair.
+      // Long needs extra temporary to load from the register pair.
+      if (!is_zero) {
+        locations->AddTemp(Location::RequiresFpuRegister());
+      }
+      FALLTHROUGH_INTENDED;
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+      locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant())
+                                    : Location::RequiresRegister());
+      locations->SetOut(Location::RequiresFpuRegister());
+      break;
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant())
+                                    : Location::RequiresFpuRegister());
+      locations->SetOut(is_zero ? Location::RequiresFpuRegister()
+                                : Location::SameAsFirstInput());
+
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void InstructionCodeGeneratorX86::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+
+  // Shorthand for any type of zero.
+  if (IsZeroBitPattern(instruction->InputAt(0))) {
+    __ xorps(dst, dst);
+    return;
+  }
+
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ movd(dst, locations->InAt(0).AsRegister<Register>());
+      __ punpcklbw(dst, dst);
+      __ punpcklwd(dst, dst);
+      __ pshufd(dst, dst, Immediate(0));
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ movd(dst, locations->InAt(0).AsRegister<Register>());
+      __ punpcklwd(dst, dst);
+      __ pshufd(dst, dst, Immediate(0));
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ movd(dst, locations->InAt(0).AsRegister<Register>());
+      __ pshufd(dst, dst, Immediate(0));
+      break;
+    case Primitive::kPrimLong: {
+      XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ movd(dst, locations->InAt(0).AsRegisterPairLow<Register>());
+      __ movd(tmp, locations->InAt(0).AsRegisterPairHigh<Register>());
+      __ punpckldq(dst, tmp);
+      __ punpcklqdq(dst, dst);
+      break;
+    }
+    case Primitive::kPrimFloat:
+      DCHECK(locations->InAt(0).Equals(locations->Out()));
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ shufps(dst, dst, Immediate(0));
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK(locations->InAt(0).Equals(locations->Out()));
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ shufpd(dst, dst, Immediate(0));
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderX86::VisitVecExtractScalar(HVecExtractScalar* instruction) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimLong:
+      // Long needs extra temporary to store into the register pair.
       locations->AddTemp(Location::RequiresFpuRegister());
       FALLTHROUGH_INTENDED;
     case Primitive::kPrimBoolean:
@@ -37,8 +127,8 @@
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
-      locations->SetInAt(0, Location::RequiresRegister());
-      locations->SetOut(Location::RequiresFpuRegister());
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetOut(Location::RequiresRegister());
       break;
     case Primitive::kPrimFloat:
     case Primitive::kPrimDouble:
@@ -51,48 +141,34 @@
   }
 }
 
-void InstructionCodeGeneratorX86::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
+void InstructionCodeGeneratorX86::VisitVecExtractScalar(HVecExtractScalar* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  XmmRegister reg = locations->Out().AsFpuRegister<XmmRegister>();
+  XmmRegister src = locations->InAt(0).AsFpuRegister<XmmRegister>();
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
-      __ movd(reg, locations->InAt(0).AsRegister<Register>());
-      __ punpcklbw(reg, reg);
-      __ punpcklwd(reg, reg);
-      __ pshufd(reg, reg, Immediate(0));
-      break;
     case Primitive::kPrimChar:
-    case Primitive::kPrimShort:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ movd(reg, locations->InAt(0).AsRegister<Register>());
-      __ punpcklwd(reg, reg);
-      __ pshufd(reg, reg, Immediate(0));
-      break;
+    case Primitive::kPrimShort:  // TODO: up to here, and?
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
     case Primitive::kPrimInt:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ movd(reg, locations->InAt(0).AsRegister<Register>());
-      __ pshufd(reg, reg, Immediate(0));
+      DCHECK_LE(4u, instruction->GetVectorLength());
+      DCHECK_LE(instruction->GetVectorLength(), 16u);
+      __ movd(locations->Out().AsRegister<Register>(), src);
       break;
     case Primitive::kPrimLong: {
       XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ movd(reg, locations->InAt(0).AsRegisterPairLow<Register>());
-      __ movd(tmp, locations->InAt(0).AsRegisterPairHigh<Register>());
-      __ punpckldq(reg, tmp);
-      __ punpcklqdq(reg, reg);
+      __ movd(locations->Out().AsRegisterPairLow<Register>(), src);
+      __ pshufd(tmp, src, Immediate(1));
+      __ movd(locations->Out().AsRegisterPairHigh<Register>(), tmp);
       break;
     }
     case Primitive::kPrimFloat:
-      DCHECK(locations->InAt(0).Equals(locations->Out()));
-      DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ shufps(reg, reg, Immediate(0));
-      break;
     case Primitive::kPrimDouble:
-      DCHECK(locations->InAt(0).Equals(locations->Out()));
-      DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ shufpd(reg, reg, Immediate(0));
+      DCHECK_LE(2u, instruction->GetVectorLength());
+      DCHECK_LE(instruction->GetVectorLength(), 4u);
+      DCHECK(locations->InAt(0).Equals(locations->Out()));  // no code required
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
@@ -100,22 +176,6 @@
   }
 }
 
-void LocationsBuilderX86::VisitVecSetScalars(HVecSetScalars* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-}
-
-void InstructionCodeGeneratorX86::VisitVecSetScalars(HVecSetScalars* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-}
-
-void LocationsBuilderX86::VisitVecSumReduce(HVecSumReduce* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-}
-
-void InstructionCodeGeneratorX86::VisitVecSumReduce(HVecSumReduce* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-}
-
 // Helper to set up locations for vector unary operations.
 static void CreateVecUnOpLocations(ArenaAllocator* arena, HVecUnaryOperation* instruction) {
   LocationSummary* locations = new (arena) LocationSummary(instruction);
@@ -137,6 +197,73 @@
   }
 }
 
+void LocationsBuilderX86::VisitVecReduce(HVecReduce* instruction) {
+  CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+  // Long reduction or min/max require a temporary.
+  if (instruction->GetPackedType() == Primitive::kPrimLong ||
+      instruction->GetKind() == HVecReduce::kMin ||
+      instruction->GetKind() == HVecReduce::kMax) {
+    instruction->GetLocations()->AddTemp(Location::RequiresFpuRegister());
+  }
+}
+
+void InstructionCodeGeneratorX86::VisitVecReduce(HVecReduce* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  XmmRegister src = locations->InAt(0).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      switch (instruction->GetKind()) {
+        case HVecReduce::kSum:
+          __ movaps(dst, src);
+          __ phaddd(dst, dst);
+          __ phaddd(dst, dst);
+          break;
+        case HVecReduce::kMin: {
+          XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+          __ movaps(tmp, src);
+          __ movaps(dst, src);
+          __ psrldq(tmp, Immediate(8));
+          __ pminsd(dst, tmp);
+          __ psrldq(tmp, Immediate(4));
+          __ pminsd(dst, tmp);
+          break;
+        }
+        case HVecReduce::kMax: {
+          XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+          __ movaps(tmp, src);
+          __ movaps(dst, src);
+          __ psrldq(tmp, Immediate(8));
+          __ pmaxsd(dst, tmp);
+          __ psrldq(tmp, Immediate(4));
+          __ pmaxsd(dst, tmp);
+          break;
+        }
+      }
+      break;
+    case Primitive::kPrimLong: {
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+      switch (instruction->GetKind()) {
+        case HVecReduce::kSum:
+          __ movaps(tmp, src);
+          __ movaps(dst, src);
+          __ punpckhqdq(tmp, tmp);
+          __ paddq(dst, tmp);
+          break;
+        case HVecReduce::kMin:
+        case HVecReduce::kMax:
+          LOG(FATAL) << "Unsupported SIMD type";
+      }
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
 void LocationsBuilderX86::VisitVecCnv(HVecCnv* instruction) {
   CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
 }
@@ -821,6 +948,91 @@
   }
 }
 
+void LocationsBuilderX86::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
+
+  DCHECK_EQ(1u, instruction->InputCount());  // only one input currently implemented
+
+  HInstruction* input = instruction->InputAt(0);
+  bool is_zero = IsZeroBitPattern(input);
+
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimLong:
+      // Long needs extra temporary to load from register pairs.
+      if (!is_zero) {
+        locations->AddTemp(Location::RequiresFpuRegister());
+      }
+      FALLTHROUGH_INTENDED;
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+      locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant())
+                                    : Location::RequiresRegister());
+      locations->SetOut(Location::RequiresFpuRegister());
+      break;
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant())
+                                    : Location::RequiresFpuRegister());
+      locations->SetOut(Location::RequiresFpuRegister());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void InstructionCodeGeneratorX86::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+
+  DCHECK_EQ(1u, instruction->InputCount());  // only one input currently implemented
+
+  // Zero out all other elements first.
+  __ xorps(dst, dst);
+
+  // Shorthand for any type of zero.
+  if (IsZeroBitPattern(instruction->InputAt(0))) {
+    return;
+  }
+
+  // Set required elements.
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:  // TODO: up to here, and?
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ movd(dst, locations->InAt(0).AsRegister<Register>());
+      break;
+    case Primitive::kPrimLong: {
+      XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ xorps(tmp, tmp);
+      __ movd(dst, locations->InAt(0).AsRegisterPairLow<Register>());
+      __ movd(tmp, locations->InAt(0).AsRegisterPairHigh<Register>());
+      __ punpckldq(dst, tmp);
+      break;
+    }
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ movss(dst, locations->InAt(1).AsFpuRegister<XmmRegister>());
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ movsd(dst, locations->InAt(1).AsFpuRegister<XmmRegister>());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
 void LocationsBuilderX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
   LOG(FATAL) << "No SIMD for " << instr->GetId();
 }
@@ -868,6 +1080,7 @@
     case 8: scale = TIMES_8; break;
     default: break;
   }
+  // Incorporate the string or array offset in the address computation.
   uint32_t offset = is_string_char_at
       ? mirror::String::ValueOffset().Uint32Value()
       : mirror::Array::DataOffset(size).Uint32Value();
@@ -902,7 +1115,7 @@
         __ testb(Address(locations->InAt(0).AsRegister<Register>(), count_offset), Immediate(1));
         __ j(kNotZero, &not_compressed);
         // Zero extend 8 compressed bytes into 8 chars.
-        __ movsd(reg, VecAddress(locations, 1, /*is_string_char_at*/ true));
+        __ movsd(reg, VecAddress(locations, 1, instruction->IsStringCharAt()));
         __ pxor(tmp, tmp);
         __ punpcklbw(reg, tmp);
         __ jmp(&done);