Enable string "array get" vectorization.

Rationale:
Like its scalar counterpart, the SIMD implementation of array get from
a string needs to deal with compressed and uncompressed cases.
Micro benchmarks shows 2x to 3x speedup for just copying data!

Test: test-art-target, test-art-host
Change-Id: I2fd714e50715b263123c215cd181f19194456d2b
diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc
index 013b092..5bb19c1 100644
--- a/compiler/optimizing/code_generator_vector_x86.cc
+++ b/compiler/optimizing/code_generator_vector_x86.cc
@@ -201,6 +201,7 @@
 
 void LocationsBuilderX86::VisitVecAbs(HVecAbs* instruction) {
   CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+  // Integral-abs requires a temporary for the comparison.
   if (instruction->GetPackedType() == Primitive::kPrimInt) {
     instruction->GetLocations()->AddTemp(Location::RequiresFpuRegister());
   }
@@ -766,16 +767,10 @@
   }
 }
 
-// Helper to set up registers and address for vector memory operations.
-static Address CreateVecMemRegisters(HVecMemoryOperation* instruction,
-                                     Location* reg_loc,
-                                     bool is_load) {
-  LocationSummary* locations = instruction->GetLocations();
+// Helper to construct address for vector memory operations.
+static Address VecAddress(LocationSummary* locations, size_t size, bool is_string_char_at) {
   Location base = locations->InAt(0);
   Location index = locations->InAt(1);
-  *reg_loc = is_load ? locations->Out() : locations->InAt(2);
-  size_t size = Primitive::ComponentSize(instruction->GetPackedType());
-  uint32_t offset = mirror::Array::DataOffset(size).Uint32Value();
   ScaleFactor scale = TIMES_1;
   switch (size) {
     case 2: scale = TIMES_2; break;
@@ -783,22 +778,53 @@
     case 8: scale = TIMES_8; break;
     default: break;
   }
+  uint32_t offset = is_string_char_at
+      ? mirror::String::ValueOffset().Uint32Value()
+      : mirror::Array::DataOffset(size).Uint32Value();
   return CodeGeneratorX86::ArrayAddress(base.AsRegister<Register>(), index, scale, offset);
 }
 
 void LocationsBuilderX86::VisitVecLoad(HVecLoad* instruction) {
   CreateVecMemLocations(GetGraph()->GetArena(), instruction, /*is_load*/ true);
+  // String load requires a temporary for the compressed load.
+  if (mirror::kUseStringCompression && instruction->IsStringCharAt()) {
+    instruction->GetLocations()->AddTemp(Location::RequiresFpuRegister());
+  }
 }
 
 void InstructionCodeGeneratorX86::VisitVecLoad(HVecLoad* instruction) {
-  Location reg_loc = Location::NoLocation();
-  Address address = CreateVecMemRegisters(instruction, &reg_loc, /*is_load*/ true);
-  XmmRegister reg = reg_loc.AsFpuRegister<XmmRegister>();
+  LocationSummary* locations = instruction->GetLocations();
+  size_t size = Primitive::ComponentSize(instruction->GetPackedType());
+  Address address = VecAddress(locations, size, instruction->IsStringCharAt());
+  XmmRegister reg = locations->Out().AsFpuRegister<XmmRegister>();
   bool is_aligned16 = instruction->GetAlignment().IsAlignedAt(16);
   switch (instruction->GetPackedType()) {
+    case Primitive::kPrimChar:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      // Special handling of compressed/uncompressed string load.
+      if (mirror::kUseStringCompression && instruction->IsStringCharAt()) {
+        NearLabel done, not_compressed;
+        XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+        // Test compression bit.
+        static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
+                      "Expecting 0=compressed, 1=uncompressed");
+        uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
+        __ testb(Address(locations->InAt(0).AsRegister<Register>(), count_offset), Immediate(1));
+        __ j(kNotZero, &not_compressed);
+        // Zero extend 8 compressed bytes into 8 chars.
+        __ movsd(reg, VecAddress(locations, 1, /*is_string_char_at*/ true));
+        __ pxor(tmp, tmp);
+        __ punpcklbw(reg, tmp);
+        __ jmp(&done);
+        // Load 4 direct uncompressed chars.
+        __ Bind(&not_compressed);
+        is_aligned16 ?  __ movdqa(reg, address) :  __ movdqu(reg, address);
+        __ Bind(&done);
+        return;
+      }
+      FALLTHROUGH_INTENDED;
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte:
-    case Primitive::kPrimChar:
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
     case Primitive::kPrimLong:
@@ -825,9 +851,10 @@
 }
 
 void InstructionCodeGeneratorX86::VisitVecStore(HVecStore* instruction) {
-  Location reg_loc = Location::NoLocation();
-  Address address = CreateVecMemRegisters(instruction, &reg_loc, /*is_load*/ false);
-  XmmRegister reg = reg_loc.AsFpuRegister<XmmRegister>();
+  LocationSummary* locations = instruction->GetLocations();
+  size_t size = Primitive::ComponentSize(instruction->GetPackedType());
+  Address address = VecAddress(locations, size, /*is_string_char_at*/ false);
+  XmmRegister reg = locations->InAt(2).AsFpuRegister<XmmRegister>();
   bool is_aligned16 = instruction->GetAlignment().IsAlignedAt(16);
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimBoolean: