Enable string "array get" vectorization.

Rationale:
Like its scalar counterpart, the SIMD implementation of array get from
a string needs to deal with compressed and uncompressed cases.
Micro benchmarks shows 2x to 3x speedup for just copying data!

Test: test-art-target, test-art-host
Change-Id: I2fd714e50715b263123c215cd181f19194456d2b
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 332ab49..3ded3e4 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -318,12 +318,13 @@
   void GenerateDivRemIntegral(HBinaryOperation* instruction);
   void HandleGoto(HInstruction* got, HBasicBlock* successor);
 
-  vixl::aarch64::MemOperand CreateVecMemRegisters(
+  vixl::aarch64::MemOperand VecAddress(
       HVecMemoryOperation* instruction,
-      Location* reg_loc,
-      bool is_load,
       // This function may acquire a scratch register.
-      vixl::aarch64::UseScratchRegisterScope* temps_scope);
+      vixl::aarch64::UseScratchRegisterScope* temps_scope,
+      size_t size,
+      bool is_string_char_at,
+      /*out*/ vixl::aarch64::Register* scratch);
 
   Arm64Assembler* const assembler_;
   CodeGeneratorARM64* const codegen_;
diff --git a/compiler/optimizing/code_generator_vector_arm64.cc b/compiler/optimizing/code_generator_vector_arm64.cc
index 93befa4..57f7e6b 100644
--- a/compiler/optimizing/code_generator_vector_arm64.cc
+++ b/compiler/optimizing/code_generator_vector_arm64.cc
@@ -22,6 +22,7 @@
 namespace art {
 namespace arm64 {
 
+using helpers::DRegisterFrom;
 using helpers::VRegisterFrom;
 using helpers::HeapOperand;
 using helpers::InputRegisterAt;
@@ -771,20 +772,22 @@
   }
 }
 
-// Helper to set up registers and address for vector memory operations.
-MemOperand InstructionCodeGeneratorARM64::CreateVecMemRegisters(
+// Helper to set up locations for vector memory operations. Returns the memory operand and,
+// if used, sets the output parameter scratch to a temporary register used in this operand,
+// so that the client can release it right after the memory operand use.
+MemOperand InstructionCodeGeneratorARM64::VecAddress(
     HVecMemoryOperation* instruction,
-    Location* reg_loc,
-    bool is_load,
-    UseScratchRegisterScope* temps_scope) {
+    UseScratchRegisterScope* temps_scope,
+    size_t size,
+    bool is_string_char_at,
+    /*out*/ Register* scratch) {
   LocationSummary* locations = instruction->GetLocations();
   Register base = InputRegisterAt(instruction, 0);
   Location index = locations->InAt(1);
-  *reg_loc = is_load ? locations->Out() : locations->InAt(2);
-
-  Primitive::Type packed_type = instruction->GetPackedType();
-  uint32_t offset = mirror::Array::DataOffset(Primitive::ComponentSize(packed_type)).Uint32Value();
-  size_t shift = Primitive::ComponentSizeShift(packed_type);
+  uint32_t offset = is_string_char_at
+      ? mirror::String::ValueOffset().Uint32Value()
+      : mirror::Array::DataOffset(size).Uint32Value();
+  size_t shift = ComponentSizeShiftWidth(size);
 
   // HIntermediateAddress optimization is only applied for scalar ArrayGet and ArraySet.
   DCHECK(!instruction->InputAt(0)->IsIntermediateAddress());
@@ -793,10 +796,9 @@
     offset += Int64ConstantFrom(index) << shift;
     return HeapOperand(base, offset);
   } else {
-    Register temp = temps_scope->AcquireSameSizeAs(base);
-    __ Add(temp, base, Operand(WRegisterFrom(index), LSL, shift));
-
-    return HeapOperand(temp, offset);
+    *scratch = temps_scope->AcquireSameSizeAs(base);
+    __ Add(*scratch, base, Operand(WRegisterFrom(index), LSL, shift));
+    return HeapOperand(*scratch, offset);
   }
 }
 
@@ -805,15 +807,43 @@
 }
 
 void InstructionCodeGeneratorARM64::VisitVecLoad(HVecLoad* instruction) {
-  Location reg_loc = Location::NoLocation();
+  LocationSummary* locations = instruction->GetLocations();
+  size_t size = Primitive::ComponentSize(instruction->GetPackedType());
+  VRegister reg = VRegisterFrom(locations->Out());
   UseScratchRegisterScope temps(GetVIXLAssembler());
-  MemOperand mem = CreateVecMemRegisters(instruction, &reg_loc, /*is_load*/ true, &temps);
-  VRegister reg = VRegisterFrom(reg_loc);
+  Register scratch;
 
   switch (instruction->GetPackedType()) {
+    case Primitive::kPrimChar:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      // Special handling of compressed/uncompressed string load.
+      if (mirror::kUseStringCompression && instruction->IsStringCharAt()) {
+        vixl::aarch64::Label uncompressed_load, done;
+        // Test compression bit.
+        static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
+                      "Expecting 0=compressed, 1=uncompressed");
+        uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
+        Register length = temps.AcquireW();
+        __ Ldr(length, HeapOperand(InputRegisterAt(instruction, 0), count_offset));
+        __ Tbnz(length.W(), 0, &uncompressed_load);
+        temps.Release(length);  // no longer needed
+        // Zero extend 8 compressed bytes into 8 chars.
+        __ Ldr(DRegisterFrom(locations->Out()).V8B(),
+               VecAddress(instruction, &temps, 1, /*is_string_char_at*/ true, &scratch));
+        __ Uxtl(reg.V8H(), reg.V8B());
+        __ B(&done);
+        if (scratch.IsValid()) {
+          temps.Release(scratch);  // if used, no longer needed
+        }
+        // Load 8 direct uncompressed chars.
+        __ Bind(&uncompressed_load);
+        __ Ldr(reg, VecAddress(instruction, &temps, size, /*is_string_char_at*/ true, &scratch));
+        __ Bind(&done);
+        return;
+      }
+      FALLTHROUGH_INTENDED;
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte:
-    case Primitive::kPrimChar:
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
     case Primitive::kPrimFloat:
@@ -821,7 +851,7 @@
     case Primitive::kPrimDouble:
       DCHECK_LE(2u, instruction->GetVectorLength());
       DCHECK_LE(instruction->GetVectorLength(), 16u);
-      __ Ldr(reg, mem);
+      __ Ldr(reg, VecAddress(instruction, &temps, size, instruction->IsStringCharAt(), &scratch));
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
@@ -834,10 +864,11 @@
 }
 
 void InstructionCodeGeneratorARM64::VisitVecStore(HVecStore* instruction) {
-  Location reg_loc = Location::NoLocation();
+  LocationSummary* locations = instruction->GetLocations();
+  size_t size = Primitive::ComponentSize(instruction->GetPackedType());
+  VRegister reg = VRegisterFrom(locations->InAt(2));
   UseScratchRegisterScope temps(GetVIXLAssembler());
-  MemOperand mem = CreateVecMemRegisters(instruction, &reg_loc, /*is_load*/ false, &temps);
-  VRegister reg = VRegisterFrom(reg_loc);
+  Register scratch;
 
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimBoolean:
@@ -850,7 +881,7 @@
     case Primitive::kPrimDouble:
       DCHECK_LE(2u, instruction->GetVectorLength());
       DCHECK_LE(instruction->GetVectorLength(), 16u);
-      __ Str(reg, mem);
+      __ Str(reg, VecAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc
index 013b092..5bb19c1 100644
--- a/compiler/optimizing/code_generator_vector_x86.cc
+++ b/compiler/optimizing/code_generator_vector_x86.cc
@@ -201,6 +201,7 @@
 
 void LocationsBuilderX86::VisitVecAbs(HVecAbs* instruction) {
   CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+  // Integral-abs requires a temporary for the comparison.
   if (instruction->GetPackedType() == Primitive::kPrimInt) {
     instruction->GetLocations()->AddTemp(Location::RequiresFpuRegister());
   }
@@ -766,16 +767,10 @@
   }
 }
 
-// Helper to set up registers and address for vector memory operations.
-static Address CreateVecMemRegisters(HVecMemoryOperation* instruction,
-                                     Location* reg_loc,
-                                     bool is_load) {
-  LocationSummary* locations = instruction->GetLocations();
+// Helper to construct address for vector memory operations.
+static Address VecAddress(LocationSummary* locations, size_t size, bool is_string_char_at) {
   Location base = locations->InAt(0);
   Location index = locations->InAt(1);
-  *reg_loc = is_load ? locations->Out() : locations->InAt(2);
-  size_t size = Primitive::ComponentSize(instruction->GetPackedType());
-  uint32_t offset = mirror::Array::DataOffset(size).Uint32Value();
   ScaleFactor scale = TIMES_1;
   switch (size) {
     case 2: scale = TIMES_2; break;
@@ -783,22 +778,53 @@
     case 8: scale = TIMES_8; break;
     default: break;
   }
+  uint32_t offset = is_string_char_at
+      ? mirror::String::ValueOffset().Uint32Value()
+      : mirror::Array::DataOffset(size).Uint32Value();
   return CodeGeneratorX86::ArrayAddress(base.AsRegister<Register>(), index, scale, offset);
 }
 
 void LocationsBuilderX86::VisitVecLoad(HVecLoad* instruction) {
   CreateVecMemLocations(GetGraph()->GetArena(), instruction, /*is_load*/ true);
+  // String load requires a temporary for the compressed load.
+  if (mirror::kUseStringCompression && instruction->IsStringCharAt()) {
+    instruction->GetLocations()->AddTemp(Location::RequiresFpuRegister());
+  }
 }
 
 void InstructionCodeGeneratorX86::VisitVecLoad(HVecLoad* instruction) {
-  Location reg_loc = Location::NoLocation();
-  Address address = CreateVecMemRegisters(instruction, &reg_loc, /*is_load*/ true);
-  XmmRegister reg = reg_loc.AsFpuRegister<XmmRegister>();
+  LocationSummary* locations = instruction->GetLocations();
+  size_t size = Primitive::ComponentSize(instruction->GetPackedType());
+  Address address = VecAddress(locations, size, instruction->IsStringCharAt());
+  XmmRegister reg = locations->Out().AsFpuRegister<XmmRegister>();
   bool is_aligned16 = instruction->GetAlignment().IsAlignedAt(16);
   switch (instruction->GetPackedType()) {
+    case Primitive::kPrimChar:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      // Special handling of compressed/uncompressed string load.
+      if (mirror::kUseStringCompression && instruction->IsStringCharAt()) {
+        NearLabel done, not_compressed;
+        XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+        // Test compression bit.
+        static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
+                      "Expecting 0=compressed, 1=uncompressed");
+        uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
+        __ testb(Address(locations->InAt(0).AsRegister<Register>(), count_offset), Immediate(1));
+        __ j(kNotZero, &not_compressed);
+        // Zero extend 8 compressed bytes into 8 chars.
+        __ movsd(reg, VecAddress(locations, 1, /*is_string_char_at*/ true));
+        __ pxor(tmp, tmp);
+        __ punpcklbw(reg, tmp);
+        __ jmp(&done);
+        // Load 4 direct uncompressed chars.
+        __ Bind(&not_compressed);
+        is_aligned16 ?  __ movdqa(reg, address) :  __ movdqu(reg, address);
+        __ Bind(&done);
+        return;
+      }
+      FALLTHROUGH_INTENDED;
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte:
-    case Primitive::kPrimChar:
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
     case Primitive::kPrimLong:
@@ -825,9 +851,10 @@
 }
 
 void InstructionCodeGeneratorX86::VisitVecStore(HVecStore* instruction) {
-  Location reg_loc = Location::NoLocation();
-  Address address = CreateVecMemRegisters(instruction, &reg_loc, /*is_load*/ false);
-  XmmRegister reg = reg_loc.AsFpuRegister<XmmRegister>();
+  LocationSummary* locations = instruction->GetLocations();
+  size_t size = Primitive::ComponentSize(instruction->GetPackedType());
+  Address address = VecAddress(locations, size, /*is_string_char_at*/ false);
+  XmmRegister reg = locations->InAt(2).AsFpuRegister<XmmRegister>();
   bool is_aligned16 = instruction->GetAlignment().IsAlignedAt(16);
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimBoolean:
diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc
index 66f19a4..6d4aae8 100644
--- a/compiler/optimizing/code_generator_vector_x86_64.cc
+++ b/compiler/optimizing/code_generator_vector_x86_64.cc
@@ -194,6 +194,7 @@
 
 void LocationsBuilderX86_64::VisitVecAbs(HVecAbs* instruction) {
   CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+  // Integral-abs requires a temporary for the comparison.
   if (instruction->GetPackedType() == Primitive::kPrimInt) {
     instruction->GetLocations()->AddTemp(Location::RequiresFpuRegister());
   }
@@ -755,16 +756,10 @@
   }
 }
 
-// Helper to set up registers and address for vector memory operations.
-static Address CreateVecMemRegisters(HVecMemoryOperation* instruction,
-                                     Location* reg_loc,
-                                     bool is_load) {
-  LocationSummary* locations = instruction->GetLocations();
+// Helper to construct address for vector memory operations.
+static Address VecAddress(LocationSummary* locations, size_t size, bool is_string_char_at) {
   Location base = locations->InAt(0);
   Location index = locations->InAt(1);
-  *reg_loc = is_load ? locations->Out() : locations->InAt(2);
-  size_t size = Primitive::ComponentSize(instruction->GetPackedType());
-  uint32_t offset = mirror::Array::DataOffset(size).Uint32Value();
   ScaleFactor scale = TIMES_1;
   switch (size) {
     case 2: scale = TIMES_2; break;
@@ -772,22 +767,53 @@
     case 8: scale = TIMES_8; break;
     default: break;
   }
+  uint32_t offset = is_string_char_at
+      ? mirror::String::ValueOffset().Uint32Value()
+      : mirror::Array::DataOffset(size).Uint32Value();
   return CodeGeneratorX86_64::ArrayAddress(base.AsRegister<CpuRegister>(), index, scale, offset);
 }
 
 void LocationsBuilderX86_64::VisitVecLoad(HVecLoad* instruction) {
   CreateVecMemLocations(GetGraph()->GetArena(), instruction, /*is_load*/ true);
+  // String load requires a temporary for the compressed load.
+  if (mirror::kUseStringCompression && instruction->IsStringCharAt()) {
+    instruction->GetLocations()->AddTemp(Location::RequiresFpuRegister());
+  }
 }
 
 void InstructionCodeGeneratorX86_64::VisitVecLoad(HVecLoad* instruction) {
-  Location reg_loc = Location::NoLocation();
-  Address address = CreateVecMemRegisters(instruction, &reg_loc, /*is_load*/ true);
-  XmmRegister reg = reg_loc.AsFpuRegister<XmmRegister>();
+  LocationSummary* locations = instruction->GetLocations();
+  size_t size = Primitive::ComponentSize(instruction->GetPackedType());
+  Address address = VecAddress(locations, size, instruction->IsStringCharAt());
+  XmmRegister reg = locations->Out().AsFpuRegister<XmmRegister>();
   bool is_aligned16 = instruction->GetAlignment().IsAlignedAt(16);
   switch (instruction->GetPackedType()) {
+    case Primitive::kPrimChar:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      // Special handling of compressed/uncompressed string load.
+      if (mirror::kUseStringCompression && instruction->IsStringCharAt()) {
+        NearLabel done, not_compressed;
+        XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+        // Test compression bit.
+        static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
+                      "Expecting 0=compressed, 1=uncompressed");
+        uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
+        __ testb(Address(locations->InAt(0).AsRegister<CpuRegister>(), count_offset), Immediate(1));
+        __ j(kNotZero, &not_compressed);
+        // Zero extend 8 compressed bytes into 8 chars.
+        __ movsd(reg, VecAddress(locations, 1, /*is_string_char_at*/ true));
+        __ pxor(tmp, tmp);
+        __ punpcklbw(reg, tmp);
+        __ jmp(&done);
+        // Load 8 direct uncompressed chars.
+        __ Bind(&not_compressed);
+        is_aligned16 ?  __ movdqa(reg, address) :  __ movdqu(reg, address);
+        __ Bind(&done);
+        return;
+      }
+      FALLTHROUGH_INTENDED;
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte:
-    case Primitive::kPrimChar:
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
     case Primitive::kPrimLong:
@@ -814,9 +840,10 @@
 }
 
 void InstructionCodeGeneratorX86_64::VisitVecStore(HVecStore* instruction) {
-  Location reg_loc = Location::NoLocation();
-  Address address = CreateVecMemRegisters(instruction, &reg_loc, /*is_load*/ false);
-  XmmRegister reg = reg_loc.AsFpuRegister<XmmRegister>();
+  LocationSummary* locations = instruction->GetLocations();
+  size_t size = Primitive::ComponentSize(instruction->GetPackedType());
+  Address address = VecAddress(locations, size, /*is_string_char_at*/ false);
+  XmmRegister reg = locations->InAt(2).AsFpuRegister<XmmRegister>();
   bool is_aligned16 = instruction->GetAlignment().IsAlignedAt(16);
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimBoolean:
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index da2acd1..c783dde 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -733,12 +733,6 @@
     }
     return true;
   } else if (instruction->IsArrayGet()) {
-    // Strings are different, with a different offset to the actual data
-    // and some compressed to save memory. For now, all cases are rejected
-    // to avoid the complexity.
-    if (instruction->AsArrayGet()->IsStringCharAt()) {
-      return false;
-    }
     // Accept a right-hand-side array base[index] for
     // (1) exact matching vector type,
     // (2) loop-invariant base,