Enable string "array get" vectorization.
Rationale:
Like its scalar counterpart, the SIMD implementation of array get from
a string needs to deal with compressed and uncompressed cases.
Micro benchmarks shows 2x to 3x speedup for just copying data!
Test: test-art-target, test-art-host
Change-Id: I2fd714e50715b263123c215cd181f19194456d2b
diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc
index 013b092..5bb19c1 100644
--- a/compiler/optimizing/code_generator_vector_x86.cc
+++ b/compiler/optimizing/code_generator_vector_x86.cc
@@ -201,6 +201,7 @@
void LocationsBuilderX86::VisitVecAbs(HVecAbs* instruction) {
CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+ // Integral-abs requires a temporary for the comparison.
if (instruction->GetPackedType() == Primitive::kPrimInt) {
instruction->GetLocations()->AddTemp(Location::RequiresFpuRegister());
}
@@ -766,16 +767,10 @@
}
}
-// Helper to set up registers and address for vector memory operations.
-static Address CreateVecMemRegisters(HVecMemoryOperation* instruction,
- Location* reg_loc,
- bool is_load) {
- LocationSummary* locations = instruction->GetLocations();
+// Helper to construct address for vector memory operations.
+static Address VecAddress(LocationSummary* locations, size_t size, bool is_string_char_at) {
Location base = locations->InAt(0);
Location index = locations->InAt(1);
- *reg_loc = is_load ? locations->Out() : locations->InAt(2);
- size_t size = Primitive::ComponentSize(instruction->GetPackedType());
- uint32_t offset = mirror::Array::DataOffset(size).Uint32Value();
ScaleFactor scale = TIMES_1;
switch (size) {
case 2: scale = TIMES_2; break;
@@ -783,22 +778,53 @@
case 8: scale = TIMES_8; break;
default: break;
}
+ uint32_t offset = is_string_char_at
+ ? mirror::String::ValueOffset().Uint32Value()
+ : mirror::Array::DataOffset(size).Uint32Value();
return CodeGeneratorX86::ArrayAddress(base.AsRegister<Register>(), index, scale, offset);
}
void LocationsBuilderX86::VisitVecLoad(HVecLoad* instruction) {
CreateVecMemLocations(GetGraph()->GetArena(), instruction, /*is_load*/ true);
+ // String load requires a temporary for the compressed load.
+ if (mirror::kUseStringCompression && instruction->IsStringCharAt()) {
+ instruction->GetLocations()->AddTemp(Location::RequiresFpuRegister());
+ }
}
void InstructionCodeGeneratorX86::VisitVecLoad(HVecLoad* instruction) {
- Location reg_loc = Location::NoLocation();
- Address address = CreateVecMemRegisters(instruction, ®_loc, /*is_load*/ true);
- XmmRegister reg = reg_loc.AsFpuRegister<XmmRegister>();
+ LocationSummary* locations = instruction->GetLocations();
+ size_t size = Primitive::ComponentSize(instruction->GetPackedType());
+ Address address = VecAddress(locations, size, instruction->IsStringCharAt());
+ XmmRegister reg = locations->Out().AsFpuRegister<XmmRegister>();
bool is_aligned16 = instruction->GetAlignment().IsAlignedAt(16);
switch (instruction->GetPackedType()) {
+ case Primitive::kPrimChar:
+ DCHECK_EQ(8u, instruction->GetVectorLength());
+ // Special handling of compressed/uncompressed string load.
+ if (mirror::kUseStringCompression && instruction->IsStringCharAt()) {
+ NearLabel done, not_compressed;
+ XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+ // Test compression bit.
+ static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
+ "Expecting 0=compressed, 1=uncompressed");
+ uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
+ __ testb(Address(locations->InAt(0).AsRegister<Register>(), count_offset), Immediate(1));
+ __ j(kNotZero, ¬_compressed);
+ // Zero extend 8 compressed bytes into 8 chars.
+ __ movsd(reg, VecAddress(locations, 1, /*is_string_char_at*/ true));
+ __ pxor(tmp, tmp);
+ __ punpcklbw(reg, tmp);
+ __ jmp(&done);
+ // Load 4 direct uncompressed chars.
+ __ Bind(¬_compressed);
+ is_aligned16 ? __ movdqa(reg, address) : __ movdqu(reg, address);
+ __ Bind(&done);
+ return;
+ }
+ FALLTHROUGH_INTENDED;
case Primitive::kPrimBoolean:
case Primitive::kPrimByte:
- case Primitive::kPrimChar:
case Primitive::kPrimShort:
case Primitive::kPrimInt:
case Primitive::kPrimLong:
@@ -825,9 +851,10 @@
}
void InstructionCodeGeneratorX86::VisitVecStore(HVecStore* instruction) {
- Location reg_loc = Location::NoLocation();
- Address address = CreateVecMemRegisters(instruction, ®_loc, /*is_load*/ false);
- XmmRegister reg = reg_loc.AsFpuRegister<XmmRegister>();
+ LocationSummary* locations = instruction->GetLocations();
+ size_t size = Primitive::ComponentSize(instruction->GetPackedType());
+ Address address = VecAddress(locations, size, /*is_string_char_at*/ false);
+ XmmRegister reg = locations->InAt(2).AsFpuRegister<XmmRegister>();
bool is_aligned16 = instruction->GetAlignment().IsAlignedAt(16);
switch (instruction->GetPackedType()) {
case Primitive::kPrimBoolean: