Intrinsify System.ArrayCopy for Primitive data types
This patch implements System.ArrayCopy intrinsic for
byte and int data types
14% improvement in microbench below:
public static void time_System_arrayCopy_byte(int reps) {
byte[] src = new byte[8192];
for (int rep = 0; rep < reps; ++rep) {
byte[] dst = new byte[8192];
System.arraycopy(src, 0, dst, 0, 8192);
}
}
public static void time_System_arrayCopy_byte(int reps) {
int[] src = new int[8192];
for (int rep = 0; rep < reps; ++rep) {
int[] dst = new int[8192];
System.arraycopy(src, 0, dst, 0, 8192);
}
}
Time for base version: 4057 ms
Time for intrinsic version: 3487 ms
Test: ./art/test/testrunner/testrunner.py --host --optimizing
Signed-off-by: Shalini Salomi Bodapati <shalini.salomi.bodapati@intel.com>
Change-Id: I87aced30330d031efea04554c6fa0c05f84e3bb9
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index 98b3e49..6306720 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -771,7 +771,7 @@
GenFPToFPCall(invoke, codegen_, kQuickNextAfter);
}
-void IntrinsicLocationsBuilderX86::VisitSystemArrayCopyChar(HInvoke* invoke) {
+static void CreateSystemArrayCopyLocations(HInvoke* invoke) {
// We need at least two of the positions or length to be an integer constant,
// or else we won't have enough free registers.
HIntConstant* src_pos = invoke->InputAt(1)->AsIntConstant();
@@ -807,7 +807,8 @@
// Okay, it is safe to generate inline code.
LocationSummary* locations =
- new (allocator_) LocationSummary(invoke, LocationSummary::kCallOnSlowPath, kIntrinsified);
+ new (invoke->GetBlock()->GetGraph()->GetAllocator())
+ LocationSummary(invoke, LocationSummary::kCallOnSlowPath, kIntrinsified);
// arraycopy(Object src, int srcPos, Object dest, int destPos, int length).
locations->SetInAt(0, Location::RequiresRegister());
locations->SetInAt(1, Location::RegisterOrConstant(invoke->InputAt(1)));
@@ -885,17 +886,18 @@
}
}
-void IntrinsicCodeGeneratorX86::VisitSystemArrayCopyChar(HInvoke* invoke) {
- X86Assembler* assembler = GetAssembler();
+static void SystemArrayCopyPrimitive(HInvoke* invoke,
+ X86Assembler* assembler,
+ CodeGeneratorX86* codegen,
+ DataType::Type type) {
LocationSummary* locations = invoke->GetLocations();
-
Register src = locations->InAt(0).AsRegister<Register>();
- Location srcPos = locations->InAt(1);
+ Location src_pos = locations->InAt(1);
Register dest = locations->InAt(2).AsRegister<Register>();
- Location destPos = locations->InAt(3);
+ Location dest_pos = locations->InAt(3);
Location length = locations->InAt(4);
- // Temporaries that we need for MOVSW.
+ // Temporaries that we need for MOVSB/W/L.
Register src_base = locations->GetTemp(0).AsRegister<Register>();
DCHECK_EQ(src_base, ESI);
Register dest_base = locations->GetTemp(1).AsRegister<Register>();
@@ -903,8 +905,8 @@
Register count = locations->GetTemp(2).AsRegister<Register>();
DCHECK_EQ(count, ECX);
- SlowPathCode* slow_path = new (codegen_->GetScopedAllocator()) IntrinsicSlowPathX86(invoke);
- codegen_->AddSlowPath(slow_path);
+ SlowPathCode* slow_path = new (codegen->GetScopedAllocator()) IntrinsicSlowPathX86(invoke);
+ codegen->AddSlowPath(slow_path);
// Bail out if the source and destination are the same (to handle overlap).
__ cmpl(src, dest);
@@ -933,40 +935,74 @@
}
// Validity checks: source. Use src_base as a temporary register.
- CheckPosition(assembler, srcPos, src, Location::RegisterLocation(count), slow_path, src_base);
+ CheckPosition(assembler, src_pos, src, Location::RegisterLocation(count), slow_path, src_base);
// Validity checks: dest. Use src_base as a temporary register.
- CheckPosition(assembler, destPos, dest, Location::RegisterLocation(count), slow_path, src_base);
+ CheckPosition(assembler, dest_pos, dest, Location::RegisterLocation(count), slow_path, src_base);
// Okay, everything checks out. Finally time to do the copy.
// Check assumption that sizeof(Char) is 2 (used in scaling below).
- const size_t char_size = DataType::Size(DataType::Type::kUint16);
- DCHECK_EQ(char_size, 2u);
+ const size_t data_size = DataType::Size(type);
+ const ScaleFactor scale_factor = CodeGenerator::ScaleFactorForType(type);
+ const uint32_t data_offset = mirror::Array::DataOffset(data_size).Uint32Value();
- const uint32_t data_offset = mirror::Array::DataOffset(char_size).Uint32Value();
-
- if (srcPos.IsConstant()) {
- int32_t srcPos_const = srcPos.GetConstant()->AsIntConstant()->GetValue();
- __ leal(src_base, Address(src, char_size * srcPos_const + data_offset));
+ if (src_pos.IsConstant()) {
+ int32_t src_pos_const = src_pos.GetConstant()->AsIntConstant()->GetValue();
+ __ leal(src_base, Address(src, data_size * src_pos_const + data_offset));
} else {
- __ leal(src_base, Address(src, srcPos.AsRegister<Register>(),
- ScaleFactor::TIMES_2, data_offset));
+ __ leal(src_base, Address(src, src_pos.AsRegister<Register>(), scale_factor, data_offset));
}
- if (destPos.IsConstant()) {
- int32_t destPos_const = destPos.GetConstant()->AsIntConstant()->GetValue();
-
- __ leal(dest_base, Address(dest, char_size * destPos_const + data_offset));
+ if (dest_pos.IsConstant()) {
+ int32_t dest_pos_const = dest_pos.GetConstant()->AsIntConstant()->GetValue();
+ __ leal(dest_base, Address(dest, data_size * dest_pos_const + data_offset));
} else {
- __ leal(dest_base, Address(dest, destPos.AsRegister<Register>(),
- ScaleFactor::TIMES_2, data_offset));
+ __ leal(dest_base, Address(dest, dest_pos.AsRegister<Register>(), scale_factor, data_offset));
}
// Do the move.
- __ rep_movsw();
-
+ switch (type) {
+ case DataType::Type::kInt8:
+ __ rep_movsb();
+ break;
+ case DataType::Type::kUint16:
+ __ rep_movsw();
+ break;
+ case DataType::Type::kInt32:
+ __ rep_movsl();
+ break;
+ default:
+ LOG(FATAL) << "Unexpected data type for intrinsic";
+ }
__ Bind(slow_path->GetExitLabel());
}
+void IntrinsicLocationsBuilderX86::VisitSystemArrayCopyChar(HInvoke* invoke) {
+ CreateSystemArrayCopyLocations(invoke);
+}
+
+void IntrinsicCodeGeneratorX86::VisitSystemArrayCopyChar(HInvoke* invoke) {
+ X86Assembler* assembler = GetAssembler();
+ SystemArrayCopyPrimitive(invoke, assembler, codegen_, DataType::Type::kUint16);
+}
+
+void IntrinsicCodeGeneratorX86::VisitSystemArrayCopyByte(HInvoke* invoke) {
+ X86Assembler* assembler = GetAssembler();
+ SystemArrayCopyPrimitive(invoke, assembler, codegen_, DataType::Type::kInt8);
+}
+
+void IntrinsicLocationsBuilderX86::VisitSystemArrayCopyByte(HInvoke* invoke) {
+ CreateSystemArrayCopyLocations(invoke);
+}
+
+void IntrinsicCodeGeneratorX86::VisitSystemArrayCopyInt(HInvoke* invoke) {
+ X86Assembler* assembler = GetAssembler();
+ SystemArrayCopyPrimitive(invoke, assembler, codegen_, DataType::Type::kInt32);
+}
+
+void IntrinsicLocationsBuilderX86::VisitSystemArrayCopyInt(HInvoke* invoke) {
+ CreateSystemArrayCopyLocations(invoke);
+}
+
void IntrinsicLocationsBuilderX86::VisitStringCompareTo(HInvoke* invoke) {
// The inputs plus one temp.
LocationSummary* locations = new (allocator_) LocationSummary(