diff options
author | 2022-02-10 18:03:34 +0530 | |
---|---|---|
committer | 2022-02-15 11:32:16 +0000 | |
commit | b414a4cc18a9e46b2674dcc81bbe1742e9fe5022 (patch) | |
tree | a50b959c266430813f81783aadf14109743a8291 | |
parent | 4f5b7cb3dfef3ea175439a8a541f8c9f9458d34a (diff) |
Intrinsify System.ArrayCopy for Primitive data types
This patch implements System.ArrayCopy intrinsic for
byte and int data types
14% improvement in microbench below:
public static void time_System_arrayCopy_byte(int reps) {
byte[] src = new byte[8192];
for (int rep = 0; rep < reps; ++rep) {
byte[] dst = new byte[8192];
System.arraycopy(src, 0, dst, 0, 8192);
}
}
public static void time_System_arrayCopy_byte(int reps) {
int[] src = new int[8192];
for (int rep = 0; rep < reps; ++rep) {
int[] dst = new int[8192];
System.arraycopy(src, 0, dst, 0, 8192);
}
}
Time for base version: 4057 ms
Time for intrinsic version: 3487 ms
Test: ./art/test/testrunner/testrunner.py --host --optimizing
Signed-off-by: Shalini Salomi Bodapati <shalini.salomi.bodapati@intel.com>
Change-Id: I87aced30330d031efea04554c6fa0c05f84e3bb9
-rw-r--r-- | compiler/optimizing/intrinsics_arm64.cc | 2 | ||||
-rw-r--r-- | compiler/optimizing/intrinsics_arm_vixl.cc | 3 | ||||
-rw-r--r-- | compiler/optimizing/intrinsics_x86.cc | 94 | ||||
-rw-r--r-- | compiler/optimizing/intrinsics_x86_64.cc | 77 | ||||
-rw-r--r-- | runtime/hidden_api.h | 2 | ||||
-rw-r--r-- | runtime/interpreter/interpreter_intrinsics.cc | 2 | ||||
-rw-r--r-- | runtime/intrinsics_list.h | 2 | ||||
-rw-r--r-- | test/641-checker-arraycopy/src/Main.java | 8 |
8 files changed, 136 insertions, 54 deletions
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc index f2bfcfef94..69e4fcf64b 100644 --- a/compiler/optimizing/intrinsics_arm64.cc +++ b/compiler/optimizing/intrinsics_arm64.cc @@ -5653,6 +5653,8 @@ UNIMPLEMENTED_INTRINSIC(ARM64, StringBuilderAppendFloat); UNIMPLEMENTED_INTRINSIC(ARM64, StringBuilderAppendDouble); UNIMPLEMENTED_INTRINSIC(ARM64, StringBuilderLength); UNIMPLEMENTED_INTRINSIC(ARM64, StringBuilderToString); +UNIMPLEMENTED_INTRINSIC(ARM64, SystemArrayCopyByte); +UNIMPLEMENTED_INTRINSIC(ARM64, SystemArrayCopyInt); // 1.8. UNIMPLEMENTED_INTRINSIC(ARM64, UnsafeGetAndAddInt) diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc index 9d0175725a..1f2ba466be 100644 --- a/compiler/optimizing/intrinsics_arm_vixl.cc +++ b/compiler/optimizing/intrinsics_arm_vixl.cc @@ -5545,6 +5545,9 @@ UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringBuilderAppendDouble); UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringBuilderLength); UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringBuilderToString); +UNIMPLEMENTED_INTRINSIC(ARMVIXL, SystemArrayCopyByte); +UNIMPLEMENTED_INTRINSIC(ARMVIXL, SystemArrayCopyInt); + // 1.8. UNIMPLEMENTED_INTRINSIC(ARMVIXL, MathFmaDouble) UNIMPLEMENTED_INTRINSIC(ARMVIXL, MathFmaFloat) diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc index 98b3e492cb..6306720512 100644 --- a/compiler/optimizing/intrinsics_x86.cc +++ b/compiler/optimizing/intrinsics_x86.cc @@ -771,7 +771,7 @@ void IntrinsicCodeGeneratorX86::VisitMathNextAfter(HInvoke* invoke) { GenFPToFPCall(invoke, codegen_, kQuickNextAfter); } -void IntrinsicLocationsBuilderX86::VisitSystemArrayCopyChar(HInvoke* invoke) { +static void CreateSystemArrayCopyLocations(HInvoke* invoke) { // We need at least two of the positions or length to be an integer constant, // or else we won't have enough free registers. HIntConstant* src_pos = invoke->InputAt(1)->AsIntConstant(); @@ -807,7 +807,8 @@ void IntrinsicLocationsBuilderX86::VisitSystemArrayCopyChar(HInvoke* invoke) { // Okay, it is safe to generate inline code. LocationSummary* locations = - new (allocator_) LocationSummary(invoke, LocationSummary::kCallOnSlowPath, kIntrinsified); + new (invoke->GetBlock()->GetGraph()->GetAllocator()) + LocationSummary(invoke, LocationSummary::kCallOnSlowPath, kIntrinsified); // arraycopy(Object src, int srcPos, Object dest, int destPos, int length). locations->SetInAt(0, Location::RequiresRegister()); locations->SetInAt(1, Location::RegisterOrConstant(invoke->InputAt(1))); @@ -885,17 +886,18 @@ static void CheckPosition(X86Assembler* assembler, } } -void IntrinsicCodeGeneratorX86::VisitSystemArrayCopyChar(HInvoke* invoke) { - X86Assembler* assembler = GetAssembler(); +static void SystemArrayCopyPrimitive(HInvoke* invoke, + X86Assembler* assembler, + CodeGeneratorX86* codegen, + DataType::Type type) { LocationSummary* locations = invoke->GetLocations(); - Register src = locations->InAt(0).AsRegister<Register>(); - Location srcPos = locations->InAt(1); + Location src_pos = locations->InAt(1); Register dest = locations->InAt(2).AsRegister<Register>(); - Location destPos = locations->InAt(3); + Location dest_pos = locations->InAt(3); Location length = locations->InAt(4); - // Temporaries that we need for MOVSW. + // Temporaries that we need for MOVSB/W/L. Register src_base = locations->GetTemp(0).AsRegister<Register>(); DCHECK_EQ(src_base, ESI); Register dest_base = locations->GetTemp(1).AsRegister<Register>(); @@ -903,8 +905,8 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopyChar(HInvoke* invoke) { Register count = locations->GetTemp(2).AsRegister<Register>(); DCHECK_EQ(count, ECX); - SlowPathCode* slow_path = new (codegen_->GetScopedAllocator()) IntrinsicSlowPathX86(invoke); - codegen_->AddSlowPath(slow_path); + SlowPathCode* slow_path = new (codegen->GetScopedAllocator()) IntrinsicSlowPathX86(invoke); + codegen->AddSlowPath(slow_path); // Bail out if the source and destination are the same (to handle overlap). __ cmpl(src, dest); @@ -933,40 +935,74 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopyChar(HInvoke* invoke) { } // Validity checks: source. Use src_base as a temporary register. - CheckPosition(assembler, srcPos, src, Location::RegisterLocation(count), slow_path, src_base); + CheckPosition(assembler, src_pos, src, Location::RegisterLocation(count), slow_path, src_base); // Validity checks: dest. Use src_base as a temporary register. - CheckPosition(assembler, destPos, dest, Location::RegisterLocation(count), slow_path, src_base); + CheckPosition(assembler, dest_pos, dest, Location::RegisterLocation(count), slow_path, src_base); // Okay, everything checks out. Finally time to do the copy. // Check assumption that sizeof(Char) is 2 (used in scaling below). - const size_t char_size = DataType::Size(DataType::Type::kUint16); - DCHECK_EQ(char_size, 2u); - - const uint32_t data_offset = mirror::Array::DataOffset(char_size).Uint32Value(); + const size_t data_size = DataType::Size(type); + const ScaleFactor scale_factor = CodeGenerator::ScaleFactorForType(type); + const uint32_t data_offset = mirror::Array::DataOffset(data_size).Uint32Value(); - if (srcPos.IsConstant()) { - int32_t srcPos_const = srcPos.GetConstant()->AsIntConstant()->GetValue(); - __ leal(src_base, Address(src, char_size * srcPos_const + data_offset)); + if (src_pos.IsConstant()) { + int32_t src_pos_const = src_pos.GetConstant()->AsIntConstant()->GetValue(); + __ leal(src_base, Address(src, data_size * src_pos_const + data_offset)); } else { - __ leal(src_base, Address(src, srcPos.AsRegister<Register>(), - ScaleFactor::TIMES_2, data_offset)); + __ leal(src_base, Address(src, src_pos.AsRegister<Register>(), scale_factor, data_offset)); } - if (destPos.IsConstant()) { - int32_t destPos_const = destPos.GetConstant()->AsIntConstant()->GetValue(); - - __ leal(dest_base, Address(dest, char_size * destPos_const + data_offset)); + if (dest_pos.IsConstant()) { + int32_t dest_pos_const = dest_pos.GetConstant()->AsIntConstant()->GetValue(); + __ leal(dest_base, Address(dest, data_size * dest_pos_const + data_offset)); } else { - __ leal(dest_base, Address(dest, destPos.AsRegister<Register>(), - ScaleFactor::TIMES_2, data_offset)); + __ leal(dest_base, Address(dest, dest_pos.AsRegister<Register>(), scale_factor, data_offset)); } // Do the move. - __ rep_movsw(); - + switch (type) { + case DataType::Type::kInt8: + __ rep_movsb(); + break; + case DataType::Type::kUint16: + __ rep_movsw(); + break; + case DataType::Type::kInt32: + __ rep_movsl(); + break; + default: + LOG(FATAL) << "Unexpected data type for intrinsic"; + } __ Bind(slow_path->GetExitLabel()); } +void IntrinsicLocationsBuilderX86::VisitSystemArrayCopyChar(HInvoke* invoke) { + CreateSystemArrayCopyLocations(invoke); +} + +void IntrinsicCodeGeneratorX86::VisitSystemArrayCopyChar(HInvoke* invoke) { + X86Assembler* assembler = GetAssembler(); + SystemArrayCopyPrimitive(invoke, assembler, codegen_, DataType::Type::kUint16); +} + +void IntrinsicCodeGeneratorX86::VisitSystemArrayCopyByte(HInvoke* invoke) { + X86Assembler* assembler = GetAssembler(); + SystemArrayCopyPrimitive(invoke, assembler, codegen_, DataType::Type::kInt8); +} + +void IntrinsicLocationsBuilderX86::VisitSystemArrayCopyByte(HInvoke* invoke) { + CreateSystemArrayCopyLocations(invoke); +} + +void IntrinsicCodeGeneratorX86::VisitSystemArrayCopyInt(HInvoke* invoke) { + X86Assembler* assembler = GetAssembler(); + SystemArrayCopyPrimitive(invoke, assembler, codegen_, DataType::Type::kInt32); +} + +void IntrinsicLocationsBuilderX86::VisitSystemArrayCopyInt(HInvoke* invoke) { + CreateSystemArrayCopyLocations(invoke); +} + void IntrinsicLocationsBuilderX86::VisitStringCompareTo(HInvoke* invoke) { // The inputs plus one temp. LocationSummary* locations = new (allocator_) LocationSummary( diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc index 43147e037e..15df11d952 100644 --- a/compiler/optimizing/intrinsics_x86_64.cc +++ b/compiler/optimizing/intrinsics_x86_64.cc @@ -614,7 +614,7 @@ void IntrinsicCodeGeneratorX86_64::VisitMathNextAfter(HInvoke* invoke) { GenFPToFPCall(invoke, codegen_, kQuickNextAfter); } -void IntrinsicLocationsBuilderX86_64::VisitSystemArrayCopyChar(HInvoke* invoke) { +static void CreateSystemArrayCopyLocations(HInvoke* invoke) { // Check to see if we have known failures that will cause us to have to bail out // to the runtime, and just generate the runtime call directly. HIntConstant* src_pos = invoke->InputAt(1)->AsIntConstant(); @@ -636,9 +636,9 @@ void IntrinsicLocationsBuilderX86_64::VisitSystemArrayCopyChar(HInvoke* invoke) return; } } - LocationSummary* locations = - new (allocator_) LocationSummary(invoke, LocationSummary::kCallOnSlowPath, kIntrinsified); + new (invoke->GetBlock()->GetGraph()->GetAllocator()) LocationSummary + (invoke, LocationSummary::kCallOnSlowPath, kIntrinsified); // arraycopy(Object src, int src_pos, Object dest, int dest_pos, int length). locations->SetInAt(0, Location::RequiresRegister()); locations->SetInAt(1, Location::RegisterOrConstant(invoke->InputAt(1))); @@ -716,17 +716,18 @@ static void CheckPosition(X86_64Assembler* assembler, } } -void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopyChar(HInvoke* invoke) { - X86_64Assembler* assembler = GetAssembler(); +static void SystemArrayCopyPrimitive(HInvoke* invoke, + X86_64Assembler* assembler, + CodeGeneratorX86_64* codegen, + DataType::Type type) { LocationSummary* locations = invoke->GetLocations(); - CpuRegister src = locations->InAt(0).AsRegister<CpuRegister>(); Location src_pos = locations->InAt(1); CpuRegister dest = locations->InAt(2).AsRegister<CpuRegister>(); Location dest_pos = locations->InAt(3); Location length = locations->InAt(4); - // Temporaries that we need for MOVSW. + // Temporaries that we need for MOVSB/W/L. CpuRegister src_base = locations->GetTemp(0).AsRegister<CpuRegister>(); DCHECK_EQ(src_base.AsRegister(), RSI); CpuRegister dest_base = locations->GetTemp(1).AsRegister<CpuRegister>(); @@ -734,8 +735,8 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopyChar(HInvoke* invoke) { CpuRegister count = locations->GetTemp(2).AsRegister<CpuRegister>(); DCHECK_EQ(count.AsRegister(), RCX); - SlowPathCode* slow_path = new (codegen_->GetScopedAllocator()) IntrinsicSlowPathX86_64(invoke); - codegen_->AddSlowPath(slow_path); + SlowPathCode* slow_path = new (codegen->GetScopedAllocator()) IntrinsicSlowPathX86_64(invoke); + codegen->AddSlowPath(slow_path); // Bail out if the source and destination are the same. __ cmpl(src, dest); @@ -771,32 +772,66 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopyChar(HInvoke* invoke) { // Okay, everything checks out. Finally time to do the copy. // Check assumption that sizeof(Char) is 2 (used in scaling below). - const size_t char_size = DataType::Size(DataType::Type::kUint16); - DCHECK_EQ(char_size, 2u); - - const uint32_t data_offset = mirror::Array::DataOffset(char_size).Uint32Value(); + const size_t data_size = DataType::Size(type); + const ScaleFactor scale_factor = CodeGenerator::ScaleFactorForType(type); + const uint32_t data_offset = mirror::Array::DataOffset(data_size).Uint32Value(); if (src_pos.IsConstant()) { int32_t src_pos_const = src_pos.GetConstant()->AsIntConstant()->GetValue(); - __ leal(src_base, Address(src, char_size * src_pos_const + data_offset)); + __ leal(src_base, Address(src, data_size * src_pos_const + data_offset)); } else { - __ leal(src_base, Address(src, src_pos.AsRegister<CpuRegister>(), - ScaleFactor::TIMES_2, data_offset)); + __ leal(src_base, Address(src, src_pos.AsRegister<CpuRegister>(), scale_factor, data_offset)); } if (dest_pos.IsConstant()) { int32_t dest_pos_const = dest_pos.GetConstant()->AsIntConstant()->GetValue(); - __ leal(dest_base, Address(dest, char_size * dest_pos_const + data_offset)); + __ leal(dest_base, Address(dest, data_size * dest_pos_const + data_offset)); } else { - __ leal(dest_base, Address(dest, dest_pos.AsRegister<CpuRegister>(), - ScaleFactor::TIMES_2, data_offset)); + __ leal(dest_base, + Address(dest, dest_pos.AsRegister<CpuRegister>(), scale_factor, data_offset)); } // Do the move. - __ rep_movsw(); - + switch (type) { + case DataType::Type::kInt8: + __ rep_movsb(); + break; + case DataType::Type::kUint16: + __ rep_movsw(); + break; + case DataType::Type::kInt32: + __ rep_movsl(); + break; + default: + LOG(FATAL) << "Unexpected data type for intrinsic"; + } __ Bind(slow_path->GetExitLabel()); } +void IntrinsicLocationsBuilderX86_64::VisitSystemArrayCopyChar(HInvoke* invoke) { + CreateSystemArrayCopyLocations(invoke); +} +void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopyChar(HInvoke* invoke) { + X86_64Assembler* assembler = GetAssembler(); + SystemArrayCopyPrimitive(invoke, assembler, codegen_, DataType::Type::kUint16); +} + +void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopyByte(HInvoke* invoke) { + X86_64Assembler* assembler = GetAssembler(); + SystemArrayCopyPrimitive(invoke, assembler, codegen_, DataType::Type::kInt8); +} + +void IntrinsicLocationsBuilderX86_64::VisitSystemArrayCopyByte(HInvoke* invoke) { + CreateSystemArrayCopyLocations(invoke); +} + +void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopyInt(HInvoke* invoke) { + X86_64Assembler* assembler = GetAssembler(); + SystemArrayCopyPrimitive(invoke, assembler, codegen_, DataType::Type::kInt32); +} + +void IntrinsicLocationsBuilderX86_64::VisitSystemArrayCopyInt(HInvoke* invoke) { + CreateSystemArrayCopyLocations(invoke); +} void IntrinsicLocationsBuilderX86_64::VisitSystemArrayCopy(HInvoke* invoke) { // The only read barrier implementation supporting the diff --git a/runtime/hidden_api.h b/runtime/hidden_api.h index 66e81c4d53..febe80875a 100644 --- a/runtime/hidden_api.h +++ b/runtime/hidden_api.h @@ -288,6 +288,8 @@ ALWAYS_INLINE inline uint32_t GetRuntimeFlags(ArtMethod* method) if (UNLIKELY(method->IsIntrinsic())) { switch (static_cast<Intrinsics>(method->GetIntrinsic())) { case Intrinsics::kSystemArrayCopyChar: + case Intrinsics::kSystemArrayCopyByte: + case Intrinsics::kSystemArrayCopyInt: case Intrinsics::kStringGetCharsNoCheck: case Intrinsics::kReferenceGetReferent: case Intrinsics::kReferenceRefersTo: diff --git a/runtime/interpreter/interpreter_intrinsics.cc b/runtime/interpreter/interpreter_intrinsics.cc index 9c6c614a9a..c8344bc760 100644 --- a/runtime/interpreter/interpreter_intrinsics.cc +++ b/runtime/interpreter/interpreter_intrinsics.cc @@ -503,7 +503,9 @@ bool MterpHandleIntrinsic(ShadowFrame* shadow_frame, UNIMPLEMENTED_CASE(MathRoundDouble /* (D)J */) UNIMPLEMENTED_CASE(MathRoundFloat /* (F)I */) UNIMPLEMENTED_CASE(MathMultiplyHigh /* (JJ)J */) + UNIMPLEMENTED_CASE(SystemArrayCopyByte /* ([BI[BII)V */) UNIMPLEMENTED_CASE(SystemArrayCopyChar /* ([CI[CII)V */) + UNIMPLEMENTED_CASE(SystemArrayCopyInt /* ([II[III)V */) UNIMPLEMENTED_CASE(SystemArrayCopy /* (Ljava/lang/Object;ILjava/lang/Object;II)V */) UNIMPLEMENTED_CASE(ThreadCurrentThread /* ()Ljava/lang/Thread; */) UNIMPLEMENTED_CASE(MemoryPeekByte /* (J)B */) diff --git a/runtime/intrinsics_list.h b/runtime/intrinsics_list.h index 568daffefe..256cd2e5fe 100644 --- a/runtime/intrinsics_list.h +++ b/runtime/intrinsics_list.h @@ -159,7 +159,9 @@ V(MathRoundDouble, kStatic, kNeedsEnvironment, kNoSideEffects, kNoThrow, "Ljava/lang/Math;", "round", "(D)J") \ V(MathRoundFloat, kStatic, kNeedsEnvironment, kNoSideEffects, kNoThrow, "Ljava/lang/Math;", "round", "(F)I") \ V(MathMultiplyHigh, kStatic, kNeedsEnvironment, kNoSideEffects, kNoThrow, "Ljava/lang/Math;", "multiplyHigh", "(JJ)J") \ + V(SystemArrayCopyByte, kStatic, kNeedsEnvironment, kAllSideEffects, kCanThrow, "Ljava/lang/System;", "arraycopy", "([BI[BII)V") \ V(SystemArrayCopyChar, kStatic, kNeedsEnvironment, kAllSideEffects, kCanThrow, "Ljava/lang/System;", "arraycopy", "([CI[CII)V") \ + V(SystemArrayCopyInt, kStatic, kNeedsEnvironment, kAllSideEffects, kCanThrow, "Ljava/lang/System;", "arraycopy", "([II[III)V") \ V(SystemArrayCopy, kStatic, kNeedsEnvironment, kAllSideEffects, kCanThrow, "Ljava/lang/System;", "arraycopy", "(Ljava/lang/Object;ILjava/lang/Object;II)V") \ V(ThreadCurrentThread, kStatic, kNeedsEnvironment, kNoSideEffects, kNoThrow, "Ljava/lang/Thread;", "currentThread", "()Ljava/lang/Thread;") \ V(MemoryPeekByte, kStatic, kNeedsEnvironment, kReadSideEffects, kCanThrow, "Llibcore/io/Memory;", "peekByte", "(J)B") \ diff --git a/test/641-checker-arraycopy/src/Main.java b/test/641-checker-arraycopy/src/Main.java index 939fc00b12..bede9026f7 100644 --- a/test/641-checker-arraycopy/src/Main.java +++ b/test/641-checker-arraycopy/src/Main.java @@ -22,8 +22,8 @@ public class Main { /// CHECK-START-X86: void Main.typedCopy(java.lang.Object, byte[]) disassembly (after) /// CHECK: InvokeStaticOrDirect method_name:java.lang.System.arraycopy intrinsic:SystemArrayCopy /// CHECK-NOT: call - /// CHECK: InvokeStaticOrDirect method_name:java.lang.System.arraycopy intrinsic:SystemArrayCopy - /// CHECK: call + /// CHECK: InvokeStaticOrDirect method_name:java.lang.System.arraycopy intrinsic:SystemArrayCopyByte + /// CHECK-NOT: call /// CHECK: ReturnVoid public static void typedCopy(Object o, byte[] foo) { System.arraycopy(o, 1, o, 0, 1); @@ -40,8 +40,8 @@ public class Main { /// CHECK-START-X86: void Main.untypedCopyCaller(java.lang.Object, byte[]) disassembly (after) /// CHECK: InvokeStaticOrDirect method_name:java.lang.System.arraycopy intrinsic:SystemArrayCopy /// CHECK-NOT: call - /// CHECK: InvokeStaticOrDirect method_name:java.lang.System.arraycopy intrinsic:SystemArrayCopy - /// CHECK: call + /// CHECK: InvokeStaticOrDirect method_name:java.lang.System.arraycopy intrinsic:SystemArrayCopyByte + /// CHECK-NOT: call /// CHECK: ReturnVoid public static void untypedCopyCaller(Object o, byte[] array) { untypedCopy(o, array); |