diff options
author | 2017-01-30 14:57:16 +0000 | |
---|---|---|
committer | 2017-02-03 09:10:57 +0000 | |
commit | d09584456559f669f5999fb1ff32aa89ebf6ef4e (patch) | |
tree | 3820a5e6505823f8b92f1cc5dcde11995c025e3b | |
parent | 588ef19c89c8299362c952037a926078740f2090 (diff) |
Align allocation entrypoints implementation between arm/arm64/x86/x64.
x64:
- Add art_quick_alloc_initialized_rosalloc
x86:
- Add art_quick_alloc_initialized_rosalloc
- Add art_quick_alloc_initialized{_region}_tlab
- Add art_quick_alloc_array_resolved{8,16,32,64}{_region}_tlab
arm32:
- Add art_quick_alloc_initialized_rosalloc
- Add art_quick_alloc_initialized{_region}_tlab
- Add art_quick_alloc_array_resolved{8,16,32,64}{_region}_tlab
arm64:
- Add art_quick_alloc_initialized_rosalloc
- Add art_quick_alloc_initialized{_region}_tlab
- Add art_quick_alloc_array_resolved{8,16,32,64}_tlab
Test: test-art-target test-art-host
bug: 30933338
Change-Id: I0dd8667a2921dd0b3403bea5d05304ba5d40627f
-rw-r--r-- | compiler/optimizing/code_generator_arm.cc | 5 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_x86.cc | 4 | ||||
-rw-r--r-- | runtime/arch/arm/quick_entrypoints_arm.S | 230 | ||||
-rw-r--r-- | runtime/arch/arm64/quick_entrypoints_arm64.S | 171 | ||||
-rw-r--r-- | runtime/arch/mips/quick_entrypoints_mips.S | 1 | ||||
-rw-r--r-- | runtime/arch/mips64/quick_entrypoints_mips64.S | 1 | ||||
-rw-r--r-- | runtime/arch/quick_alloc_entrypoints.S | 2 | ||||
-rw-r--r-- | runtime/arch/x86/quick_entrypoints_x86.S | 212 | ||||
-rw-r--r-- | runtime/arch/x86_64/quick_entrypoints_x86_64.S | 34 |
9 files changed, 479 insertions, 181 deletions
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc index f5b6ebef9c..1893029cad 100644 --- a/compiler/optimizing/code_generator_arm.cc +++ b/compiler/optimizing/code_generator_arm.cc @@ -3993,8 +3993,11 @@ void LocationsBuilderARM::VisitNewArray(HNewArray* instruction) { void InstructionCodeGeneratorARM::VisitNewArray(HNewArray* instruction) { // Note: if heap poisoning is enabled, the entry point takes cares // of poisoning the reference. - codegen_->InvokeRuntime(kQuickAllocArrayResolved, instruction, instruction->GetDexPc()); + QuickEntrypointEnum entrypoint = + CodeGenerator::GetArrayAllocationEntrypoint(instruction->GetLoadClass()->GetClass()); + codegen_->InvokeRuntime(entrypoint, instruction, instruction->GetDexPc()); CheckEntrypointTypes<kQuickAllocArrayResolved, void*, mirror::Class*, int32_t>(); + DCHECK(!codegen_->IsLeafMethod()); } void LocationsBuilderARM::VisitParameterValue(HParameterValue* instruction) { diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc index 1b7431612d..3e795c7bf8 100644 --- a/compiler/optimizing/code_generator_x86.cc +++ b/compiler/optimizing/code_generator_x86.cc @@ -4214,7 +4214,9 @@ void LocationsBuilderX86::VisitNewArray(HNewArray* instruction) { void InstructionCodeGeneratorX86::VisitNewArray(HNewArray* instruction) { // Note: if heap poisoning is enabled, the entry point takes cares // of poisoning the reference. - codegen_->InvokeRuntime(kQuickAllocArrayResolved, instruction, instruction->GetDexPc()); + QuickEntrypointEnum entrypoint = + CodeGenerator::GetArrayAllocationEntrypoint(instruction->GetLoadClass()->GetClass()); + codegen_->InvokeRuntime(entrypoint, instruction, instruction->GetDexPc()); CheckEntrypointTypes<kQuickAllocArrayResolved, void*, mirror::Class*, int32_t>(); DCHECK(!codegen_->IsLeafMethod()); } diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S index ed36436120..a443a4060d 100644 --- a/runtime/arch/arm/quick_entrypoints_arm.S +++ b/runtime/arch/arm/quick_entrypoints_arm.S @@ -1086,11 +1086,37 @@ ENTRY art_quick_resolve_string DELIVER_PENDING_EXCEPTION_FRAME_READY END art_quick_resolve_string + // Generate the allocation entrypoints for each allocator. -GENERATE_ALLOC_ENTRYPOINTS_FOR_EACH_ALLOCATOR +GENERATE_ALLOC_ENTRYPOINTS_FOR_NON_TLAB_ALLOCATORS +// Comment out allocators that have arm specific asm. +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_region_tlab, RegionTLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_region_tlab, RegionTLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_WITH_ACCESS_CHECK(_region_tlab, RegionTLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED(_region_tlab, RegionTLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED8(_region_tlab, RegionTLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED16(_region_tlab, RegionTLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED32(_region_tlab, RegionTLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED64(_region_tlab, RegionTLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_BYTES(_region_tlab, RegionTLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_region_tlab, RegionTLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_region_tlab, RegionTLAB) + +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_WITH_ACCESS_CHECK(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED8(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED16(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED32(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED64(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_BYTES(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_tlab, TLAB) // A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_RESOLVED_OBJECT(_rosalloc, RosAlloc). -ENTRY art_quick_alloc_object_resolved_rosalloc +.macro ART_QUICK_ALLOC_OBJECT_ROSALLOC c_name, cxx_name +ENTRY \c_name // Fast path rosalloc allocation. // r0: type/return value, r9: Thread::Current // r1, r2, r3, r12: free. @@ -1099,13 +1125,13 @@ ENTRY art_quick_alloc_object_resolved_rosalloc // TODO: consider using ldrd. ldr r12, [r9, #THREAD_LOCAL_ALLOC_STACK_END_OFFSET] cmp r3, r12 - bhs .Lart_quick_alloc_object_resolved_rosalloc_slow_path + bhs .Lslow_path\c_name ldr r3, [r0, #MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET] // Load the object size (r3) cmp r3, #ROSALLOC_MAX_THREAD_LOCAL_BRACKET_SIZE // Check if the size is for a thread // local allocation. Also does the // initialized and finalizable checks. - bhs .Lart_quick_alloc_object_resolved_rosalloc_slow_path + bhs .Lslow_path\c_name // Compute the rosalloc bracket index // from the size. Since the size is // already aligned we can combine the @@ -1119,7 +1145,7 @@ ENTRY art_quick_alloc_object_resolved_rosalloc // Load the free list head (r3). This // will be the return val. ldr r3, [r12, #(ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_HEAD_OFFSET)] - cbz r3, .Lart_quick_alloc_object_resolved_rosalloc_slow_path + cbz r3, .Lslow_path\c_name // "Point of no slow path". Won't go to the slow path from here on. OK to clobber r0 and r1. ldr r1, [r3, #ROSALLOC_SLOT_NEXT_OFFSET] // Load the next pointer of the head // and update the list head with the @@ -1164,16 +1190,20 @@ ENTRY art_quick_alloc_object_resolved_rosalloc mov r0, r3 // Set the return value and return. bx lr -.Lart_quick_alloc_object_resolved_rosalloc_slow_path: +.Lslow_path\c_name: SETUP_SAVE_REFS_ONLY_FRAME r2 @ save callee saves in case of GC mov r1, r9 @ pass Thread::Current - bl artAllocObjectFromCodeResolvedRosAlloc @ (mirror::Class* cls, Thread*) + bl \cxx_name @ (mirror::Class* cls, Thread*) RESTORE_SAVE_REFS_ONLY_FRAME RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER -END art_quick_alloc_object_resolved_rosalloc +END \c_name +.endm -// The common fast path code for art_quick_alloc_object_resolved_tlab -// and art_quick_alloc_object_resolved_region_tlab. +ART_QUICK_ALLOC_OBJECT_ROSALLOC art_quick_alloc_object_resolved_rosalloc, artAllocObjectFromCodeResolvedRosAlloc +ART_QUICK_ALLOC_OBJECT_ROSALLOC art_quick_alloc_object_initialized_rosalloc, artAllocObjectFromCodeInitializedRosAlloc + +// The common fast path code for art_quick_alloc_object_resolved/initialized_tlab +// and art_quick_alloc_object_resolved/initialized_region_tlab. // // r0: type r9: Thread::Current, r1, r2, r3, r12: free. // Need to preserve r0 to the slow path. @@ -1212,41 +1242,173 @@ END art_quick_alloc_object_resolved_rosalloc bx lr .endm -// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_RESOLVED_OBJECT(_tlab, TLAB). -ENTRY art_quick_alloc_object_resolved_tlab +// The common code for art_quick_alloc_object_*region_tlab +.macro GENERATE_ALLOC_OBJECT_RESOLVED_TLAB name, entrypoint +ENTRY \name // Fast path tlab allocation. // r0: type, r9: Thread::Current // r1, r2, r3, r12: free. -#if defined(USE_READ_BARRIER) - mvn r0, #0 // Read barrier not supported here. - bx lr // Return -1. -#endif - ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH .Lart_quick_alloc_object_resolved_tlab_slow_path -.Lart_quick_alloc_object_resolved_tlab_slow_path: + ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH .Lslow_path\name +.Lslow_path\name: SETUP_SAVE_REFS_ONLY_FRAME r2 // Save callee saves in case of GC. mov r1, r9 // Pass Thread::Current. - bl artAllocObjectFromCodeResolvedTLAB // (mirror::Class* klass, Thread*) + bl \entrypoint // (mirror::Class* klass, Thread*) RESTORE_SAVE_REFS_ONLY_FRAME RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER -END art_quick_alloc_object_resolved_tlab +END \name +.endm -// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_region_tlab, RegionTLAB) -ENTRY art_quick_alloc_object_resolved_region_tlab - // Fast path tlab allocation. - // r0: type, r9: Thread::Current, r1, r2, r3, r12: free. -#if !defined(USE_READ_BARRIER) - eor r0, r0, r0 // Read barrier must be enabled here. - sub r0, r0, #1 // Return -1. - bx lr +GENERATE_ALLOC_OBJECT_RESOLVED_TLAB art_quick_alloc_object_resolved_region_tlab, artAllocObjectFromCodeResolvedRegionTLAB +GENERATE_ALLOC_OBJECT_RESOLVED_TLAB art_quick_alloc_object_initialized_region_tlab, artAllocObjectFromCodeInitializedRegionTLAB +GENERATE_ALLOC_OBJECT_RESOLVED_TLAB art_quick_alloc_object_resolved_tlab, artAllocObjectFromCodeResolvedTLAB +GENERATE_ALLOC_OBJECT_RESOLVED_TLAB art_quick_alloc_object_initialized_tlab, artAllocObjectFromCodeInitializedTLAB + + +// The common fast path code for art_quick_alloc_array_resolved/initialized_tlab +// and art_quick_alloc_array_resolved/initialized_region_tlab. +// +// r0: type r1: component_count r2: total_size r9: Thread::Current, r3, r12: free. +// Need to preserve r0 and r1 to the slow path. +.macro ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED_WITH_SIZE slowPathLabel + and r2, r2, #OBJECT_ALIGNMENT_MASK_TOGGLED // Apply alignemnt mask + // (addr + 7) & ~7. + + // Load thread_local_pos (r3) and + // thread_local_end (r12) with ldrd. + // Check constraints for ldrd. +#if !((THREAD_LOCAL_POS_OFFSET + 4 == THREAD_LOCAL_END_OFFSET) && (THREAD_LOCAL_POS_OFFSET % 8 == 0)) +#error "Thread::thread_local_pos/end must be consecutive and are 8 byte aligned for performance" #endif - ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH .Lart_quick_alloc_object_resolved_region_tlab_slow_path -.Lart_quick_alloc_object_resolved_region_tlab_slow_path: - SETUP_SAVE_REFS_ONLY_FRAME r2 // Save callee saves in case of GC. - mov r1, r9 // Pass Thread::Current. - bl artAllocObjectFromCodeResolvedRegionTLAB // (mirror::Class* klass, Thread*) + ldrd r3, r12, [r9, #THREAD_LOCAL_POS_OFFSET] + sub r12, r12, r3 // Compute the remaining buf size. + cmp r2, r12 // Check if the total_size fits. + bhi \slowPathLabel + // "Point of no slow path". Won't go to the slow path from here on. OK to clobber r0 and r1. + add r2, r2, r3 + str r2, [r9, #THREAD_LOCAL_POS_OFFSET] // Store new thread_local_pos. + ldr r2, [r9, #THREAD_LOCAL_OBJECTS_OFFSET] // Increment thread_local_objects. + add r2, r2, #1 + str r2, [r9, #THREAD_LOCAL_OBJECTS_OFFSET] + POISON_HEAP_REF r0 + str r0, [r3, #MIRROR_OBJECT_CLASS_OFFSET] // Store the class pointer. + str r1, [r3, #MIRROR_ARRAY_LENGTH_OFFSET] // Store the array length. + // Fence. This is "ish" not "ishst" so + // that the code after this allocation + // site will see the right values in + // the fields of the class. + // Alternatively we could use "ishst" + // if we use load-acquire for the + // object size load.) + mov r0, r3 + dmb ish + bx lr +.endm + +.macro GENERATE_ALLOC_ARRAY_TLAB name, entrypoint, size_setup +ENTRY \name + // Fast path array allocation for region tlab allocation. + // r0: mirror::Class* type + // r1: int32_t component_count + // r9: thread + // r2, r3, r12: free. + \size_setup .Lslow_path\name + ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED_WITH_SIZE .Lslow_path\name +.Lslow_path\name: + // r0: mirror::Class* klass + // r1: int32_t component_count + // r2: Thread* self + SETUP_SAVE_REFS_ONLY_FRAME r2 // save callee saves in case of GC + mov r2, r9 // pass Thread::Current + bl \entrypoint RESTORE_SAVE_REFS_ONLY_FRAME RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER -END art_quick_alloc_object_resolved_region_tlab +END \name +.endm + +.macro COMPUTE_ARRAY_SIZE_UNKNOWN slow_path + bkpt // We should never enter here. + // Code below is for reference. + // Possibly a large object, go slow. + // Also does negative array size check. + movw r2, #((MIN_LARGE_OBJECT_THRESHOLD - MIRROR_WIDE_ARRAY_DATA_OFFSET) / 8) + cmp r1, r2 + bhi \slow_path + // Array classes are never finalizable + // or uninitialized, no need to check. + ldr r3, [r0, #MIRROR_CLASS_COMPONENT_TYPE_OFFSET] // Load component type + UNPOISON_HEAP_REF r3 + ldr r3, [r3, #MIRROR_CLASS_OBJECT_PRIMITIVE_TYPE_OFFSET] + lsr r3, r3, #PRIMITIVE_TYPE_SIZE_SHIFT_SHIFT // Component size shift is in high 16 + // bits. + lsl r2, r1, r3 // Calculate data size + // Add array data offset and alignment. + add r2, r2, #(MIRROR_INT_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK) +#if MIRROR_WIDE_ARRAY_DATA_OFFSET != MIRROR_INT_ARRAY_DATA_OFFSET + 4 +#error Long array data offset must be 4 greater than int array data offset. +#endif + + add r3, r3, #1 // Add 4 to the length only if the + // component size shift is 3 + // (for 64 bit alignment). + and r3, r3, #4 + add r2, r2, r3 +.endm + +.macro COMPUTE_ARRAY_SIZE_8 slow_path + // Possibly a large object, go slow. + // Also does negative array size check. + movw r2, #(MIN_LARGE_OBJECT_THRESHOLD - MIRROR_INT_ARRAY_DATA_OFFSET) + cmp r1, r2 + bhi \slow_path + // Add array data offset and alignment. + add r2, r1, #(MIRROR_INT_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK) +.endm + +.macro COMPUTE_ARRAY_SIZE_16 slow_path + // Possibly a large object, go slow. + // Also does negative array size check. + movw r2, #((MIN_LARGE_OBJECT_THRESHOLD - MIRROR_INT_ARRAY_DATA_OFFSET) / 2) + cmp r1, r2 + bhi \slow_path + lsl r2, r1, #1 + // Add array data offset and alignment. + add r2, r2, #(MIRROR_INT_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK) +.endm + +.macro COMPUTE_ARRAY_SIZE_32 slow_path + // Possibly a large object, go slow. + // Also does negative array size check. + movw r2, #((MIN_LARGE_OBJECT_THRESHOLD - MIRROR_INT_ARRAY_DATA_OFFSET) / 4) + cmp r1, r2 + bhi \slow_path + lsl r2, r1, #2 + // Add array data offset and alignment. + add r2, r2, #(MIRROR_INT_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK) +.endm + +.macro COMPUTE_ARRAY_SIZE_64 slow_path + // Possibly a large object, go slow. + // Also does negative array size check. + movw r2, #((MIN_LARGE_OBJECT_THRESHOLD - MIRROR_LONG_ARRAY_DATA_OFFSET) / 8) + cmp r1, r2 + bhi \slow_path + lsl r2, r1, #3 + // Add array data offset and alignment. + add r2, r2, #(MIRROR_WIDE_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK) +.endm + +# TODO(ngeoffray): art_quick_alloc_array_resolved_region_tlab is not used for arm, remove +# the entrypoint once all backends have been updated to use the size variants. +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_UNKNOWN +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved8_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_8 +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved16_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_16 +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved32_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_32 +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved64_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_64 +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_UNKNOWN +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved8_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_8 +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved16_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_16 +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved32_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_32 +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved64_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_64 /* * Called by managed code when the value in rSUSPEND has been decremented to 0. diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S index 6a2034fd68..219d8b447a 100644 --- a/runtime/arch/arm64/quick_entrypoints_arm64.S +++ b/runtime/arch/arm64/quick_entrypoints_arm64.S @@ -1626,7 +1626,7 @@ ENTRY art_quick_resolve_string END art_quick_resolve_string // Generate the allocation entrypoints for each allocator. -GENERATE_ALLOC_ENTRYPOINTS_FOR_NON_REGION_TLAB_ALLOCATORS +GENERATE_ALLOC_ENTRYPOINTS_FOR_NON_TLAB_ALLOCATORS // Comment out allocators that have arm64 specific asm. // GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_region_tlab, RegionTLAB) // GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_region_tlab, RegionTLAB) @@ -1640,8 +1640,20 @@ GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_BYTES(_region_tlab, RegionTLAB) GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_region_tlab, RegionTLAB) GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_region_tlab, RegionTLAB) -// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_rosalloc, RosAlloc). -ENTRY art_quick_alloc_object_resolved_rosalloc +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_WITH_ACCESS_CHECK(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED8(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED16(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED32(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED64(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_BYTES(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_tlab, TLAB) + +.macro ART_QUICK_ALLOC_OBJECT_ROSALLOC c_name, cxx_name +ENTRY \c_name // Fast path rosalloc allocation. // x0: type, xSELF(x19): Thread::Current // x1-x7: free. @@ -1650,13 +1662,13 @@ ENTRY art_quick_alloc_object_resolved_rosalloc // ldp won't work due to large offset. ldr x4, [xSELF, #THREAD_LOCAL_ALLOC_STACK_END_OFFSET] cmp x3, x4 - bhs .Lart_quick_alloc_object_resolved_rosalloc_slow_path + bhs .Lslow_path\c_name ldr w3, [x0, #MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET] // Load the object size (x3) cmp x3, #ROSALLOC_MAX_THREAD_LOCAL_BRACKET_SIZE // Check if the size is for a thread // local allocation. Also does the // finalizable and initialization // checks. - bhs .Lart_quick_alloc_object_resolved_rosalloc_slow_path + bhs .Lslow_path\c_name // Compute the rosalloc bracket index // from the size. Since the size is // already aligned we can combine the @@ -1669,7 +1681,7 @@ ENTRY art_quick_alloc_object_resolved_rosalloc // Load the free list head (x3). This // will be the return val. ldr x3, [x4, #(ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_HEAD_OFFSET)] - cbz x3, .Lart_quick_alloc_object_resolved_rosalloc_slow_path + cbz x3, .Lslow_path\c_name // "Point of no slow path". Won't go to the slow path from here on. OK to clobber x0 and x1. ldr x1, [x3, #ROSALLOC_SLOT_NEXT_OFFSET] // Load the next pointer of the head // and update the list head with the @@ -1713,59 +1725,19 @@ ENTRY art_quick_alloc_object_resolved_rosalloc mov x0, x3 // Set the return value and return. ret -.Lart_quick_alloc_object_resolved_rosalloc_slow_path: +.Lslow_path\c_name: SETUP_SAVE_REFS_ONLY_FRAME // save callee saves in case of GC mov x1, xSELF // pass Thread::Current - bl artAllocObjectFromCodeResolvedRosAlloc // (mirror::Class* klass, Thread*) + bl \cxx_name RESTORE_SAVE_REFS_ONLY_FRAME RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER -END art_quick_alloc_object_resolved_rosalloc - -.macro ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED_WITH_SIZE slowPathLabel, xClass, wClass, xCount, wCount, xTemp0, wTemp0, xTemp1, wTemp1, xTemp2, wTemp2 - and \xTemp1, \xTemp1, #OBJECT_ALIGNMENT_MASK_TOGGLED64 // Apply alignemnt mask - // (addr + 7) & ~7. The mask must - // be 64 bits to keep high bits in - // case of overflow. - // Negative sized arrays are handled here since xCount holds a zero extended 32 bit value. - // Negative ints become large 64 bit unsigned ints which will always be larger than max signed - // 32 bit int. Since the max shift for arrays is 3, it can not become a negative 64 bit int. - cmp \xTemp1, #MIN_LARGE_OBJECT_THRESHOLD // Possibly a large object, go slow - bhs \slowPathLabel // path. - - ldr \xTemp0, [xSELF, #THREAD_LOCAL_POS_OFFSET] // Check tlab for space, note that - // we use (end - begin) to handle - // negative size arrays. It is - // assumed that a negative size will - // always be greater unsigned than - // region size. - ldr \xTemp2, [xSELF, #THREAD_LOCAL_END_OFFSET] - sub \xTemp2, \xTemp2, \xTemp0 - cmp \xTemp1, \xTemp2 - bhi \slowPathLabel - // "Point of no slow path". Won't go to the slow path from here on. OK to clobber x0 and x1. - // Move old thread_local_pos to x0 - // for the return value. - mov x0, \xTemp0 - add \xTemp0, \xTemp0, \xTemp1 - str \xTemp0, [xSELF, #THREAD_LOCAL_POS_OFFSET] // Store new thread_local_pos. - ldr \xTemp0, [xSELF, #THREAD_LOCAL_OBJECTS_OFFSET] // Increment thread_local_objects. - add \xTemp0, \xTemp0, #1 - str \xTemp0, [xSELF, #THREAD_LOCAL_OBJECTS_OFFSET] - POISON_HEAP_REF \wClass - str \wClass, [x0, #MIRROR_OBJECT_CLASS_OFFSET] // Store the class pointer. - str \wCount, [x0, #MIRROR_ARRAY_LENGTH_OFFSET] // Store the array length. - // Fence. - dmb ishst - ret +END \c_name .endm -// TODO: delete ALLOC_OBJECT_TLAB_FAST_PATH_RESOLVED since it is the same as -// ALLOC_OBJECT_TLAB_FAST_PATH_INITIALIZED. -.macro ALLOC_OBJECT_TLAB_FAST_PATH_RESOLVED slowPathLabel - ALLOC_OBJECT_TLAB_FAST_PATH_INITIALIZED \slowPathLabel -.endm +ART_QUICK_ALLOC_OBJECT_ROSALLOC art_quick_alloc_object_resolved_rosalloc, artAllocObjectFromCodeResolvedRosAlloc +ART_QUICK_ALLOC_OBJECT_ROSALLOC art_quick_alloc_object_initialized_rosalloc, artAllocObjectFromCodeInitializedRosAlloc -.macro ALLOC_OBJECT_TLAB_FAST_PATH_INITIALIZED slowPathLabel +.macro ALLOC_OBJECT_TLAB_FAST_PATH_RESOLVED slowPathLabel ldr x4, [xSELF, #THREAD_LOCAL_POS_OFFSET] ldr x5, [xSELF, #THREAD_LOCAL_END_OFFSET] ldr w7, [x0, #MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET] // Load the object size (x7). @@ -1792,36 +1764,13 @@ END art_quick_alloc_object_resolved_rosalloc ret .endm -// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab, TLAB). -ENTRY art_quick_alloc_object_resolved_tlab - // Fast path tlab allocation. - // x0: type, xSELF(x19): Thread::Current - // x1-x7: free. -#if defined(USE_READ_BARRIER) - mvn x0, xzr // Read barrier not supported here. - ret // Return -1. -#endif - ALLOC_OBJECT_TLAB_FAST_PATH_RESOLVED .Lart_quick_alloc_object_resolved_tlab_slow_path -.Lart_quick_alloc_object_resolved_tlab_slow_path: - SETUP_SAVE_REFS_ONLY_FRAME // Save callee saves in case of GC. - mov x1, xSELF // Pass Thread::Current. - bl artAllocObjectFromCodeResolvedTLAB // (mirror::Class*, Thread*) - RESTORE_SAVE_REFS_ONLY_FRAME - RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER -END art_quick_alloc_object_resolved_tlab - // The common code for art_quick_alloc_object_*region_tlab -.macro GENERATE_ALLOC_OBJECT_RESOLVED_REGION_TLAB name, entrypoint, fast_path +.macro GENERATE_ALLOC_OBJECT_RESOLVED_TLAB name, entrypoint ENTRY \name // Fast path region tlab allocation. // x0: type, xSELF(x19): Thread::Current // x1-x7: free. -#if !defined(USE_READ_BARRIER) - mvn x0, xzr // Read barrier must be enabled here. - ret // Return -1. -#endif -.Ldo_allocation\name: - \fast_path .Lslow_path\name + ALLOC_OBJECT_TLAB_FAST_PATH_RESOLVED .Lslow_path\name .Lslow_path\name: SETUP_SAVE_REFS_ONLY_FRAME // Save callee saves in case of GC. mov x1, xSELF // Pass Thread::Current. @@ -1831,21 +1780,55 @@ ENTRY \name END \name .endm -GENERATE_ALLOC_OBJECT_RESOLVED_REGION_TLAB art_quick_alloc_object_resolved_region_tlab, artAllocObjectFromCodeResolvedRegionTLAB, ALLOC_OBJECT_TLAB_FAST_PATH_RESOLVED -GENERATE_ALLOC_OBJECT_RESOLVED_REGION_TLAB art_quick_alloc_object_initialized_region_tlab, artAllocObjectFromCodeInitializedRegionTLAB, ALLOC_OBJECT_TLAB_FAST_PATH_INITIALIZED +GENERATE_ALLOC_OBJECT_RESOLVED_TLAB art_quick_alloc_object_resolved_region_tlab, artAllocObjectFromCodeResolvedRegionTLAB +GENERATE_ALLOC_OBJECT_RESOLVED_TLAB art_quick_alloc_object_initialized_region_tlab, artAllocObjectFromCodeInitializedRegionTLAB +GENERATE_ALLOC_OBJECT_RESOLVED_TLAB art_quick_alloc_object_resolved_tlab, artAllocObjectFromCodeResolvedTLAB +GENERATE_ALLOC_OBJECT_RESOLVED_TLAB art_quick_alloc_object_initialized_tlab, artAllocObjectFromCodeInitializedTLAB -// TODO: We could use this macro for the normal tlab allocator too. +.macro ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED_WITH_SIZE slowPathLabel, xClass, wClass, xCount, wCount, xTemp0, wTemp0, xTemp1, wTemp1, xTemp2, wTemp2 + and \xTemp1, \xTemp1, #OBJECT_ALIGNMENT_MASK_TOGGLED64 // Apply alignemnt mask + // (addr + 7) & ~7. The mask must + // be 64 bits to keep high bits in + // case of overflow. + // Negative sized arrays are handled here since xCount holds a zero extended 32 bit value. + // Negative ints become large 64 bit unsigned ints which will always be larger than max signed + // 32 bit int. Since the max shift for arrays is 3, it can not become a negative 64 bit int. + cmp \xTemp1, #MIN_LARGE_OBJECT_THRESHOLD // Possibly a large object, go slow + bhs \slowPathLabel // path. -.macro GENERATE_ALLOC_ARRAY_REGION_TLAB name, entrypoint, size_setup + ldr \xTemp0, [xSELF, #THREAD_LOCAL_POS_OFFSET] // Check tlab for space, note that + // we use (end - begin) to handle + // negative size arrays. It is + // assumed that a negative size will + // always be greater unsigned than + // region size. + ldr \xTemp2, [xSELF, #THREAD_LOCAL_END_OFFSET] + sub \xTemp2, \xTemp2, \xTemp0 + cmp \xTemp1, \xTemp2 + bhi \slowPathLabel + // "Point of no slow path". Won't go to the slow path from here on. OK to clobber x0 and x1. + // Move old thread_local_pos to x0 + // for the return value. + mov x0, \xTemp0 + add \xTemp0, \xTemp0, \xTemp1 + str \xTemp0, [xSELF, #THREAD_LOCAL_POS_OFFSET] // Store new thread_local_pos. + ldr \xTemp0, [xSELF, #THREAD_LOCAL_OBJECTS_OFFSET] // Increment thread_local_objects. + add \xTemp0, \xTemp0, #1 + str \xTemp0, [xSELF, #THREAD_LOCAL_OBJECTS_OFFSET] + POISON_HEAP_REF \wClass + str \wClass, [x0, #MIRROR_OBJECT_CLASS_OFFSET] // Store the class pointer. + str \wCount, [x0, #MIRROR_ARRAY_LENGTH_OFFSET] // Store the array length. + // Fence. + dmb ishst + ret +.endm + +.macro GENERATE_ALLOC_ARRAY_TLAB name, entrypoint, size_setup ENTRY \name // Fast path array allocation for region tlab allocation. // x0: mirror::Class* type // x1: int32_t component_count // x2-x7: free. -#if !defined(USE_READ_BARRIER) - mvn x0, xzr // Read barrier must be enabled here. - ret // Return -1. -#endif mov x3, x0 \size_setup x3, w3, x1, w1, x4, w4, x5, w5, x6, w6 ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED_WITH_SIZE .Lslow_path\name, x3, w3, x1, w1, x4, w4, x5, w5, x6, w6 @@ -1904,17 +1887,21 @@ END \name .macro COMPUTE_ARRAY_SIZE_64 xClass, wClass, xCount, wCount, xTemp0, wTemp0, xTemp1, wTemp1, xTemp2, wTemp2 lsl \xTemp1, \xCount, #3 // Add array data offset and alignment. - // Add 4 to the size for 64 bit alignment. - add \xTemp1, \xTemp1, #(MIRROR_INT_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK + 4) + add \xTemp1, \xTemp1, #(MIRROR_WIDE_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK) .endm # TODO(ngeoffray): art_quick_alloc_array_resolved_region_tlab is not used for arm64, remove # the entrypoint once all backends have been updated to use the size variants. -GENERATE_ALLOC_ARRAY_REGION_TLAB art_quick_alloc_array_resolved_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_UNKNOWN -GENERATE_ALLOC_ARRAY_REGION_TLAB art_quick_alloc_array_resolved8_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_8 -GENERATE_ALLOC_ARRAY_REGION_TLAB art_quick_alloc_array_resolved16_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_16 -GENERATE_ALLOC_ARRAY_REGION_TLAB art_quick_alloc_array_resolved32_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_32 -GENERATE_ALLOC_ARRAY_REGION_TLAB art_quick_alloc_array_resolved64_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_64 +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_UNKNOWN +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved8_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_8 +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved16_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_16 +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved32_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_32 +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved64_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_64 +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_UNKNOWN +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved8_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_8 +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved16_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_16 +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved32_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_32 +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved64_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_64 /* * Called by managed code when the thread has been asked to suspend. diff --git a/runtime/arch/mips/quick_entrypoints_mips.S b/runtime/arch/mips/quick_entrypoints_mips.S index 2d5eca003d..663cb6c62f 100644 --- a/runtime/arch/mips/quick_entrypoints_mips.S +++ b/runtime/arch/mips/quick_entrypoints_mips.S @@ -1578,6 +1578,7 @@ GENERATE_ALLOC_ENTRYPOINTS_FOR_EACH_ALLOCATOR GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_rosalloc, RosAlloc) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_rosalloc, RosAlloc) GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab, TLAB) GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_region_tlab, RegionTLAB) diff --git a/runtime/arch/mips64/quick_entrypoints_mips64.S b/runtime/arch/mips64/quick_entrypoints_mips64.S index f3629d90d3..5fee575331 100644 --- a/runtime/arch/mips64/quick_entrypoints_mips64.S +++ b/runtime/arch/mips64/quick_entrypoints_mips64.S @@ -1534,6 +1534,7 @@ END \name GENERATE_ALLOC_ENTRYPOINTS_FOR_EACH_ALLOCATOR GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_rosalloc, RosAlloc) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_rosalloc, RosAlloc) GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab, TLAB) GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_region_tlab, RegionTLAB) diff --git a/runtime/arch/quick_alloc_entrypoints.S b/runtime/arch/quick_alloc_entrypoints.S index 9204d85279..2b3525b189 100644 --- a/runtime/arch/quick_alloc_entrypoints.S +++ b/runtime/arch/quick_alloc_entrypoints.S @@ -145,7 +145,7 @@ GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_dlmalloc_instrumented, DlMa // This is to be separately defined for each architecture to allow a hand-written assembly fast path. // GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_rosalloc, RosAlloc) -GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_rosalloc, RosAlloc) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_rosalloc, RosAlloc) GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_WITH_ACCESS_CHECK(_rosalloc, RosAlloc) GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED(_rosalloc, RosAlloc) GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED8(_rosalloc, RosAlloc) diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S index 47dc34a355..76615e843b 100644 --- a/runtime/arch/x86/quick_entrypoints_x86.S +++ b/runtime/arch/x86/quick_entrypoints_x86.S @@ -947,10 +947,37 @@ MACRO0(RETURN_OR_DELIVER_PENDING_EXCEPTION) END_MACRO // Generate the allocation entrypoints for each allocator. -GENERATE_ALLOC_ENTRYPOINTS_FOR_EACH_ALLOCATOR +GENERATE_ALLOC_ENTRYPOINTS_FOR_NON_TLAB_ALLOCATORS + +// Comment out allocators that have x86 specific asm. +// Region TLAB: +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_region_tlab, RegionTLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_region_tlab, RegionTLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_WITH_ACCESS_CHECK(_region_tlab, RegionTLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED(_region_tlab, RegionTLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED8(_region_tlab, RegionTLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED16(_region_tlab, RegionTLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED32(_region_tlab, RegionTLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED64(_region_tlab, RegionTLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_BYTES(_region_tlab, RegionTLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_region_tlab, RegionTLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_region_tlab, RegionTLAB) +// Normal TLAB: +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_WITH_ACCESS_CHECK(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED8(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED16(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED32(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED64(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_BYTES(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_tlab, TLAB) // A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_rosalloc, RosAlloc). -DEFINE_FUNCTION art_quick_alloc_object_resolved_rosalloc +MACRO2(ART_QUICK_ALLOC_OBJECT_ROSALLOC, c_name, cxx_name) + DEFINE_FUNCTION VAR(c_name) // Fast path rosalloc allocation. // eax: type/return value // ecx, ebx, edx: free @@ -959,14 +986,14 @@ DEFINE_FUNCTION art_quick_alloc_object_resolved_rosalloc // stack has room movl THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET(%ebx), %ecx cmpl THREAD_LOCAL_ALLOC_STACK_END_OFFSET(%ebx), %ecx - jae .Lart_quick_alloc_object_resolved_rosalloc_slow_path + jae .Lslow_path\c_name movl MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET(%eax), %ecx // Load the object size (ecx) // Check if the size is for a thread // local allocation. Also does the // finalizable and initialization check. cmpl LITERAL(ROSALLOC_MAX_THREAD_LOCAL_BRACKET_SIZE), %ecx - ja .Lart_quick_alloc_object_resolved_rosalloc_slow_path + ja .Lslow_path\c_name shrl LITERAL(ROSALLOC_BRACKET_QUANTUM_SIZE_SHIFT), %ecx // Calculate the rosalloc bracket index // from object size. // Load thread local rosalloc run (ebx) @@ -977,7 +1004,7 @@ DEFINE_FUNCTION art_quick_alloc_object_resolved_rosalloc // Load free_list head (edi), // this will be the return value. movl (ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_HEAD_OFFSET)(%ebx), %ecx - jecxz .Lart_quick_alloc_object_resolved_rosalloc_slow_path + jecxz .Lslow_path\c_name // Point of no slow path. Won't go to // the slow path from here on. // Load the next pointer of the head @@ -1008,7 +1035,7 @@ DEFINE_FUNCTION art_quick_alloc_object_resolved_rosalloc // No fence needed for x86. movl %ecx, %eax // Move object to return register ret -.Lart_quick_alloc_object_resolved_rosalloc_slow_path: +.Lslow_path\c_name: SETUP_SAVE_REFS_ONLY_FRAME ebx, ebx // save ref containing registers for GC // Outgoing argument set up subl LITERAL(8), %esp // alignment padding @@ -1020,10 +1047,14 @@ DEFINE_FUNCTION art_quick_alloc_object_resolved_rosalloc CFI_ADJUST_CFA_OFFSET(-16) RESTORE_SAVE_REFS_ONLY_FRAME // restore frame up to return address RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER // return or deliver exception -END_FUNCTION art_quick_alloc_object_resolved_rosalloc + END_FUNCTION VAR(c_name) +END_MACRO -// The common fast path code for art_quick_alloc_object_resolved_tlab -// and art_quick_alloc_object_resolved_region_tlab. +ART_QUICK_ALLOC_OBJECT_ROSALLOC art_quick_alloc_object_resolved_rosalloc, artAllocObjectFromCodeResolvedRosAlloc +ART_QUICK_ALLOC_OBJECT_ROSALLOC art_quick_alloc_object_initialized_rosalloc, artAllocObjectFromCodeInitializedRosAlloc + +// The common fast path code for art_quick_alloc_object_resolved/initialized_tlab +// and art_quick_alloc_object_resolved/initialized_region_tlab. // // EAX: type/return_value MACRO1(ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH, slowPathLabel) @@ -1047,8 +1078,8 @@ MACRO1(ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH, slowPathLabel) ret // Fast path succeeded. END_MACRO -// The common slow path code for art_quick_alloc_object_resolved_tlab -// and art_quick_alloc_object_resolved_region_tlab. +// The common slow path code for art_quick_alloc_object_resolved/initialized_tlab +// and art_quick_alloc_object_resolved/initialized_region_tlab. MACRO1(ALLOC_OBJECT_RESOLVED_TLAB_SLOW_PATH, cxx_name) POP edi SETUP_SAVE_REFS_ONLY_FRAME ebx, ebx // save ref containing registers for GC @@ -1065,33 +1096,154 @@ MACRO1(ALLOC_OBJECT_RESOLVED_TLAB_SLOW_PATH, cxx_name) RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER // return or deliver exception END_MACRO -// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab, TLAB). May be called -// for CC if the GC is not marking. -DEFINE_FUNCTION art_quick_alloc_object_resolved_tlab +MACRO2(ART_QUICK_ALLOC_OBJECT_TLAB, c_name, cxx_name) + DEFINE_FUNCTION VAR(c_name) // Fast path tlab allocation. // EAX: type // EBX, ECX, EDX: free. PUSH edi - ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH .Lart_quick_alloc_object_resolved_tlab_slow_path -.Lart_quick_alloc_object_resolved_tlab_slow_path: - ALLOC_OBJECT_RESOLVED_TLAB_SLOW_PATH artAllocObjectFromCodeResolvedTLAB -END_FUNCTION art_quick_alloc_object_resolved_tlab - -// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_region_tlab, RegionTLAB). -DEFINE_FUNCTION art_quick_alloc_object_resolved_region_tlab - // Fast path region tlab allocation. - // EAX: type/return value - // EBX, ECX, EDX: free. -#if !defined(USE_READ_BARRIER) - int3 + ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH .Lslow_path\c_name +.Lslow_path\c_name: + ALLOC_OBJECT_RESOLVED_TLAB_SLOW_PATH RAW_VAR(cxx_name) + END_FUNCTION VAR(c_name) +END_MACRO + +ART_QUICK_ALLOC_OBJECT_TLAB art_quick_alloc_object_resolved_tlab, artAllocObjectFromCodeResolvedTLAB +ART_QUICK_ALLOC_OBJECT_TLAB art_quick_alloc_object_initialized_tlab, artAllocObjectFromCodeInitializedTLAB +ART_QUICK_ALLOC_OBJECT_TLAB art_quick_alloc_object_resolved_region_tlab, artAllocObjectFromCodeResolvedRegionTLAB +ART_QUICK_ALLOC_OBJECT_TLAB art_quick_alloc_object_initialized_region_tlab, artAllocObjectFromCodeInitializedRegionTLAB + +// The fast path code for art_quick_alloc_array_region_tlab. +// Inputs: EAX: the class, ECX: int32_t component_count, EDX: total_size +// Free temp: EBX +// Output: EAX: return value. +MACRO1(ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED_WITH_SIZE, slowPathLabel) + mov %fs:THREAD_SELF_OFFSET, %ebx // ebx = thread + // Mask out the unaligned part to make sure we are 8 byte aligned. + andl LITERAL(OBJECT_ALIGNMENT_MASK_TOGGLED), %edx + movl THREAD_LOCAL_END_OFFSET(%ebx), %edi + subl THREAD_LOCAL_POS_OFFSET(%ebx), %edi + cmpl %edi, %edx // Check if it fits. + ja RAW_VAR(slowPathLabel) + movl THREAD_LOCAL_POS_OFFSET(%ebx), %edi + addl %edi, %edx // Add the object size. + movl %edx, THREAD_LOCAL_POS_OFFSET(%ebx) // Update thread_local_pos_ + addl LITERAL(1), THREAD_LOCAL_OBJECTS_OFFSET(%ebx) // Increase thread_local_objects. + // Store the class pointer in the + // header. + // No fence needed for x86. + POISON_HEAP_REF eax + movl %eax, MIRROR_OBJECT_CLASS_OFFSET(%edi) + movl %ecx, MIRROR_ARRAY_LENGTH_OFFSET(%edi) + movl %edi, %eax + POP edi + ret // Fast path succeeded. +END_MACRO + +MACRO1(COMPUTE_ARRAY_SIZE_UNKNOWN, slow_path) + // We should never enter here. Code is provided for reference. int3 + // Possibly a large object, go slow. + // Also does negative array size check. + cmpl LITERAL((MIN_LARGE_OBJECT_THRESHOLD - MIRROR_WIDE_ARRAY_DATA_OFFSET) / 8), %ecx + ja RAW_VAR(slow_path) + PUSH ecx + movl %ecx, %edx + movl MIRROR_CLASS_COMPONENT_TYPE_OFFSET(%eax), %ecx // Load component type. + UNPOISON_HEAP_REF ecx + movl MIRROR_CLASS_OBJECT_PRIMITIVE_TYPE_OFFSET(%ecx), %ecx // Load primitive type. + shr MACRO_LITERAL(PRIMITIVE_TYPE_SIZE_SHIFT_SHIFT), %ecx // Get component size shift. + sall %cl, %edx // Calculate array count shifted. + // Add array header + alignment rounding. + add MACRO_LITERAL(MIRROR_INT_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK), %edx + // Add 4 extra bytes if we are doing a long array. + add MACRO_LITERAL(1), %ecx + and MACRO_LITERAL(4), %ecx +#if MIRROR_WIDE_ARRAY_DATA_OFFSET != MIRROR_INT_ARRAY_DATA_OFFSET + 4 +#error Long array data offset must be 4 greater than int array data offset. #endif + addl %ecx, %edx + POP ecx +END_MACRO + +MACRO1(COMPUTE_ARRAY_SIZE_8, slow_path) + // EAX: mirror::Class* klass, ECX: int32_t component_count + // Possibly a large object, go slow. + // Also does negative array size check. + cmpl LITERAL(MIN_LARGE_OBJECT_THRESHOLD - MIRROR_INT_ARRAY_DATA_OFFSET), %ecx + ja RAW_VAR(slow_path) + // Add array header + alignment rounding. + leal (MIRROR_INT_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK)(%ecx), %edx +END_MACRO + +MACRO1(COMPUTE_ARRAY_SIZE_16, slow_path) + // EAX: mirror::Class* klass, ECX: int32_t component_count + // Possibly a large object, go slow. + // Also does negative array size check. + cmpl LITERAL((MIN_LARGE_OBJECT_THRESHOLD - MIRROR_INT_ARRAY_DATA_OFFSET) / 2), %ecx + ja RAW_VAR(slow_path) + // Add array header + alignment rounding. + leal ((MIRROR_INT_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK) / 2)(%ecx), %edx + sall MACRO_LITERAL(1), %edx +END_MACRO + +MACRO1(COMPUTE_ARRAY_SIZE_32, slow_path) + // EAX: mirror::Class* klass, ECX: int32_t component_count + // Possibly a large object, go slow. + // Also does negative array size check. + cmpl LITERAL((MIN_LARGE_OBJECT_THRESHOLD - MIRROR_INT_ARRAY_DATA_OFFSET) / 4), %ecx + ja RAW_VAR(slow_path) + // Add array header + alignment rounding. + leal ((MIRROR_INT_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK) / 4)(%ecx), %edx + sall MACRO_LITERAL(2), %edx +END_MACRO + +MACRO1(COMPUTE_ARRAY_SIZE_64, slow_path) + // EAX: mirror::Class* klass, ECX: int32_t component_count + // Possibly a large object, go slow. + // Also does negative array size check. + cmpl LITERAL((MIN_LARGE_OBJECT_THRESHOLD - MIRROR_WIDE_ARRAY_DATA_OFFSET) / 8), %ecx + ja RAW_VAR(slow_path) + // Add array header + alignment rounding. + leal ((MIRROR_WIDE_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK) / 8)(%ecx), %edx + sall MACRO_LITERAL(3), %edx +END_MACRO + +MACRO3(GENERATE_ALLOC_ARRAY_TLAB, c_entrypoint, cxx_name, size_setup) + DEFINE_FUNCTION VAR(c_entrypoint) + // EAX: mirror::Class* klass, ECX: int32_t component_count PUSH edi - ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH .Lart_quick_alloc_object_resolved_region_tlab_slow_path -.Lart_quick_alloc_object_resolved_region_tlab_slow_path: - ALLOC_OBJECT_RESOLVED_TLAB_SLOW_PATH artAllocObjectFromCodeResolvedRegionTLAB -END_FUNCTION art_quick_alloc_object_resolved_region_tlab + CALL_MACRO(size_setup) .Lslow_path\c_entrypoint + ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED_WITH_SIZE .Lslow_path\c_entrypoint +.Lslow_path\c_entrypoint: + POP edi + SETUP_SAVE_REFS_ONLY_FRAME ebx, ebx // save ref containing registers for GC + // Outgoing argument set up + PUSH eax // alignment padding + pushl %fs:THREAD_SELF_OFFSET // pass Thread::Current() + CFI_ADJUST_CFA_OFFSET(4) + PUSH ecx + PUSH eax + call CALLVAR(cxx_name) // cxx_name(arg0, arg1, Thread*) + addl LITERAL(16), %esp // pop arguments + CFI_ADJUST_CFA_OFFSET(-16) + RESTORE_SAVE_REFS_ONLY_FRAME // restore frame up to return address + RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER // return or deliver exception + END_FUNCTION VAR(c_entrypoint) +END_MACRO + + +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_UNKNOWN +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved8_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_8 +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved16_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_16 +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved32_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_32 +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved64_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_64 +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_UNKNOWN +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved8_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_8 +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved16_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_16 +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved32_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_32 +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved64_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_64 DEFINE_FUNCTION art_quick_resolve_string SETUP_SAVE_EVERYTHING_FRAME ebx, ebx diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S index 10f9047bf3..a1ae858735 100644 --- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S +++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S @@ -1006,7 +1006,8 @@ GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_tlab, TLAB) // A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_rosalloc, RosAlloc). -DEFINE_FUNCTION art_quick_alloc_object_resolved_rosalloc +MACRO2(ART_QUICK_ALLOC_OBJECT_ROSALLOC, c_name, cxx_name) + DEFINE_FUNCTION VAR(c_name) // Fast path rosalloc allocation. // RDI: mirror::Class*, RAX: return value // RSI, RDX, RCX, R8, R9: free. @@ -1015,14 +1016,14 @@ DEFINE_FUNCTION art_quick_alloc_object_resolved_rosalloc movq %gs:THREAD_SELF_OFFSET, %r8 // r8 = thread movq THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET(%r8), %rcx // rcx = alloc stack top. cmpq THREAD_LOCAL_ALLOC_STACK_END_OFFSET(%r8), %rcx - jae .Lart_quick_alloc_object_resolved_rosalloc_slow_path + jae .Lslow_path\c_name // Load the object size movl MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET(%rdi), %eax // Check if the size is for a thread // local allocation. Also does the // initialized and finalizable checks. cmpl LITERAL(ROSALLOC_MAX_THREAD_LOCAL_BRACKET_SIZE), %eax - ja .Lart_quick_alloc_object_resolved_rosalloc_slow_path + ja .Lslow_path\c_name // Compute the rosalloc bracket index // from the size. shrq LITERAL(ROSALLOC_BRACKET_QUANTUM_SIZE_SHIFT), %rax @@ -1036,7 +1037,7 @@ DEFINE_FUNCTION art_quick_alloc_object_resolved_rosalloc // will be the return val. movq (ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_HEAD_OFFSET)(%r9), %rax testq %rax, %rax - jz .Lart_quick_alloc_object_resolved_rosalloc_slow_path + jz .Lslow_path\c_name // "Point of no slow path". Won't go to the slow path from here on. OK to clobber rdi and rsi. // Push the new object onto the thread // local allocation stack and @@ -1063,25 +1064,19 @@ DEFINE_FUNCTION art_quick_alloc_object_resolved_rosalloc decl (ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_SIZE_OFFSET)(%r9) // No fence necessary for x86. ret -.Lart_quick_alloc_object_resolved_rosalloc_slow_path: +.Lslow_path\c_name: SETUP_SAVE_REFS_ONLY_FRAME // save ref containing registers for GC // Outgoing argument set up movq %gs:THREAD_SELF_OFFSET, %rsi // pass Thread::Current() - call SYMBOL(artAllocObjectFromCodeResolvedRosAlloc) // cxx_name(arg0, Thread*) + call CALLVAR(cxx_name) // cxx_name(arg0, Thread*) RESTORE_SAVE_REFS_ONLY_FRAME // restore frame up to return address RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER // return or deliver exception -END_FUNCTION art_quick_alloc_object_rosalloc - -// The common fast path code for art_quick_alloc_object_tlab and art_quick_alloc_object_region_tlab. -// -// RDI: type_idx, RSI: ArtMethod*, RDX/EDX: the class, RAX: return value. -// RCX: scratch, r8: Thread::Current(). -MACRO1(ALLOC_OBJECT_TLAB_FAST_PATH, slowPathLabel) - testl %edx, %edx // Check null class - jz RAW_VAR(slowPathLabel) - ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH(RAW_VAR(slowPathLabel)) + END_FUNCTION VAR(c_name) END_MACRO +ART_QUICK_ALLOC_OBJECT_ROSALLOC art_quick_alloc_object_resolved_rosalloc, artAllocObjectFromCodeResolvedRosAlloc +ART_QUICK_ALLOC_OBJECT_ROSALLOC art_quick_alloc_object_initialized_rosalloc, artAllocObjectFromCodeInitializedRosAlloc + // The common fast path code for art_quick_alloc_object_resolved_region_tlab. // TODO: delete ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH since it is the same as // ALLOC_OBJECT_INITIALIZED_TLAB_FAST_PATH. @@ -1220,12 +1215,7 @@ MACRO0(COMPUTE_ARRAY_SIZE_64) movq %rsi, %r9 salq MACRO_LITERAL(3), %r9 // Add array header + alignment rounding. - // Add 4 extra bytes for array data alignment - addq MACRO_LITERAL(MIRROR_INT_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK + 4), %r9 -END_MACRO - -// The slow path code for art_quick_alloc_array_*tlab. -MACRO1(ALLOC_ARRAY_TLAB_SLOW_PATH, cxx_name) + addq MACRO_LITERAL(MIRROR_WIDE_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK), %r9 END_MACRO MACRO3(GENERATE_ALLOC_ARRAY_TLAB, c_entrypoint, cxx_name, size_setup) |