diff options
author | 2017-06-27 14:41:39 +0200 | |
---|---|---|
committer | 2017-07-13 10:17:07 +0200 | |
commit | 854df416f12c48b52239fe163ab8a7fcac4cddd3 (patch) | |
tree | f5cf247f1e71a5242c797b8fab99ded21839267d | |
parent | e63a91111d13f33028c2988ded53a4659140ca2e (diff) |
MIPS: TLAB allocation entrypoints
Add fast paths for TLAB allocation entrypoints for MIPS32 and MIPS64.
Also improve rosalloc entrypoints.
Note: All tests are executed on CI20 (MIPS32R2) and in QEMU (MIPS32R6
and MIPS64R6), with and without ART_TEST_DEBUG_GC=true.
Test: ./testrunner.py --optimizing --target
Test: mma test-art-target-gtest
Test: mma test-art-host-gtest
Change-Id: I92195d2d318b26a19afc5ac46a1844b13b2d5191
-rw-r--r-- | compiler/optimizing/code_generator_mips.cc | 5 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_mips64.cc | 5 | ||||
-rw-r--r-- | runtime/arch/mips/quick_entrypoints_mips.S | 238 | ||||
-rw-r--r-- | runtime/arch/mips64/quick_entrypoints_mips64.S | 225 | ||||
-rw-r--r-- | runtime/arch/quick_alloc_entrypoints.S | 5 |
5 files changed, 452 insertions, 26 deletions
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc index be8f9e9cf8..23d188d630 100644 --- a/compiler/optimizing/code_generator_mips.cc +++ b/compiler/optimizing/code_generator_mips.cc @@ -7859,8 +7859,11 @@ void LocationsBuilderMIPS::VisitNewArray(HNewArray* instruction) { void InstructionCodeGeneratorMIPS::VisitNewArray(HNewArray* instruction) { // Note: if heap poisoning is enabled, the entry point takes care // of poisoning the reference. - codegen_->InvokeRuntime(kQuickAllocArrayResolved, instruction, instruction->GetDexPc()); + QuickEntrypointEnum entrypoint = + CodeGenerator::GetArrayAllocationEntrypoint(instruction->GetLoadClass()->GetClass()); + codegen_->InvokeRuntime(entrypoint, instruction, instruction->GetDexPc()); CheckEntrypointTypes<kQuickAllocArrayResolved, void*, mirror::Class*, int32_t>(); + DCHECK(!codegen_->IsLeafMethod()); } void LocationsBuilderMIPS::VisitNewInstance(HNewInstance* instruction) { diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc index cf6b3d5805..454a2ddc14 100644 --- a/compiler/optimizing/code_generator_mips64.cc +++ b/compiler/optimizing/code_generator_mips64.cc @@ -5578,8 +5578,11 @@ void LocationsBuilderMIPS64::VisitNewArray(HNewArray* instruction) { void InstructionCodeGeneratorMIPS64::VisitNewArray(HNewArray* instruction) { // Note: if heap poisoning is enabled, the entry point takes care // of poisoning the reference. - codegen_->InvokeRuntime(kQuickAllocArrayResolved, instruction, instruction->GetDexPc()); + QuickEntrypointEnum entrypoint = + CodeGenerator::GetArrayAllocationEntrypoint(instruction->GetLoadClass()->GetClass()); + codegen_->InvokeRuntime(entrypoint, instruction, instruction->GetDexPc()); CheckEntrypointTypes<kQuickAllocArrayResolved, void*, mirror::Class*, int32_t>(); + DCHECK(!codegen_->IsLeafMethod()); } void LocationsBuilderMIPS64::VisitNewInstance(HNewInstance* instruction) { diff --git a/runtime/arch/mips/quick_entrypoints_mips.S b/runtime/arch/mips/quick_entrypoints_mips.S index a5a65e6843..00e3d67207 100644 --- a/runtime/arch/mips/quick_entrypoints_mips.S +++ b/runtime/arch/mips/quick_entrypoints_mips.S @@ -1662,13 +1662,37 @@ END \name .endm // Generate the allocation entrypoints for each allocator. -GENERATE_ALLOC_ENTRYPOINTS_FOR_EACH_ALLOCATOR +GENERATE_ALLOC_ENTRYPOINTS_FOR_NON_TLAB_ALLOCATORS +// Comment out allocators that have mips specific asm. +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_region_tlab, RegionTLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_region_tlab, RegionTLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_WITH_ACCESS_CHECK(_region_tlab, RegionTLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED(_region_tlab, RegionTLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED8(_region_tlab, RegionTLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED16(_region_tlab, RegionTLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED32(_region_tlab, RegionTLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED64(_region_tlab, RegionTLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_BYTES(_region_tlab, RegionTLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_region_tlab, RegionTLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_region_tlab, RegionTLAB) + +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_WITH_ACCESS_CHECK(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED8(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED16(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED32(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED64(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_BYTES(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_tlab, TLAB) // A hand-written override for: // GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_rosalloc, RosAlloc) // GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_rosalloc, RosAlloc) -.macro ART_QUICK_ALLOC_OBJECT_ROSALLOC c_name, cxx_name -ENTRY \c_name +.macro ART_QUICK_ALLOC_OBJECT_ROSALLOC c_name, cxx_name, isInitialized +ENTRY_NO_GP \c_name # Fast path rosalloc allocation # a0: type # s1: Thread::Current @@ -1688,6 +1712,11 @@ ENTRY \c_name li $t5, ROSALLOC_MAX_THREAD_LOCAL_BRACKET_SIZE # Check if size is for a thread local # allocation. Also does the # initialized and finalizable checks. + # When isInitialized == 0, then the class is potentially not yet initialized. + # If the class is not yet initialized, the object size will be very large to force the branch + # below to be taken. + # + # See InitializeClassVisitors in class-inl.h for more details. bgtu $t1, $t5, .Lslow_path_\c_name # Compute the rosalloc bracket index from the size. Since the size is already aligned we can @@ -1728,12 +1757,19 @@ ENTRY \c_name addiu $t5, $t5, -1 sw $t5, (ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_SIZE_OFFSET)($t2) +.if \isInitialized == 0 + # This barrier is only necessary when the allocation also requires a class initialization check. + # + # If the class is already observably initialized, then new-instance allocations are protected + # from publishing by the compiler which inserts its own StoreStore barrier. sync # Fence. - +.endif jalr $zero, $ra nop .Lslow_path_\c_name: + addiu $t9, $t9, (.Lslow_path_\c_name - \c_name) + 4 + .cpload $t9 SETUP_SAVE_REFS_ONLY_FRAME la $t9, \cxx_name jalr $t9 @@ -1742,11 +1778,197 @@ ENTRY \c_name END \c_name .endm -ART_QUICK_ALLOC_OBJECT_ROSALLOC art_quick_alloc_object_resolved_rosalloc, artAllocObjectFromCodeResolvedRosAlloc -ART_QUICK_ALLOC_OBJECT_ROSALLOC art_quick_alloc_object_initialized_rosalloc, artAllocObjectFromCodeInitializedRosAlloc +ART_QUICK_ALLOC_OBJECT_ROSALLOC art_quick_alloc_object_resolved_rosalloc, artAllocObjectFromCodeResolvedRosAlloc, /* isInitialized */ 0 +ART_QUICK_ALLOC_OBJECT_ROSALLOC art_quick_alloc_object_initialized_rosalloc, artAllocObjectFromCodeInitializedRosAlloc, /* isInitialized */ 1 + +// The common fast path code for art_quick_alloc_object_resolved/initialized_tlab +// and art_quick_alloc_object_resolved/initialized_region_tlab. +// +// a0: type, s1(rSELF): Thread::Current. +// Need to preserve a0 to the slow path. +// +// If isInitialized=1 then the compiler assumes the object's class has already been initialized. +// If isInitialized=0 the compiler can only assume it's been at least resolved. +.macro ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH slowPathLabel isInitialized + lw $v0, THREAD_LOCAL_POS_OFFSET(rSELF) # Load thread_local_pos. + lw $a2, THREAD_LOCAL_END_OFFSET(rSELF) # Load thread_local_end. + subu $a3, $a2, $v0 # Compute the remaining buffer size. + lw $t0, MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET($a0) # Load the object size. + + # When isInitialized == 0, then the class is potentially not yet initialized. + # If the class is not yet initialized, the object size will be very large to force the branch + # below to be taken. + # + # See InitializeClassVisitors in class-inl.h for more details. + bgtu $t0, $a3, \slowPathLabel # Check if it fits. + addu $t1, $v0, $t0 # Add object size to tlab pos (in branch + # delay slot). + # "Point of no slow path". Won't go to the slow path from here on. + sw $t1, THREAD_LOCAL_POS_OFFSET(rSELF) # Store new thread_local_pos. + lw $a2, THREAD_LOCAL_OBJECTS_OFFSET(rSELF) # Increment thread_local_objects. + addiu $a2, $a2, 1 + sw $a2, THREAD_LOCAL_OBJECTS_OFFSET(rSELF) + POISON_HEAP_REF $a0 + sw $a0, MIRROR_OBJECT_CLASS_OFFSET($v0) # Store the class pointer. + +.if \isInitialized == 0 + # This barrier is only necessary when the allocation also requires a class initialization check. + # + # If the class is already observably initialized, then new-instance allocations are protected + # from publishing by the compiler which inserts its own StoreStore barrier. + sync # Fence. +.endif + jalr $zero, $ra + nop +.endm + +// The common code for art_quick_alloc_object_resolved/initialized_tlab +// and art_quick_alloc_object_resolved/initialized_region_tlab. +.macro GENERATE_ALLOC_OBJECT_TLAB name, entrypoint, isInitialized +ENTRY_NO_GP \name + # Fast path tlab allocation. + # a0: type, s1(rSELF): Thread::Current. + ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH .Lslow_path_\name, \isInitialized +.Lslow_path_\name: + addiu $t9, $t9, (.Lslow_path_\name - \name) + 4 + .cpload $t9 + SETUP_SAVE_REFS_ONLY_FRAME # Save callee saves in case of GC. + la $t9, \entrypoint + jalr $t9 # (mirror::Class*, Thread*) + move $a1, rSELF # Pass Thread::Current. + RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER +END \name +.endm + +GENERATE_ALLOC_OBJECT_TLAB art_quick_alloc_object_resolved_region_tlab, artAllocObjectFromCodeResolvedRegionTLAB, /* isInitialized */ 0 +GENERATE_ALLOC_OBJECT_TLAB art_quick_alloc_object_initialized_region_tlab, artAllocObjectFromCodeInitializedRegionTLAB, /* isInitialized */ 1 +GENERATE_ALLOC_OBJECT_TLAB art_quick_alloc_object_resolved_tlab, artAllocObjectFromCodeResolvedTLAB, /* isInitialized */ 0 +GENERATE_ALLOC_OBJECT_TLAB art_quick_alloc_object_initialized_tlab, artAllocObjectFromCodeInitializedTLAB, /* isInitialized */ 1 + +// The common fast path code for art_quick_alloc_array_resolved/initialized_tlab +// and art_quick_alloc_array_resolved/initialized_region_tlab. +// +// a0: type, a1: component_count, a2: total_size, s1(rSELF): Thread::Current. +// Need to preserve a0 and a1 to the slow path. +.macro ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED_WITH_SIZE slowPathLabel + li $a3, OBJECT_ALIGNMENT_MASK_TOGGLED # Apply alignemnt mask + and $a2, $a2, $a3 # (addr + 7) & ~7. + + lw $v0, THREAD_LOCAL_POS_OFFSET(rSELF) # Load thread_local_pos. + lw $t1, THREAD_LOCAL_END_OFFSET(rSELF) # Load thread_local_end. + subu $t2, $t1, $v0 # Compute the remaining buffer size. + bgtu $a2, $t2, \slowPathLabel # Check if it fits. + addu $a2, $v0, $a2 # Add object size to tlab pos (in branch + # delay slot). + + # "Point of no slow path". Won't go to the slow path from here on. + sw $a2, THREAD_LOCAL_POS_OFFSET(rSELF) # Store new thread_local_pos. + lw $a2, THREAD_LOCAL_OBJECTS_OFFSET(rSELF) # Increment thread_local_objects. + addiu $a2, $a2, 1 + sw $a2, THREAD_LOCAL_OBJECTS_OFFSET(rSELF) + POISON_HEAP_REF $a0 + sw $a0, MIRROR_OBJECT_CLASS_OFFSET($v0) # Store the class pointer. + jalr $zero, $ra + sw $a1, MIRROR_ARRAY_LENGTH_OFFSET($v0) # Store the array length. +.endm + +.macro GENERATE_ALLOC_ARRAY_TLAB name, entrypoint, size_setup +ENTRY_NO_GP \name + # Fast path array allocation for region tlab allocation. + # a0: mirror::Class* type + # a1: int32_t component_count + # s1(rSELF): Thread::Current + \size_setup .Lslow_path_\name + ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED_WITH_SIZE .Lslow_path_\name +.Lslow_path_\name: + # a0: mirror::Class* type + # a1: int32_t component_count + # a2: Thread* self + addiu $t9, $t9, (.Lslow_path_\name - \name) + 4 + .cpload $t9 + SETUP_SAVE_REFS_ONLY_FRAME # Save callee saves in case of GC. + la $t9, \entrypoint + jalr $t9 + move $a2, rSELF # Pass Thread::Current. + RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER +END \name +.endm + +.macro COMPUTE_ARRAY_SIZE_UNKNOWN slow_path + break # We should never enter here. + # Code below is for reference. + # Possibly a large object, go slow. + # Also does negative array size check. + li $a2, ((MIN_LARGE_OBJECT_THRESHOLD - MIRROR_WIDE_ARRAY_DATA_OFFSET) / 8) + bgtu $a1, $a2, \slow_path + # Array classes are never finalizable + # or uninitialized, no need to check. + lw $a3, MIRROR_CLASS_COMPONENT_TYPE_OFFSET($a0) # Load component type. + UNPOISON_HEAP_REF $a3 + lw $a3, MIRROR_CLASS_OBJECT_PRIMITIVE_TYPE_OFFSET($a3) + srl $a3, $a3, PRIMITIVE_TYPE_SIZE_SHIFT_SHIFT # Component size shift is in high 16 bits. + sllv $a2, $a1, $a3 # Calculate data size. + # Add array data offset and alignment. + addiu $a2, $a2, (MIRROR_INT_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK) +#if MIRROR_WIDE_ARRAY_DATA_OFFSET != MIRROR_INT_ARRAY_DATA_OFFSET + 4 +#error Long array data offset must be 4 greater than int array data offset. +#endif + + addiu $a3, $a3, 1 # Add 4 to the length only if the component + andi $a3, $a3, 4 # size shift is 3 (for 64 bit alignment). + addu $a2, $a2, $a3 +.endm + +.macro COMPUTE_ARRAY_SIZE_8 slow_path + # Possibly a large object, go slow. + # Also does negative array size check. + li $a2, (MIN_LARGE_OBJECT_THRESHOLD - MIRROR_INT_ARRAY_DATA_OFFSET) + bgtu $a1, $a2, \slow_path + # Add array data offset and alignment (in branch delay slot). + addiu $a2, $a1, (MIRROR_INT_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK) +.endm + +.macro COMPUTE_ARRAY_SIZE_16 slow_path + # Possibly a large object, go slow. + # Also does negative array size check. + li $a2, ((MIN_LARGE_OBJECT_THRESHOLD - MIRROR_INT_ARRAY_DATA_OFFSET) / 2) + bgtu $a1, $a2, \slow_path + sll $a2, $a1, 1 + # Add array data offset and alignment. + addiu $a2, $a2, (MIRROR_INT_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK) +.endm + +.macro COMPUTE_ARRAY_SIZE_32 slow_path + # Possibly a large object, go slow. + # Also does negative array size check. + li $a2, ((MIN_LARGE_OBJECT_THRESHOLD - MIRROR_INT_ARRAY_DATA_OFFSET) / 4) + bgtu $a1, $a2, \slow_path + sll $a2, $a1, 2 + # Add array data offset and alignment. + addiu $a2, $a2, (MIRROR_INT_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK) +.endm + +.macro COMPUTE_ARRAY_SIZE_64 slow_path + # Possibly a large object, go slow. + # Also does negative array size check. + li $a2, ((MIN_LARGE_OBJECT_THRESHOLD - MIRROR_LONG_ARRAY_DATA_OFFSET) / 8) + bgtu $a1, $a2, \slow_path + sll $a2, $a1, 3 + # Add array data offset and alignment. + addiu $a2, $a2, (MIRROR_WIDE_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK) +.endm -GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab, TLAB) -GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_region_tlab, RegionTLAB) +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_UNKNOWN +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved8_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_8 +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved16_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_16 +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved32_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_32 +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved64_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_64 + +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_UNKNOWN +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved8_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_8 +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved16_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_16 +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved32_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_32 +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved64_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_64 // Macro for string and type resolution and initialization. // $a0 is both input and output. diff --git a/runtime/arch/mips64/quick_entrypoints_mips64.S b/runtime/arch/mips64/quick_entrypoints_mips64.S index 10074fd43b..d427fe320b 100644 --- a/runtime/arch/mips64/quick_entrypoints_mips64.S +++ b/runtime/arch/mips64/quick_entrypoints_mips64.S @@ -1611,13 +1611,37 @@ END \name .endm // Generate the allocation entrypoints for each allocator. -GENERATE_ALLOC_ENTRYPOINTS_FOR_EACH_ALLOCATOR +GENERATE_ALLOC_ENTRYPOINTS_FOR_NON_TLAB_ALLOCATORS +// Comment out allocators that have mips64 specific asm. +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_region_tlab, RegionTLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_region_tlab, RegionTLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_WITH_ACCESS_CHECK(_region_tlab, RegionTLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED(_region_tlab, RegionTLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED8(_region_tlab, RegionTLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED16(_region_tlab, RegionTLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED32(_region_tlab, RegionTLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED64(_region_tlab, RegionTLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_BYTES(_region_tlab, RegionTLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_region_tlab, RegionTLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_region_tlab, RegionTLAB) + +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_WITH_ACCESS_CHECK(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED8(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED16(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED32(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED64(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_BYTES(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_tlab, TLAB) // A hand-written override for: // GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_rosalloc, RosAlloc) // GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_rosalloc, RosAlloc) -.macro ART_QUICK_ALLOC_OBJECT_ROSALLOC c_name, cxx_name -ENTRY \c_name +.macro ART_QUICK_ALLOC_OBJECT_ROSALLOC c_name, cxx_name, isInitialized +ENTRY_NO_GP \c_name # Fast path rosalloc allocation # a0: type # s1: Thread::Current @@ -1637,6 +1661,11 @@ ENTRY \c_name li $a5, ROSALLOC_MAX_THREAD_LOCAL_BRACKET_SIZE # Check if size is for a thread local # allocation. Also does the initialized # and finalizable checks. + # When isInitialized == 0, then the class is potentially not yet initialized. + # If the class is not yet initialized, the object size will be very large to force the branch + # below to be taken. + # + # See InitializeClassVisitors in class-inl.h for more details. bltuc $a5, $t1, .Lslow_path_\c_name # Compute the rosalloc bracket index from the size. Since the size is already aligned we can @@ -1667,7 +1696,7 @@ ENTRY \c_name # Push the new object onto the thread local allocation stack and increment the thread local # allocation stack top. - sd $v0, 0($t3) + sw $v0, 0($t3) daddiu $t3, $t3, COMPRESSED_REFERENCE_SIZE sd $t3, THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET($s1) @@ -1676,12 +1705,17 @@ ENTRY \c_name addiu $a5, $a5, -1 sw $a5, (ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_SIZE_OFFSET)($t2) +.if \isInitialized == 0 + # This barrier is only necessary when the allocation also requires a class initialization check. + # + # If the class is already observably initialized, then new-instance allocations are protected + # from publishing by the compiler which inserts its own StoreStore barrier. sync # Fence. - - jalr $zero, $ra - .cpreturn # Restore gp from t8 in branch delay slot. +.endif + jic $ra, 0 .Lslow_path_\c_name: + SETUP_GP SETUP_SAVE_REFS_ONLY_FRAME jal \cxx_name move $a1 ,$s1 # Pass self as argument. @@ -1689,11 +1723,180 @@ ENTRY \c_name END \c_name .endm -ART_QUICK_ALLOC_OBJECT_ROSALLOC art_quick_alloc_object_resolved_rosalloc, artAllocObjectFromCodeResolvedRosAlloc -ART_QUICK_ALLOC_OBJECT_ROSALLOC art_quick_alloc_object_initialized_rosalloc, artAllocObjectFromCodeInitializedRosAlloc +ART_QUICK_ALLOC_OBJECT_ROSALLOC art_quick_alloc_object_resolved_rosalloc, artAllocObjectFromCodeResolvedRosAlloc, /* isInitialized */ 0 +ART_QUICK_ALLOC_OBJECT_ROSALLOC art_quick_alloc_object_initialized_rosalloc, artAllocObjectFromCodeInitializedRosAlloc, /* isInitialized */ 1 + +// The common fast path code for art_quick_alloc_object_resolved/initialized_tlab +// and art_quick_alloc_object_resolved/initialized_region_tlab. +// +// a0: type, s1(rSELF): Thread::Current +// Need to preserve a0 to the slow path. +// +// If isInitialized=1 then the compiler assumes the object's class has already been initialized. +// If isInitialized=0 the compiler can only assume it's been at least resolved. +.macro ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH slowPathLabel isInitialized + ld $v0, THREAD_LOCAL_POS_OFFSET(rSELF) # Load thread_local_pos. + ld $a2, THREAD_LOCAL_END_OFFSET(rSELF) # Load thread_local_end. + lwu $t0, MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET($a0) # Load the object size. + daddu $a3, $v0, $t0 # Add object size to tlab pos. + + # When isInitialized == 0, then the class is potentially not yet initialized. + # If the class is not yet initialized, the object size will be very large to force the branch + # below to be taken. + # + # See InitializeClassVisitors in class-inl.h for more details. + bltuc $a2, $a3, \slowPathLabel # Check if it fits, overflow works since the + # tlab pos and end are 32 bit values. + # "Point of no slow path". Won't go to the slow path from here on. + sd $a3, THREAD_LOCAL_POS_OFFSET(rSELF) # Store new thread_local_pos. + ld $a2, THREAD_LOCAL_OBJECTS_OFFSET(rSELF) # Increment thread_local_objects. + daddiu $a2, $a2, 1 + sd $a2, THREAD_LOCAL_OBJECTS_OFFSET(rSELF) + POISON_HEAP_REF $a0 + sw $a0, MIRROR_OBJECT_CLASS_OFFSET($v0) # Store the class pointer. + +.if \isInitialized == 0 + # This barrier is only necessary when the allocation also requires a class initialization check. + # + # If the class is already observably initialized, then new-instance allocations are protected + # from publishing by the compiler which inserts its own StoreStore barrier. + sync # Fence. +.endif + jic $ra, 0 +.endm + +// The common code for art_quick_alloc_object_resolved/initialized_tlab +// and art_quick_alloc_object_resolved/initialized_region_tlab. +.macro GENERATE_ALLOC_OBJECT_TLAB name, entrypoint, isInitialized +ENTRY_NO_GP \name + # Fast path tlab allocation. + # a0: type, s1(rSELF): Thread::Current. + ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH .Lslow_path_\name, \isInitialized +.Lslow_path_\name: + SETUP_GP + SETUP_SAVE_REFS_ONLY_FRAME # Save callee saves in case of GC. + jal \entrypoint # (mirror::Class*, Thread*) + move $a1, rSELF # Pass Thread::Current. + RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER +END \name +.endm + +GENERATE_ALLOC_OBJECT_TLAB art_quick_alloc_object_resolved_region_tlab, artAllocObjectFromCodeResolvedRegionTLAB, /* isInitialized */ 0 +GENERATE_ALLOC_OBJECT_TLAB art_quick_alloc_object_initialized_region_tlab, artAllocObjectFromCodeInitializedRegionTLAB, /* isInitialized */ 1 +GENERATE_ALLOC_OBJECT_TLAB art_quick_alloc_object_resolved_tlab, artAllocObjectFromCodeResolvedTLAB, /* isInitialized */ 0 +GENERATE_ALLOC_OBJECT_TLAB art_quick_alloc_object_initialized_tlab, artAllocObjectFromCodeInitializedTLAB, /* isInitialized */ 1 + +// The common fast path code for art_quick_alloc_array_resolved/initialized_tlab +// and art_quick_alloc_array_resolved/initialized_region_tlab. +// +// a0: type, a1: component_count, a2: total_size, s1(rSELF): Thread::Current. +// Need to preserve a0 and a1 to the slow path. +.macro ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED_WITH_SIZE slowPathLabel + dli $a3, OBJECT_ALIGNMENT_MASK_TOGGLED64 # Apply alignemnt mask (addr + 7) & ~7. + and $a2, $a2, $a3 # The mask must be 64 bits to keep high + # bits in case of overflow. + # Negative sized arrays are handled here since a1 holds a zero extended 32 bit value. + # Negative ints become large 64 bit unsigned ints which will always be larger than max signed + # 32 bit int. Since the max shift for arrays is 3, it can not become a negative 64 bit int. + dli $a3, MIN_LARGE_OBJECT_THRESHOLD + bgeuc $a2, $a3, \slowPathLabel # Possibly a large object, go slow path. + + ld $v0, THREAD_LOCAL_POS_OFFSET(rSELF) # Load thread_local_pos. + ld $t1, THREAD_LOCAL_END_OFFSET(rSELF) # Load thread_local_end. + dsubu $t2, $t1, $v0 # Compute the remaining buffer size. + bltuc $t2, $a2, \slowPathLabel # Check tlab for space, note that we use + # (end - begin) to handle negative size + # arrays. It is assumed that a negative size + # will always be greater unsigned than region + # size. + + # "Point of no slow path". Won't go to the slow path from here on. + daddu $a2, $v0, $a2 # Add object size to tlab pos. + sd $a2, THREAD_LOCAL_POS_OFFSET(rSELF) # Store new thread_local_pos. + ld $a2, THREAD_LOCAL_OBJECTS_OFFSET(rSELF) # Increment thread_local_objects. + daddiu $a2, $a2, 1 + sd $a2, THREAD_LOCAL_OBJECTS_OFFSET(rSELF) + POISON_HEAP_REF $a0 + sw $a0, MIRROR_OBJECT_CLASS_OFFSET($v0) # Store the class pointer. + sw $a1, MIRROR_ARRAY_LENGTH_OFFSET($v0) # Store the array length. + + jic $ra, 0 +.endm + +.macro GENERATE_ALLOC_ARRAY_TLAB name, entrypoint, size_setup +ENTRY_NO_GP \name + # Fast path array allocation for region tlab allocation. + # a0: mirror::Class* type + # a1: int32_t component_count + # s1(rSELF): Thread::Current + dext $a4, $a1, 0, 32 # Create zero-extended component_count. Value + # in a1 is preserved in a case of slow path. + \size_setup .Lslow_path_\name + ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED_WITH_SIZE .Lslow_path_\name +.Lslow_path_\name: + # a0: mirror::Class* type + # a1: int32_t component_count + # a2: Thread* self + SETUP_GP + SETUP_SAVE_REFS_ONLY_FRAME # Save callee saves in case of GC. + jal \entrypoint + move $a2, rSELF # Pass Thread::Current. + RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER +END \name +.endm + +.macro COMPUTE_ARRAY_SIZE_UNKNOWN slow_path + # Array classes are never finalizable or uninitialized, no need to check. + lwu $a3, MIRROR_CLASS_COMPONENT_TYPE_OFFSET($a0) # Load component type. + UNPOISON_HEAP_REF $a3 + lw $a3, MIRROR_CLASS_OBJECT_PRIMITIVE_TYPE_OFFSET($a3) + dsrl $a3, $a3, PRIMITIVE_TYPE_SIZE_SHIFT_SHIFT # Component size shift is in high 16 bits. + dsllv $a2, $a4, $a3 # Calculate data size. + # Add array data offset and alignment. + daddiu $a2, $a2, (MIRROR_INT_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK) +#if MIRROR_WIDE_ARRAY_DATA_OFFSET != MIRROR_INT_ARRAY_DATA_OFFSET + 4 +#error Long array data offset must be 4 greater than int array data offset. +#endif + + daddiu $a3, $a3, 1 # Add 4 to the length only if the component + andi $a3, $a3, 4 # size shift is 3 (for 64 bit alignment). + daddu $a2, $a2, $a3 +.endm + +.macro COMPUTE_ARRAY_SIZE_8 slow_path + # Add array data offset and alignment. + daddiu $a2, $a4, (MIRROR_INT_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK) +.endm + +.macro COMPUTE_ARRAY_SIZE_16 slow_path + dsll $a2, $a4, 1 + # Add array data offset and alignment. + daddiu $a2, $a2, (MIRROR_INT_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK) +.endm + +.macro COMPUTE_ARRAY_SIZE_32 slow_path + dsll $a2, $a4, 2 + # Add array data offset and alignment. + daddiu $a2, $a2, (MIRROR_INT_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK) +.endm + +.macro COMPUTE_ARRAY_SIZE_64 slow_path + dsll $a2, $a4, 3 + # Add array data offset and alignment. + daddiu $a2, $a2, (MIRROR_WIDE_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK) +.endm -GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab, TLAB) -GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_region_tlab, RegionTLAB) +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_UNKNOWN +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved8_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_8 +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved16_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_16 +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved32_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_32 +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved64_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_64 + +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_UNKNOWN +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved8_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_8 +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved16_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_16 +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved32_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_32 +GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved64_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_64 // Macro for string and type resolution and initialization. // $a0 is both input and output. diff --git a/runtime/arch/quick_alloc_entrypoints.S b/runtime/arch/quick_alloc_entrypoints.S index fbfa7564a7..c091b0e0d5 100644 --- a/runtime/arch/quick_alloc_entrypoints.S +++ b/runtime/arch/quick_alloc_entrypoints.S @@ -78,11 +78,6 @@ GENERATE_ALLOC_ENTRYPOINTS _region_tlab_instrumented, RegionTLABInstrumented #define GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED64(c_suffix, cxx_suffix) \ TWO_ARG_DOWNCALL art_quick_alloc_array_resolved64 ## c_suffix, artAllocArrayFromCodeResolved ## cxx_suffix, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER -.macro GENERATE_ALLOC_ENTRYPOINTS_FOR_EACH_ALLOCATOR -GENERATE_ALLOC_ENTRYPOINTS_FOR_NON_REGION_TLAB_ALLOCATORS -GENERATE_ALLOC_ENTRYPOINTS_FOR_REGION_TLAB_ALLOCATOR -.endm - .macro GENERATE_ALLOC_ENTRYPOINTS_FOR_REGION_TLAB_ALLOCATOR // This is to be separately defined for each architecture to allow a hand-written assembly fast path. // GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_region_tlab, RegionTLAB) |