diff options
| -rw-r--r-- | runtime/arch/quick_alloc_entrypoints.S | 35 | ||||
| -rw-r--r-- | runtime/arch/x86/quick_entrypoints_x86.S | 7 | ||||
| -rw-r--r-- | runtime/arch/x86_64/quick_entrypoints_x86_64.S | 150 | ||||
| -rw-r--r-- | runtime/entrypoints/quick/quick_alloc_entrypoints.cc | 9 | ||||
| -rw-r--r-- | runtime/entrypoints/quick/quick_alloc_entrypoints.h | 4 | ||||
| -rw-r--r-- | runtime/entrypoints/quick/quick_default_init_entrypoints.h | 2 | ||||
| -rw-r--r-- | runtime/gc/heap-inl.h | 124 | ||||
| -rw-r--r-- | runtime/gc/heap.cc | 69 | ||||
| -rw-r--r-- | runtime/gc/heap.h | 17 | ||||
| -rw-r--r-- | runtime/instrumentation.cc | 2 | ||||
| -rw-r--r-- | runtime/thread.cc | 14 | ||||
| -rw-r--r-- | runtime/thread.h | 2 |
12 files changed, 266 insertions, 169 deletions
diff --git a/runtime/arch/quick_alloc_entrypoints.S b/runtime/arch/quick_alloc_entrypoints.S index fa86bf4087..db2fdcabea 100644 --- a/runtime/arch/quick_alloc_entrypoints.S +++ b/runtime/arch/quick_alloc_entrypoints.S @@ -107,7 +107,28 @@ GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_region_tlab, RegionTLAB) GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_region_tlab, RegionTLAB) .endm +.macro GENERATE_ALLOC_ENTRYPOINTS_FOR_TLAB_ALLOCATOR +// This is to be separately defined for each architecture to allow a hand-written assembly fast path. +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_WITH_ACCESS_CHECK(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_WITH_ACCESS_CHECK(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_CHECK_AND_ALLOC_ARRAY(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_CHECK_AND_ALLOC_ARRAY_WITH_ACCESS_CHECK(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_BYTES(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_tlab, TLAB) +.endm + .macro GENERATE_ALLOC_ENTRYPOINTS_FOR_NON_REGION_TLAB_ALLOCATORS +GENERATE_ALLOC_ENTRYPOINTS_FOR_NON_TLAB_ALLOCATORS +GENERATE_ALLOC_ENTRYPOINTS_FOR_TLAB_ALLOCATOR +.endm + +.macro GENERATE_ALLOC_ENTRYPOINTS_FOR_NON_TLAB_ALLOCATORS GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_dlmalloc, DlMalloc) GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_dlmalloc, DlMalloc) GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_dlmalloc, DlMalloc) @@ -187,20 +208,6 @@ GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_BYTES(_bump_pointer_instrumented, B GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_bump_pointer_instrumented, BumpPointerInstrumented) GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_bump_pointer_instrumented, BumpPointerInstrumented) -// This is to be separately defined for each architecture to allow a hand-written assembly fast path. -// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB) -GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab, TLAB) -GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_tlab, TLAB) -GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_WITH_ACCESS_CHECK(_tlab, TLAB) -GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY(_tlab, TLAB) -GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED(_tlab, TLAB) -GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_WITH_ACCESS_CHECK(_tlab, TLAB) -GENERATE_ALLOC_ENTRYPOINTS_CHECK_AND_ALLOC_ARRAY(_tlab, TLAB) -GENERATE_ALLOC_ENTRYPOINTS_CHECK_AND_ALLOC_ARRAY_WITH_ACCESS_CHECK(_tlab, TLAB) -GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_BYTES(_tlab, TLAB) -GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_tlab, TLAB) -GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_tlab, TLAB) - GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab_instrumented, TLABInstrumented) GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab_instrumented, TLABInstrumented) GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_tlab_instrumented, TLABInstrumented) diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S index fb405fac0f..6fbc9547e6 100644 --- a/runtime/arch/x86/quick_entrypoints_x86.S +++ b/runtime/arch/x86/quick_entrypoints_x86.S @@ -1085,15 +1085,12 @@ MACRO1(ALLOC_OBJECT_TLAB_SLOW_PATH, cxx_name) RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER // return or deliver exception END_MACRO -// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB). +// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB). May be called +// for CC if the GC is not marking. DEFINE_FUNCTION art_quick_alloc_object_tlab // Fast path tlab allocation. // EAX: uint32_t type_idx/return value, ECX: ArtMethod*. // EBX, EDX: free. -#if defined(USE_READ_BARRIER) - int3 - int3 -#endif PUSH esi PUSH edi movl ART_METHOD_DEX_CACHE_TYPES_OFFSET_32(%ecx), %edx // Load dex cache resolved types array diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S index 860b77efe3..f8066e45fb 100644 --- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S +++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S @@ -18,6 +18,13 @@ #include "arch/quick_alloc_entrypoints.S" +MACRO0(ASSERT_USE_READ_BARRIER) +#if !defined(USE_READ_BARRIER) + int3 + int3 +#endif +END_MACRO + MACRO0(SETUP_FP_CALLEE_SAVE_FRAME) // Create space for ART FP callee-saved registers subq MACRO_LITERAL(4 * 8), %rsp @@ -972,8 +979,10 @@ MACRO0(RETURN_OR_DELIVER_PENDING_EXCEPTION) END_MACRO // Generate the allocation entrypoints for each allocator. -GENERATE_ALLOC_ENTRYPOINTS_FOR_NON_REGION_TLAB_ALLOCATORS +GENERATE_ALLOC_ENTRYPOINTS_FOR_NON_TLAB_ALLOCATORS + // Comment out allocators that have x86_64 specific asm. +// Region TLAB: // GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_region_tlab, RegionTLAB) // GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_region_tlab, RegionTLAB) // GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_region_tlab, RegionTLAB) @@ -986,6 +995,19 @@ GENERATE_ALLOC_ENTRYPOINTS_CHECK_AND_ALLOC_ARRAY_WITH_ACCESS_CHECK(_region_tlab, GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_BYTES(_region_tlab, RegionTLAB) GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_region_tlab, RegionTLAB) GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_region_tlab, RegionTLAB) +// Normal TLAB: +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_WITH_ACCESS_CHECK(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_WITH_ACCESS_CHECK(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_CHECK_AND_ALLOC_ARRAY(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_CHECK_AND_ALLOC_ARRAY_WITH_ACCESS_CHECK(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_BYTES(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_tlab, TLAB) // A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_rosalloc, RosAlloc). DEFINE_FUNCTION art_quick_alloc_object_rosalloc @@ -1162,16 +1184,11 @@ MACRO1(ALLOC_ARRAY_TLAB_SLOW_PATH, cxx_name) RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER // return or deliver exception END_MACRO -// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB). +// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB). May be +// called with CC if the GC is not active. DEFINE_FUNCTION art_quick_alloc_object_tlab - // Fast path tlab allocation. // RDI: uint32_t type_idx, RSI: ArtMethod* // RDX, RCX, R8, R9: free. RAX: return val. -#if defined(USE_READ_BARRIER) - int3 - int3 -#endif - // Might need a special macro since rsi and edx is 32b/64b mismatched. movq ART_METHOD_DEX_CACHE_TYPES_OFFSET_64(%rsi), %rdx // Load dex cache resolved types array // Might need to break down into multiple instructions to get the base address in a register. // Load the class @@ -1181,29 +1198,69 @@ DEFINE_FUNCTION art_quick_alloc_object_tlab ALLOC_OBJECT_TLAB_SLOW_PATH artAllocObjectFromCodeTLAB END_FUNCTION art_quick_alloc_object_tlab +// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab, TLAB). May be +// called with CC if the GC is not active. +DEFINE_FUNCTION art_quick_alloc_object_resolved_tlab + // RDI: mirror::Class* klass, RSI: ArtMethod* + // RDX, RCX, R8, R9: free. RAX: return val. + movq %rdi, %rdx + ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH .Lart_quick_alloc_object_resolved_tlab_slow_path +.Lart_quick_alloc_object_resolved_tlab_slow_path: + ALLOC_OBJECT_TLAB_SLOW_PATH artAllocObjectFromCodeResolvedTLAB +END_FUNCTION art_quick_alloc_object_resolved_tlab + +// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_tlab, TLAB). +// May be called with CC if the GC is not active. +DEFINE_FUNCTION art_quick_alloc_object_initialized_tlab + // RDI: mirror::Class* klass, RSI: ArtMethod* + // RDX, RCX, R8, R9: free. RAX: return val. + movq %rdi, %rdx + ALLOC_OBJECT_INITIALIZED_TLAB_FAST_PATH .Lart_quick_alloc_object_initialized_tlab_slow_path +.Lart_quick_alloc_object_initialized_tlab_slow_path: + ALLOC_OBJECT_TLAB_SLOW_PATH artAllocObjectFromCodeInitializedTLAB +END_FUNCTION art_quick_alloc_object_initialized_tlab + +// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY(_tlab, TLAB). +DEFINE_FUNCTION art_quick_alloc_array_tlab + // RDI: uint32_t type_idx, RSI: int32_t component_count, RDX: ArtMethod* + // RCX: klass, R8, R9: free. RAX: return val. + movq ART_METHOD_DEX_CACHE_TYPES_OFFSET_64(%rdx), %rcx // Load dex cache resolved types array + movl 0(%rcx, %rdi, COMPRESSED_REFERENCE_SIZE), %ecx // Load the class + testl %ecx, %ecx + jz .Lart_quick_alloc_array_tlab_slow_path + ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED .Lart_quick_alloc_array_tlab_slow_path +.Lart_quick_alloc_array_tlab_slow_path: + ALLOC_ARRAY_TLAB_SLOW_PATH artAllocArrayFromCodeTLAB +END_FUNCTION art_quick_alloc_array_tlab + +// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED(_tlab, TLAB). +DEFINE_FUNCTION art_quick_alloc_array_resolved_tlab + // RDI: mirror::Class* klass, RSI: int32_t component_count, RDX: ArtMethod* + // RCX: mirror::Class* klass, R8, R9: free. RAX: return val. + movq %rdi, %rcx + // Already resolved, no null check. + ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED .Lart_quick_alloc_array_resolved_tlab_slow_path +.Lart_quick_alloc_array_resolved_tlab_slow_path: + ALLOC_ARRAY_TLAB_SLOW_PATH artAllocArrayFromCodeResolvedTLAB +END_FUNCTION art_quick_alloc_array_resolved_tlab + // A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY(_region_tlab, RegionTLAB). DEFINE_FUNCTION art_quick_alloc_array_region_tlab // Fast path region tlab allocation. // RDI: uint32_t type_idx, RSI: int32_t component_count, RDX: ArtMethod* // RCX: klass, R8, R9: free. RAX: return val. -#if !defined(USE_READ_BARRIER) - int3 - int3 -#endif + ASSERT_USE_READ_BARRIER movq ART_METHOD_DEX_CACHE_TYPES_OFFSET_64(%rdx), %rcx // Load dex cache resolved types array movl 0(%rcx, %rdi, COMPRESSED_REFERENCE_SIZE), %ecx // Load the class // Null check so that we can load the lock word. testl %ecx, %ecx jz .Lart_quick_alloc_array_region_tlab_slow_path - - cmpl LITERAL(0), %gs:THREAD_IS_GC_MARKING_OFFSET - jne .Lart_quick_alloc_array_region_tlab_class_load_read_barrier_marking + // Since we have allocation entrypoint switching, we know the GC is marking. + // Check the mark bit, if it is 0, do the read barrier mark. + testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(%ecx) + jz .Lart_quick_alloc_array_region_tlab_class_load_read_barrier_slow_path .Lart_quick_alloc_array_region_tlab_class_load_read_barrier_slow_path_exit: ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED .Lart_quick_alloc_array_region_tlab_slow_path -.Lart_quick_alloc_array_region_tlab_class_load_read_barrier_marking: - // Check the mark bit, if it is 1 return. - testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(%ecx) - jnz .Lart_quick_alloc_array_region_tlab_class_load_read_barrier_slow_path_exit .Lart_quick_alloc_array_region_tlab_class_load_read_barrier_slow_path: // The read barrier slow path. Mark the class. PUSH rdi @@ -1226,33 +1283,11 @@ DEFINE_FUNCTION art_quick_alloc_array_resolved_region_tlab // Fast path region tlab allocation. // RDI: mirror::Class* klass, RSI: int32_t component_count, RDX: ArtMethod* // RCX: mirror::Class* klass, R8, R9: free. RAX: return val. -#if !defined(USE_READ_BARRIER) - int3 - int3 -#endif + ASSERT_USE_READ_BARRIER movq %rdi, %rcx + // Caller is responsible for read barrier. // Already resolved, no null check. - cmpl LITERAL(0), %gs:THREAD_IS_GC_MARKING_OFFSET - jne .Lart_quick_alloc_array_resolved_region_tlab_class_load_read_barrier_marking -.Lart_quick_alloc_array_resolved_region_tlab_class_load_read_barrier_slow_path_exit: ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED .Lart_quick_alloc_array_resolved_region_tlab_slow_path -.Lart_quick_alloc_array_resolved_region_tlab_class_load_read_barrier_marking: - // Check the mark bit, if it is 1 return. - testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(%ecx) - jnz .Lart_quick_alloc_array_region_tlab_class_load_read_barrier_slow_path_exit -.Lart_quick_alloc_array_resolved_region_tlab_class_load_read_barrier_slow_path: - // The read barrier slow path. Mark the class. - PUSH rdi - PUSH rsi - PUSH rdx - // Outgoing argument set up - movq %rcx, %rdi // Pass the class as the first param. - call SYMBOL(artReadBarrierMark) // cxx_name(mirror::Object* obj) - movq %rax, %rcx - POP rdx - POP rsi - POP rdi - jmp .Lart_quick_alloc_array_resolved_region_tlab_class_load_read_barrier_slow_path_exit .Lart_quick_alloc_array_resolved_region_tlab_slow_path: ALLOC_ARRAY_TLAB_SLOW_PATH artAllocArrayFromCodeResolvedRegionTLAB END_FUNCTION art_quick_alloc_array_resolved_region_tlab @@ -1262,24 +1297,19 @@ DEFINE_FUNCTION art_quick_alloc_object_region_tlab // Fast path region tlab allocation. // RDI: uint32_t type_idx, RSI: ArtMethod* // RDX, RCX, R8, R9: free. RAX: return val. -#if !defined(USE_READ_BARRIER) - int3 - int3 -#endif + ASSERT_USE_READ_BARRIER movq ART_METHOD_DEX_CACHE_TYPES_OFFSET_64(%rsi), %rdx // Load dex cache resolved types array movl 0(%rdx, %rdi, COMPRESSED_REFERENCE_SIZE), %edx // Load the class // Null check so that we can load the lock word. testl %edx, %edx jz .Lart_quick_alloc_object_region_tlab_slow_path - // Test if the GC is marking. - cmpl LITERAL(0), %gs:THREAD_IS_GC_MARKING_OFFSET - jne .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_marking -.Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit: - ALLOC_OBJECT_TLAB_FAST_PATH .Lart_quick_alloc_object_region_tlab_slow_path -.Lart_quick_alloc_object_region_tlab_class_load_read_barrier_marking: - // Check the mark bit, if it is 1 avoid the read barrier. + // Since we have allocation entrypoint switching, we know the GC is marking. + // Check the mark bit, if it is 0, do the read barrier mark. testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(%edx) - jnz .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit + jz .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path +.Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit: + // Use resolved one since we already did the null check. + ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH .Lart_quick_alloc_object_region_tlab_slow_path .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path: // The read barrier slow path. Mark the class. PUSH rdi @@ -1302,10 +1332,7 @@ DEFINE_FUNCTION art_quick_alloc_object_resolved_region_tlab // Fast path region tlab allocation. // RDI: mirror::Class* klass, RSI: ArtMethod* // RDX, RCX, R8, R9: free. RAX: return val. -#if !defined(USE_READ_BARRIER) - int3 - int3 -#endif + ASSERT_USE_READ_BARRIER // No read barrier since the caller is responsible for that. movq %rdi, %rdx ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH .Lart_quick_alloc_object_resolved_region_tlab_slow_path @@ -1318,10 +1345,7 @@ DEFINE_FUNCTION art_quick_alloc_object_initialized_region_tlab // Fast path region tlab allocation. // RDI: mirror::Class* klass, RSI: ArtMethod* // RDX, RCX, R8, R9: free. RAX: return val. -#if !defined(USE_READ_BARRIER) - int3 - int3 -#endif + ASSERT_USE_READ_BARRIER movq %rdi, %rdx // No read barrier since the caller is responsible for that. ALLOC_OBJECT_INITIALIZED_TLAB_FAST_PATH .Lart_quick_alloc_object_initialized_region_tlab_slow_path diff --git a/runtime/entrypoints/quick/quick_alloc_entrypoints.cc b/runtime/entrypoints/quick/quick_alloc_entrypoints.cc index 397655a895..d2fee9a766 100644 --- a/runtime/entrypoints/quick/quick_alloc_entrypoints.cc +++ b/runtime/entrypoints/quick/quick_alloc_entrypoints.cc @@ -292,7 +292,7 @@ void SetQuickAllocEntryPointsInstrumented(bool instrumented) { entry_points_instrumented = instrumented; } -void ResetQuickAllocEntryPoints(QuickEntryPoints* qpoints) { +void ResetQuickAllocEntryPoints(QuickEntryPoints* qpoints, bool is_marking) { #if !defined(__APPLE__) || !defined(__LP64__) switch (entry_points_allocator) { case gc::kAllocatorTypeDlMalloc: { @@ -320,7 +320,12 @@ void ResetQuickAllocEntryPoints(QuickEntryPoints* qpoints) { } case gc::kAllocatorTypeRegionTLAB: { CHECK(kMovingCollector); - SetQuickAllocEntryPoints_region_tlab(qpoints, entry_points_instrumented); + if (is_marking) { + SetQuickAllocEntryPoints_region_tlab(qpoints, entry_points_instrumented); + } else { + // Not marking means we need no read barriers and can just use the normal TLAB case. + SetQuickAllocEntryPoints_tlab(qpoints, entry_points_instrumented); + } return; } default: diff --git a/runtime/entrypoints/quick/quick_alloc_entrypoints.h b/runtime/entrypoints/quick/quick_alloc_entrypoints.h index 14a8e0428b..bd1e295e48 100644 --- a/runtime/entrypoints/quick/quick_alloc_entrypoints.h +++ b/runtime/entrypoints/quick/quick_alloc_entrypoints.h @@ -23,7 +23,9 @@ namespace art { -void ResetQuickAllocEntryPoints(QuickEntryPoints* qpoints); +// is_marking is only used for CC, if the GC is marking the allocation entrypoint is the marking +// one. +void ResetQuickAllocEntryPoints(QuickEntryPoints* qpoints, bool is_marking); // Runtime shutdown lock is necessary to prevent races in thread initialization. When the thread is // starting it doesn't hold the mutator lock until after it has been added to the thread list. diff --git a/runtime/entrypoints/quick/quick_default_init_entrypoints.h b/runtime/entrypoints/quick/quick_default_init_entrypoints.h index df23f94a31..78dad94dfe 100644 --- a/runtime/entrypoints/quick/quick_default_init_entrypoints.h +++ b/runtime/entrypoints/quick/quick_default_init_entrypoints.h @@ -31,7 +31,7 @@ void DefaultInitEntryPoints(JniEntryPoints* jpoints, QuickEntryPoints* qpoints) jpoints->pDlsymLookup = art_jni_dlsym_lookup_stub; // Alloc - ResetQuickAllocEntryPoints(qpoints); + ResetQuickAllocEntryPoints(qpoints, /* is_marking */ true); // DexCache qpoints->pInitializeStaticStorage = art_quick_initialize_static_storage; diff --git a/runtime/gc/heap-inl.h b/runtime/gc/heap-inl.h index 97129e8b19..54f221056a 100644 --- a/runtime/gc/heap-inl.h +++ b/runtime/gc/heap-inl.h @@ -247,7 +247,7 @@ inline mirror::Object* Heap::TryToAllocate(Thread* self, if (allocator_type != kAllocatorTypeTLAB && allocator_type != kAllocatorTypeRegionTLAB && allocator_type != kAllocatorTypeRosAlloc && - UNLIKELY(IsOutOfMemoryOnAllocation<kGrow>(allocator_type, alloc_size))) { + UNLIKELY(IsOutOfMemoryOnAllocation(allocator_type, alloc_size, kGrow))) { return nullptr; } mirror::Object* ret; @@ -267,8 +267,9 @@ inline mirror::Object* Heap::TryToAllocate(Thread* self, if (kInstrumented && UNLIKELY(is_running_on_memory_tool_)) { // If running on valgrind or asan, we should be using the instrumented path. size_t max_bytes_tl_bulk_allocated = rosalloc_space_->MaxBytesBulkAllocatedFor(alloc_size); - if (UNLIKELY(IsOutOfMemoryOnAllocation<kGrow>(allocator_type, - max_bytes_tl_bulk_allocated))) { + if (UNLIKELY(IsOutOfMemoryOnAllocation(allocator_type, + max_bytes_tl_bulk_allocated, + kGrow))) { return nullptr; } ret = rosalloc_space_->Alloc(self, alloc_size, bytes_allocated, usable_size, @@ -277,14 +278,18 @@ inline mirror::Object* Heap::TryToAllocate(Thread* self, DCHECK(!is_running_on_memory_tool_); size_t max_bytes_tl_bulk_allocated = rosalloc_space_->MaxBytesBulkAllocatedForNonvirtual(alloc_size); - if (UNLIKELY(IsOutOfMemoryOnAllocation<kGrow>(allocator_type, - max_bytes_tl_bulk_allocated))) { + if (UNLIKELY(IsOutOfMemoryOnAllocation(allocator_type, + max_bytes_tl_bulk_allocated, + kGrow))) { return nullptr; } if (!kInstrumented) { DCHECK(!rosalloc_space_->CanAllocThreadLocal(self, alloc_size)); } - ret = rosalloc_space_->AllocNonvirtual(self, alloc_size, bytes_allocated, usable_size, + ret = rosalloc_space_->AllocNonvirtual(self, + alloc_size, + bytes_allocated, + usable_size, bytes_tl_bulk_allocated); } break; @@ -292,22 +297,34 @@ inline mirror::Object* Heap::TryToAllocate(Thread* self, case kAllocatorTypeDlMalloc: { if (kInstrumented && UNLIKELY(is_running_on_memory_tool_)) { // If running on valgrind, we should be using the instrumented path. - ret = dlmalloc_space_->Alloc(self, alloc_size, bytes_allocated, usable_size, + ret = dlmalloc_space_->Alloc(self, + alloc_size, + bytes_allocated, + usable_size, bytes_tl_bulk_allocated); } else { DCHECK(!is_running_on_memory_tool_); - ret = dlmalloc_space_->AllocNonvirtual(self, alloc_size, bytes_allocated, usable_size, + ret = dlmalloc_space_->AllocNonvirtual(self, + alloc_size, + bytes_allocated, + usable_size, bytes_tl_bulk_allocated); } break; } case kAllocatorTypeNonMoving: { - ret = non_moving_space_->Alloc(self, alloc_size, bytes_allocated, usable_size, + ret = non_moving_space_->Alloc(self, + alloc_size, + bytes_allocated, + usable_size, bytes_tl_bulk_allocated); break; } case kAllocatorTypeLOS: { - ret = large_object_space_->Alloc(self, alloc_size, bytes_allocated, usable_size, + ret = large_object_space_->Alloc(self, + alloc_size, + bytes_allocated, + usable_size, bytes_tl_bulk_allocated); // Note that the bump pointer spaces aren't necessarily next to // the other continuous spaces like the non-moving alloc space or @@ -315,80 +332,38 @@ inline mirror::Object* Heap::TryToAllocate(Thread* self, DCHECK(ret == nullptr || large_object_space_->Contains(ret)); break; } - case kAllocatorTypeTLAB: { - DCHECK_ALIGNED(alloc_size, space::BumpPointerSpace::kAlignment); - if (UNLIKELY(self->TlabSize() < alloc_size)) { - const size_t new_tlab_size = alloc_size + kDefaultTLABSize; - if (UNLIKELY(IsOutOfMemoryOnAllocation<kGrow>(allocator_type, new_tlab_size))) { - return nullptr; - } - // Try allocating a new thread local buffer, if the allocaiton fails the space must be - // full so return null. - if (!bump_pointer_space_->AllocNewTlab(self, new_tlab_size)) { - return nullptr; - } - *bytes_tl_bulk_allocated = new_tlab_size; - } else { - *bytes_tl_bulk_allocated = 0; - } - // The allocation can't fail. - ret = self->AllocTlab(alloc_size); - DCHECK(ret != nullptr); - *bytes_allocated = alloc_size; - *usable_size = alloc_size; - break; - } case kAllocatorTypeRegion: { DCHECK(region_space_ != nullptr); alloc_size = RoundUp(alloc_size, space::RegionSpace::kAlignment); - ret = region_space_->AllocNonvirtual<false>(alloc_size, bytes_allocated, usable_size, + ret = region_space_->AllocNonvirtual<false>(alloc_size, + bytes_allocated, + usable_size, bytes_tl_bulk_allocated); break; } + case kAllocatorTypeTLAB: + FALLTHROUGH_INTENDED; case kAllocatorTypeRegionTLAB: { - DCHECK(region_space_ != nullptr); - DCHECK_ALIGNED(alloc_size, space::RegionSpace::kAlignment); + DCHECK_ALIGNED(alloc_size, kObjectAlignment); + static_assert(space::RegionSpace::kAlignment == space::BumpPointerSpace::kAlignment, + "mismatched alignments"); + static_assert(kObjectAlignment == space::BumpPointerSpace::kAlignment, + "mismatched alignments"); if (UNLIKELY(self->TlabSize() < alloc_size)) { - if (space::RegionSpace::kRegionSize >= alloc_size) { - // Non-large. Check OOME for a tlab. - if (LIKELY(!IsOutOfMemoryOnAllocation<kGrow>(allocator_type, space::RegionSpace::kRegionSize))) { - // Try to allocate a tlab. - if (!region_space_->AllocNewTlab(self)) { - // Failed to allocate a tlab. Try non-tlab. - ret = region_space_->AllocNonvirtual<false>(alloc_size, bytes_allocated, usable_size, - bytes_tl_bulk_allocated); - return ret; - } - *bytes_tl_bulk_allocated = space::RegionSpace::kRegionSize; - // Fall-through. - } else { - // Check OOME for a non-tlab allocation. - if (!IsOutOfMemoryOnAllocation<kGrow>(allocator_type, alloc_size)) { - ret = region_space_->AllocNonvirtual<false>(alloc_size, bytes_allocated, usable_size, - bytes_tl_bulk_allocated); - return ret; - } else { - // Neither tlab or non-tlab works. Give up. - return nullptr; - } - } - } else { - // Large. Check OOME. - if (LIKELY(!IsOutOfMemoryOnAllocation<kGrow>(allocator_type, alloc_size))) { - ret = region_space_->AllocNonvirtual<false>(alloc_size, bytes_allocated, usable_size, - bytes_tl_bulk_allocated); - return ret; - } else { - return nullptr; - } - } - } else { - *bytes_tl_bulk_allocated = 0; // Allocated in an existing buffer. + // kAllocatorTypeTLAB may be the allocator for region space TLAB if the GC is not marking, + // that is why the allocator is not passed down. + return AllocWithNewTLAB(self, + alloc_size, + kGrow, + bytes_allocated, + usable_size, + bytes_tl_bulk_allocated); } // The allocation can't fail. ret = self->AllocTlab(alloc_size); DCHECK(ret != nullptr); *bytes_allocated = alloc_size; + *bytes_tl_bulk_allocated = 0; // Allocated in an existing buffer. *usable_size = alloc_size; break; } @@ -408,15 +383,16 @@ inline bool Heap::ShouldAllocLargeObject(ObjPtr<mirror::Class> c, size_t byte_co return byte_count >= large_object_threshold_ && (c->IsPrimitiveArray() || c->IsStringClass()); } -template <bool kGrow> -inline bool Heap::IsOutOfMemoryOnAllocation(AllocatorType allocator_type, size_t alloc_size) { +inline bool Heap::IsOutOfMemoryOnAllocation(AllocatorType allocator_type, + size_t alloc_size, + bool grow) { size_t new_footprint = num_bytes_allocated_.LoadSequentiallyConsistent() + alloc_size; if (UNLIKELY(new_footprint > max_allowed_footprint_)) { if (UNLIKELY(new_footprint > growth_limit_)) { return true; } if (!AllocatorMayHaveConcurrentGC(allocator_type) || !IsGcConcurrent()) { - if (!kGrow) { + if (!grow) { return true; } // TODO: Grow for allocation is racy, fix it. diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc index f0e619dd35..ae9741ffa5 100644 --- a/runtime/gc/heap.cc +++ b/runtime/gc/heap.cc @@ -1819,7 +1819,7 @@ mirror::Object* Heap::AllocateInternalWithGc(Thread* self, break; } // Try to transition the heap if the allocation failure was due to the space being full. - if (!IsOutOfMemoryOnAllocation<false>(allocator, alloc_size)) { + if (!IsOutOfMemoryOnAllocation(allocator, alloc_size, /*grow*/ false)) { // If we aren't out of memory then the OOM was probably from the non moving space being // full. Attempt to disable compaction and turn the main space into a non moving space. DisableMovingGc(); @@ -4225,5 +4225,72 @@ void Heap::RemoveGcPauseListener() { gc_pause_listener_.StoreRelaxed(nullptr); } +mirror::Object* Heap::AllocWithNewTLAB(Thread* self, + size_t alloc_size, + bool grow, + size_t* bytes_allocated, + size_t* usable_size, + size_t* bytes_tl_bulk_allocated) { + const AllocatorType allocator_type = GetCurrentAllocator(); + if (allocator_type == kAllocatorTypeTLAB) { + DCHECK(bump_pointer_space_ != nullptr); + const size_t new_tlab_size = alloc_size + kDefaultTLABSize; + if (UNLIKELY(IsOutOfMemoryOnAllocation(allocator_type, new_tlab_size, grow))) { + return nullptr; + } + // Try allocating a new thread local buffer, if the allocation fails the space must be + // full so return null. + if (!bump_pointer_space_->AllocNewTlab(self, new_tlab_size)) { + return nullptr; + } + *bytes_tl_bulk_allocated = new_tlab_size; + } else { + DCHECK(allocator_type == kAllocatorTypeRegionTLAB); + DCHECK(region_space_ != nullptr); + if (space::RegionSpace::kRegionSize >= alloc_size) { + // Non-large. Check OOME for a tlab. + if (LIKELY(!IsOutOfMemoryOnAllocation(allocator_type, + space::RegionSpace::kRegionSize, + grow))) { + // Try to allocate a tlab. + if (!region_space_->AllocNewTlab(self)) { + // Failed to allocate a tlab. Try non-tlab. + return region_space_->AllocNonvirtual<false>(alloc_size, + bytes_allocated, + usable_size, + bytes_tl_bulk_allocated); + } + *bytes_tl_bulk_allocated = space::RegionSpace::kRegionSize; + // Fall-through to using the TLAB below. + } else { + // Check OOME for a non-tlab allocation. + if (!IsOutOfMemoryOnAllocation(allocator_type, alloc_size, grow)) { + return region_space_->AllocNonvirtual<false>(alloc_size, + bytes_allocated, + usable_size, + bytes_tl_bulk_allocated); + } + // Neither tlab or non-tlab works. Give up. + return nullptr; + } + } else { + // Large. Check OOME. + if (LIKELY(!IsOutOfMemoryOnAllocation(allocator_type, alloc_size, grow))) { + return region_space_->AllocNonvirtual<false>(alloc_size, + bytes_allocated, + usable_size, + bytes_tl_bulk_allocated); + } + return nullptr; + } + } + // Refilled TLAB, return. + mirror::Object* ret = self->AllocTlab(alloc_size); + DCHECK(ret != nullptr); + *bytes_allocated = alloc_size; + *usable_size = alloc_size; + return ret; +} + } // namespace gc } // namespace art diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h index 0c671d269d..3a8e29b08a 100644 --- a/runtime/gc/heap.h +++ b/runtime/gc/heap.h @@ -854,6 +854,10 @@ class Heap { allocator_type != kAllocatorTypeRegionTLAB; } static ALWAYS_INLINE bool AllocatorMayHaveConcurrentGC(AllocatorType allocator_type) { + if (kUseReadBarrier) { + // Read barrier may have the TLAB allocator but is always concurrent. TODO: clean this up. + return true; + } return allocator_type != kAllocatorTypeBumpPointer && allocator_type != kAllocatorTypeTLAB; @@ -923,11 +927,20 @@ class Heap { size_t* bytes_tl_bulk_allocated) REQUIRES_SHARED(Locks::mutator_lock_); + mirror::Object* AllocWithNewTLAB(Thread* self, + size_t alloc_size, + bool grow, + size_t* bytes_allocated, + size_t* usable_size, + size_t* bytes_tl_bulk_allocated) + REQUIRES_SHARED(Locks::mutator_lock_); + void ThrowOutOfMemoryError(Thread* self, size_t byte_count, AllocatorType allocator_type) REQUIRES_SHARED(Locks::mutator_lock_); - template <bool kGrow> - ALWAYS_INLINE bool IsOutOfMemoryOnAllocation(AllocatorType allocator_type, size_t alloc_size); + ALWAYS_INLINE bool IsOutOfMemoryOnAllocation(AllocatorType allocator_type, + size_t alloc_size, + bool grow); // Run the finalizers. If timeout is non zero, then we use the VMRuntime version. void RunFinalization(JNIEnv* env, uint64_t timeout); diff --git a/runtime/instrumentation.cc b/runtime/instrumentation.cc index d4c322eb84..870d1ae9b5 100644 --- a/runtime/instrumentation.cc +++ b/runtime/instrumentation.cc @@ -630,7 +630,7 @@ void Instrumentation::ConfigureStubs(const char* key, InstrumentationLevel desir } static void ResetQuickAllocEntryPointsForThread(Thread* thread, void* arg ATTRIBUTE_UNUSED) { - thread->ResetQuickAllocEntryPointsForThread(); + thread->ResetQuickAllocEntryPointsForThread(kUseReadBarrier && thread->GetIsGcMarking()); } void Instrumentation::SetEntrypointsInstrumented(bool instrumented) { diff --git a/runtime/thread.cc b/runtime/thread.cc index 65c86815b5..c92e38b6e8 100644 --- a/runtime/thread.cc +++ b/runtime/thread.cc @@ -122,21 +122,27 @@ void Thread::SetIsGcMarkingAndUpdateEntrypoints(bool is_marking) { CHECK(kUseReadBarrier); tls32_.is_gc_marking = is_marking; UpdateReadBarrierEntrypoints(&tlsPtr_.quick_entrypoints, is_marking); + if (kRuntimeISA == kX86_64) { + // Entrypoint switching is only implemented for X86_64. + ResetQuickAllocEntryPointsForThread(is_marking); + } } void Thread::InitTlsEntryPoints() { // Insert a placeholder so we can easily tell if we call an unimplemented entry point. uintptr_t* begin = reinterpret_cast<uintptr_t*>(&tlsPtr_.jni_entrypoints); - uintptr_t* end = reinterpret_cast<uintptr_t*>(reinterpret_cast<uint8_t*>(&tlsPtr_.quick_entrypoints) + - sizeof(tlsPtr_.quick_entrypoints)); + uintptr_t* end = reinterpret_cast<uintptr_t*>( + reinterpret_cast<uint8_t*>(&tlsPtr_.quick_entrypoints) + sizeof(tlsPtr_.quick_entrypoints)); for (uintptr_t* it = begin; it != end; ++it) { *it = reinterpret_cast<uintptr_t>(UnimplementedEntryPoint); } InitEntryPoints(&tlsPtr_.jni_entrypoints, &tlsPtr_.quick_entrypoints); } -void Thread::ResetQuickAllocEntryPointsForThread() { - ResetQuickAllocEntryPoints(&tlsPtr_.quick_entrypoints); +void Thread::ResetQuickAllocEntryPointsForThread(bool is_marking) { + // Entrypoint switching is currnetly only faster for X86_64 since other archs don't have TLAB + // fast path for non region space entrypoints. + ResetQuickAllocEntryPoints(&tlsPtr_.quick_entrypoints, is_marking); } class DeoptimizationContextRecord { diff --git a/runtime/thread.h b/runtime/thread.h index 97093a6350..35226f2230 100644 --- a/runtime/thread.h +++ b/runtime/thread.h @@ -1007,7 +1007,7 @@ class Thread { tls32_.state_and_flags.as_atomic_int.FetchAndAndSequentiallyConsistent(-1 ^ flag); } - void ResetQuickAllocEntryPointsForThread(); + void ResetQuickAllocEntryPointsForThread(bool is_marking); // Returns the remaining space in the TLAB. size_t TlabSize() const; |