diff options
| author | 2016-11-29 21:32:11 +0000 | |
|---|---|---|
| committer | 2016-11-29 21:32:12 +0000 | |
| commit | ab191538a1d9eee6ec96bc3fa86dde36a007a6f5 (patch) | |
| tree | 64c4ffaf96a8cf0aeb27bf8cbbbd5cfa42d1ff40 | |
| parent | 8fb28dcf0e83f7153e76e176671cd4ad1f20205b (diff) | |
| parent | f5de23265360e15fcfceb7d07bdadca0e5bb5f0a (diff) | |
Merge "X86_64: Add allocation entrypoint switching for CC is_marking"
| -rw-r--r-- | runtime/arch/quick_alloc_entrypoints.S | 35 | ||||
| -rw-r--r-- | runtime/arch/x86/quick_entrypoints_x86.S | 7 | ||||
| -rw-r--r-- | runtime/arch/x86_64/quick_entrypoints_x86_64.S | 150 | ||||
| -rw-r--r-- | runtime/entrypoints/quick/quick_alloc_entrypoints.cc | 9 | ||||
| -rw-r--r-- | runtime/entrypoints/quick/quick_alloc_entrypoints.h | 4 | ||||
| -rw-r--r-- | runtime/entrypoints/quick/quick_default_init_entrypoints.h | 2 | ||||
| -rw-r--r-- | runtime/gc/heap-inl.h | 124 | ||||
| -rw-r--r-- | runtime/gc/heap.cc | 69 | ||||
| -rw-r--r-- | runtime/gc/heap.h | 17 | ||||
| -rw-r--r-- | runtime/instrumentation.cc | 2 | ||||
| -rw-r--r-- | runtime/thread.cc | 14 | ||||
| -rw-r--r-- | runtime/thread.h | 2 | 
12 files changed, 266 insertions, 169 deletions
| diff --git a/runtime/arch/quick_alloc_entrypoints.S b/runtime/arch/quick_alloc_entrypoints.S index fa86bf4087..db2fdcabea 100644 --- a/runtime/arch/quick_alloc_entrypoints.S +++ b/runtime/arch/quick_alloc_entrypoints.S @@ -107,7 +107,28 @@ GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_region_tlab, RegionTLAB)  GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_region_tlab, RegionTLAB)  .endm +.macro GENERATE_ALLOC_ENTRYPOINTS_FOR_TLAB_ALLOCATOR +// This is to be separately defined for each architecture to allow a hand-written assembly fast path. +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_WITH_ACCESS_CHECK(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_WITH_ACCESS_CHECK(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_CHECK_AND_ALLOC_ARRAY(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_CHECK_AND_ALLOC_ARRAY_WITH_ACCESS_CHECK(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_BYTES(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_tlab, TLAB) +.endm +  .macro GENERATE_ALLOC_ENTRYPOINTS_FOR_NON_REGION_TLAB_ALLOCATORS +GENERATE_ALLOC_ENTRYPOINTS_FOR_NON_TLAB_ALLOCATORS +GENERATE_ALLOC_ENTRYPOINTS_FOR_TLAB_ALLOCATOR +.endm + +.macro GENERATE_ALLOC_ENTRYPOINTS_FOR_NON_TLAB_ALLOCATORS  GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_dlmalloc, DlMalloc)  GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_dlmalloc, DlMalloc)  GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_dlmalloc, DlMalloc) @@ -187,20 +208,6 @@ GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_BYTES(_bump_pointer_instrumented, B  GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_bump_pointer_instrumented, BumpPointerInstrumented)  GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_bump_pointer_instrumented, BumpPointerInstrumented) -// This is to be separately defined for each architecture to allow a hand-written assembly fast path. -// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB) -GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab, TLAB) -GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_tlab, TLAB) -GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_WITH_ACCESS_CHECK(_tlab, TLAB) -GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY(_tlab, TLAB) -GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED(_tlab, TLAB) -GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_WITH_ACCESS_CHECK(_tlab, TLAB) -GENERATE_ALLOC_ENTRYPOINTS_CHECK_AND_ALLOC_ARRAY(_tlab, TLAB) -GENERATE_ALLOC_ENTRYPOINTS_CHECK_AND_ALLOC_ARRAY_WITH_ACCESS_CHECK(_tlab, TLAB) -GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_BYTES(_tlab, TLAB) -GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_tlab, TLAB) -GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_tlab, TLAB) -  GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab_instrumented, TLABInstrumented)  GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab_instrumented, TLABInstrumented)  GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_tlab_instrumented, TLABInstrumented) diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S index fb405fac0f..6fbc9547e6 100644 --- a/runtime/arch/x86/quick_entrypoints_x86.S +++ b/runtime/arch/x86/quick_entrypoints_x86.S @@ -1085,15 +1085,12 @@ MACRO1(ALLOC_OBJECT_TLAB_SLOW_PATH, cxx_name)      RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER             // return or deliver exception  END_MACRO -// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB). +// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB). May be called +// for CC if the GC is not marking.  DEFINE_FUNCTION art_quick_alloc_object_tlab      // Fast path tlab allocation.      // EAX: uint32_t type_idx/return value, ECX: ArtMethod*.      // EBX, EDX: free. -#if defined(USE_READ_BARRIER) -    int3 -    int3 -#endif      PUSH esi      PUSH edi      movl ART_METHOD_DEX_CACHE_TYPES_OFFSET_32(%ecx), %edx   // Load dex cache resolved types array diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S index 860b77efe3..f8066e45fb 100644 --- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S +++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S @@ -18,6 +18,13 @@  #include "arch/quick_alloc_entrypoints.S" +MACRO0(ASSERT_USE_READ_BARRIER) +#if !defined(USE_READ_BARRIER) +    int3 +    int3 +#endif +END_MACRO +  MACRO0(SETUP_FP_CALLEE_SAVE_FRAME)      // Create space for ART FP callee-saved registers      subq MACRO_LITERAL(4 * 8), %rsp @@ -972,8 +979,10 @@ MACRO0(RETURN_OR_DELIVER_PENDING_EXCEPTION)  END_MACRO  // Generate the allocation entrypoints for each allocator. -GENERATE_ALLOC_ENTRYPOINTS_FOR_NON_REGION_TLAB_ALLOCATORS +GENERATE_ALLOC_ENTRYPOINTS_FOR_NON_TLAB_ALLOCATORS +  // Comment out allocators that have x86_64 specific asm. +// Region TLAB:  // GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_region_tlab, RegionTLAB)  // GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_region_tlab, RegionTLAB)  // GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_region_tlab, RegionTLAB) @@ -986,6 +995,19 @@ GENERATE_ALLOC_ENTRYPOINTS_CHECK_AND_ALLOC_ARRAY_WITH_ACCESS_CHECK(_region_tlab,  GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_BYTES(_region_tlab, RegionTLAB)  GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_region_tlab, RegionTLAB)  GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_region_tlab, RegionTLAB) +// Normal TLAB: +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_WITH_ACCESS_CHECK(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_WITH_ACCESS_CHECK(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_CHECK_AND_ALLOC_ARRAY(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_CHECK_AND_ALLOC_ARRAY_WITH_ACCESS_CHECK(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_BYTES(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_tlab, TLAB)  // A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_rosalloc, RosAlloc).  DEFINE_FUNCTION art_quick_alloc_object_rosalloc @@ -1162,16 +1184,11 @@ MACRO1(ALLOC_ARRAY_TLAB_SLOW_PATH, cxx_name)      RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER                    // return or deliver exception  END_MACRO -// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB). +// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB). May be +// called with CC if the GC is not active.  DEFINE_FUNCTION art_quick_alloc_object_tlab -    // Fast path tlab allocation.      // RDI: uint32_t type_idx, RSI: ArtMethod*      // RDX, RCX, R8, R9: free. RAX: return val. -#if defined(USE_READ_BARRIER) -    int3 -    int3 -#endif -    // Might need a special macro since rsi and edx is 32b/64b mismatched.      movq ART_METHOD_DEX_CACHE_TYPES_OFFSET_64(%rsi), %rdx  // Load dex cache resolved types array      // Might need to break down into multiple instructions to get the base address in a register.                                                                 // Load the class @@ -1181,29 +1198,69 @@ DEFINE_FUNCTION art_quick_alloc_object_tlab      ALLOC_OBJECT_TLAB_SLOW_PATH artAllocObjectFromCodeTLAB  END_FUNCTION art_quick_alloc_object_tlab +// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab, TLAB). May be +// called with CC if the GC is not active. +DEFINE_FUNCTION art_quick_alloc_object_resolved_tlab +    // RDI: mirror::Class* klass, RSI: ArtMethod* +    // RDX, RCX, R8, R9: free. RAX: return val. +    movq %rdi, %rdx +    ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH .Lart_quick_alloc_object_resolved_tlab_slow_path +.Lart_quick_alloc_object_resolved_tlab_slow_path: +    ALLOC_OBJECT_TLAB_SLOW_PATH artAllocObjectFromCodeResolvedTLAB +END_FUNCTION art_quick_alloc_object_resolved_tlab + +// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_tlab, TLAB). +// May be called with CC if the GC is not active. +DEFINE_FUNCTION art_quick_alloc_object_initialized_tlab +    // RDI: mirror::Class* klass, RSI: ArtMethod* +    // RDX, RCX, R8, R9: free. RAX: return val. +    movq %rdi, %rdx +    ALLOC_OBJECT_INITIALIZED_TLAB_FAST_PATH .Lart_quick_alloc_object_initialized_tlab_slow_path +.Lart_quick_alloc_object_initialized_tlab_slow_path: +    ALLOC_OBJECT_TLAB_SLOW_PATH artAllocObjectFromCodeInitializedTLAB +END_FUNCTION art_quick_alloc_object_initialized_tlab + +// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY(_tlab, TLAB). +DEFINE_FUNCTION art_quick_alloc_array_tlab +    // RDI: uint32_t type_idx, RSI: int32_t component_count, RDX: ArtMethod* +    // RCX: klass, R8, R9: free. RAX: return val. +    movq ART_METHOD_DEX_CACHE_TYPES_OFFSET_64(%rdx), %rcx      // Load dex cache resolved types array +    movl 0(%rcx, %rdi, COMPRESSED_REFERENCE_SIZE), %ecx        // Load the class +    testl %ecx, %ecx +    jz .Lart_quick_alloc_array_tlab_slow_path +    ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED .Lart_quick_alloc_array_tlab_slow_path +.Lart_quick_alloc_array_tlab_slow_path: +    ALLOC_ARRAY_TLAB_SLOW_PATH artAllocArrayFromCodeTLAB +END_FUNCTION art_quick_alloc_array_tlab + +// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED(_tlab, TLAB). +DEFINE_FUNCTION art_quick_alloc_array_resolved_tlab +    // RDI: mirror::Class* klass, RSI: int32_t component_count, RDX: ArtMethod* +    // RCX: mirror::Class* klass, R8, R9: free. RAX: return val. +    movq %rdi, %rcx +    // Already resolved, no null check. +    ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED .Lart_quick_alloc_array_resolved_tlab_slow_path +.Lart_quick_alloc_array_resolved_tlab_slow_path: +    ALLOC_ARRAY_TLAB_SLOW_PATH artAllocArrayFromCodeResolvedTLAB +END_FUNCTION art_quick_alloc_array_resolved_tlab +  // A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY(_region_tlab, RegionTLAB).  DEFINE_FUNCTION art_quick_alloc_array_region_tlab      // Fast path region tlab allocation.      // RDI: uint32_t type_idx, RSI: int32_t component_count, RDX: ArtMethod*      // RCX: klass, R8, R9: free. RAX: return val. -#if !defined(USE_READ_BARRIER) -    int3 -    int3 -#endif +    ASSERT_USE_READ_BARRIER      movq ART_METHOD_DEX_CACHE_TYPES_OFFSET_64(%rdx), %rcx      // Load dex cache resolved types array      movl 0(%rcx, %rdi, COMPRESSED_REFERENCE_SIZE), %ecx        // Load the class      // Null check so that we can load the lock word.      testl %ecx, %ecx      jz .Lart_quick_alloc_array_region_tlab_slow_path - -    cmpl LITERAL(0), %gs:THREAD_IS_GC_MARKING_OFFSET -    jne .Lart_quick_alloc_array_region_tlab_class_load_read_barrier_marking +    // Since we have allocation entrypoint switching, we know the GC is marking. +    // Check the mark bit, if it is 0, do the read barrier mark. +    testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(%ecx) +    jz .Lart_quick_alloc_array_region_tlab_class_load_read_barrier_slow_path  .Lart_quick_alloc_array_region_tlab_class_load_read_barrier_slow_path_exit:      ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED .Lart_quick_alloc_array_region_tlab_slow_path -.Lart_quick_alloc_array_region_tlab_class_load_read_barrier_marking: -    // Check the mark bit, if it is 1 return. -    testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(%ecx) -    jnz .Lart_quick_alloc_array_region_tlab_class_load_read_barrier_slow_path_exit  .Lart_quick_alloc_array_region_tlab_class_load_read_barrier_slow_path:      // The read barrier slow path. Mark the class.      PUSH rdi @@ -1226,33 +1283,11 @@ DEFINE_FUNCTION art_quick_alloc_array_resolved_region_tlab      // Fast path region tlab allocation.      // RDI: mirror::Class* klass, RSI: int32_t component_count, RDX: ArtMethod*      // RCX: mirror::Class* klass, R8, R9: free. RAX: return val. -#if !defined(USE_READ_BARRIER) -    int3 -    int3 -#endif +    ASSERT_USE_READ_BARRIER      movq %rdi, %rcx +    // Caller is responsible for read barrier.      // Already resolved, no null check. -    cmpl LITERAL(0), %gs:THREAD_IS_GC_MARKING_OFFSET -    jne .Lart_quick_alloc_array_resolved_region_tlab_class_load_read_barrier_marking -.Lart_quick_alloc_array_resolved_region_tlab_class_load_read_barrier_slow_path_exit:      ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED .Lart_quick_alloc_array_resolved_region_tlab_slow_path -.Lart_quick_alloc_array_resolved_region_tlab_class_load_read_barrier_marking: -    // Check the mark bit, if it is 1 return. -    testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(%ecx) -    jnz .Lart_quick_alloc_array_region_tlab_class_load_read_barrier_slow_path_exit -.Lart_quick_alloc_array_resolved_region_tlab_class_load_read_barrier_slow_path: -    // The read barrier slow path. Mark the class. -    PUSH rdi -    PUSH rsi -    PUSH rdx -    // Outgoing argument set up -    movq %rcx, %rdi                                            // Pass the class as the first param. -    call SYMBOL(artReadBarrierMark)                            // cxx_name(mirror::Object* obj) -    movq %rax, %rcx -    POP rdx -    POP rsi -    POP rdi -    jmp .Lart_quick_alloc_array_resolved_region_tlab_class_load_read_barrier_slow_path_exit  .Lart_quick_alloc_array_resolved_region_tlab_slow_path:      ALLOC_ARRAY_TLAB_SLOW_PATH artAllocArrayFromCodeResolvedRegionTLAB  END_FUNCTION art_quick_alloc_array_resolved_region_tlab @@ -1262,24 +1297,19 @@ DEFINE_FUNCTION art_quick_alloc_object_region_tlab      // Fast path region tlab allocation.      // RDI: uint32_t type_idx, RSI: ArtMethod*      // RDX, RCX, R8, R9: free. RAX: return val. -#if !defined(USE_READ_BARRIER) -    int3 -    int3 -#endif +    ASSERT_USE_READ_BARRIER      movq ART_METHOD_DEX_CACHE_TYPES_OFFSET_64(%rsi), %rdx  // Load dex cache resolved types array      movl 0(%rdx, %rdi, COMPRESSED_REFERENCE_SIZE), %edx    // Load the class      // Null check so that we can load the lock word.      testl %edx, %edx      jz .Lart_quick_alloc_object_region_tlab_slow_path -    // Test if the GC is marking. -    cmpl LITERAL(0), %gs:THREAD_IS_GC_MARKING_OFFSET -    jne .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_marking -.Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit: -    ALLOC_OBJECT_TLAB_FAST_PATH .Lart_quick_alloc_object_region_tlab_slow_path -.Lart_quick_alloc_object_region_tlab_class_load_read_barrier_marking: -    // Check the mark bit, if it is 1 avoid the read barrier. +    // Since we have allocation entrypoint switching, we know the GC is marking. +    // Check the mark bit, if it is 0, do the read barrier mark.      testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(%edx) -    jnz .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit +    jz .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path +.Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit: +    // Use resolved one since we already did the null check. +    ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH .Lart_quick_alloc_object_region_tlab_slow_path  .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path:      // The read barrier slow path. Mark the class.      PUSH rdi @@ -1302,10 +1332,7 @@ DEFINE_FUNCTION art_quick_alloc_object_resolved_region_tlab      // Fast path region tlab allocation.      // RDI: mirror::Class* klass, RSI: ArtMethod*      // RDX, RCX, R8, R9: free. RAX: return val. -#if !defined(USE_READ_BARRIER) -    int3 -    int3 -#endif +    ASSERT_USE_READ_BARRIER      // No read barrier since the caller is responsible for that.      movq %rdi, %rdx      ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH .Lart_quick_alloc_object_resolved_region_tlab_slow_path @@ -1318,10 +1345,7 @@ DEFINE_FUNCTION art_quick_alloc_object_initialized_region_tlab      // Fast path region tlab allocation.      // RDI: mirror::Class* klass, RSI: ArtMethod*      // RDX, RCX, R8, R9: free. RAX: return val. -#if !defined(USE_READ_BARRIER) -    int3 -    int3 -#endif +    ASSERT_USE_READ_BARRIER      movq %rdi, %rdx      // No read barrier since the caller is responsible for that.      ALLOC_OBJECT_INITIALIZED_TLAB_FAST_PATH .Lart_quick_alloc_object_initialized_region_tlab_slow_path diff --git a/runtime/entrypoints/quick/quick_alloc_entrypoints.cc b/runtime/entrypoints/quick/quick_alloc_entrypoints.cc index 397655a895..d2fee9a766 100644 --- a/runtime/entrypoints/quick/quick_alloc_entrypoints.cc +++ b/runtime/entrypoints/quick/quick_alloc_entrypoints.cc @@ -292,7 +292,7 @@ void SetQuickAllocEntryPointsInstrumented(bool instrumented) {    entry_points_instrumented = instrumented;  } -void ResetQuickAllocEntryPoints(QuickEntryPoints* qpoints) { +void ResetQuickAllocEntryPoints(QuickEntryPoints* qpoints, bool is_marking) {  #if !defined(__APPLE__) || !defined(__LP64__)    switch (entry_points_allocator) {      case gc::kAllocatorTypeDlMalloc: { @@ -320,7 +320,12 @@ void ResetQuickAllocEntryPoints(QuickEntryPoints* qpoints) {      }      case gc::kAllocatorTypeRegionTLAB: {        CHECK(kMovingCollector); -      SetQuickAllocEntryPoints_region_tlab(qpoints, entry_points_instrumented); +      if (is_marking) { +        SetQuickAllocEntryPoints_region_tlab(qpoints, entry_points_instrumented); +      } else { +        // Not marking means we need no read barriers and can just use the normal TLAB case. +        SetQuickAllocEntryPoints_tlab(qpoints, entry_points_instrumented); +      }        return;      }      default: diff --git a/runtime/entrypoints/quick/quick_alloc_entrypoints.h b/runtime/entrypoints/quick/quick_alloc_entrypoints.h index 14a8e0428b..bd1e295e48 100644 --- a/runtime/entrypoints/quick/quick_alloc_entrypoints.h +++ b/runtime/entrypoints/quick/quick_alloc_entrypoints.h @@ -23,7 +23,9 @@  namespace art { -void ResetQuickAllocEntryPoints(QuickEntryPoints* qpoints); +// is_marking is only used for CC, if the GC is marking the allocation entrypoint is the marking +// one. +void ResetQuickAllocEntryPoints(QuickEntryPoints* qpoints, bool is_marking);  // Runtime shutdown lock is necessary to prevent races in thread initialization. When the thread is  // starting it doesn't hold the mutator lock until after it has been added to the thread list. diff --git a/runtime/entrypoints/quick/quick_default_init_entrypoints.h b/runtime/entrypoints/quick/quick_default_init_entrypoints.h index df23f94a31..78dad94dfe 100644 --- a/runtime/entrypoints/quick/quick_default_init_entrypoints.h +++ b/runtime/entrypoints/quick/quick_default_init_entrypoints.h @@ -31,7 +31,7 @@ void DefaultInitEntryPoints(JniEntryPoints* jpoints, QuickEntryPoints* qpoints)    jpoints->pDlsymLookup = art_jni_dlsym_lookup_stub;    // Alloc -  ResetQuickAllocEntryPoints(qpoints); +  ResetQuickAllocEntryPoints(qpoints, /* is_marking */ true);    // DexCache    qpoints->pInitializeStaticStorage = art_quick_initialize_static_storage; diff --git a/runtime/gc/heap-inl.h b/runtime/gc/heap-inl.h index 97129e8b19..54f221056a 100644 --- a/runtime/gc/heap-inl.h +++ b/runtime/gc/heap-inl.h @@ -247,7 +247,7 @@ inline mirror::Object* Heap::TryToAllocate(Thread* self,    if (allocator_type != kAllocatorTypeTLAB &&        allocator_type != kAllocatorTypeRegionTLAB &&        allocator_type != kAllocatorTypeRosAlloc && -      UNLIKELY(IsOutOfMemoryOnAllocation<kGrow>(allocator_type, alloc_size))) { +      UNLIKELY(IsOutOfMemoryOnAllocation(allocator_type, alloc_size, kGrow))) {      return nullptr;    }    mirror::Object* ret; @@ -267,8 +267,9 @@ inline mirror::Object* Heap::TryToAllocate(Thread* self,        if (kInstrumented && UNLIKELY(is_running_on_memory_tool_)) {          // If running on valgrind or asan, we should be using the instrumented path.          size_t max_bytes_tl_bulk_allocated = rosalloc_space_->MaxBytesBulkAllocatedFor(alloc_size); -        if (UNLIKELY(IsOutOfMemoryOnAllocation<kGrow>(allocator_type, -                                                      max_bytes_tl_bulk_allocated))) { +        if (UNLIKELY(IsOutOfMemoryOnAllocation(allocator_type, +                                               max_bytes_tl_bulk_allocated, +                                               kGrow))) {            return nullptr;          }          ret = rosalloc_space_->Alloc(self, alloc_size, bytes_allocated, usable_size, @@ -277,14 +278,18 @@ inline mirror::Object* Heap::TryToAllocate(Thread* self,          DCHECK(!is_running_on_memory_tool_);          size_t max_bytes_tl_bulk_allocated =              rosalloc_space_->MaxBytesBulkAllocatedForNonvirtual(alloc_size); -        if (UNLIKELY(IsOutOfMemoryOnAllocation<kGrow>(allocator_type, -                                                      max_bytes_tl_bulk_allocated))) { +        if (UNLIKELY(IsOutOfMemoryOnAllocation(allocator_type, +                                               max_bytes_tl_bulk_allocated, +                                               kGrow))) {            return nullptr;          }          if (!kInstrumented) {            DCHECK(!rosalloc_space_->CanAllocThreadLocal(self, alloc_size));          } -        ret = rosalloc_space_->AllocNonvirtual(self, alloc_size, bytes_allocated, usable_size, +        ret = rosalloc_space_->AllocNonvirtual(self, +                                               alloc_size, +                                               bytes_allocated, +                                               usable_size,                                                 bytes_tl_bulk_allocated);        }        break; @@ -292,22 +297,34 @@ inline mirror::Object* Heap::TryToAllocate(Thread* self,      case kAllocatorTypeDlMalloc: {        if (kInstrumented && UNLIKELY(is_running_on_memory_tool_)) {          // If running on valgrind, we should be using the instrumented path. -        ret = dlmalloc_space_->Alloc(self, alloc_size, bytes_allocated, usable_size, +        ret = dlmalloc_space_->Alloc(self, +                                     alloc_size, +                                     bytes_allocated, +                                     usable_size,                                       bytes_tl_bulk_allocated);        } else {          DCHECK(!is_running_on_memory_tool_); -        ret = dlmalloc_space_->AllocNonvirtual(self, alloc_size, bytes_allocated, usable_size, +        ret = dlmalloc_space_->AllocNonvirtual(self, +                                               alloc_size, +                                               bytes_allocated, +                                               usable_size,                                                 bytes_tl_bulk_allocated);        }        break;      }      case kAllocatorTypeNonMoving: { -      ret = non_moving_space_->Alloc(self, alloc_size, bytes_allocated, usable_size, +      ret = non_moving_space_->Alloc(self, +                                     alloc_size, +                                     bytes_allocated, +                                     usable_size,                                       bytes_tl_bulk_allocated);        break;      }      case kAllocatorTypeLOS: { -      ret = large_object_space_->Alloc(self, alloc_size, bytes_allocated, usable_size, +      ret = large_object_space_->Alloc(self, +                                       alloc_size, +                                       bytes_allocated, +                                       usable_size,                                         bytes_tl_bulk_allocated);        // Note that the bump pointer spaces aren't necessarily next to        // the other continuous spaces like the non-moving alloc space or @@ -315,80 +332,38 @@ inline mirror::Object* Heap::TryToAllocate(Thread* self,        DCHECK(ret == nullptr || large_object_space_->Contains(ret));        break;      } -    case kAllocatorTypeTLAB: { -      DCHECK_ALIGNED(alloc_size, space::BumpPointerSpace::kAlignment); -      if (UNLIKELY(self->TlabSize() < alloc_size)) { -        const size_t new_tlab_size = alloc_size + kDefaultTLABSize; -        if (UNLIKELY(IsOutOfMemoryOnAllocation<kGrow>(allocator_type, new_tlab_size))) { -          return nullptr; -        } -        // Try allocating a new thread local buffer, if the allocaiton fails the space must be -        // full so return null. -        if (!bump_pointer_space_->AllocNewTlab(self, new_tlab_size)) { -          return nullptr; -        } -        *bytes_tl_bulk_allocated = new_tlab_size; -      } else { -        *bytes_tl_bulk_allocated = 0; -      } -      // The allocation can't fail. -      ret = self->AllocTlab(alloc_size); -      DCHECK(ret != nullptr); -      *bytes_allocated = alloc_size; -      *usable_size = alloc_size; -      break; -    }      case kAllocatorTypeRegion: {        DCHECK(region_space_ != nullptr);        alloc_size = RoundUp(alloc_size, space::RegionSpace::kAlignment); -      ret = region_space_->AllocNonvirtual<false>(alloc_size, bytes_allocated, usable_size, +      ret = region_space_->AllocNonvirtual<false>(alloc_size, +                                                  bytes_allocated, +                                                  usable_size,                                                    bytes_tl_bulk_allocated);        break;      } +    case kAllocatorTypeTLAB: +      FALLTHROUGH_INTENDED;      case kAllocatorTypeRegionTLAB: { -      DCHECK(region_space_ != nullptr); -      DCHECK_ALIGNED(alloc_size, space::RegionSpace::kAlignment); +      DCHECK_ALIGNED(alloc_size, kObjectAlignment); +      static_assert(space::RegionSpace::kAlignment == space::BumpPointerSpace::kAlignment, +                    "mismatched alignments"); +      static_assert(kObjectAlignment == space::BumpPointerSpace::kAlignment, +                    "mismatched alignments");        if (UNLIKELY(self->TlabSize() < alloc_size)) { -        if (space::RegionSpace::kRegionSize >= alloc_size) { -          // Non-large. Check OOME for a tlab. -          if (LIKELY(!IsOutOfMemoryOnAllocation<kGrow>(allocator_type, space::RegionSpace::kRegionSize))) { -            // Try to allocate a tlab. -            if (!region_space_->AllocNewTlab(self)) { -              // Failed to allocate a tlab. Try non-tlab. -              ret = region_space_->AllocNonvirtual<false>(alloc_size, bytes_allocated, usable_size, -                                                          bytes_tl_bulk_allocated); -              return ret; -            } -            *bytes_tl_bulk_allocated = space::RegionSpace::kRegionSize; -            // Fall-through. -          } else { -            // Check OOME for a non-tlab allocation. -            if (!IsOutOfMemoryOnAllocation<kGrow>(allocator_type, alloc_size)) { -              ret = region_space_->AllocNonvirtual<false>(alloc_size, bytes_allocated, usable_size, -                                                          bytes_tl_bulk_allocated); -              return ret; -            } else { -              // Neither tlab or non-tlab works. Give up. -              return nullptr; -            } -          } -        } else { -          // Large. Check OOME. -          if (LIKELY(!IsOutOfMemoryOnAllocation<kGrow>(allocator_type, alloc_size))) { -            ret = region_space_->AllocNonvirtual<false>(alloc_size, bytes_allocated, usable_size, -                                                        bytes_tl_bulk_allocated); -            return ret; -          } else { -            return nullptr; -          } -        } -      } else { -        *bytes_tl_bulk_allocated = 0;  // Allocated in an existing buffer. +        // kAllocatorTypeTLAB may be the allocator for region space TLAB if the GC is not marking, +        // that is why the allocator is not passed down. +        return AllocWithNewTLAB(self, +                                alloc_size, +                                kGrow, +                                bytes_allocated, +                                usable_size, +                                bytes_tl_bulk_allocated);        }        // The allocation can't fail.        ret = self->AllocTlab(alloc_size);        DCHECK(ret != nullptr);        *bytes_allocated = alloc_size; +      *bytes_tl_bulk_allocated = 0;  // Allocated in an existing buffer.        *usable_size = alloc_size;        break;      } @@ -408,15 +383,16 @@ inline bool Heap::ShouldAllocLargeObject(ObjPtr<mirror::Class> c, size_t byte_co    return byte_count >= large_object_threshold_ && (c->IsPrimitiveArray() || c->IsStringClass());  } -template <bool kGrow> -inline bool Heap::IsOutOfMemoryOnAllocation(AllocatorType allocator_type, size_t alloc_size) { +inline bool Heap::IsOutOfMemoryOnAllocation(AllocatorType allocator_type, +                                            size_t alloc_size, +                                            bool grow) {    size_t new_footprint = num_bytes_allocated_.LoadSequentiallyConsistent() + alloc_size;    if (UNLIKELY(new_footprint > max_allowed_footprint_)) {      if (UNLIKELY(new_footprint > growth_limit_)) {        return true;      }      if (!AllocatorMayHaveConcurrentGC(allocator_type) || !IsGcConcurrent()) { -      if (!kGrow) { +      if (!grow) {          return true;        }        // TODO: Grow for allocation is racy, fix it. diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc index f0e619dd35..ae9741ffa5 100644 --- a/runtime/gc/heap.cc +++ b/runtime/gc/heap.cc @@ -1819,7 +1819,7 @@ mirror::Object* Heap::AllocateInternalWithGc(Thread* self,            break;          }          // Try to transition the heap if the allocation failure was due to the space being full. -        if (!IsOutOfMemoryOnAllocation<false>(allocator, alloc_size)) { +        if (!IsOutOfMemoryOnAllocation(allocator, alloc_size, /*grow*/ false)) {            // If we aren't out of memory then the OOM was probably from the non moving space being            // full. Attempt to disable compaction and turn the main space into a non moving space.            DisableMovingGc(); @@ -4225,5 +4225,72 @@ void Heap::RemoveGcPauseListener() {    gc_pause_listener_.StoreRelaxed(nullptr);  } +mirror::Object* Heap::AllocWithNewTLAB(Thread* self, +                                       size_t alloc_size, +                                       bool grow, +                                       size_t* bytes_allocated, +                                       size_t* usable_size, +                                       size_t* bytes_tl_bulk_allocated) { +  const AllocatorType allocator_type = GetCurrentAllocator(); +  if (allocator_type == kAllocatorTypeTLAB) { +    DCHECK(bump_pointer_space_ != nullptr); +    const size_t new_tlab_size = alloc_size + kDefaultTLABSize; +    if (UNLIKELY(IsOutOfMemoryOnAllocation(allocator_type, new_tlab_size, grow))) { +      return nullptr; +    } +    // Try allocating a new thread local buffer, if the allocation fails the space must be +    // full so return null. +    if (!bump_pointer_space_->AllocNewTlab(self, new_tlab_size)) { +      return nullptr; +    } +    *bytes_tl_bulk_allocated = new_tlab_size; +  } else { +    DCHECK(allocator_type == kAllocatorTypeRegionTLAB); +    DCHECK(region_space_ != nullptr); +    if (space::RegionSpace::kRegionSize >= alloc_size) { +      // Non-large. Check OOME for a tlab. +      if (LIKELY(!IsOutOfMemoryOnAllocation(allocator_type, +                                            space::RegionSpace::kRegionSize, +                                            grow))) { +        // Try to allocate a tlab. +        if (!region_space_->AllocNewTlab(self)) { +          // Failed to allocate a tlab. Try non-tlab. +          return region_space_->AllocNonvirtual<false>(alloc_size, +                                                       bytes_allocated, +                                                       usable_size, +                                                       bytes_tl_bulk_allocated); +        } +        *bytes_tl_bulk_allocated = space::RegionSpace::kRegionSize; +        // Fall-through to using the TLAB below. +      } else { +        // Check OOME for a non-tlab allocation. +        if (!IsOutOfMemoryOnAllocation(allocator_type, alloc_size, grow)) { +          return region_space_->AllocNonvirtual<false>(alloc_size, +                                                       bytes_allocated, +                                                       usable_size, +                                                       bytes_tl_bulk_allocated); +        } +        // Neither tlab or non-tlab works. Give up. +        return nullptr; +      } +    } else { +      // Large. Check OOME. +      if (LIKELY(!IsOutOfMemoryOnAllocation(allocator_type, alloc_size, grow))) { +        return region_space_->AllocNonvirtual<false>(alloc_size, +                                                     bytes_allocated, +                                                     usable_size, +                                                     bytes_tl_bulk_allocated); +      } +      return nullptr; +    } +  } +  // Refilled TLAB, return. +  mirror::Object* ret = self->AllocTlab(alloc_size); +  DCHECK(ret != nullptr); +  *bytes_allocated = alloc_size; +  *usable_size = alloc_size; +  return ret; +} +  }  // namespace gc  }  // namespace art diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h index 0c671d269d..3a8e29b08a 100644 --- a/runtime/gc/heap.h +++ b/runtime/gc/heap.h @@ -854,6 +854,10 @@ class Heap {          allocator_type != kAllocatorTypeRegionTLAB;    }    static ALWAYS_INLINE bool AllocatorMayHaveConcurrentGC(AllocatorType allocator_type) { +    if (kUseReadBarrier) { +      // Read barrier may have the TLAB allocator but is always concurrent. TODO: clean this up. +      return true; +    }      return          allocator_type != kAllocatorTypeBumpPointer &&          allocator_type != kAllocatorTypeTLAB; @@ -923,11 +927,20 @@ class Heap {                                                size_t* bytes_tl_bulk_allocated)        REQUIRES_SHARED(Locks::mutator_lock_); +  mirror::Object* AllocWithNewTLAB(Thread* self, +                                   size_t alloc_size, +                                   bool grow, +                                   size_t* bytes_allocated, +                                   size_t* usable_size, +                                   size_t* bytes_tl_bulk_allocated) +      REQUIRES_SHARED(Locks::mutator_lock_); +    void ThrowOutOfMemoryError(Thread* self, size_t byte_count, AllocatorType allocator_type)        REQUIRES_SHARED(Locks::mutator_lock_); -  template <bool kGrow> -  ALWAYS_INLINE bool IsOutOfMemoryOnAllocation(AllocatorType allocator_type, size_t alloc_size); +  ALWAYS_INLINE bool IsOutOfMemoryOnAllocation(AllocatorType allocator_type, +                                               size_t alloc_size, +                                               bool grow);    // Run the finalizers. If timeout is non zero, then we use the VMRuntime version.    void RunFinalization(JNIEnv* env, uint64_t timeout); diff --git a/runtime/instrumentation.cc b/runtime/instrumentation.cc index d4c322eb84..870d1ae9b5 100644 --- a/runtime/instrumentation.cc +++ b/runtime/instrumentation.cc @@ -630,7 +630,7 @@ void Instrumentation::ConfigureStubs(const char* key, InstrumentationLevel desir  }  static void ResetQuickAllocEntryPointsForThread(Thread* thread, void* arg ATTRIBUTE_UNUSED) { -  thread->ResetQuickAllocEntryPointsForThread(); +  thread->ResetQuickAllocEntryPointsForThread(kUseReadBarrier && thread->GetIsGcMarking());  }  void Instrumentation::SetEntrypointsInstrumented(bool instrumented) { diff --git a/runtime/thread.cc b/runtime/thread.cc index 65c86815b5..c92e38b6e8 100644 --- a/runtime/thread.cc +++ b/runtime/thread.cc @@ -122,21 +122,27 @@ void Thread::SetIsGcMarkingAndUpdateEntrypoints(bool is_marking) {    CHECK(kUseReadBarrier);    tls32_.is_gc_marking = is_marking;    UpdateReadBarrierEntrypoints(&tlsPtr_.quick_entrypoints, is_marking); +  if (kRuntimeISA == kX86_64) { +    // Entrypoint switching is only implemented for X86_64. +    ResetQuickAllocEntryPointsForThread(is_marking); +  }  }  void Thread::InitTlsEntryPoints() {    // Insert a placeholder so we can easily tell if we call an unimplemented entry point.    uintptr_t* begin = reinterpret_cast<uintptr_t*>(&tlsPtr_.jni_entrypoints); -  uintptr_t* end = reinterpret_cast<uintptr_t*>(reinterpret_cast<uint8_t*>(&tlsPtr_.quick_entrypoints) + -      sizeof(tlsPtr_.quick_entrypoints)); +  uintptr_t* end = reinterpret_cast<uintptr_t*>( +      reinterpret_cast<uint8_t*>(&tlsPtr_.quick_entrypoints) + sizeof(tlsPtr_.quick_entrypoints));    for (uintptr_t* it = begin; it != end; ++it) {      *it = reinterpret_cast<uintptr_t>(UnimplementedEntryPoint);    }    InitEntryPoints(&tlsPtr_.jni_entrypoints, &tlsPtr_.quick_entrypoints);  } -void Thread::ResetQuickAllocEntryPointsForThread() { -  ResetQuickAllocEntryPoints(&tlsPtr_.quick_entrypoints); +void Thread::ResetQuickAllocEntryPointsForThread(bool is_marking) { +  // Entrypoint switching is currnetly only faster for X86_64 since other archs don't have TLAB +  // fast path for non region space entrypoints. +  ResetQuickAllocEntryPoints(&tlsPtr_.quick_entrypoints, is_marking);  }  class DeoptimizationContextRecord { diff --git a/runtime/thread.h b/runtime/thread.h index 97093a6350..35226f2230 100644 --- a/runtime/thread.h +++ b/runtime/thread.h @@ -1007,7 +1007,7 @@ class Thread {      tls32_.state_and_flags.as_atomic_int.FetchAndAndSequentiallyConsistent(-1 ^ flag);    } -  void ResetQuickAllocEntryPointsForThread(); +  void ResetQuickAllocEntryPointsForThread(bool is_marking);    // Returns the remaining space in the TLAB.    size_t TlabSize() const; |