X86_64: Add allocation entrypoint switching for CC is_marking
Only X86_64 done so far. Use normal TLAB allocators if GC is not
marking.
Allocation speed goes up by ~8% based on perf sampling.
Without change:
1.19%: art_quick_alloc_object_region_tlab
With change:
0.63%: art_quick_alloc_object_tlab
0.47%: art_quick_alloc_object_region_tlab
Bug: 31018974
Bug: 12687968
Test: test-art-host-run-test
Change-Id: I4c4d9eb229d4ad2f41b856ba5c2958a5eb3b7ffa
diff --git a/runtime/arch/quick_alloc_entrypoints.S b/runtime/arch/quick_alloc_entrypoints.S
index fa86bf4..db2fdca 100644
--- a/runtime/arch/quick_alloc_entrypoints.S
+++ b/runtime/arch/quick_alloc_entrypoints.S
@@ -107,7 +107,28 @@
GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_region_tlab, RegionTLAB)
.endm
+.macro GENERATE_ALLOC_ENTRYPOINTS_FOR_TLAB_ALLOCATOR
+// This is to be separately defined for each architecture to allow a hand-written assembly fast path.
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab, TLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_tlab, TLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_WITH_ACCESS_CHECK(_tlab, TLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY(_tlab, TLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED(_tlab, TLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_WITH_ACCESS_CHECK(_tlab, TLAB)
+GENERATE_ALLOC_ENTRYPOINTS_CHECK_AND_ALLOC_ARRAY(_tlab, TLAB)
+GENERATE_ALLOC_ENTRYPOINTS_CHECK_AND_ALLOC_ARRAY_WITH_ACCESS_CHECK(_tlab, TLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_BYTES(_tlab, TLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_tlab, TLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_tlab, TLAB)
+.endm
+
.macro GENERATE_ALLOC_ENTRYPOINTS_FOR_NON_REGION_TLAB_ALLOCATORS
+GENERATE_ALLOC_ENTRYPOINTS_FOR_NON_TLAB_ALLOCATORS
+GENERATE_ALLOC_ENTRYPOINTS_FOR_TLAB_ALLOCATOR
+.endm
+
+.macro GENERATE_ALLOC_ENTRYPOINTS_FOR_NON_TLAB_ALLOCATORS
GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_dlmalloc, DlMalloc)
GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_dlmalloc, DlMalloc)
GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_dlmalloc, DlMalloc)
@@ -187,20 +208,6 @@
GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_bump_pointer_instrumented, BumpPointerInstrumented)
GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_bump_pointer_instrumented, BumpPointerInstrumented)
-// This is to be separately defined for each architecture to allow a hand-written assembly fast path.
-// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB)
-GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab, TLAB)
-GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_tlab, TLAB)
-GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_WITH_ACCESS_CHECK(_tlab, TLAB)
-GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY(_tlab, TLAB)
-GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED(_tlab, TLAB)
-GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_WITH_ACCESS_CHECK(_tlab, TLAB)
-GENERATE_ALLOC_ENTRYPOINTS_CHECK_AND_ALLOC_ARRAY(_tlab, TLAB)
-GENERATE_ALLOC_ENTRYPOINTS_CHECK_AND_ALLOC_ARRAY_WITH_ACCESS_CHECK(_tlab, TLAB)
-GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_BYTES(_tlab, TLAB)
-GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_tlab, TLAB)
-GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_tlab, TLAB)
-
GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab_instrumented, TLABInstrumented)
GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab_instrumented, TLABInstrumented)
GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_tlab_instrumented, TLABInstrumented)
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index fb405fa..6fbc954 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -1085,15 +1085,12 @@
RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER // return or deliver exception
END_MACRO
-// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB).
+// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB). May be called
+// for CC if the GC is not marking.
DEFINE_FUNCTION art_quick_alloc_object_tlab
// Fast path tlab allocation.
// EAX: uint32_t type_idx/return value, ECX: ArtMethod*.
// EBX, EDX: free.
-#if defined(USE_READ_BARRIER)
- int3
- int3
-#endif
PUSH esi
PUSH edi
movl ART_METHOD_DEX_CACHE_TYPES_OFFSET_32(%ecx), %edx // Load dex cache resolved types array
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 860b77e..f8066e4 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -18,6 +18,13 @@
#include "arch/quick_alloc_entrypoints.S"
+MACRO0(ASSERT_USE_READ_BARRIER)
+#if !defined(USE_READ_BARRIER)
+ int3
+ int3
+#endif
+END_MACRO
+
MACRO0(SETUP_FP_CALLEE_SAVE_FRAME)
// Create space for ART FP callee-saved registers
subq MACRO_LITERAL(4 * 8), %rsp
@@ -972,8 +979,10 @@
END_MACRO
// Generate the allocation entrypoints for each allocator.
-GENERATE_ALLOC_ENTRYPOINTS_FOR_NON_REGION_TLAB_ALLOCATORS
+GENERATE_ALLOC_ENTRYPOINTS_FOR_NON_TLAB_ALLOCATORS
+
// Comment out allocators that have x86_64 specific asm.
+// Region TLAB:
// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_region_tlab, RegionTLAB)
// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_region_tlab, RegionTLAB)
// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_region_tlab, RegionTLAB)
@@ -986,6 +995,19 @@
GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_BYTES(_region_tlab, RegionTLAB)
GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_region_tlab, RegionTLAB)
GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_region_tlab, RegionTLAB)
+// Normal TLAB:
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab, TLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_tlab, TLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_WITH_ACCESS_CHECK(_tlab, TLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY(_tlab, TLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED(_tlab, TLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_WITH_ACCESS_CHECK(_tlab, TLAB)
+GENERATE_ALLOC_ENTRYPOINTS_CHECK_AND_ALLOC_ARRAY(_tlab, TLAB)
+GENERATE_ALLOC_ENTRYPOINTS_CHECK_AND_ALLOC_ARRAY_WITH_ACCESS_CHECK(_tlab, TLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_BYTES(_tlab, TLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_tlab, TLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_tlab, TLAB)
// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_rosalloc, RosAlloc).
DEFINE_FUNCTION art_quick_alloc_object_rosalloc
@@ -1162,16 +1184,11 @@
RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER // return or deliver exception
END_MACRO
-// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB).
+// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB). May be
+// called with CC if the GC is not active.
DEFINE_FUNCTION art_quick_alloc_object_tlab
- // Fast path tlab allocation.
// RDI: uint32_t type_idx, RSI: ArtMethod*
// RDX, RCX, R8, R9: free. RAX: return val.
-#if defined(USE_READ_BARRIER)
- int3
- int3
-#endif
- // Might need a special macro since rsi and edx is 32b/64b mismatched.
movq ART_METHOD_DEX_CACHE_TYPES_OFFSET_64(%rsi), %rdx // Load dex cache resolved types array
// Might need to break down into multiple instructions to get the base address in a register.
// Load the class
@@ -1181,29 +1198,69 @@
ALLOC_OBJECT_TLAB_SLOW_PATH artAllocObjectFromCodeTLAB
END_FUNCTION art_quick_alloc_object_tlab
+// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab, TLAB). May be
+// called with CC if the GC is not active.
+DEFINE_FUNCTION art_quick_alloc_object_resolved_tlab
+ // RDI: mirror::Class* klass, RSI: ArtMethod*
+ // RDX, RCX, R8, R9: free. RAX: return val.
+ movq %rdi, %rdx
+ ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH .Lart_quick_alloc_object_resolved_tlab_slow_path
+.Lart_quick_alloc_object_resolved_tlab_slow_path:
+ ALLOC_OBJECT_TLAB_SLOW_PATH artAllocObjectFromCodeResolvedTLAB
+END_FUNCTION art_quick_alloc_object_resolved_tlab
+
+// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_tlab, TLAB).
+// May be called with CC if the GC is not active.
+DEFINE_FUNCTION art_quick_alloc_object_initialized_tlab
+ // RDI: mirror::Class* klass, RSI: ArtMethod*
+ // RDX, RCX, R8, R9: free. RAX: return val.
+ movq %rdi, %rdx
+ ALLOC_OBJECT_INITIALIZED_TLAB_FAST_PATH .Lart_quick_alloc_object_initialized_tlab_slow_path
+.Lart_quick_alloc_object_initialized_tlab_slow_path:
+ ALLOC_OBJECT_TLAB_SLOW_PATH artAllocObjectFromCodeInitializedTLAB
+END_FUNCTION art_quick_alloc_object_initialized_tlab
+
+// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY(_tlab, TLAB).
+DEFINE_FUNCTION art_quick_alloc_array_tlab
+ // RDI: uint32_t type_idx, RSI: int32_t component_count, RDX: ArtMethod*
+ // RCX: klass, R8, R9: free. RAX: return val.
+ movq ART_METHOD_DEX_CACHE_TYPES_OFFSET_64(%rdx), %rcx // Load dex cache resolved types array
+ movl 0(%rcx, %rdi, COMPRESSED_REFERENCE_SIZE), %ecx // Load the class
+ testl %ecx, %ecx
+ jz .Lart_quick_alloc_array_tlab_slow_path
+ ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED .Lart_quick_alloc_array_tlab_slow_path
+.Lart_quick_alloc_array_tlab_slow_path:
+ ALLOC_ARRAY_TLAB_SLOW_PATH artAllocArrayFromCodeTLAB
+END_FUNCTION art_quick_alloc_array_tlab
+
+// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED(_tlab, TLAB).
+DEFINE_FUNCTION art_quick_alloc_array_resolved_tlab
+ // RDI: mirror::Class* klass, RSI: int32_t component_count, RDX: ArtMethod*
+ // RCX: mirror::Class* klass, R8, R9: free. RAX: return val.
+ movq %rdi, %rcx
+ // Already resolved, no null check.
+ ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED .Lart_quick_alloc_array_resolved_tlab_slow_path
+.Lart_quick_alloc_array_resolved_tlab_slow_path:
+ ALLOC_ARRAY_TLAB_SLOW_PATH artAllocArrayFromCodeResolvedTLAB
+END_FUNCTION art_quick_alloc_array_resolved_tlab
+
// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY(_region_tlab, RegionTLAB).
DEFINE_FUNCTION art_quick_alloc_array_region_tlab
// Fast path region tlab allocation.
// RDI: uint32_t type_idx, RSI: int32_t component_count, RDX: ArtMethod*
// RCX: klass, R8, R9: free. RAX: return val.
-#if !defined(USE_READ_BARRIER)
- int3
- int3
-#endif
+ ASSERT_USE_READ_BARRIER
movq ART_METHOD_DEX_CACHE_TYPES_OFFSET_64(%rdx), %rcx // Load dex cache resolved types array
movl 0(%rcx, %rdi, COMPRESSED_REFERENCE_SIZE), %ecx // Load the class
// Null check so that we can load the lock word.
testl %ecx, %ecx
jz .Lart_quick_alloc_array_region_tlab_slow_path
-
- cmpl LITERAL(0), %gs:THREAD_IS_GC_MARKING_OFFSET
- jne .Lart_quick_alloc_array_region_tlab_class_load_read_barrier_marking
+ // Since we have allocation entrypoint switching, we know the GC is marking.
+ // Check the mark bit, if it is 0, do the read barrier mark.
+ testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(%ecx)
+ jz .Lart_quick_alloc_array_region_tlab_class_load_read_barrier_slow_path
.Lart_quick_alloc_array_region_tlab_class_load_read_barrier_slow_path_exit:
ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED .Lart_quick_alloc_array_region_tlab_slow_path
-.Lart_quick_alloc_array_region_tlab_class_load_read_barrier_marking:
- // Check the mark bit, if it is 1 return.
- testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(%ecx)
- jnz .Lart_quick_alloc_array_region_tlab_class_load_read_barrier_slow_path_exit
.Lart_quick_alloc_array_region_tlab_class_load_read_barrier_slow_path:
// The read barrier slow path. Mark the class.
PUSH rdi
@@ -1226,33 +1283,11 @@
// Fast path region tlab allocation.
// RDI: mirror::Class* klass, RSI: int32_t component_count, RDX: ArtMethod*
// RCX: mirror::Class* klass, R8, R9: free. RAX: return val.
-#if !defined(USE_READ_BARRIER)
- int3
- int3
-#endif
+ ASSERT_USE_READ_BARRIER
movq %rdi, %rcx
+ // Caller is responsible for read barrier.
// Already resolved, no null check.
- cmpl LITERAL(0), %gs:THREAD_IS_GC_MARKING_OFFSET
- jne .Lart_quick_alloc_array_resolved_region_tlab_class_load_read_barrier_marking
-.Lart_quick_alloc_array_resolved_region_tlab_class_load_read_barrier_slow_path_exit:
ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED .Lart_quick_alloc_array_resolved_region_tlab_slow_path
-.Lart_quick_alloc_array_resolved_region_tlab_class_load_read_barrier_marking:
- // Check the mark bit, if it is 1 return.
- testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(%ecx)
- jnz .Lart_quick_alloc_array_region_tlab_class_load_read_barrier_slow_path_exit
-.Lart_quick_alloc_array_resolved_region_tlab_class_load_read_barrier_slow_path:
- // The read barrier slow path. Mark the class.
- PUSH rdi
- PUSH rsi
- PUSH rdx
- // Outgoing argument set up
- movq %rcx, %rdi // Pass the class as the first param.
- call SYMBOL(artReadBarrierMark) // cxx_name(mirror::Object* obj)
- movq %rax, %rcx
- POP rdx
- POP rsi
- POP rdi
- jmp .Lart_quick_alloc_array_resolved_region_tlab_class_load_read_barrier_slow_path_exit
.Lart_quick_alloc_array_resolved_region_tlab_slow_path:
ALLOC_ARRAY_TLAB_SLOW_PATH artAllocArrayFromCodeResolvedRegionTLAB
END_FUNCTION art_quick_alloc_array_resolved_region_tlab
@@ -1262,24 +1297,19 @@
// Fast path region tlab allocation.
// RDI: uint32_t type_idx, RSI: ArtMethod*
// RDX, RCX, R8, R9: free. RAX: return val.
-#if !defined(USE_READ_BARRIER)
- int3
- int3
-#endif
+ ASSERT_USE_READ_BARRIER
movq ART_METHOD_DEX_CACHE_TYPES_OFFSET_64(%rsi), %rdx // Load dex cache resolved types array
movl 0(%rdx, %rdi, COMPRESSED_REFERENCE_SIZE), %edx // Load the class
// Null check so that we can load the lock word.
testl %edx, %edx
jz .Lart_quick_alloc_object_region_tlab_slow_path
- // Test if the GC is marking.
- cmpl LITERAL(0), %gs:THREAD_IS_GC_MARKING_OFFSET
- jne .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_marking
-.Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit:
- ALLOC_OBJECT_TLAB_FAST_PATH .Lart_quick_alloc_object_region_tlab_slow_path
-.Lart_quick_alloc_object_region_tlab_class_load_read_barrier_marking:
- // Check the mark bit, if it is 1 avoid the read barrier.
+ // Since we have allocation entrypoint switching, we know the GC is marking.
+ // Check the mark bit, if it is 0, do the read barrier mark.
testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(%edx)
- jnz .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit
+ jz .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path
+.Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit:
+ // Use resolved one since we already did the null check.
+ ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH .Lart_quick_alloc_object_region_tlab_slow_path
.Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path:
// The read barrier slow path. Mark the class.
PUSH rdi
@@ -1302,10 +1332,7 @@
// Fast path region tlab allocation.
// RDI: mirror::Class* klass, RSI: ArtMethod*
// RDX, RCX, R8, R9: free. RAX: return val.
-#if !defined(USE_READ_BARRIER)
- int3
- int3
-#endif
+ ASSERT_USE_READ_BARRIER
// No read barrier since the caller is responsible for that.
movq %rdi, %rdx
ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH .Lart_quick_alloc_object_resolved_region_tlab_slow_path
@@ -1318,10 +1345,7 @@
// Fast path region tlab allocation.
// RDI: mirror::Class* klass, RSI: ArtMethod*
// RDX, RCX, R8, R9: free. RAX: return val.
-#if !defined(USE_READ_BARRIER)
- int3
- int3
-#endif
+ ASSERT_USE_READ_BARRIER
movq %rdi, %rdx
// No read barrier since the caller is responsible for that.
ALLOC_OBJECT_INITIALIZED_TLAB_FAST_PATH .Lart_quick_alloc_object_initialized_region_tlab_slow_path
diff --git a/runtime/entrypoints/quick/quick_alloc_entrypoints.cc b/runtime/entrypoints/quick/quick_alloc_entrypoints.cc
index 515fcbf..4a7e819 100644
--- a/runtime/entrypoints/quick/quick_alloc_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_alloc_entrypoints.cc
@@ -258,7 +258,7 @@
entry_points_instrumented = instrumented;
}
-void ResetQuickAllocEntryPoints(QuickEntryPoints* qpoints) {
+void ResetQuickAllocEntryPoints(QuickEntryPoints* qpoints, bool is_marking) {
#if !defined(__APPLE__) || !defined(__LP64__)
switch (entry_points_allocator) {
case gc::kAllocatorTypeDlMalloc: {
@@ -286,7 +286,12 @@
}
case gc::kAllocatorTypeRegionTLAB: {
CHECK(kMovingCollector);
- SetQuickAllocEntryPoints_region_tlab(qpoints, entry_points_instrumented);
+ if (is_marking) {
+ SetQuickAllocEntryPoints_region_tlab(qpoints, entry_points_instrumented);
+ } else {
+ // Not marking means we need no read barriers and can just use the normal TLAB case.
+ SetQuickAllocEntryPoints_tlab(qpoints, entry_points_instrumented);
+ }
return;
}
default:
diff --git a/runtime/entrypoints/quick/quick_alloc_entrypoints.h b/runtime/entrypoints/quick/quick_alloc_entrypoints.h
index 14a8e04..bd1e295 100644
--- a/runtime/entrypoints/quick/quick_alloc_entrypoints.h
+++ b/runtime/entrypoints/quick/quick_alloc_entrypoints.h
@@ -23,7 +23,9 @@
namespace art {
-void ResetQuickAllocEntryPoints(QuickEntryPoints* qpoints);
+// is_marking is only used for CC, if the GC is marking the allocation entrypoint is the marking
+// one.
+void ResetQuickAllocEntryPoints(QuickEntryPoints* qpoints, bool is_marking);
// Runtime shutdown lock is necessary to prevent races in thread initialization. When the thread is
// starting it doesn't hold the mutator lock until after it has been added to the thread list.
diff --git a/runtime/entrypoints/quick/quick_default_init_entrypoints.h b/runtime/entrypoints/quick/quick_default_init_entrypoints.h
index df23f94..78dad94 100644
--- a/runtime/entrypoints/quick/quick_default_init_entrypoints.h
+++ b/runtime/entrypoints/quick/quick_default_init_entrypoints.h
@@ -31,7 +31,7 @@
jpoints->pDlsymLookup = art_jni_dlsym_lookup_stub;
// Alloc
- ResetQuickAllocEntryPoints(qpoints);
+ ResetQuickAllocEntryPoints(qpoints, /* is_marking */ true);
// DexCache
qpoints->pInitializeStaticStorage = art_quick_initialize_static_storage;
diff --git a/runtime/gc/heap-inl.h b/runtime/gc/heap-inl.h
index 97129e8..54f2210 100644
--- a/runtime/gc/heap-inl.h
+++ b/runtime/gc/heap-inl.h
@@ -247,7 +247,7 @@
if (allocator_type != kAllocatorTypeTLAB &&
allocator_type != kAllocatorTypeRegionTLAB &&
allocator_type != kAllocatorTypeRosAlloc &&
- UNLIKELY(IsOutOfMemoryOnAllocation<kGrow>(allocator_type, alloc_size))) {
+ UNLIKELY(IsOutOfMemoryOnAllocation(allocator_type, alloc_size, kGrow))) {
return nullptr;
}
mirror::Object* ret;
@@ -267,8 +267,9 @@
if (kInstrumented && UNLIKELY(is_running_on_memory_tool_)) {
// If running on valgrind or asan, we should be using the instrumented path.
size_t max_bytes_tl_bulk_allocated = rosalloc_space_->MaxBytesBulkAllocatedFor(alloc_size);
- if (UNLIKELY(IsOutOfMemoryOnAllocation<kGrow>(allocator_type,
- max_bytes_tl_bulk_allocated))) {
+ if (UNLIKELY(IsOutOfMemoryOnAllocation(allocator_type,
+ max_bytes_tl_bulk_allocated,
+ kGrow))) {
return nullptr;
}
ret = rosalloc_space_->Alloc(self, alloc_size, bytes_allocated, usable_size,
@@ -277,14 +278,18 @@
DCHECK(!is_running_on_memory_tool_);
size_t max_bytes_tl_bulk_allocated =
rosalloc_space_->MaxBytesBulkAllocatedForNonvirtual(alloc_size);
- if (UNLIKELY(IsOutOfMemoryOnAllocation<kGrow>(allocator_type,
- max_bytes_tl_bulk_allocated))) {
+ if (UNLIKELY(IsOutOfMemoryOnAllocation(allocator_type,
+ max_bytes_tl_bulk_allocated,
+ kGrow))) {
return nullptr;
}
if (!kInstrumented) {
DCHECK(!rosalloc_space_->CanAllocThreadLocal(self, alloc_size));
}
- ret = rosalloc_space_->AllocNonvirtual(self, alloc_size, bytes_allocated, usable_size,
+ ret = rosalloc_space_->AllocNonvirtual(self,
+ alloc_size,
+ bytes_allocated,
+ usable_size,
bytes_tl_bulk_allocated);
}
break;
@@ -292,22 +297,34 @@
case kAllocatorTypeDlMalloc: {
if (kInstrumented && UNLIKELY(is_running_on_memory_tool_)) {
// If running on valgrind, we should be using the instrumented path.
- ret = dlmalloc_space_->Alloc(self, alloc_size, bytes_allocated, usable_size,
+ ret = dlmalloc_space_->Alloc(self,
+ alloc_size,
+ bytes_allocated,
+ usable_size,
bytes_tl_bulk_allocated);
} else {
DCHECK(!is_running_on_memory_tool_);
- ret = dlmalloc_space_->AllocNonvirtual(self, alloc_size, bytes_allocated, usable_size,
+ ret = dlmalloc_space_->AllocNonvirtual(self,
+ alloc_size,
+ bytes_allocated,
+ usable_size,
bytes_tl_bulk_allocated);
}
break;
}
case kAllocatorTypeNonMoving: {
- ret = non_moving_space_->Alloc(self, alloc_size, bytes_allocated, usable_size,
+ ret = non_moving_space_->Alloc(self,
+ alloc_size,
+ bytes_allocated,
+ usable_size,
bytes_tl_bulk_allocated);
break;
}
case kAllocatorTypeLOS: {
- ret = large_object_space_->Alloc(self, alloc_size, bytes_allocated, usable_size,
+ ret = large_object_space_->Alloc(self,
+ alloc_size,
+ bytes_allocated,
+ usable_size,
bytes_tl_bulk_allocated);
// Note that the bump pointer spaces aren't necessarily next to
// the other continuous spaces like the non-moving alloc space or
@@ -315,80 +332,38 @@
DCHECK(ret == nullptr || large_object_space_->Contains(ret));
break;
}
- case kAllocatorTypeTLAB: {
- DCHECK_ALIGNED(alloc_size, space::BumpPointerSpace::kAlignment);
- if (UNLIKELY(self->TlabSize() < alloc_size)) {
- const size_t new_tlab_size = alloc_size + kDefaultTLABSize;
- if (UNLIKELY(IsOutOfMemoryOnAllocation<kGrow>(allocator_type, new_tlab_size))) {
- return nullptr;
- }
- // Try allocating a new thread local buffer, if the allocaiton fails the space must be
- // full so return null.
- if (!bump_pointer_space_->AllocNewTlab(self, new_tlab_size)) {
- return nullptr;
- }
- *bytes_tl_bulk_allocated = new_tlab_size;
- } else {
- *bytes_tl_bulk_allocated = 0;
- }
- // The allocation can't fail.
- ret = self->AllocTlab(alloc_size);
- DCHECK(ret != nullptr);
- *bytes_allocated = alloc_size;
- *usable_size = alloc_size;
- break;
- }
case kAllocatorTypeRegion: {
DCHECK(region_space_ != nullptr);
alloc_size = RoundUp(alloc_size, space::RegionSpace::kAlignment);
- ret = region_space_->AllocNonvirtual<false>(alloc_size, bytes_allocated, usable_size,
+ ret = region_space_->AllocNonvirtual<false>(alloc_size,
+ bytes_allocated,
+ usable_size,
bytes_tl_bulk_allocated);
break;
}
+ case kAllocatorTypeTLAB:
+ FALLTHROUGH_INTENDED;
case kAllocatorTypeRegionTLAB: {
- DCHECK(region_space_ != nullptr);
- DCHECK_ALIGNED(alloc_size, space::RegionSpace::kAlignment);
+ DCHECK_ALIGNED(alloc_size, kObjectAlignment);
+ static_assert(space::RegionSpace::kAlignment == space::BumpPointerSpace::kAlignment,
+ "mismatched alignments");
+ static_assert(kObjectAlignment == space::BumpPointerSpace::kAlignment,
+ "mismatched alignments");
if (UNLIKELY(self->TlabSize() < alloc_size)) {
- if (space::RegionSpace::kRegionSize >= alloc_size) {
- // Non-large. Check OOME for a tlab.
- if (LIKELY(!IsOutOfMemoryOnAllocation<kGrow>(allocator_type, space::RegionSpace::kRegionSize))) {
- // Try to allocate a tlab.
- if (!region_space_->AllocNewTlab(self)) {
- // Failed to allocate a tlab. Try non-tlab.
- ret = region_space_->AllocNonvirtual<false>(alloc_size, bytes_allocated, usable_size,
- bytes_tl_bulk_allocated);
- return ret;
- }
- *bytes_tl_bulk_allocated = space::RegionSpace::kRegionSize;
- // Fall-through.
- } else {
- // Check OOME for a non-tlab allocation.
- if (!IsOutOfMemoryOnAllocation<kGrow>(allocator_type, alloc_size)) {
- ret = region_space_->AllocNonvirtual<false>(alloc_size, bytes_allocated, usable_size,
- bytes_tl_bulk_allocated);
- return ret;
- } else {
- // Neither tlab or non-tlab works. Give up.
- return nullptr;
- }
- }
- } else {
- // Large. Check OOME.
- if (LIKELY(!IsOutOfMemoryOnAllocation<kGrow>(allocator_type, alloc_size))) {
- ret = region_space_->AllocNonvirtual<false>(alloc_size, bytes_allocated, usable_size,
- bytes_tl_bulk_allocated);
- return ret;
- } else {
- return nullptr;
- }
- }
- } else {
- *bytes_tl_bulk_allocated = 0; // Allocated in an existing buffer.
+ // kAllocatorTypeTLAB may be the allocator for region space TLAB if the GC is not marking,
+ // that is why the allocator is not passed down.
+ return AllocWithNewTLAB(self,
+ alloc_size,
+ kGrow,
+ bytes_allocated,
+ usable_size,
+ bytes_tl_bulk_allocated);
}
// The allocation can't fail.
ret = self->AllocTlab(alloc_size);
DCHECK(ret != nullptr);
*bytes_allocated = alloc_size;
+ *bytes_tl_bulk_allocated = 0; // Allocated in an existing buffer.
*usable_size = alloc_size;
break;
}
@@ -408,15 +383,16 @@
return byte_count >= large_object_threshold_ && (c->IsPrimitiveArray() || c->IsStringClass());
}
-template <bool kGrow>
-inline bool Heap::IsOutOfMemoryOnAllocation(AllocatorType allocator_type, size_t alloc_size) {
+inline bool Heap::IsOutOfMemoryOnAllocation(AllocatorType allocator_type,
+ size_t alloc_size,
+ bool grow) {
size_t new_footprint = num_bytes_allocated_.LoadSequentiallyConsistent() + alloc_size;
if (UNLIKELY(new_footprint > max_allowed_footprint_)) {
if (UNLIKELY(new_footprint > growth_limit_)) {
return true;
}
if (!AllocatorMayHaveConcurrentGC(allocator_type) || !IsGcConcurrent()) {
- if (!kGrow) {
+ if (!grow) {
return true;
}
// TODO: Grow for allocation is racy, fix it.
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index ddc3852..3e1cbeb 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -1815,7 +1815,7 @@
break;
}
// Try to transition the heap if the allocation failure was due to the space being full.
- if (!IsOutOfMemoryOnAllocation<false>(allocator, alloc_size)) {
+ if (!IsOutOfMemoryOnAllocation(allocator, alloc_size, /*grow*/ false)) {
// If we aren't out of memory then the OOM was probably from the non moving space being
// full. Attempt to disable compaction and turn the main space into a non moving space.
DisableMovingGc();
@@ -4221,5 +4221,72 @@
gc_pause_listener_.StoreRelaxed(nullptr);
}
+mirror::Object* Heap::AllocWithNewTLAB(Thread* self,
+ size_t alloc_size,
+ bool grow,
+ size_t* bytes_allocated,
+ size_t* usable_size,
+ size_t* bytes_tl_bulk_allocated) {
+ const AllocatorType allocator_type = GetCurrentAllocator();
+ if (allocator_type == kAllocatorTypeTLAB) {
+ DCHECK(bump_pointer_space_ != nullptr);
+ const size_t new_tlab_size = alloc_size + kDefaultTLABSize;
+ if (UNLIKELY(IsOutOfMemoryOnAllocation(allocator_type, new_tlab_size, grow))) {
+ return nullptr;
+ }
+ // Try allocating a new thread local buffer, if the allocation fails the space must be
+ // full so return null.
+ if (!bump_pointer_space_->AllocNewTlab(self, new_tlab_size)) {
+ return nullptr;
+ }
+ *bytes_tl_bulk_allocated = new_tlab_size;
+ } else {
+ DCHECK(allocator_type == kAllocatorTypeRegionTLAB);
+ DCHECK(region_space_ != nullptr);
+ if (space::RegionSpace::kRegionSize >= alloc_size) {
+ // Non-large. Check OOME for a tlab.
+ if (LIKELY(!IsOutOfMemoryOnAllocation(allocator_type,
+ space::RegionSpace::kRegionSize,
+ grow))) {
+ // Try to allocate a tlab.
+ if (!region_space_->AllocNewTlab(self)) {
+ // Failed to allocate a tlab. Try non-tlab.
+ return region_space_->AllocNonvirtual<false>(alloc_size,
+ bytes_allocated,
+ usable_size,
+ bytes_tl_bulk_allocated);
+ }
+ *bytes_tl_bulk_allocated = space::RegionSpace::kRegionSize;
+ // Fall-through to using the TLAB below.
+ } else {
+ // Check OOME for a non-tlab allocation.
+ if (!IsOutOfMemoryOnAllocation(allocator_type, alloc_size, grow)) {
+ return region_space_->AllocNonvirtual<false>(alloc_size,
+ bytes_allocated,
+ usable_size,
+ bytes_tl_bulk_allocated);
+ }
+ // Neither tlab or non-tlab works. Give up.
+ return nullptr;
+ }
+ } else {
+ // Large. Check OOME.
+ if (LIKELY(!IsOutOfMemoryOnAllocation(allocator_type, alloc_size, grow))) {
+ return region_space_->AllocNonvirtual<false>(alloc_size,
+ bytes_allocated,
+ usable_size,
+ bytes_tl_bulk_allocated);
+ }
+ return nullptr;
+ }
+ }
+ // Refilled TLAB, return.
+ mirror::Object* ret = self->AllocTlab(alloc_size);
+ DCHECK(ret != nullptr);
+ *bytes_allocated = alloc_size;
+ *usable_size = alloc_size;
+ return ret;
+}
+
} // namespace gc
} // namespace art
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index 0c671d2..3a8e29b 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -854,6 +854,10 @@
allocator_type != kAllocatorTypeRegionTLAB;
}
static ALWAYS_INLINE bool AllocatorMayHaveConcurrentGC(AllocatorType allocator_type) {
+ if (kUseReadBarrier) {
+ // Read barrier may have the TLAB allocator but is always concurrent. TODO: clean this up.
+ return true;
+ }
return
allocator_type != kAllocatorTypeBumpPointer &&
allocator_type != kAllocatorTypeTLAB;
@@ -923,11 +927,20 @@
size_t* bytes_tl_bulk_allocated)
REQUIRES_SHARED(Locks::mutator_lock_);
+ mirror::Object* AllocWithNewTLAB(Thread* self,
+ size_t alloc_size,
+ bool grow,
+ size_t* bytes_allocated,
+ size_t* usable_size,
+ size_t* bytes_tl_bulk_allocated)
+ REQUIRES_SHARED(Locks::mutator_lock_);
+
void ThrowOutOfMemoryError(Thread* self, size_t byte_count, AllocatorType allocator_type)
REQUIRES_SHARED(Locks::mutator_lock_);
- template <bool kGrow>
- ALWAYS_INLINE bool IsOutOfMemoryOnAllocation(AllocatorType allocator_type, size_t alloc_size);
+ ALWAYS_INLINE bool IsOutOfMemoryOnAllocation(AllocatorType allocator_type,
+ size_t alloc_size,
+ bool grow);
// Run the finalizers. If timeout is non zero, then we use the VMRuntime version.
void RunFinalization(JNIEnv* env, uint64_t timeout);
diff --git a/runtime/instrumentation.cc b/runtime/instrumentation.cc
index d4c322e..870d1ae 100644
--- a/runtime/instrumentation.cc
+++ b/runtime/instrumentation.cc
@@ -630,7 +630,7 @@
}
static void ResetQuickAllocEntryPointsForThread(Thread* thread, void* arg ATTRIBUTE_UNUSED) {
- thread->ResetQuickAllocEntryPointsForThread();
+ thread->ResetQuickAllocEntryPointsForThread(kUseReadBarrier && thread->GetIsGcMarking());
}
void Instrumentation::SetEntrypointsInstrumented(bool instrumented) {
diff --git a/runtime/thread.cc b/runtime/thread.cc
index b99df26..debd13a 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -122,21 +122,27 @@
CHECK(kUseReadBarrier);
tls32_.is_gc_marking = is_marking;
UpdateReadBarrierEntrypoints(&tlsPtr_.quick_entrypoints, is_marking);
+ if (kRuntimeISA == kX86_64) {
+ // Entrypoint switching is only implemented for X86_64.
+ ResetQuickAllocEntryPointsForThread(is_marking);
+ }
}
void Thread::InitTlsEntryPoints() {
// Insert a placeholder so we can easily tell if we call an unimplemented entry point.
uintptr_t* begin = reinterpret_cast<uintptr_t*>(&tlsPtr_.jni_entrypoints);
- uintptr_t* end = reinterpret_cast<uintptr_t*>(reinterpret_cast<uint8_t*>(&tlsPtr_.quick_entrypoints) +
- sizeof(tlsPtr_.quick_entrypoints));
+ uintptr_t* end = reinterpret_cast<uintptr_t*>(
+ reinterpret_cast<uint8_t*>(&tlsPtr_.quick_entrypoints) + sizeof(tlsPtr_.quick_entrypoints));
for (uintptr_t* it = begin; it != end; ++it) {
*it = reinterpret_cast<uintptr_t>(UnimplementedEntryPoint);
}
InitEntryPoints(&tlsPtr_.jni_entrypoints, &tlsPtr_.quick_entrypoints);
}
-void Thread::ResetQuickAllocEntryPointsForThread() {
- ResetQuickAllocEntryPoints(&tlsPtr_.quick_entrypoints);
+void Thread::ResetQuickAllocEntryPointsForThread(bool is_marking) {
+ // Entrypoint switching is currnetly only faster for X86_64 since other archs don't have TLAB
+ // fast path for non region space entrypoints.
+ ResetQuickAllocEntryPoints(&tlsPtr_.quick_entrypoints, is_marking);
}
class DeoptimizationContextRecord {
diff --git a/runtime/thread.h b/runtime/thread.h
index b2983cc..2079646 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -989,7 +989,7 @@
tls32_.state_and_flags.as_atomic_int.FetchAndAndSequentiallyConsistent(-1 ^ flag);
}
- void ResetQuickAllocEntryPointsForThread();
+ void ResetQuickAllocEntryPointsForThread(bool is_marking);
// Returns the remaining space in the TLAB.
size_t TlabSize() const;