Rosalloc fast path in assembly for x86_64.
Measurements (host, ms)
BinaryTrees: 324 -> 360 (+11%)
BinaryTrees with 64 MB alloc stack + 1 GB heap:
299 -> 275 (-8%)
MemAllocTest: 414 -> 368 (-11%)
Interestingly, BinaryTrees gets slower with default settings due to more
blocking (gc-for-alloc) collections. It seems because allocations are
faster, the allocation stack size and the heap size become the
bottleneck (note both an allocation stack overflow as well as heap
exhaustion cause gc-for-alloc collections). With a larger allocation
stack and heap where no blocking collections are observed, BinaryTrees
gets faster.
Bug: 9986565
Change-Id: I642b9fecd0a583cc133998c2f3932de815c4a757
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 95f0ccb..5fd8969 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -809,7 +809,90 @@
// Generate the allocation entrypoints for each allocator.
GENERATE_ALLOC_ENTRYPOINTS_FOR_EACH_ALLOCATOR
-GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_rosalloc, RosAlloc)
+// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_rosalloc, RosAlloc).
+DEFINE_FUNCTION art_quick_alloc_object_rosalloc
+ // Fast path rosalloc allocation.
+ // RDI: type_idx, RSI: ArtMethod*, RAX: return value
+ // RDX, RCX, R8, R9: free.
+ movq ART_METHOD_DEX_CACHE_TYPES_OFFSET_64(%rsi), %rdx // Load dex cache resolved types array
+ // Load the class (edx)
+ movl 0(%rdx, %rdi, COMPRESSED_REFERENCE_SIZE), %edx
+ testl %edx, %edx // Check null class
+ jz .Lart_quick_alloc_object_rosalloc_slow_path
+ // Check class status.
+ cmpl LITERAL(MIRROR_CLASS_STATUS_INITIALIZED), MIRROR_CLASS_STATUS_OFFSET(%rdx)
+ jne .Lart_quick_alloc_object_rosalloc_slow_path
+ // We don't need a fence (between the
+ // the status and the access flag
+ // loads) here because every load is
+ // a load acquire on x86.
+ // Check access flags has
+ // kAccClassIsFinalizable
+ testl LITERAL(ACCESS_FLAGS_CLASS_IS_FINALIZABLE), MIRROR_CLASS_ACCESS_FLAGS_OFFSET(%rdx)
+ jnz .Lart_quick_alloc_object_rosalloc_slow_path
+ // Check if the thread local
+ // allocation stack has room.
+ movq %gs:THREAD_SELF_OFFSET, %r8 // r8 = thread
+ movq THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET(%r8), %rcx // rcx = alloc stack top.
+ cmpq THREAD_LOCAL_ALLOC_STACK_END_OFFSET(%r8), %rcx
+ jae .Lart_quick_alloc_object_rosalloc_slow_path
+ // Load the object size
+ movl MIRROR_CLASS_OBJECT_SIZE_OFFSET(%rdx), %eax
+ // Check if the size is for a thread
+ // local allocation
+ cmpl LITERAL(ROSALLOC_MAX_THREAD_LOCAL_BRACKET_SIZE), %eax
+ ja .Lart_quick_alloc_object_rosalloc_slow_path
+ // Compute the rosalloc bracket index
+ // from the size.
+ // Align up the size by the rosalloc
+ // bracket quantum size and divide
+ // by the quantum size and subtract
+ // by 1. This code is a shorter but
+ // equivalent version.
+ subq LITERAL(1), %rax
+ shrq LITERAL(ROSALLOC_BRACKET_QUANTUM_SIZE_SHIFT), %rax
+ // Load the rosalloc run (r9)
+ movq THREAD_ROSALLOC_RUNS_OFFSET(%r8, %rax, __SIZEOF_POINTER__), %r9
+ // Load the free list head (rax). This
+ // will be the return val.
+ movq (ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_HEAD_OFFSET)(%r9), %rax
+ testq %rax, %rax
+ jz .Lart_quick_alloc_object_rosalloc_slow_path
+ // "Point of no slow path". Won't go to the slow path from here on. OK to clobber rdi and rsi.
+ // Push the new object onto the thread
+ // local allocation stack and
+ // increment the thread local
+ // allocation stack top.
+ movl %eax, (%rcx)
+ addq LITERAL(COMPRESSED_REFERENCE_SIZE), %rcx
+ movq %rcx, THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET(%r8)
+ // Load the next pointer of the head
+ // and update the list head with the
+ // next pointer.
+ movq ROSALLOC_SLOT_NEXT_OFFSET(%rax), %rcx
+ movq %rcx, (ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_HEAD_OFFSET)(%r9)
+ // Store the class pointer in the
+ // header. This also overwrites the
+ // next pointer. The offsets are
+ // asserted to match.
+#if ROSALLOC_SLOT_NEXT_OFFSET != MIRROR_OBJECT_CLASS_OFFSET
+#error "Class pointer needs to overwrite next pointer."
+#endif
+ POISON_HEAP_REF edx
+ movl %edx, MIRROR_OBJECT_CLASS_OFFSET(%rax)
+ // Decrement the size of the free list
+ decl (ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_SIZE_OFFSET)(%r9)
+ // No fence necessary for x86.
+ ret
+.Lart_quick_alloc_object_rosalloc_slow_path:
+ SETUP_REFS_ONLY_CALLEE_SAVE_FRAME // save ref containing registers for GC
+ // Outgoing argument set up
+ movq %gs:THREAD_SELF_OFFSET, %rdx // pass Thread::Current()
+ call SYMBOL(artAllocObjectFromCodeRosAlloc) // cxx_name(arg0, arg1, Thread*)
+ RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME // restore frame up to return address
+ RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER // return or deliver exception
+END_FUNCTION art_quick_alloc_object_rosalloc
+
// A handle-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB).
DEFINE_FUNCTION art_quick_alloc_object_tlab
// Fast path tlab allocation.