Assembly TLAB allocation fast path for arm.
Speedup (GSS GC with TLAB on N5):
BinaryTrees: 1872 -> 796 ms (-57%)
MemAllocTest: 2522 -> 2219 ms (-12%)
Bug: 9986565
Change-Id: Icb9d1259461f3abe83a4a82c8aff911937eaf57d
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index c4e314b..cfcef49 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -942,7 +942,86 @@
// Generate the allocation entrypoints for each allocator.
GENERATE_ALLOC_ENTRYPOINTS_FOR_EACH_ALLOCATOR
-GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB)
+
+// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB).
+ENTRY art_quick_alloc_object_tlab
+ // Fast path tlab allocation.
+ // r0: type_idx/return value, r1: ArtMethod*, r9: Thread::Current
+ // r2, r3, r12: free.
+#if defined(USE_READ_BARRIER)
+ eor r0, r0, r0 // Read barrier not supported here.
+ sub r0, r0, #1 // Return -1.
+ bx lr
+#endif
+ ldr r2, [r1, #ART_METHOD_DEX_CACHE_TYPES_OFFSET_32] // Load dex cache resolved types array
+ // Load the class (r2)
+ ldr r2, [r2, r0, lsl #COMPRESSED_REFERENCE_SIZE_SHIFT]
+ cbz r2, .Lart_quick_alloc_object_tlab_slow_path // Check null class
+ // Check class status.
+ ldr r3, [r2, #MIRROR_CLASS_STATUS_OFFSET]
+ cmp r3, #MIRROR_CLASS_STATUS_INITIALIZED
+ bne .Lart_quick_alloc_object_tlab_slow_path
+ // Add a fake dependence from the
+ // following access flag and size
+ // loads to the status load.
+ // This is to prevent those loads
+ // from being reordered above the
+ // status load and reading wrong
+ // values (an alternative is to use
+ // a load-acquire for the status).
+ eor r3, r3, r3
+ add r2, r2, r3
+ // Check access flags has
+ // kAccClassIsFinalizable.
+ ldr r3, [r2, #MIRROR_CLASS_ACCESS_FLAGS_OFFSET]
+ tst r3, #ACCESS_FLAGS_CLASS_IS_FINALIZABLE
+ bne .Lart_quick_alloc_object_tlab_slow_path
+ // Load thread_local_pos (r12) and
+ // thread_local_end (r3) with ldrd.
+ // Check constraints for ldrd.
+#if !((THREAD_LOCAL_POS_OFFSET + 4 == THREAD_LOCAL_END_OFFSET) && (THREAD_LOCAL_POS_OFFSET % 8 == 0))
+#error "Thread::thread_local_pos/end must be consecutive and are 8 byte aligned for performance"
+#endif
+ ldrd r12, r3, [r9, #THREAD_LOCAL_POS_OFFSET]
+ sub r12, r3, r12 // Compute the remaining buf size.
+ ldr r3, [r2, #MIRROR_CLASS_OBJECT_SIZE_OFFSET] // Load the object size (r3).
+ cmp r3, r12 // Check if it fits. OK to do this
+ // before rounding up the object size
+ // assuming the buf size alignment.
+ bhi .Lart_quick_alloc_object_tlab_slow_path
+ // "Point of no slow path". Won't go to the slow path from here on. OK to clobber r0 and r1.
+ // Round up the object size by the
+ // object alignment. (addr + 7) & ~7.
+ add r3, r3, #OBJECT_ALIGNMENT_MASK
+ and r3, r3, #OBJECT_ALIGNMENT_MASK_TOGGLED
+ // Reload old thread_local_pos (r0)
+ // for the return value.
+ ldr r0, [r9, #THREAD_LOCAL_POS_OFFSET]
+ add r1, r0, r3
+ str r1, [r9, #THREAD_LOCAL_POS_OFFSET] // Store new thread_local_pos.
+ ldr r1, [r9, #THREAD_LOCAL_OBJECTS_OFFSET] // Increment thread_local_objects.
+ add r1, r1, #1
+ str r1, [r9, #THREAD_LOCAL_OBJECTS_OFFSET]
+ POISON_HEAP_REF r2
+ str r2, [r0, #MIRROR_OBJECT_CLASS_OFFSET] // Store the class pointer.
+ // Fence. This is "ish" not "ishst" so
+ // that the code after this allocation
+ // site will see the right values in
+ // the fields of the class.
+ // Alternatively we could use "ishst"
+ // if we use load-acquire for the
+ // class status load.)
+ dmb ish
+ bx lr
+.Lart_quick_alloc_object_tlab_slow_path:
+ SETUP_REFS_ONLY_CALLEE_SAVE_FRAME r2, r3 // Save callee saves in case of GC.
+ mov r2, r9 // Pass Thread::Current.
+ bl artAllocObjectFromCodeTLAB // (uint32_t type_idx, Method* method, Thread*)
+ RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME
+ RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
+END art_quick_alloc_object_tlab
+
+
// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_rosalloc, RosAlloc).
ENTRY art_quick_alloc_object_rosalloc
// Fast path rosalloc allocation.
diff --git a/runtime/asm_support.h b/runtime/asm_support.h
index eb3b7f3..879364e 100644
--- a/runtime/asm_support.h
+++ b/runtime/asm_support.h
@@ -121,32 +121,32 @@
ADD_TEST_EQ(THREAD_SELF_OFFSET,
art::Thread::SelfOffset<__SIZEOF_POINTER__>().Int32Value())
+// Offset of field Thread::tlsPtr_.thread_local_objects.
+#define THREAD_LOCAL_OBJECTS_OFFSET (THREAD_CARD_TABLE_OFFSET + 168 * __SIZEOF_POINTER__)
+ADD_TEST_EQ(THREAD_LOCAL_OBJECTS_OFFSET,
+ art::Thread::ThreadLocalObjectsOffset<__SIZEOF_POINTER__>().Int32Value())
// Offset of field Thread::tlsPtr_.thread_local_pos.
-#define THREAD_LOCAL_POS_OFFSET (THREAD_CARD_TABLE_OFFSET + 169 * __SIZEOF_POINTER__)
+#define THREAD_LOCAL_POS_OFFSET (THREAD_LOCAL_OBJECTS_OFFSET + 2 * __SIZEOF_POINTER__)
ADD_TEST_EQ(THREAD_LOCAL_POS_OFFSET,
art::Thread::ThreadLocalPosOffset<__SIZEOF_POINTER__>().Int32Value())
// Offset of field Thread::tlsPtr_.thread_local_end.
#define THREAD_LOCAL_END_OFFSET (THREAD_LOCAL_POS_OFFSET + __SIZEOF_POINTER__)
ADD_TEST_EQ(THREAD_LOCAL_END_OFFSET,
art::Thread::ThreadLocalEndOffset<__SIZEOF_POINTER__>().Int32Value())
-// Offset of field Thread::tlsPtr_.thread_local_objects.
-#define THREAD_LOCAL_OBJECTS_OFFSET (THREAD_LOCAL_POS_OFFSET + 2 * __SIZEOF_POINTER__)
-ADD_TEST_EQ(THREAD_LOCAL_OBJECTS_OFFSET,
- art::Thread::ThreadLocalObjectsOffset<__SIZEOF_POINTER__>().Int32Value())
// Offset of field Thread::tlsPtr_.mterp_current_ibase.
-#define THREAD_CURRENT_IBASE_OFFSET (THREAD_LOCAL_POS_OFFSET + 3 * __SIZEOF_POINTER__)
+#define THREAD_CURRENT_IBASE_OFFSET (THREAD_LOCAL_POS_OFFSET + 2 * __SIZEOF_POINTER__)
ADD_TEST_EQ(THREAD_CURRENT_IBASE_OFFSET,
art::Thread::MterpCurrentIBaseOffset<__SIZEOF_POINTER__>().Int32Value())
// Offset of field Thread::tlsPtr_.mterp_default_ibase.
-#define THREAD_DEFAULT_IBASE_OFFSET (THREAD_LOCAL_POS_OFFSET + 4 * __SIZEOF_POINTER__)
+#define THREAD_DEFAULT_IBASE_OFFSET (THREAD_LOCAL_POS_OFFSET + 3 * __SIZEOF_POINTER__)
ADD_TEST_EQ(THREAD_DEFAULT_IBASE_OFFSET,
art::Thread::MterpDefaultIBaseOffset<__SIZEOF_POINTER__>().Int32Value())
// Offset of field Thread::tlsPtr_.mterp_alt_ibase.
-#define THREAD_ALT_IBASE_OFFSET (THREAD_LOCAL_POS_OFFSET + 5 * __SIZEOF_POINTER__)
+#define THREAD_ALT_IBASE_OFFSET (THREAD_LOCAL_POS_OFFSET + 4 * __SIZEOF_POINTER__)
ADD_TEST_EQ(THREAD_ALT_IBASE_OFFSET,
art::Thread::MterpAltIBaseOffset<__SIZEOF_POINTER__>().Int32Value())
// Offset of field Thread::tlsPtr_.rosalloc_runs.
-#define THREAD_ROSALLOC_RUNS_OFFSET (THREAD_LOCAL_POS_OFFSET + 6 * __SIZEOF_POINTER__)
+#define THREAD_ROSALLOC_RUNS_OFFSET (THREAD_LOCAL_POS_OFFSET + 5 * __SIZEOF_POINTER__)
ADD_TEST_EQ(THREAD_ROSALLOC_RUNS_OFFSET,
art::Thread::RosAllocRunsOffset<__SIZEOF_POINTER__>().Int32Value())
// Offset of field Thread::tlsPtr_.thread_local_alloc_stack_top.
diff --git a/runtime/entrypoints_order_test.cc b/runtime/entrypoints_order_test.cc
index e72809b..c621672 100644
--- a/runtime/entrypoints_order_test.cc
+++ b/runtime/entrypoints_order_test.cc
@@ -119,10 +119,10 @@
// Skip across the entrypoints structures.
+ EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, thread_local_objects, thread_local_start, sizeof(void*));
EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, thread_local_start, thread_local_pos, sizeof(void*));
EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, thread_local_pos, thread_local_end, sizeof(void*));
- EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, thread_local_end, thread_local_objects, sizeof(void*));
- EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, thread_local_objects, mterp_current_ibase, sizeof(void*));
+ EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, thread_local_end, mterp_current_ibase, sizeof(void*));
EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, mterp_current_ibase, mterp_default_ibase, sizeof(void*));
EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, mterp_default_ibase, mterp_alt_ibase, sizeof(void*));
EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, mterp_alt_ibase, rosalloc_runs, sizeof(void*));
diff --git a/runtime/thread.h b/runtime/thread.h
index 97c47e1..234750c 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -1324,8 +1324,8 @@
instrumentation_stack(nullptr), debug_invoke_req(nullptr), single_step_control(nullptr),
stacked_shadow_frame_record(nullptr), deoptimization_context_stack(nullptr),
frame_id_to_shadow_frame(nullptr), name(nullptr), pthread_self(0),
- last_no_thread_suspension_cause(nullptr), thread_local_start(nullptr),
- thread_local_pos(nullptr), thread_local_end(nullptr), thread_local_objects(0),
+ last_no_thread_suspension_cause(nullptr), thread_local_objects(0),
+ thread_local_start(nullptr), thread_local_pos(nullptr), thread_local_end(nullptr),
mterp_current_ibase(nullptr), mterp_default_ibase(nullptr), mterp_alt_ibase(nullptr),
thread_local_alloc_stack_top(nullptr), thread_local_alloc_stack_end(nullptr),
nested_signal_state(nullptr), flip_function(nullptr), method_verifier(nullptr),
@@ -1440,10 +1440,12 @@
QuickEntryPoints quick_entrypoints;
// Thread-local allocation pointer.
+ size_t thread_local_objects;
uint8_t* thread_local_start;
+ // thread_local_pos and thread_local_end must be consecutive for ldrd and are 8 byte aligned for
+ // potentially better performance.
uint8_t* thread_local_pos;
uint8_t* thread_local_end;
- size_t thread_local_objects;
// Mterp jump table bases.
void* mterp_current_ibase;