Assembly region TLAB allocation fast path for arm.
This is for the CC collector.
Share the common fast path code with the tlab fast path code.
Speedup (on N5):
BinaryTrees: 2291 -> 902 ms (-60%)
MemAllocTest: 2137 -> 1845 ms (-14%)
Bug: 9986565
Bug: 12687968
Change-Id: Ica63094ec2f85eaa4fd04d202a20090399275d85
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index cfcef49..9271493 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -943,85 +943,6 @@
// Generate the allocation entrypoints for each allocator.
GENERATE_ALLOC_ENTRYPOINTS_FOR_EACH_ALLOCATOR
-// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB).
-ENTRY art_quick_alloc_object_tlab
- // Fast path tlab allocation.
- // r0: type_idx/return value, r1: ArtMethod*, r9: Thread::Current
- // r2, r3, r12: free.
-#if defined(USE_READ_BARRIER)
- eor r0, r0, r0 // Read barrier not supported here.
- sub r0, r0, #1 // Return -1.
- bx lr
-#endif
- ldr r2, [r1, #ART_METHOD_DEX_CACHE_TYPES_OFFSET_32] // Load dex cache resolved types array
- // Load the class (r2)
- ldr r2, [r2, r0, lsl #COMPRESSED_REFERENCE_SIZE_SHIFT]
- cbz r2, .Lart_quick_alloc_object_tlab_slow_path // Check null class
- // Check class status.
- ldr r3, [r2, #MIRROR_CLASS_STATUS_OFFSET]
- cmp r3, #MIRROR_CLASS_STATUS_INITIALIZED
- bne .Lart_quick_alloc_object_tlab_slow_path
- // Add a fake dependence from the
- // following access flag and size
- // loads to the status load.
- // This is to prevent those loads
- // from being reordered above the
- // status load and reading wrong
- // values (an alternative is to use
- // a load-acquire for the status).
- eor r3, r3, r3
- add r2, r2, r3
- // Check access flags has
- // kAccClassIsFinalizable.
- ldr r3, [r2, #MIRROR_CLASS_ACCESS_FLAGS_OFFSET]
- tst r3, #ACCESS_FLAGS_CLASS_IS_FINALIZABLE
- bne .Lart_quick_alloc_object_tlab_slow_path
- // Load thread_local_pos (r12) and
- // thread_local_end (r3) with ldrd.
- // Check constraints for ldrd.
-#if !((THREAD_LOCAL_POS_OFFSET + 4 == THREAD_LOCAL_END_OFFSET) && (THREAD_LOCAL_POS_OFFSET % 8 == 0))
-#error "Thread::thread_local_pos/end must be consecutive and are 8 byte aligned for performance"
-#endif
- ldrd r12, r3, [r9, #THREAD_LOCAL_POS_OFFSET]
- sub r12, r3, r12 // Compute the remaining buf size.
- ldr r3, [r2, #MIRROR_CLASS_OBJECT_SIZE_OFFSET] // Load the object size (r3).
- cmp r3, r12 // Check if it fits. OK to do this
- // before rounding up the object size
- // assuming the buf size alignment.
- bhi .Lart_quick_alloc_object_tlab_slow_path
- // "Point of no slow path". Won't go to the slow path from here on. OK to clobber r0 and r1.
- // Round up the object size by the
- // object alignment. (addr + 7) & ~7.
- add r3, r3, #OBJECT_ALIGNMENT_MASK
- and r3, r3, #OBJECT_ALIGNMENT_MASK_TOGGLED
- // Reload old thread_local_pos (r0)
- // for the return value.
- ldr r0, [r9, #THREAD_LOCAL_POS_OFFSET]
- add r1, r0, r3
- str r1, [r9, #THREAD_LOCAL_POS_OFFSET] // Store new thread_local_pos.
- ldr r1, [r9, #THREAD_LOCAL_OBJECTS_OFFSET] // Increment thread_local_objects.
- add r1, r1, #1
- str r1, [r9, #THREAD_LOCAL_OBJECTS_OFFSET]
- POISON_HEAP_REF r2
- str r2, [r0, #MIRROR_OBJECT_CLASS_OFFSET] // Store the class pointer.
- // Fence. This is "ish" not "ishst" so
- // that the code after this allocation
- // site will see the right values in
- // the fields of the class.
- // Alternatively we could use "ishst"
- // if we use load-acquire for the
- // class status load.)
- dmb ish
- bx lr
-.Lart_quick_alloc_object_tlab_slow_path:
- SETUP_REFS_ONLY_CALLEE_SAVE_FRAME r2, r3 // Save callee saves in case of GC.
- mov r2, r9 // Pass Thread::Current.
- bl artAllocObjectFromCodeTLAB // (uint32_t type_idx, Method* method, Thread*)
- RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME
- RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
-END art_quick_alloc_object_tlab
-
-
// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_rosalloc, RosAlloc).
ENTRY art_quick_alloc_object_rosalloc
// Fast path rosalloc allocation.
@@ -1125,6 +1046,127 @@
RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
END art_quick_alloc_object_rosalloc
+// The common fast path code for art_quick_alloc_object_tlab and art_quick_alloc_object_region_tlab.
+//
+// r0: type_idx/return value, r1: ArtMethod*, r2: class, r9: Thread::Current, r3, r12: free.
+// Need to preserve r0 and r1 to the slow path.
+.macro ALLOC_OBJECT_TLAB_FAST_PATH slowPathLabel
+ cbz r2, \slowPathLabel // Check null class
+ // Check class status.
+ ldr r3, [r2, #MIRROR_CLASS_STATUS_OFFSET]
+ cmp r3, #MIRROR_CLASS_STATUS_INITIALIZED
+ bne \slowPathLabel
+ // Add a fake dependence from the
+ // following access flag and size
+ // loads to the status load.
+ // This is to prevent those loads
+ // from being reordered above the
+ // status load and reading wrong
+ // values (an alternative is to use
+ // a load-acquire for the status).
+ eor r3, r3, r3
+ add r2, r2, r3
+ // Check access flags has
+ // kAccClassIsFinalizable.
+ ldr r3, [r2, #MIRROR_CLASS_ACCESS_FLAGS_OFFSET]
+ tst r3, #ACCESS_FLAGS_CLASS_IS_FINALIZABLE
+ bne \slowPathLabel
+ // Load thread_local_pos (r12) and
+ // thread_local_end (r3) with ldrd.
+ // Check constraints for ldrd.
+#if !((THREAD_LOCAL_POS_OFFSET + 4 == THREAD_LOCAL_END_OFFSET) && (THREAD_LOCAL_POS_OFFSET % 8 == 0))
+#error "Thread::thread_local_pos/end must be consecutive and are 8 byte aligned for performance"
+#endif
+ ldrd r12, r3, [r9, #THREAD_LOCAL_POS_OFFSET]
+ sub r12, r3, r12 // Compute the remaining buf size.
+ ldr r3, [r2, #MIRROR_CLASS_OBJECT_SIZE_OFFSET] // Load the object size (r3).
+ cmp r3, r12 // Check if it fits. OK to do this
+ // before rounding up the object size
+ // assuming the buf size alignment.
+ bhi \slowPathLabel
+ // "Point of no slow path". Won't go to the slow path from here on. OK to clobber r0 and r1.
+ // Round up the object size by the
+ // object alignment. (addr + 7) & ~7.
+ add r3, r3, #OBJECT_ALIGNMENT_MASK
+ and r3, r3, #OBJECT_ALIGNMENT_MASK_TOGGLED
+ // Reload old thread_local_pos (r0)
+ // for the return value.
+ ldr r0, [r9, #THREAD_LOCAL_POS_OFFSET]
+ add r1, r0, r3
+ str r1, [r9, #THREAD_LOCAL_POS_OFFSET] // Store new thread_local_pos.
+ ldr r1, [r9, #THREAD_LOCAL_OBJECTS_OFFSET] // Increment thread_local_objects.
+ add r1, r1, #1
+ str r1, [r9, #THREAD_LOCAL_OBJECTS_OFFSET]
+ POISON_HEAP_REF r2
+ str r2, [r0, #MIRROR_OBJECT_CLASS_OFFSET] // Store the class pointer.
+ // Fence. This is "ish" not "ishst" so
+ // that the code after this allocation
+ // site will see the right values in
+ // the fields of the class.
+ // Alternatively we could use "ishst"
+ // if we use load-acquire for the
+ // class status load.)
+ dmb ish
+ bx lr
+.endm
+
+// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB).
+ENTRY art_quick_alloc_object_tlab
+ // Fast path tlab allocation.
+ // r0: type_idx/return value, r1: ArtMethod*, r9: Thread::Current
+ // r2, r3, r12: free.
+#if defined(USE_READ_BARRIER)
+ eor r0, r0, r0 // Read barrier not supported here.
+ sub r0, r0, #1 // Return -1.
+ bx lr
+#endif
+ ldr r2, [r1, #ART_METHOD_DEX_CACHE_TYPES_OFFSET_32] // Load dex cache resolved types array
+ // Load the class (r2)
+ ldr r2, [r2, r0, lsl #COMPRESSED_REFERENCE_SIZE_SHIFT]
+ ALLOC_OBJECT_TLAB_FAST_PATH .Lart_quick_alloc_object_tlab_slow_path
+.Lart_quick_alloc_object_tlab_slow_path:
+ SETUP_REFS_ONLY_CALLEE_SAVE_FRAME r2, r3 // Save callee saves in case of GC.
+ mov r2, r9 // Pass Thread::Current.
+ bl artAllocObjectFromCodeTLAB // (uint32_t type_idx, Method* method, Thread*)
+ RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME
+ RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
+END art_quick_alloc_object_tlab
+
+// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_region_tlab, RegionTLAB)
+ENTRY art_quick_alloc_object_region_tlab
+ // Fast path tlab allocation.
+ // r0: type_idx/return value, r1: ArtMethod*, r9: Thread::Current, r2, r3, r12: free.
+#if !defined(USE_READ_BARRIER)
+ eor r0, r0, r0 // Read barrier must be enabled here.
+ sub r0, r0, #1 // Return -1.
+ bx lr
+#endif
+ ldr r2, [r1, #ART_METHOD_DEX_CACHE_TYPES_OFFSET_32] // Load dex cache resolved types array
+ // Load the class (r2)
+ ldr r2, [r2, r0, lsl #COMPRESSED_REFERENCE_SIZE_SHIFT]
+ // Read barrier for class load.
+ ldr r3, [r9, #THREAD_IS_GC_MARKING_OFFSET]
+ cbnz r3, .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path
+.Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit:
+ ALLOC_OBJECT_TLAB_FAST_PATH .Lart_quick_alloc_object_region_tlab_slow_path
+.Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path:
+ // The read barrier slow path. Mark
+ // the class.
+ push {r0, r1, r3, lr} // Save registers. r3 is pushed only
+ // to align sp by 16 bytes.
+ mov r0, r2 // Pass the class as the first param.
+ bl artReadBarrierMark
+ mov r2, r0 // Get the (marked) class back.
+ pop {r0, r1, r3, lr}
+ b .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit
+.Lart_quick_alloc_object_region_tlab_slow_path:
+ SETUP_REFS_ONLY_CALLEE_SAVE_FRAME r2, r3 // Save callee saves in case of GC.
+ mov r2, r9 // Pass Thread::Current.
+ bl artAllocObjectFromCodeRegionTLAB // (uint32_t type_idx, Method* method, Thread*)
+ RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME
+ RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
+END art_quick_alloc_object_region_tlab
+
/*
* Called by managed code when the value in rSUSPEND has been decremented to 0.
*/
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index e848008..e4c2558 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -1537,7 +1537,7 @@
// Generate the allocation entrypoints for each allocator.
GENERATE_ALLOC_ENTRYPOINTS_FOR_EACH_ALLOCATOR
-GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB)
+
// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_rosalloc, RosAlloc).
ENTRY art_quick_alloc_object_rosalloc
// Fast path rosalloc allocation.
@@ -1638,6 +1638,9 @@
RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
END art_quick_alloc_object_rosalloc
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_region_tlab, RegionTLAB)
+
/*
* Called by managed code when the thread has been asked to suspend.
*/
diff --git a/runtime/arch/mips/quick_entrypoints_mips.S b/runtime/arch/mips/quick_entrypoints_mips.S
index 699ab3e..882a169 100644
--- a/runtime/arch/mips/quick_entrypoints_mips.S
+++ b/runtime/arch/mips/quick_entrypoints_mips.S
@@ -1313,7 +1313,7 @@
// Generate the allocation entrypoints for each allocator.
GENERATE_ALLOC_ENTRYPOINTS_FOR_EACH_ALLOCATOR
-GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB)
+
// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_rosalloc, RosAlloc).
ENTRY art_quick_alloc_object_rosalloc
@@ -1421,6 +1421,9 @@
END art_quick_alloc_object_rosalloc
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_region_tlab, RegionTLAB)
+
/*
* Entry from managed code to resolve a string, this stub will allocate a String and deliver an
* exception on error. On success the String is returned. A0 holds the string index. The fast
diff --git a/runtime/arch/mips64/quick_entrypoints_mips64.S b/runtime/arch/mips64/quick_entrypoints_mips64.S
index d264c9b..2f48b88 100644
--- a/runtime/arch/mips64/quick_entrypoints_mips64.S
+++ b/runtime/arch/mips64/quick_entrypoints_mips64.S
@@ -1367,7 +1367,7 @@
// Generate the allocation entrypoints for each allocator.
GENERATE_ALLOC_ENTRYPOINTS_FOR_EACH_ALLOCATOR
-GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB)
+
// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_rosalloc, RosAlloc).
ENTRY art_quick_alloc_object_rosalloc
@@ -1467,6 +1467,9 @@
END art_quick_alloc_object_rosalloc
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_region_tlab, RegionTLAB)
+
/*
* Entry from managed code to resolve a string, this stub will allocate a String and deliver an
* exception on error. On success the String is returned. A0 holds the string index. The fast
diff --git a/runtime/arch/quick_alloc_entrypoints.S b/runtime/arch/quick_alloc_entrypoints.S
index fbacdbc..290769b 100644
--- a/runtime/arch/quick_alloc_entrypoints.S
+++ b/runtime/arch/quick_alloc_entrypoints.S
@@ -219,7 +219,8 @@
GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_region_instrumented, RegionInstrumented)
GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_region_instrumented, RegionInstrumented)
-GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_region_tlab, RegionTLAB)
+// This is to be separately defined for each architecture to allow a hand-written assembly fast path.
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_region_tlab, RegionTLAB)
GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_region_tlab, RegionTLAB)
GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_region_tlab, RegionTLAB)
GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_WITH_ACCESS_CHECK(_region_tlab, RegionTLAB)
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index fbee5d7..f83ad05 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -887,8 +887,8 @@
RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER // return or deliver exception
END_FUNCTION art_quick_alloc_object_rosalloc
-
GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_region_tlab, RegionTLAB)
ONE_ARG_DOWNCALL art_quick_resolve_string, artResolveStringFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
ONE_ARG_DOWNCALL art_quick_initialize_static_storage, artInitializeStaticStorageFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 69caec8..dee8d3c 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -809,6 +809,7 @@
// Generate the allocation entrypoints for each allocator.
GENERATE_ALLOC_ENTRYPOINTS_FOR_EACH_ALLOCATOR
+
// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_rosalloc, RosAlloc).
DEFINE_FUNCTION art_quick_alloc_object_rosalloc
// Fast path rosalloc allocation.
@@ -943,6 +944,8 @@
RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER // return or deliver exception
END_FUNCTION art_quick_alloc_object_tlab
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_region_tlab, RegionTLAB)
+
ONE_ARG_DOWNCALL art_quick_resolve_string, artResolveStringFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
ONE_ARG_DOWNCALL art_quick_initialize_static_storage, artInitializeStaticStorageFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
ONE_ARG_DOWNCALL art_quick_initialize_type, artInitializeTypeFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
diff --git a/runtime/asm_support.h b/runtime/asm_support.h
index 879364e..d5f0dff 100644
--- a/runtime/asm_support.h
+++ b/runtime/asm_support.h
@@ -101,6 +101,11 @@
ADD_TEST_EQ(THREAD_ID_OFFSET,
art::Thread::ThinLockIdOffset<__SIZEOF_POINTER__>().Int32Value())
+// Offset of field Thread::tls32_.is_gc_marking.
+#define THREAD_IS_GC_MARKING_OFFSET 52
+ADD_TEST_EQ(THREAD_IS_GC_MARKING_OFFSET,
+ art::Thread::IsGcMarkingOffset<__SIZEOF_POINTER__>().Int32Value())
+
// Offset of field Thread::tlsPtr_.card_table.
#define THREAD_CARD_TABLE_OFFSET 128
ADD_TEST_EQ(THREAD_CARD_TABLE_OFFSET,