Revert "Implement rosalloc fast path in assembly for 32 bit arm."
Tentative. Looks like heap poisoning breaks with this change.
bug: 9986565
This reverts commit e6316940db61faead36f9642cce137d41fc8f606.
Change-Id: I5c63758221464fe319315f40ae79c656048faed0
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index 930bb2c..d09631b 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -891,109 +891,7 @@
ONE_ARG_DOWNCALL art_quick_resolve_string, artResolveStringFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
// Generate the allocation entrypoints for each allocator.
-GENERATE_ALLOC_ENTRYPOINTS_FOR_EACH_ALLOCATOR
-GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB)
-// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_rosalloc, RosAlloc).
-ENTRY art_quick_alloc_object_rosalloc
- // Fast path rosalloc allocation.
- // r0: type_idx/return value, r1: ArtMethod*, r9: Thread::Current
- // r2, r3, r12: free.
- ldr r2, [r1, #ART_METHOD_DEX_CACHE_TYPES_OFFSET_32] // Load dex cache resolved types array
- // Load the class (r2)
- ldr r2, [r2, r0, lsl #COMPRESSED_REFERENCE_SIZE_SHIFT]
- cbz r2, .Lart_quick_alloc_object_rosalloc_slow_path // Check null class
- // Check class status.
- ldr r3, [r2, #MIRROR_CLASS_STATUS_OFFSET]
- cmp r3, #MIRROR_CLASS_STATUS_INITIALIZED
- bne .Lart_quick_alloc_object_rosalloc_slow_path
- // Add a fake dependence from the
- // following access flag and size
- // loads to the status load.
- // This is to prevent those loads
- // from being reordered above the
- // status load and reading wrong
- // values (an alternative is to use
- // a load-acquire for the status).
- eor r3, r3, r3
- add r2, r2, r3
- // Check access flags has
- // kAccClassIsFinalizable
- ldr r3, [r2, #MIRROR_CLASS_ACCESS_FLAGS_OFFSET]
- tst r3, #ACCESS_FLAGS_CLASS_IS_FINALIZABLE
- bne .Lart_quick_alloc_object_rosalloc_slow_path
-
- ldr r3, [r9, #THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET] // Check if the thread local
- // allocation stack has room.
- // TODO: consider using ldrd.
- ldr r12, [r9, #THREAD_LOCAL_ALLOC_STACK_END_OFFSET]
- cmp r3, r12
- bhs .Lart_quick_alloc_object_rosalloc_slow_path
-
- ldr r3, [r2, #MIRROR_CLASS_OBJECT_SIZE_OFFSET] // Load the object size (r3)
- cmp r3, #ROSALLOC_MAX_THREAD_LOCAL_BRACKET_SIZE // Check if the size is for a thread
- // local allocation
- bhs .Lart_quick_alloc_object_rosalloc_slow_path
- // Compute the rosalloc bracket index
- // from the size.
- // Align up the size by the rosalloc
- // bracket quantum size and divide
- // by the quantum size and subtract
- // by 1. This code is a shorter but
- // equivalent version.
- sub r3, r3, #1
- lsr r3, r3, #ROSALLOC_BRACKET_QUANTUM_SIZE_SHIFT
- // Load the rosalloc run (r12)
- add r12, r9, r3, lsl #POINTER_SIZE_SHIFT
- ldr r12, [r12, #THREAD_ROSALLOC_RUNS_OFFSET]
- // Load the free list head (r3). This
- // will be the return val.
- ldr r3, [r12, #(ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_HEAD_OFFSET)]
- cbz r3, .Lart_quick_alloc_object_rosalloc_slow_path
- // "Point of no slow path". Won't go to the slow path from here on. OK to clobber r0 and r1.
- ldr r1, [r3, #ROSALLOC_SLOT_NEXT_OFFSET] // Load the next pointer of the head
- // and update the list head with the
- // next pointer.
- str r1, [r12, #(ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_HEAD_OFFSET)]
- // Store the class pointer in the
- // header. This also overwrites the
- // next pointer. The offsets are
- // asserted to match.
-#if ROSALLOC_SLOT_NEXT_OFFSET != MIRROR_OBJECT_CLASS_OFFSET
-#error "Class pointer needs to overwrite next pointer."
-#endif
- str r2, [r3, #MIRROR_OBJECT_CLASS_OFFSET]
- // Push the new object onto the thread
- // local allocation stack and
- // increment the thread local
- // allocation stack top.
- ldr r1, [r9, #THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET]
- str r3, [r1], #COMPRESSED_REFERENCE_SIZE // (Increment r1 as a side effect.)
- str r1, [r9, #THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET]
- // Decrement the size of the free list
- ldr r1, [r12, #(ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_SIZE_OFFSET)]
- sub r1, #1
- // TODO: consider combining this store
- // and the list head store above using
- // strd.
- str r1, [r12, #(ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_SIZE_OFFSET)]
- // Fence. This is "ish" not "ishst" so
- // that the code after this allocation
- // site will see the right values in
- // the fields of the class.
- // Alternatively we could use "ishst"
- // if we use load-acquire for the
- // class status load.)
- dmb ish
- mov r0, r3 // Set the return value and return.
- bx lr
-
-.Lart_quick_alloc_object_rosalloc_slow_path:
- SETUP_REFS_ONLY_CALLEE_SAVE_FRAME r2, r3 @ save callee saves in case of GC
- mov r2, r9 @ pass Thread::Current
- bl artAllocObjectFromCodeRosAlloc @ (uint32_t type_idx, Method* method, Thread*)
- RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME
- RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
-END art_quick_alloc_object_rosalloc
+GENERATE_ALL_ALLOC_ENTRYPOINTS
/*
* Called by managed code when the value in rSUSPEND has been decremented to 0.
diff --git a/runtime/arch/quick_alloc_entrypoints.S b/runtime/arch/quick_alloc_entrypoints.S
index fbacdbc..ef5edbb 100644
--- a/runtime/arch/quick_alloc_entrypoints.S
+++ b/runtime/arch/quick_alloc_entrypoints.S
@@ -113,8 +113,7 @@
GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_dlmalloc_instrumented, DlMallocInstrumented)
GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_dlmalloc_instrumented, DlMallocInstrumented)
-// This is to be separately defined for each architecture to allow a hand-written assembly fast path.
-// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_rosalloc, RosAlloc)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_rosalloc, RosAlloc)
GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_rosalloc, RosAlloc)
GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_rosalloc, RosAlloc)
GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_WITH_ACCESS_CHECK(_rosalloc, RosAlloc)
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 2f485ae..4a106e4 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -788,7 +788,6 @@
// Generate the allocation entrypoints for each allocator.
GENERATE_ALLOC_ENTRYPOINTS_FOR_EACH_ALLOCATOR
-GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_rosalloc, RosAlloc)
GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB)
ONE_ARG_DOWNCALL art_quick_resolve_string, artResolveStringFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 95f0ccb..5c413d2 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -809,7 +809,6 @@
// Generate the allocation entrypoints for each allocator.
GENERATE_ALLOC_ENTRYPOINTS_FOR_EACH_ALLOCATOR
-GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_rosalloc, RosAlloc)
// A handle-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB).
DEFINE_FUNCTION art_quick_alloc_object_tlab
// Fast path tlab allocation.
diff --git a/runtime/asm_support.h b/runtime/asm_support.h
index baf59e6..1b569fe 100644
--- a/runtime/asm_support.h
+++ b/runtime/asm_support.h
@@ -19,7 +19,6 @@
#if defined(__cplusplus)
#include "art_method.h"
-#include "gc/allocator/rosalloc.h"
#include "lock_word.h"
#include "mirror/class.h"
#include "mirror/string.h"
@@ -54,14 +53,6 @@
#define ADD_TEST_EQ(x, y)
#endif
-#if defined(__LP64__)
-#define POINTER_SIZE_SHIFT 3
-#else
-#define POINTER_SIZE_SHIFT 2
-#endif
-ADD_TEST_EQ(static_cast<size_t>(1U << POINTER_SIZE_SHIFT),
- static_cast<size_t>(__SIZEOF_POINTER__))
-
// Size of references to the heap on the stack.
#define STACK_REFERENCE_SIZE 4
ADD_TEST_EQ(static_cast<size_t>(STACK_REFERENCE_SIZE), sizeof(art::StackReference<art::mirror::Object>))
@@ -71,10 +62,6 @@
ADD_TEST_EQ(static_cast<size_t>(COMPRESSED_REFERENCE_SIZE),
sizeof(art::mirror::CompressedReference<art::mirror::Object>))
-#define COMPRESSED_REFERENCE_SIZE_SHIFT 2
-ADD_TEST_EQ(static_cast<size_t>(1U << COMPRESSED_REFERENCE_SIZE_SHIFT),
- static_cast<size_t>(COMPRESSED_REFERENCE_SIZE))
-
// Note: these callee save methods loads require read barriers.
// Offset of field Runtime::callee_save_methods_[kSaveAll]
#define RUNTIME_SAVE_ALL_CALLEE_SAVE_FRAME_OFFSET 0
@@ -133,18 +120,6 @@
#define THREAD_LOCAL_OBJECTS_OFFSET (THREAD_LOCAL_POS_OFFSET + 2 * __SIZEOF_POINTER__)
ADD_TEST_EQ(THREAD_LOCAL_OBJECTS_OFFSET,
art::Thread::ThreadLocalObjectsOffset<__SIZEOF_POINTER__>().Int32Value())
-// Offset of field Thread::tlsPtr_.rosalloc_runs.
-#define THREAD_ROSALLOC_RUNS_OFFSET (THREAD_LOCAL_POS_OFFSET + 3 * __SIZEOF_POINTER__)
-ADD_TEST_EQ(THREAD_ROSALLOC_RUNS_OFFSET,
- art::Thread::RosAllocRunsOffset<__SIZEOF_POINTER__>().Int32Value())
-// Offset of field Thread::tlsPtr_.thread_local_alloc_stack_top.
-#define THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET (THREAD_ROSALLOC_RUNS_OFFSET + 34 * __SIZEOF_POINTER__)
-ADD_TEST_EQ(THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET,
- art::Thread::ThreadLocalAllocStackTopOffset<__SIZEOF_POINTER__>().Int32Value())
-// Offset of field Thread::tlsPtr_.thread_local_alloc_stack_end.
-#define THREAD_LOCAL_ALLOC_STACK_END_OFFSET (THREAD_ROSALLOC_RUNS_OFFSET + 35 * __SIZEOF_POINTER__)
-ADD_TEST_EQ(THREAD_LOCAL_ALLOC_STACK_END_OFFSET,
- art::Thread::ThreadLocalAllocStackEndOffset<__SIZEOF_POINTER__>().Int32Value())
// Offsets within java.lang.Object.
#define MIRROR_OBJECT_CLASS_OFFSET 0
@@ -261,44 +236,6 @@
ADD_TEST_EQ(static_cast<uint32_t>(OBJECT_ALIGNMENT_MASK_TOGGLED),
~static_cast<uint32_t>(art::kObjectAlignment - 1))
-#define ROSALLOC_MAX_THREAD_LOCAL_BRACKET_SIZE 128
-ADD_TEST_EQ(ROSALLOC_MAX_THREAD_LOCAL_BRACKET_SIZE,
- static_cast<int32_t>(art::gc::allocator::RosAlloc::kMaxThreadLocalBracketSize))
-
-#define ROSALLOC_BRACKET_QUANTUM_SIZE_SHIFT 4
-ADD_TEST_EQ(ROSALLOC_BRACKET_QUANTUM_SIZE_SHIFT,
- static_cast<int32_t>(art::gc::allocator::RosAlloc::kBracketQuantumSizeShift))
-
-#define ROSALLOC_BRACKET_QUANTUM_SIZE_MASK 15
-ADD_TEST_EQ(ROSALLOC_BRACKET_QUANTUM_SIZE_MASK,
- static_cast<int32_t>(art::gc::allocator::RosAlloc::kBracketQuantumSize - 1))
-
-#define ROSALLOC_BRACKET_QUANTUM_SIZE_MASK_TOGGLED32 0xfffffff0
-ADD_TEST_EQ(static_cast<uint32_t>(ROSALLOC_BRACKET_QUANTUM_SIZE_MASK_TOGGLED32),
- ~static_cast<uint32_t>(art::gc::allocator::RosAlloc::kBracketQuantumSize - 1))
-
-#define ROSALLOC_BRACKET_QUANTUM_SIZE_MASK_TOGGLED64 0xfffffffffffffff0
-ADD_TEST_EQ(static_cast<uint64_t>(ROSALLOC_BRACKET_QUANTUM_SIZE_MASK_TOGGLED64),
- ~static_cast<uint64_t>(art::gc::allocator::RosAlloc::kBracketQuantumSize - 1))
-
-#define ROSALLOC_RUN_FREE_LIST_OFFSET 8
-ADD_TEST_EQ(ROSALLOC_RUN_FREE_LIST_OFFSET,
- static_cast<int32_t>(art::gc::allocator::RosAlloc::RunFreeListOffset()))
-
-#define ROSALLOC_RUN_FREE_LIST_HEAD_OFFSET 0
-ADD_TEST_EQ(ROSALLOC_RUN_FREE_LIST_HEAD_OFFSET,
- static_cast<int32_t>(art::gc::allocator::RosAlloc::RunFreeListHeadOffset()))
-
-#define ROSALLOC_RUN_FREE_LIST_SIZE_OFFSET 16
-ADD_TEST_EQ(ROSALLOC_RUN_FREE_LIST_SIZE_OFFSET,
- static_cast<int32_t>(art::gc::allocator::RosAlloc::RunFreeListSizeOffset()))
-
-#define ROSALLOC_SLOT_NEXT_OFFSET 0
-ADD_TEST_EQ(ROSALLOC_SLOT_NEXT_OFFSET,
- static_cast<int32_t>(art::gc::allocator::RosAlloc::RunSlotNextOffset()))
-// Assert this so that we can avoid zeroing the next field by installing the class pointer.
-ADD_TEST_EQ(ROSALLOC_SLOT_NEXT_OFFSET, MIRROR_OBJECT_CLASS_OFFSET)
-
#if defined(__cplusplus)
} // End of CheckAsmSupportOffsets.
#endif
diff --git a/runtime/gc/allocator/rosalloc.h b/runtime/gc/allocator/rosalloc.h
index 3ce3d63..87f1392 100644
--- a/runtime/gc/allocator/rosalloc.h
+++ b/runtime/gc/allocator/rosalloc.h
@@ -131,7 +131,6 @@
private:
Slot* next_; // Next slot in the list.
- friend class RosAlloc;
};
// We use the tail (kUseTail == true) for the bulk or thread-local free lists to avoid the need to
@@ -303,7 +302,6 @@
// free without traversing the whole free list.
uint32_t size_;
uint32_t padding_ ATTRIBUTE_UNUSED;
- friend class RosAlloc;
};
// Represents a run of memory slots of the same size.
@@ -484,7 +482,7 @@
static constexpr uint8_t kMagicNumFree = 43;
// The number of size brackets. Sync this with the length of Thread::rosalloc_runs_.
static constexpr size_t kNumOfSizeBrackets = kNumRosAllocThreadLocalSizeBrackets;
- // The number of smaller size brackets that are the quantum size apart.
+ // The number of smaller size brackets that are 16 bytes apart.
static constexpr size_t kNumOfQuantumSizeBrackets = 32;
// The sizes (the slot sizes, in bytes) of the size brackets.
static size_t bracketSizes[kNumOfSizeBrackets];
@@ -522,7 +520,9 @@
}
// Returns true if the given allocation size is for a thread local allocation.
static bool IsSizeForThreadLocal(size_t size) {
- bool is_size_for_thread_local = size <= kMaxThreadLocalBracketSize;
+ DCHECK_GT(kNumThreadLocalSizeBrackets, 0U);
+ size_t max_thread_local_bracket_idx = kNumThreadLocalSizeBrackets - 1;
+ bool is_size_for_thread_local = size <= bracketSizes[max_thread_local_bracket_idx];
DCHECK(size > kLargeSizeThreshold ||
(is_size_for_thread_local == (SizeToIndex(size) < kNumThreadLocalSizeBrackets)));
return is_size_for_thread_local;
@@ -634,16 +634,6 @@
// are less than this index. We use shared (current) runs for the rest.
static const size_t kNumThreadLocalSizeBrackets = 8;
- // The size of the largest bracket we use thread-local runs for.
- // This should be equal to bracketSizes[kNumThreadLocalSizeBrackets - 1].
- static const size_t kMaxThreadLocalBracketSize = 128;
-
- // The bracket size increment for the brackets of size <= 512 bytes.
- static constexpr size_t kBracketQuantumSize = 16;
-
- // Equal to Log2(kQuantumBracketSizeIncrement).
- static constexpr size_t kBracketQuantumSizeShift = 4;
-
private:
// The base address of the memory region that's managed by this allocator.
uint8_t* base_;
@@ -780,19 +770,6 @@
size_t page_release_size_threshold = kDefaultPageReleaseSizeThreshold);
~RosAlloc();
- static size_t RunFreeListOffset() {
- return OFFSETOF_MEMBER(Run, free_list_);
- }
- static size_t RunFreeListHeadOffset() {
- return OFFSETOF_MEMBER(SlotFreeList<false>, head_);
- }
- static size_t RunFreeListSizeOffset() {
- return OFFSETOF_MEMBER(SlotFreeList<false>, size_);
- }
- static size_t RunSlotNextOffset() {
- return OFFSETOF_MEMBER(Slot, next_);
- }
-
// If kThreadUnsafe is true then the allocator may avoid acquiring some locks as an optimization.
// If used, this may cause race conditions if multiple threads are allocating at the same time.
template<bool kThreadSafe = true>
diff --git a/runtime/thread.h b/runtime/thread.h
index e8af88f..d262c62 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -633,24 +633,6 @@
return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values, thread_local_objects));
}
- template<size_t pointer_size>
- static ThreadOffset<pointer_size> RosAllocRunsOffset() {
- return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values,
- rosalloc_runs));
- }
-
- template<size_t pointer_size>
- static ThreadOffset<pointer_size> ThreadLocalAllocStackTopOffset() {
- return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values,
- thread_local_alloc_stack_top));
- }
-
- template<size_t pointer_size>
- static ThreadOffset<pointer_size> ThreadLocalAllocStackEndOffset() {
- return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values,
- thread_local_alloc_stack_end));
- }
-
// Size of stack less any space reserved for stack overflow
size_t GetStackSize() const {
return tlsPtr_.stack_size - (tlsPtr_.stack_end - tlsPtr_.stack_begin);