Align allocation entrypoints implementation between arm/arm64/x86/x64.

x64:
- Add art_quick_alloc_initialized_rosalloc

x86:
- Add art_quick_alloc_initialized_rosalloc
- Add art_quick_alloc_initialized{_region}_tlab
- Add art_quick_alloc_array_resolved{8,16,32,64}{_region}_tlab

arm32:
- Add art_quick_alloc_initialized_rosalloc
- Add art_quick_alloc_initialized{_region}_tlab
- Add art_quick_alloc_array_resolved{8,16,32,64}{_region}_tlab

arm64:
- Add art_quick_alloc_initialized_rosalloc
- Add art_quick_alloc_initialized{_region}_tlab
- Add art_quick_alloc_array_resolved{8,16,32,64}_tlab

Test: test-art-target test-art-host
bug: 30933338

Change-Id: I0dd8667a2921dd0b3403bea5d05304ba5d40627f
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index f5b6ebe..1893029 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -3993,8 +3993,11 @@
 void InstructionCodeGeneratorARM::VisitNewArray(HNewArray* instruction) {
   // Note: if heap poisoning is enabled, the entry point takes cares
   // of poisoning the reference.
-  codegen_->InvokeRuntime(kQuickAllocArrayResolved, instruction, instruction->GetDexPc());
+  QuickEntrypointEnum entrypoint =
+      CodeGenerator::GetArrayAllocationEntrypoint(instruction->GetLoadClass()->GetClass());
+  codegen_->InvokeRuntime(entrypoint, instruction, instruction->GetDexPc());
   CheckEntrypointTypes<kQuickAllocArrayResolved, void*, mirror::Class*, int32_t>();
+  DCHECK(!codegen_->IsLeafMethod());
 }
 
 void LocationsBuilderARM::VisitParameterValue(HParameterValue* instruction) {
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 1b74316..3e795c7 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -4214,7 +4214,9 @@
 void InstructionCodeGeneratorX86::VisitNewArray(HNewArray* instruction) {
   // Note: if heap poisoning is enabled, the entry point takes cares
   // of poisoning the reference.
-  codegen_->InvokeRuntime(kQuickAllocArrayResolved, instruction, instruction->GetDexPc());
+  QuickEntrypointEnum entrypoint =
+      CodeGenerator::GetArrayAllocationEntrypoint(instruction->GetLoadClass()->GetClass());
+  codegen_->InvokeRuntime(entrypoint, instruction, instruction->GetDexPc());
   CheckEntrypointTypes<kQuickAllocArrayResolved, void*, mirror::Class*, int32_t>();
   DCHECK(!codegen_->IsLeafMethod());
 }
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index ed36436..a443a40 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -1086,11 +1086,37 @@
     DELIVER_PENDING_EXCEPTION_FRAME_READY
 END art_quick_resolve_string
 
+
 // Generate the allocation entrypoints for each allocator.
-GENERATE_ALLOC_ENTRYPOINTS_FOR_EACH_ALLOCATOR
+GENERATE_ALLOC_ENTRYPOINTS_FOR_NON_TLAB_ALLOCATORS
+// Comment out allocators that have arm specific asm.
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_region_tlab, RegionTLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_region_tlab, RegionTLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_WITH_ACCESS_CHECK(_region_tlab, RegionTLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED(_region_tlab, RegionTLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED8(_region_tlab, RegionTLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED16(_region_tlab, RegionTLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED32(_region_tlab, RegionTLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED64(_region_tlab, RegionTLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_BYTES(_region_tlab, RegionTLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_region_tlab, RegionTLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_region_tlab, RegionTLAB)
+
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab, TLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_tlab, TLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_WITH_ACCESS_CHECK(_tlab, TLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED(_tlab, TLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED8(_tlab, TLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED16(_tlab, TLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED32(_tlab, TLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED64(_tlab, TLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_BYTES(_tlab, TLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_tlab, TLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_tlab, TLAB)
 
 // A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_RESOLVED_OBJECT(_rosalloc, RosAlloc).
-ENTRY art_quick_alloc_object_resolved_rosalloc
+.macro ART_QUICK_ALLOC_OBJECT_ROSALLOC c_name, cxx_name
+ENTRY \c_name
     // Fast path rosalloc allocation.
     // r0: type/return value, r9: Thread::Current
     // r1, r2, r3, r12: free.
@@ -1099,13 +1125,13 @@
                                                               // TODO: consider using ldrd.
     ldr    r12, [r9, #THREAD_LOCAL_ALLOC_STACK_END_OFFSET]
     cmp    r3, r12
-    bhs    .Lart_quick_alloc_object_resolved_rosalloc_slow_path
+    bhs    .Lslow_path\c_name
 
     ldr    r3, [r0, #MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET]  // Load the object size (r3)
     cmp    r3, #ROSALLOC_MAX_THREAD_LOCAL_BRACKET_SIZE        // Check if the size is for a thread
                                                               // local allocation. Also does the
                                                               // initialized and finalizable checks.
-    bhs    .Lart_quick_alloc_object_resolved_rosalloc_slow_path
+    bhs    .Lslow_path\c_name
                                                               // Compute the rosalloc bracket index
                                                               // from the size. Since the size is
                                                               // already aligned we can combine the
@@ -1119,7 +1145,7 @@
                                                               // Load the free list head (r3). This
                                                               // will be the return val.
     ldr    r3, [r12, #(ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_HEAD_OFFSET)]
-    cbz    r3, .Lart_quick_alloc_object_resolved_rosalloc_slow_path
+    cbz    r3, .Lslow_path\c_name
     // "Point of no slow path". Won't go to the slow path from here on. OK to clobber r0 and r1.
     ldr    r1, [r3, #ROSALLOC_SLOT_NEXT_OFFSET]               // Load the next pointer of the head
                                                               // and update the list head with the
@@ -1164,16 +1190,20 @@
     mov    r0, r3                                             // Set the return value and return.
     bx     lr
 
-.Lart_quick_alloc_object_resolved_rosalloc_slow_path:
+.Lslow_path\c_name:
     SETUP_SAVE_REFS_ONLY_FRAME r2     @ save callee saves in case of GC
     mov    r1, r9                     @ pass Thread::Current
-    bl     artAllocObjectFromCodeResolvedRosAlloc     @ (mirror::Class* cls, Thread*)
+    bl     \cxx_name                  @ (mirror::Class* cls, Thread*)
     RESTORE_SAVE_REFS_ONLY_FRAME
     RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
-END art_quick_alloc_object_resolved_rosalloc
+END \c_name
+.endm
 
-// The common fast path code for art_quick_alloc_object_resolved_tlab
-// and art_quick_alloc_object_resolved_region_tlab.
+ART_QUICK_ALLOC_OBJECT_ROSALLOC art_quick_alloc_object_resolved_rosalloc, artAllocObjectFromCodeResolvedRosAlloc
+ART_QUICK_ALLOC_OBJECT_ROSALLOC art_quick_alloc_object_initialized_rosalloc, artAllocObjectFromCodeInitializedRosAlloc
+
+// The common fast path code for art_quick_alloc_object_resolved/initialized_tlab
+// and art_quick_alloc_object_resolved/initialized_region_tlab.
 //
 // r0: type r9: Thread::Current, r1, r2, r3, r12: free.
 // Need to preserve r0 to the slow path.
@@ -1212,41 +1242,173 @@
     bx     lr
 .endm
 
-// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_RESOLVED_OBJECT(_tlab, TLAB).
-ENTRY art_quick_alloc_object_resolved_tlab
+// The common code for art_quick_alloc_object_*region_tlab
+.macro GENERATE_ALLOC_OBJECT_RESOLVED_TLAB name, entrypoint
+ENTRY \name
     // Fast path tlab allocation.
     // r0: type, r9: Thread::Current
     // r1, r2, r3, r12: free.
-#if defined(USE_READ_BARRIER)
-    mvn    r0, #0                                             // Read barrier not supported here.
-    bx     lr                                                 // Return -1.
-#endif
-    ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH .Lart_quick_alloc_object_resolved_tlab_slow_path
-.Lart_quick_alloc_object_resolved_tlab_slow_path:
+    ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH .Lslow_path\name
+.Lslow_path\name:
     SETUP_SAVE_REFS_ONLY_FRAME r2                             // Save callee saves in case of GC.
     mov    r1, r9                                             // Pass Thread::Current.
-    bl     artAllocObjectFromCodeResolvedTLAB                 // (mirror::Class* klass, Thread*)
+    bl     \entrypoint                                        // (mirror::Class* klass, Thread*)
     RESTORE_SAVE_REFS_ONLY_FRAME
     RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
-END art_quick_alloc_object_resolved_tlab
+END \name
+.endm
 
-// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_region_tlab, RegionTLAB)
-ENTRY art_quick_alloc_object_resolved_region_tlab
-    // Fast path tlab allocation.
-    // r0: type, r9: Thread::Current, r1, r2, r3, r12: free.
-#if !defined(USE_READ_BARRIER)
-    eor    r0, r0, r0                                         // Read barrier must be enabled here.
-    sub    r0, r0, #1                                         // Return -1.
-    bx     lr
+GENERATE_ALLOC_OBJECT_RESOLVED_TLAB art_quick_alloc_object_resolved_region_tlab, artAllocObjectFromCodeResolvedRegionTLAB
+GENERATE_ALLOC_OBJECT_RESOLVED_TLAB art_quick_alloc_object_initialized_region_tlab, artAllocObjectFromCodeInitializedRegionTLAB
+GENERATE_ALLOC_OBJECT_RESOLVED_TLAB art_quick_alloc_object_resolved_tlab, artAllocObjectFromCodeResolvedTLAB
+GENERATE_ALLOC_OBJECT_RESOLVED_TLAB art_quick_alloc_object_initialized_tlab, artAllocObjectFromCodeInitializedTLAB
+
+
+// The common fast path code for art_quick_alloc_array_resolved/initialized_tlab
+// and art_quick_alloc_array_resolved/initialized_region_tlab.
+//
+// r0: type r1: component_count r2: total_size r9: Thread::Current, r3, r12: free.
+// Need to preserve r0 and r1 to the slow path.
+.macro ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED_WITH_SIZE slowPathLabel
+    and    r2, r2, #OBJECT_ALIGNMENT_MASK_TOGGLED             // Apply alignemnt mask
+                                                              // (addr + 7) & ~7.
+
+                                                              // Load thread_local_pos (r3) and
+                                                              // thread_local_end (r12) with ldrd.
+                                                              // Check constraints for ldrd.
+#if !((THREAD_LOCAL_POS_OFFSET + 4 == THREAD_LOCAL_END_OFFSET) && (THREAD_LOCAL_POS_OFFSET % 8 == 0))
+#error "Thread::thread_local_pos/end must be consecutive and are 8 byte aligned for performance"
 #endif
-    ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH .Lart_quick_alloc_object_resolved_region_tlab_slow_path
-.Lart_quick_alloc_object_resolved_region_tlab_slow_path:
-    SETUP_SAVE_REFS_ONLY_FRAME r2                             // Save callee saves in case of GC.
-    mov    r1, r9                                             // Pass Thread::Current.
-    bl     artAllocObjectFromCodeResolvedRegionTLAB           // (mirror::Class* klass, Thread*)
+    ldrd   r3, r12, [r9, #THREAD_LOCAL_POS_OFFSET]
+    sub    r12, r12, r3                                       // Compute the remaining buf size.
+    cmp    r2, r12                                            // Check if the total_size fits.
+    bhi    \slowPathLabel
+    // "Point of no slow path". Won't go to the slow path from here on. OK to clobber r0 and r1.
+    add    r2, r2, r3
+    str    r2, [r9, #THREAD_LOCAL_POS_OFFSET]                 // Store new thread_local_pos.
+    ldr    r2, [r9, #THREAD_LOCAL_OBJECTS_OFFSET]             // Increment thread_local_objects.
+    add    r2, r2, #1
+    str    r2, [r9, #THREAD_LOCAL_OBJECTS_OFFSET]
+    POISON_HEAP_REF r0
+    str    r0, [r3, #MIRROR_OBJECT_CLASS_OFFSET]              // Store the class pointer.
+    str    r1, [r3, #MIRROR_ARRAY_LENGTH_OFFSET]              // Store the array length.
+                                                              // Fence. This is "ish" not "ishst" so
+                                                              // that the code after this allocation
+                                                              // site will see the right values in
+                                                              // the fields of the class.
+                                                              // Alternatively we could use "ishst"
+                                                              // if we use load-acquire for the
+                                                              // object size load.)
+    mov    r0, r3
+    dmb    ish
+    bx     lr
+.endm
+
+.macro GENERATE_ALLOC_ARRAY_TLAB name, entrypoint, size_setup
+ENTRY \name
+    // Fast path array allocation for region tlab allocation.
+    // r0: mirror::Class* type
+    // r1: int32_t component_count
+    // r9: thread
+    // r2, r3, r12: free.
+    \size_setup .Lslow_path\name
+    ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED_WITH_SIZE .Lslow_path\name
+.Lslow_path\name:
+    // r0: mirror::Class* klass
+    // r1: int32_t component_count
+    // r2: Thread* self
+    SETUP_SAVE_REFS_ONLY_FRAME r2  // save callee saves in case of GC
+    mov    r2, r9                  // pass Thread::Current
+    bl     \entrypoint
     RESTORE_SAVE_REFS_ONLY_FRAME
     RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
-END art_quick_alloc_object_resolved_region_tlab
+END \name
+.endm
+
+.macro COMPUTE_ARRAY_SIZE_UNKNOWN slow_path
+    bkpt                                                    // We should never enter here.
+                                                            // Code below is for reference.
+                                                            // Possibly a large object, go slow.
+                                                            // Also does negative array size check.
+    movw r2, #((MIN_LARGE_OBJECT_THRESHOLD - MIRROR_WIDE_ARRAY_DATA_OFFSET) / 8)
+    cmp r1, r2
+    bhi \slow_path
+                                                            // Array classes are never finalizable
+                                                            // or uninitialized, no need to check.
+    ldr    r3, [r0, #MIRROR_CLASS_COMPONENT_TYPE_OFFSET]    // Load component type
+    UNPOISON_HEAP_REF r3
+    ldr    r3, [r3, #MIRROR_CLASS_OBJECT_PRIMITIVE_TYPE_OFFSET]
+    lsr    r3, r3, #PRIMITIVE_TYPE_SIZE_SHIFT_SHIFT         // Component size shift is in high 16
+                                                            // bits.
+    lsl    r2, r1, r3                                       // Calculate data size
+                                                            // Add array data offset and alignment.
+    add    r2, r2, #(MIRROR_INT_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK)
+#if MIRROR_WIDE_ARRAY_DATA_OFFSET != MIRROR_INT_ARRAY_DATA_OFFSET + 4
+#error Long array data offset must be 4 greater than int array data offset.
+#endif
+
+    add    r3, r3, #1                                       // Add 4 to the length only if the
+                                                            // component size shift is 3
+                                                            // (for 64 bit alignment).
+    and    r3, r3, #4
+    add    r2, r2, r3
+.endm
+
+.macro COMPUTE_ARRAY_SIZE_8 slow_path
+    // Possibly a large object, go slow.
+    // Also does negative array size check.
+    movw r2, #(MIN_LARGE_OBJECT_THRESHOLD - MIRROR_INT_ARRAY_DATA_OFFSET)
+    cmp r1, r2
+    bhi \slow_path
+    // Add array data offset and alignment.
+    add    r2, r1, #(MIRROR_INT_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK)
+.endm
+
+.macro COMPUTE_ARRAY_SIZE_16 slow_path
+    // Possibly a large object, go slow.
+    // Also does negative array size check.
+    movw r2, #((MIN_LARGE_OBJECT_THRESHOLD - MIRROR_INT_ARRAY_DATA_OFFSET) / 2)
+    cmp r1, r2
+    bhi \slow_path
+    lsl    r2, r1, #1
+    // Add array data offset and alignment.
+    add    r2, r2, #(MIRROR_INT_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK)
+.endm
+
+.macro COMPUTE_ARRAY_SIZE_32 slow_path
+    // Possibly a large object, go slow.
+    // Also does negative array size check.
+    movw r2, #((MIN_LARGE_OBJECT_THRESHOLD - MIRROR_INT_ARRAY_DATA_OFFSET) / 4)
+    cmp r1, r2
+    bhi \slow_path
+    lsl    r2, r1, #2
+    // Add array data offset and alignment.
+    add    r2, r2, #(MIRROR_INT_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK)
+.endm
+
+.macro COMPUTE_ARRAY_SIZE_64 slow_path
+    // Possibly a large object, go slow.
+    // Also does negative array size check.
+    movw r2, #((MIN_LARGE_OBJECT_THRESHOLD - MIRROR_LONG_ARRAY_DATA_OFFSET) / 8)
+    cmp r1, r2
+    bhi \slow_path
+    lsl    r2, r1, #3
+    // Add array data offset and alignment.
+    add    r2, r2, #(MIRROR_WIDE_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK)
+.endm
+
+# TODO(ngeoffray): art_quick_alloc_array_resolved_region_tlab is not used for arm, remove
+# the entrypoint once all backends have been updated to use the size variants.
+GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_UNKNOWN
+GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved8_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_8
+GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved16_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_16
+GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved32_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_32
+GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved64_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_64
+GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_UNKNOWN
+GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved8_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_8
+GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved16_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_16
+GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved32_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_32
+GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved64_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_64
 
     /*
      * Called by managed code when the value in rSUSPEND has been decremented to 0.
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 6a2034f..219d8b4 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -1626,7 +1626,7 @@
 END art_quick_resolve_string
 
 // Generate the allocation entrypoints for each allocator.
-GENERATE_ALLOC_ENTRYPOINTS_FOR_NON_REGION_TLAB_ALLOCATORS
+GENERATE_ALLOC_ENTRYPOINTS_FOR_NON_TLAB_ALLOCATORS
 // Comment out allocators that have arm64 specific asm.
 // GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_region_tlab, RegionTLAB)
 // GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_region_tlab, RegionTLAB)
@@ -1640,8 +1640,20 @@
 GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_region_tlab, RegionTLAB)
 GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_region_tlab, RegionTLAB)
 
-// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_rosalloc, RosAlloc).
-ENTRY art_quick_alloc_object_resolved_rosalloc
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab, TLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_tlab, TLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_WITH_ACCESS_CHECK(_tlab, TLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED(_tlab, TLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED8(_tlab, TLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED16(_tlab, TLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED32(_tlab, TLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED64(_tlab, TLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_BYTES(_tlab, TLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_tlab, TLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_tlab, TLAB)
+
+.macro ART_QUICK_ALLOC_OBJECT_ROSALLOC c_name, cxx_name
+ENTRY \c_name
     // Fast path rosalloc allocation.
     // x0: type, xSELF(x19): Thread::Current
     // x1-x7: free.
@@ -1650,13 +1662,13 @@
                                                               // ldp won't work due to large offset.
     ldr    x4, [xSELF, #THREAD_LOCAL_ALLOC_STACK_END_OFFSET]
     cmp    x3, x4
-    bhs    .Lart_quick_alloc_object_resolved_rosalloc_slow_path
+    bhs    .Lslow_path\c_name
     ldr    w3, [x0, #MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET]  // Load the object size (x3)
     cmp    x3, #ROSALLOC_MAX_THREAD_LOCAL_BRACKET_SIZE        // Check if the size is for a thread
                                                               // local allocation. Also does the
                                                               // finalizable and initialization
                                                               // checks.
-    bhs    .Lart_quick_alloc_object_resolved_rosalloc_slow_path
+    bhs    .Lslow_path\c_name
                                                               // Compute the rosalloc bracket index
                                                               // from the size. Since the size is
                                                               // already aligned we can combine the
@@ -1669,7 +1681,7 @@
                                                               // Load the free list head (x3). This
                                                               // will be the return val.
     ldr    x3, [x4, #(ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_HEAD_OFFSET)]
-    cbz    x3, .Lart_quick_alloc_object_resolved_rosalloc_slow_path
+    cbz    x3, .Lslow_path\c_name
     // "Point of no slow path". Won't go to the slow path from here on. OK to clobber x0 and x1.
     ldr    x1, [x3, #ROSALLOC_SLOT_NEXT_OFFSET]               // Load the next pointer of the head
                                                               // and update the list head with the
@@ -1713,13 +1725,65 @@
 
     mov    x0, x3                                             // Set the return value and return.
     ret
-.Lart_quick_alloc_object_resolved_rosalloc_slow_path:
+.Lslow_path\c_name:
     SETUP_SAVE_REFS_ONLY_FRAME                      // save callee saves in case of GC
     mov    x1, xSELF                                // pass Thread::Current
-    bl     artAllocObjectFromCodeResolvedRosAlloc   // (mirror::Class* klass, Thread*)
+    bl     \cxx_name
     RESTORE_SAVE_REFS_ONLY_FRAME
     RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
-END art_quick_alloc_object_resolved_rosalloc
+END \c_name
+.endm
+
+ART_QUICK_ALLOC_OBJECT_ROSALLOC art_quick_alloc_object_resolved_rosalloc, artAllocObjectFromCodeResolvedRosAlloc
+ART_QUICK_ALLOC_OBJECT_ROSALLOC art_quick_alloc_object_initialized_rosalloc, artAllocObjectFromCodeInitializedRosAlloc
+
+.macro ALLOC_OBJECT_TLAB_FAST_PATH_RESOLVED slowPathLabel
+    ldr    x4, [xSELF, #THREAD_LOCAL_POS_OFFSET]
+    ldr    x5, [xSELF, #THREAD_LOCAL_END_OFFSET]
+    ldr    w7, [x0, #MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET]  // Load the object size (x7).
+    add    x6, x4, x7                                         // Add object size to tlab pos.
+    cmp    x6, x5                                             // Check if it fits, overflow works
+                                                              // since the tlab pos and end are 32
+                                                              // bit values.
+    bhi    \slowPathLabel
+    str    x6, [xSELF, #THREAD_LOCAL_POS_OFFSET]              // Store new thread_local_pos.
+    ldr    x5, [xSELF, #THREAD_LOCAL_OBJECTS_OFFSET]          // Increment thread_local_objects.
+    add    x5, x5, #1
+    str    x5, [xSELF, #THREAD_LOCAL_OBJECTS_OFFSET]
+    POISON_HEAP_REF w0
+    str    w0, [x4, #MIRROR_OBJECT_CLASS_OFFSET]              // Store the class pointer.
+                                                              // Fence. This is "ish" not "ishst" so
+                                                              // that the code after this allocation
+                                                              // site will see the right values in
+                                                              // the fields of the class.
+                                                              // Alternatively we could use "ishst"
+                                                              // if we use load-acquire for the
+                                                              // object size load.)
+    mov    x0, x4
+    dmb    ish
+    ret
+.endm
+
+// The common code for art_quick_alloc_object_*region_tlab
+.macro GENERATE_ALLOC_OBJECT_RESOLVED_TLAB name, entrypoint
+ENTRY \name
+    // Fast path region tlab allocation.
+    // x0: type, xSELF(x19): Thread::Current
+    // x1-x7: free.
+    ALLOC_OBJECT_TLAB_FAST_PATH_RESOLVED .Lslow_path\name
+.Lslow_path\name:
+    SETUP_SAVE_REFS_ONLY_FRAME                 // Save callee saves in case of GC.
+    mov    x1, xSELF                           // Pass Thread::Current.
+    bl     \entrypoint                         // (mirror::Class*, Thread*)
+    RESTORE_SAVE_REFS_ONLY_FRAME
+    RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
+END \name
+.endm
+
+GENERATE_ALLOC_OBJECT_RESOLVED_TLAB art_quick_alloc_object_resolved_region_tlab, artAllocObjectFromCodeResolvedRegionTLAB
+GENERATE_ALLOC_OBJECT_RESOLVED_TLAB art_quick_alloc_object_initialized_region_tlab, artAllocObjectFromCodeInitializedRegionTLAB
+GENERATE_ALLOC_OBJECT_RESOLVED_TLAB art_quick_alloc_object_resolved_tlab, artAllocObjectFromCodeResolvedTLAB
+GENERATE_ALLOC_OBJECT_RESOLVED_TLAB art_quick_alloc_object_initialized_tlab, artAllocObjectFromCodeInitializedTLAB
 
 .macro ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED_WITH_SIZE slowPathLabel, xClass, wClass, xCount, wCount, xTemp0, wTemp0, xTemp1, wTemp1, xTemp2, wTemp2
     and    \xTemp1, \xTemp1, #OBJECT_ALIGNMENT_MASK_TOGGLED64 // Apply alignemnt mask
@@ -1759,93 +1823,12 @@
     ret
 .endm
 
-// TODO: delete ALLOC_OBJECT_TLAB_FAST_PATH_RESOLVED since it is the same as
-// ALLOC_OBJECT_TLAB_FAST_PATH_INITIALIZED.
-.macro ALLOC_OBJECT_TLAB_FAST_PATH_RESOLVED slowPathLabel
-    ALLOC_OBJECT_TLAB_FAST_PATH_INITIALIZED \slowPathLabel
-.endm
-
-.macro ALLOC_OBJECT_TLAB_FAST_PATH_INITIALIZED slowPathLabel
-    ldr    x4, [xSELF, #THREAD_LOCAL_POS_OFFSET]
-    ldr    x5, [xSELF, #THREAD_LOCAL_END_OFFSET]
-    ldr    w7, [x0, #MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET]  // Load the object size (x7).
-    add    x6, x4, x7                                         // Add object size to tlab pos.
-    cmp    x6, x5                                             // Check if it fits, overflow works
-                                                              // since the tlab pos and end are 32
-                                                              // bit values.
-    bhi    \slowPathLabel
-    str    x6, [xSELF, #THREAD_LOCAL_POS_OFFSET]              // Store new thread_local_pos.
-    ldr    x5, [xSELF, #THREAD_LOCAL_OBJECTS_OFFSET]          // Increment thread_local_objects.
-    add    x5, x5, #1
-    str    x5, [xSELF, #THREAD_LOCAL_OBJECTS_OFFSET]
-    POISON_HEAP_REF w0
-    str    w0, [x4, #MIRROR_OBJECT_CLASS_OFFSET]              // Store the class pointer.
-                                                              // Fence. This is "ish" not "ishst" so
-                                                              // that the code after this allocation
-                                                              // site will see the right values in
-                                                              // the fields of the class.
-                                                              // Alternatively we could use "ishst"
-                                                              // if we use load-acquire for the
-                                                              // object size load.)
-    mov    x0, x4
-    dmb    ish
-    ret
-.endm
-
-// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab, TLAB).
-ENTRY art_quick_alloc_object_resolved_tlab
-    // Fast path tlab allocation.
-    // x0: type, xSELF(x19): Thread::Current
-    // x1-x7: free.
-#if defined(USE_READ_BARRIER)
-    mvn    x0, xzr                                            // Read barrier not supported here.
-    ret                                                       // Return -1.
-#endif
-    ALLOC_OBJECT_TLAB_FAST_PATH_RESOLVED .Lart_quick_alloc_object_resolved_tlab_slow_path
-.Lart_quick_alloc_object_resolved_tlab_slow_path:
-    SETUP_SAVE_REFS_ONLY_FRAME                    // Save callee saves in case of GC.
-    mov    x1, xSELF                              // Pass Thread::Current.
-    bl     artAllocObjectFromCodeResolvedTLAB     // (mirror::Class*, Thread*)
-    RESTORE_SAVE_REFS_ONLY_FRAME
-    RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
-END art_quick_alloc_object_resolved_tlab
-
-// The common code for art_quick_alloc_object_*region_tlab
-.macro GENERATE_ALLOC_OBJECT_RESOLVED_REGION_TLAB name, entrypoint, fast_path
-ENTRY \name
-    // Fast path region tlab allocation.
-    // x0: type, xSELF(x19): Thread::Current
-    // x1-x7: free.
-#if !defined(USE_READ_BARRIER)
-    mvn    x0, xzr                                            // Read barrier must be enabled here.
-    ret                                                       // Return -1.
-#endif
-.Ldo_allocation\name:
-    \fast_path .Lslow_path\name
-.Lslow_path\name:
-    SETUP_SAVE_REFS_ONLY_FRAME                 // Save callee saves in case of GC.
-    mov    x1, xSELF                           // Pass Thread::Current.
-    bl     \entrypoint                         // (mirror::Class*, Thread*)
-    RESTORE_SAVE_REFS_ONLY_FRAME
-    RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
-END \name
-.endm
-
-GENERATE_ALLOC_OBJECT_RESOLVED_REGION_TLAB art_quick_alloc_object_resolved_region_tlab, artAllocObjectFromCodeResolvedRegionTLAB, ALLOC_OBJECT_TLAB_FAST_PATH_RESOLVED
-GENERATE_ALLOC_OBJECT_RESOLVED_REGION_TLAB art_quick_alloc_object_initialized_region_tlab, artAllocObjectFromCodeInitializedRegionTLAB, ALLOC_OBJECT_TLAB_FAST_PATH_INITIALIZED
-
-// TODO: We could use this macro for the normal tlab allocator too.
-
-.macro GENERATE_ALLOC_ARRAY_REGION_TLAB name, entrypoint, size_setup
+.macro GENERATE_ALLOC_ARRAY_TLAB name, entrypoint, size_setup
 ENTRY \name
     // Fast path array allocation for region tlab allocation.
     // x0: mirror::Class* type
     // x1: int32_t component_count
     // x2-x7: free.
-#if !defined(USE_READ_BARRIER)
-    mvn    x0, xzr                                            // Read barrier must be enabled here.
-    ret                                                       // Return -1.
-#endif
     mov    x3, x0
     \size_setup x3, w3, x1, w1, x4, w4, x5, w5, x6, w6
     ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED_WITH_SIZE .Lslow_path\name, x3, w3, x1, w1, x4, w4, x5, w5, x6, w6
@@ -1904,17 +1887,21 @@
 .macro COMPUTE_ARRAY_SIZE_64 xClass, wClass, xCount, wCount, xTemp0, wTemp0, xTemp1, wTemp1, xTemp2, wTemp2
     lsl    \xTemp1, \xCount, #3
     // Add array data offset and alignment.
-    // Add 4 to the size for 64 bit alignment.
-    add    \xTemp1, \xTemp1, #(MIRROR_INT_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK + 4)
+    add    \xTemp1, \xTemp1, #(MIRROR_WIDE_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK)
 .endm
 
 # TODO(ngeoffray): art_quick_alloc_array_resolved_region_tlab is not used for arm64, remove
 # the entrypoint once all backends have been updated to use the size variants.
-GENERATE_ALLOC_ARRAY_REGION_TLAB art_quick_alloc_array_resolved_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_UNKNOWN
-GENERATE_ALLOC_ARRAY_REGION_TLAB art_quick_alloc_array_resolved8_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_8
-GENERATE_ALLOC_ARRAY_REGION_TLAB art_quick_alloc_array_resolved16_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_16
-GENERATE_ALLOC_ARRAY_REGION_TLAB art_quick_alloc_array_resolved32_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_32
-GENERATE_ALLOC_ARRAY_REGION_TLAB art_quick_alloc_array_resolved64_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_64
+GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_UNKNOWN
+GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved8_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_8
+GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved16_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_16
+GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved32_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_32
+GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved64_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_64
+GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_UNKNOWN
+GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved8_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_8
+GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved16_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_16
+GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved32_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_32
+GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved64_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_64
 
     /*
      * Called by managed code when the thread has been asked to suspend.
diff --git a/runtime/arch/mips/quick_entrypoints_mips.S b/runtime/arch/mips/quick_entrypoints_mips.S
index 2d5eca0..663cb6c 100644
--- a/runtime/arch/mips/quick_entrypoints_mips.S
+++ b/runtime/arch/mips/quick_entrypoints_mips.S
@@ -1578,6 +1578,7 @@
 
 
 GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_rosalloc, RosAlloc)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_rosalloc, RosAlloc)
 GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab, TLAB)
 GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_region_tlab, RegionTLAB)
 
diff --git a/runtime/arch/mips64/quick_entrypoints_mips64.S b/runtime/arch/mips64/quick_entrypoints_mips64.S
index f3629d9..5fee575 100644
--- a/runtime/arch/mips64/quick_entrypoints_mips64.S
+++ b/runtime/arch/mips64/quick_entrypoints_mips64.S
@@ -1534,6 +1534,7 @@
 GENERATE_ALLOC_ENTRYPOINTS_FOR_EACH_ALLOCATOR
 
 GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_rosalloc, RosAlloc)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_rosalloc, RosAlloc)
 GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab, TLAB)
 GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_region_tlab, RegionTLAB)
 
diff --git a/runtime/arch/quick_alloc_entrypoints.S b/runtime/arch/quick_alloc_entrypoints.S
index 9204d85..2b3525b 100644
--- a/runtime/arch/quick_alloc_entrypoints.S
+++ b/runtime/arch/quick_alloc_entrypoints.S
@@ -145,7 +145,7 @@
 
 // This is to be separately defined for each architecture to allow a hand-written assembly fast path.
 // GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_rosalloc, RosAlloc)
-GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_rosalloc, RosAlloc)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_rosalloc, RosAlloc)
 GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_WITH_ACCESS_CHECK(_rosalloc, RosAlloc)
 GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED(_rosalloc, RosAlloc)
 GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED8(_rosalloc, RosAlloc)
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 47dc34a..76615e8 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -947,10 +947,37 @@
 END_MACRO
 
 // Generate the allocation entrypoints for each allocator.
-GENERATE_ALLOC_ENTRYPOINTS_FOR_EACH_ALLOCATOR
+GENERATE_ALLOC_ENTRYPOINTS_FOR_NON_TLAB_ALLOCATORS
+
+// Comment out allocators that have x86 specific asm.
+// Region TLAB:
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_region_tlab, RegionTLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_region_tlab, RegionTLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_WITH_ACCESS_CHECK(_region_tlab, RegionTLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED(_region_tlab, RegionTLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED8(_region_tlab, RegionTLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED16(_region_tlab, RegionTLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED32(_region_tlab, RegionTLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED64(_region_tlab, RegionTLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_BYTES(_region_tlab, RegionTLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_region_tlab, RegionTLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_region_tlab, RegionTLAB)
+// Normal TLAB:
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab, TLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_tlab, TLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_WITH_ACCESS_CHECK(_tlab, TLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED(_tlab, TLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED8(_tlab, TLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED16(_tlab, TLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED32(_tlab, TLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED64(_tlab, TLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_BYTES(_tlab, TLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_tlab, TLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_tlab, TLAB)
 
 // A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_rosalloc, RosAlloc).
-DEFINE_FUNCTION art_quick_alloc_object_resolved_rosalloc
+MACRO2(ART_QUICK_ALLOC_OBJECT_ROSALLOC, c_name, cxx_name)
+    DEFINE_FUNCTION VAR(c_name)
     // Fast path rosalloc allocation.
     // eax: type/return value
     // ecx, ebx, edx: free
@@ -959,14 +986,14 @@
                                                         // stack has room
     movl THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET(%ebx), %ecx
     cmpl THREAD_LOCAL_ALLOC_STACK_END_OFFSET(%ebx), %ecx
-    jae  .Lart_quick_alloc_object_resolved_rosalloc_slow_path
+    jae  .Lslow_path\c_name
 
     movl MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET(%eax), %ecx  // Load the object size (ecx)
                                                         // Check if the size is for a thread
                                                         // local allocation. Also does the
                                                         // finalizable and initialization check.
     cmpl LITERAL(ROSALLOC_MAX_THREAD_LOCAL_BRACKET_SIZE), %ecx
-    ja   .Lart_quick_alloc_object_resolved_rosalloc_slow_path
+    ja   .Lslow_path\c_name
     shrl LITERAL(ROSALLOC_BRACKET_QUANTUM_SIZE_SHIFT), %ecx // Calculate the rosalloc bracket index
                                                             // from object size.
                                                         // Load thread local rosalloc run (ebx)
@@ -977,7 +1004,7 @@
                                                         // Load free_list head (edi),
                                                         // this will be the return value.
     movl (ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_HEAD_OFFSET)(%ebx), %ecx
-    jecxz   .Lart_quick_alloc_object_resolved_rosalloc_slow_path
+    jecxz   .Lslow_path\c_name
                                                         // Point of no slow path. Won't go to
                                                         // the slow path from here on.
                                                         // Load the next pointer of the head
@@ -1008,7 +1035,7 @@
                                                         // No fence needed for x86.
     movl %ecx, %eax                                     // Move object to return register
     ret
-.Lart_quick_alloc_object_resolved_rosalloc_slow_path:
+.Lslow_path\c_name:
     SETUP_SAVE_REFS_ONLY_FRAME ebx, ebx          // save ref containing registers for GC
     // Outgoing argument set up
     subl LITERAL(8), %esp                       // alignment padding
@@ -1020,10 +1047,14 @@
     CFI_ADJUST_CFA_OFFSET(-16)
     RESTORE_SAVE_REFS_ONLY_FRAME                 // restore frame up to return address
     RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER      // return or deliver exception
-END_FUNCTION art_quick_alloc_object_resolved_rosalloc
+    END_FUNCTION VAR(c_name)
+END_MACRO
 
-// The common fast path code for art_quick_alloc_object_resolved_tlab
-// and art_quick_alloc_object_resolved_region_tlab.
+ART_QUICK_ALLOC_OBJECT_ROSALLOC art_quick_alloc_object_resolved_rosalloc, artAllocObjectFromCodeResolvedRosAlloc
+ART_QUICK_ALLOC_OBJECT_ROSALLOC art_quick_alloc_object_initialized_rosalloc, artAllocObjectFromCodeInitializedRosAlloc
+
+// The common fast path code for art_quick_alloc_object_resolved/initialized_tlab
+// and art_quick_alloc_object_resolved/initialized_region_tlab.
 //
 // EAX: type/return_value
 MACRO1(ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH, slowPathLabel)
@@ -1047,8 +1078,8 @@
     ret                                                 // Fast path succeeded.
 END_MACRO
 
-// The common slow path code for art_quick_alloc_object_resolved_tlab
-// and art_quick_alloc_object_resolved_region_tlab.
+// The common slow path code for art_quick_alloc_object_resolved/initialized_tlab
+// and art_quick_alloc_object_resolved/initialized_region_tlab.
 MACRO1(ALLOC_OBJECT_RESOLVED_TLAB_SLOW_PATH, cxx_name)
     POP edi
     SETUP_SAVE_REFS_ONLY_FRAME ebx, ebx                 // save ref containing registers for GC
@@ -1065,33 +1096,154 @@
     RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER             // return or deliver exception
 END_MACRO
 
-// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab, TLAB). May be called
-// for CC if the GC is not marking.
-DEFINE_FUNCTION art_quick_alloc_object_resolved_tlab
+MACRO2(ART_QUICK_ALLOC_OBJECT_TLAB, c_name, cxx_name)
+    DEFINE_FUNCTION VAR(c_name)
     // Fast path tlab allocation.
     // EAX: type
     // EBX, ECX, EDX: free.
     PUSH edi
-    ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH .Lart_quick_alloc_object_resolved_tlab_slow_path
-.Lart_quick_alloc_object_resolved_tlab_slow_path:
-    ALLOC_OBJECT_RESOLVED_TLAB_SLOW_PATH artAllocObjectFromCodeResolvedTLAB
-END_FUNCTION art_quick_alloc_object_resolved_tlab
+    ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH .Lslow_path\c_name
+.Lslow_path\c_name:
+    ALLOC_OBJECT_RESOLVED_TLAB_SLOW_PATH RAW_VAR(cxx_name)
+    END_FUNCTION VAR(c_name)
+END_MACRO
 
-// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_region_tlab, RegionTLAB).
-DEFINE_FUNCTION art_quick_alloc_object_resolved_region_tlab
-    // Fast path region tlab allocation.
-    // EAX: type/return value
-    // EBX, ECX, EDX: free.
-#if !defined(USE_READ_BARRIER)
+ART_QUICK_ALLOC_OBJECT_TLAB art_quick_alloc_object_resolved_tlab, artAllocObjectFromCodeResolvedTLAB
+ART_QUICK_ALLOC_OBJECT_TLAB art_quick_alloc_object_initialized_tlab, artAllocObjectFromCodeInitializedTLAB
+ART_QUICK_ALLOC_OBJECT_TLAB art_quick_alloc_object_resolved_region_tlab, artAllocObjectFromCodeResolvedRegionTLAB
+ART_QUICK_ALLOC_OBJECT_TLAB art_quick_alloc_object_initialized_region_tlab, artAllocObjectFromCodeInitializedRegionTLAB
+
+// The fast path code for art_quick_alloc_array_region_tlab.
+// Inputs: EAX: the class, ECX: int32_t component_count, EDX: total_size
+// Free temp: EBX
+// Output: EAX: return value.
+MACRO1(ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED_WITH_SIZE, slowPathLabel)
+    mov %fs:THREAD_SELF_OFFSET, %ebx                          // ebx = thread
+    // Mask out the unaligned part to make sure we are 8 byte aligned.
+    andl LITERAL(OBJECT_ALIGNMENT_MASK_TOGGLED), %edx
+    movl THREAD_LOCAL_END_OFFSET(%ebx), %edi
+    subl THREAD_LOCAL_POS_OFFSET(%ebx), %edi
+    cmpl %edi, %edx                                           // Check if it fits.
+    ja   RAW_VAR(slowPathLabel)
+    movl THREAD_LOCAL_POS_OFFSET(%ebx), %edi
+    addl %edi, %edx                                            // Add the object size.
+    movl %edx, THREAD_LOCAL_POS_OFFSET(%ebx)                   // Update thread_local_pos_
+    addl LITERAL(1), THREAD_LOCAL_OBJECTS_OFFSET(%ebx)         // Increase thread_local_objects.
+                                                               // Store the class pointer in the
+                                                               // header.
+                                                               // No fence needed for x86.
+    POISON_HEAP_REF eax
+    movl %eax, MIRROR_OBJECT_CLASS_OFFSET(%edi)
+    movl %ecx, MIRROR_ARRAY_LENGTH_OFFSET(%edi)
+    movl %edi, %eax
+    POP edi
+    ret                                                        // Fast path succeeded.
+END_MACRO
+
+MACRO1(COMPUTE_ARRAY_SIZE_UNKNOWN, slow_path)
+    // We should never enter here. Code is provided for reference.
     int3
-    int3
+    // Possibly a large object, go slow.
+    // Also does negative array size check.
+    cmpl LITERAL((MIN_LARGE_OBJECT_THRESHOLD - MIRROR_WIDE_ARRAY_DATA_OFFSET) / 8), %ecx
+    ja RAW_VAR(slow_path)
+    PUSH ecx
+    movl %ecx, %edx
+    movl MIRROR_CLASS_COMPONENT_TYPE_OFFSET(%eax), %ecx        // Load component type.
+    UNPOISON_HEAP_REF ecx
+    movl MIRROR_CLASS_OBJECT_PRIMITIVE_TYPE_OFFSET(%ecx), %ecx // Load primitive type.
+    shr MACRO_LITERAL(PRIMITIVE_TYPE_SIZE_SHIFT_SHIFT), %ecx        // Get component size shift.
+    sall %cl, %edx                                              // Calculate array count shifted.
+    // Add array header + alignment rounding.
+    add MACRO_LITERAL(MIRROR_INT_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK), %edx
+    // Add 4 extra bytes if we are doing a long array.
+    add MACRO_LITERAL(1), %ecx
+    and MACRO_LITERAL(4), %ecx
+#if MIRROR_WIDE_ARRAY_DATA_OFFSET != MIRROR_INT_ARRAY_DATA_OFFSET + 4
+#error Long array data offset must be 4 greater than int array data offset.
 #endif
-    PUSH edi
-    ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH .Lart_quick_alloc_object_resolved_region_tlab_slow_path
-.Lart_quick_alloc_object_resolved_region_tlab_slow_path:
-    ALLOC_OBJECT_RESOLVED_TLAB_SLOW_PATH artAllocObjectFromCodeResolvedRegionTLAB
-END_FUNCTION art_quick_alloc_object_resolved_region_tlab
+    addl %ecx, %edx
+    POP ecx
+END_MACRO
 
+MACRO1(COMPUTE_ARRAY_SIZE_8, slow_path)
+    // EAX: mirror::Class* klass, ECX: int32_t component_count
+    // Possibly a large object, go slow.
+    // Also does negative array size check.
+    cmpl LITERAL(MIN_LARGE_OBJECT_THRESHOLD - MIRROR_INT_ARRAY_DATA_OFFSET), %ecx
+    ja RAW_VAR(slow_path)
+    // Add array header + alignment rounding.
+    leal (MIRROR_INT_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK)(%ecx), %edx
+END_MACRO
+
+MACRO1(COMPUTE_ARRAY_SIZE_16, slow_path)
+    // EAX: mirror::Class* klass, ECX: int32_t component_count
+    // Possibly a large object, go slow.
+    // Also does negative array size check.
+    cmpl LITERAL((MIN_LARGE_OBJECT_THRESHOLD - MIRROR_INT_ARRAY_DATA_OFFSET) / 2), %ecx
+    ja RAW_VAR(slow_path)
+    // Add array header + alignment rounding.
+    leal ((MIRROR_INT_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK) / 2)(%ecx), %edx
+    sall MACRO_LITERAL(1), %edx
+END_MACRO
+
+MACRO1(COMPUTE_ARRAY_SIZE_32, slow_path)
+    // EAX: mirror::Class* klass, ECX: int32_t component_count
+    // Possibly a large object, go slow.
+    // Also does negative array size check.
+    cmpl LITERAL((MIN_LARGE_OBJECT_THRESHOLD - MIRROR_INT_ARRAY_DATA_OFFSET) / 4), %ecx
+    ja RAW_VAR(slow_path)
+    // Add array header + alignment rounding.
+    leal ((MIRROR_INT_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK) / 4)(%ecx), %edx
+    sall MACRO_LITERAL(2), %edx
+END_MACRO
+
+MACRO1(COMPUTE_ARRAY_SIZE_64, slow_path)
+    // EAX: mirror::Class* klass, ECX: int32_t component_count
+    // Possibly a large object, go slow.
+    // Also does negative array size check.
+    cmpl LITERAL((MIN_LARGE_OBJECT_THRESHOLD - MIRROR_WIDE_ARRAY_DATA_OFFSET) / 8), %ecx
+    ja RAW_VAR(slow_path)
+    // Add array header + alignment rounding.
+    leal ((MIRROR_WIDE_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK) / 8)(%ecx), %edx
+    sall MACRO_LITERAL(3), %edx
+END_MACRO
+
+MACRO3(GENERATE_ALLOC_ARRAY_TLAB, c_entrypoint, cxx_name, size_setup)
+    DEFINE_FUNCTION VAR(c_entrypoint)
+    // EAX: mirror::Class* klass, ECX: int32_t component_count
+    PUSH edi
+    CALL_MACRO(size_setup) .Lslow_path\c_entrypoint
+    ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED_WITH_SIZE .Lslow_path\c_entrypoint
+.Lslow_path\c_entrypoint:
+    POP edi
+    SETUP_SAVE_REFS_ONLY_FRAME ebx, ebx                        // save ref containing registers for GC
+    // Outgoing argument set up
+    PUSH eax                                                   // alignment padding
+    pushl %fs:THREAD_SELF_OFFSET                               // pass Thread::Current()
+    CFI_ADJUST_CFA_OFFSET(4)
+    PUSH ecx
+    PUSH eax
+    call CALLVAR(cxx_name)                                     // cxx_name(arg0, arg1, Thread*)
+    addl LITERAL(16), %esp                                     // pop arguments
+    CFI_ADJUST_CFA_OFFSET(-16)
+    RESTORE_SAVE_REFS_ONLY_FRAME                               // restore frame up to return address
+    RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER                    // return or deliver exception
+    END_FUNCTION VAR(c_entrypoint)
+END_MACRO
+
+
+GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_UNKNOWN
+GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved8_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_8
+GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved16_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_16
+GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved32_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_32
+GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved64_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_64
+
+GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_UNKNOWN
+GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved8_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_8
+GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved16_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_16
+GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved32_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_32
+GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved64_tlab, artAllocArrayFromCodeResolvedTLAB, COMPUTE_ARRAY_SIZE_64
 
 DEFINE_FUNCTION art_quick_resolve_string
     SETUP_SAVE_EVERYTHING_FRAME ebx, ebx
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 10f9047..a1ae858 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -1006,7 +1006,8 @@
 
 
 // A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_rosalloc, RosAlloc).
-DEFINE_FUNCTION art_quick_alloc_object_resolved_rosalloc
+MACRO2(ART_QUICK_ALLOC_OBJECT_ROSALLOC, c_name, cxx_name)
+    DEFINE_FUNCTION VAR(c_name)
     // Fast path rosalloc allocation.
     // RDI: mirror::Class*, RAX: return value
     // RSI, RDX, RCX, R8, R9: free.
@@ -1015,14 +1016,14 @@
     movq   %gs:THREAD_SELF_OFFSET, %r8                     // r8 = thread
     movq   THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET(%r8), %rcx  // rcx = alloc stack top.
     cmpq   THREAD_LOCAL_ALLOC_STACK_END_OFFSET(%r8), %rcx
-    jae    .Lart_quick_alloc_object_resolved_rosalloc_slow_path
+    jae    .Lslow_path\c_name
                                                            // Load the object size
     movl   MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET(%rdi), %eax
                                                            // Check if the size is for a thread
                                                            // local allocation. Also does the
                                                            // initialized and finalizable checks.
     cmpl   LITERAL(ROSALLOC_MAX_THREAD_LOCAL_BRACKET_SIZE), %eax
-    ja     .Lart_quick_alloc_object_resolved_rosalloc_slow_path
+    ja     .Lslow_path\c_name
                                                            // Compute the rosalloc bracket index
                                                            // from the size.
     shrq   LITERAL(ROSALLOC_BRACKET_QUANTUM_SIZE_SHIFT), %rax
@@ -1036,7 +1037,7 @@
                                                            // will be the return val.
     movq   (ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_HEAD_OFFSET)(%r9), %rax
     testq  %rax, %rax
-    jz     .Lart_quick_alloc_object_resolved_rosalloc_slow_path
+    jz     .Lslow_path\c_name
     // "Point of no slow path". Won't go to the slow path from here on. OK to clobber rdi and rsi.
                                                            // Push the new object onto the thread
                                                            // local allocation stack and
@@ -1063,25 +1064,19 @@
     decl   (ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_SIZE_OFFSET)(%r9)
                                                            // No fence necessary for x86.
     ret
-.Lart_quick_alloc_object_resolved_rosalloc_slow_path:
+.Lslow_path\c_name:
     SETUP_SAVE_REFS_ONLY_FRAME                             // save ref containing registers for GC
     // Outgoing argument set up
     movq %gs:THREAD_SELF_OFFSET, %rsi                      // pass Thread::Current()
-    call SYMBOL(artAllocObjectFromCodeResolvedRosAlloc)    // cxx_name(arg0, Thread*)
+    call CALLVAR(cxx_name)                                 // cxx_name(arg0, Thread*)
     RESTORE_SAVE_REFS_ONLY_FRAME                           // restore frame up to return address
     RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER                // return or deliver exception
-END_FUNCTION art_quick_alloc_object_rosalloc
-
-// The common fast path code for art_quick_alloc_object_tlab and art_quick_alloc_object_region_tlab.
-//
-// RDI: type_idx, RSI: ArtMethod*, RDX/EDX: the class, RAX: return value.
-// RCX: scratch, r8: Thread::Current().
-MACRO1(ALLOC_OBJECT_TLAB_FAST_PATH, slowPathLabel)
-    testl %edx, %edx                                       // Check null class
-    jz   RAW_VAR(slowPathLabel)
-    ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH(RAW_VAR(slowPathLabel))
+    END_FUNCTION VAR(c_name)
 END_MACRO
 
+ART_QUICK_ALLOC_OBJECT_ROSALLOC art_quick_alloc_object_resolved_rosalloc, artAllocObjectFromCodeResolvedRosAlloc
+ART_QUICK_ALLOC_OBJECT_ROSALLOC art_quick_alloc_object_initialized_rosalloc, artAllocObjectFromCodeInitializedRosAlloc
+
 // The common fast path code for art_quick_alloc_object_resolved_region_tlab.
 // TODO: delete ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH since it is the same as
 // ALLOC_OBJECT_INITIALIZED_TLAB_FAST_PATH.
@@ -1220,12 +1215,7 @@
     movq %rsi, %r9
     salq MACRO_LITERAL(3), %r9
     // Add array header + alignment rounding.
-    // Add 4 extra bytes for array data alignment
-    addq MACRO_LITERAL(MIRROR_INT_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK + 4), %r9
-END_MACRO
-
-// The slow path code for art_quick_alloc_array_*tlab.
-MACRO1(ALLOC_ARRAY_TLAB_SLOW_PATH, cxx_name)
+    addq MACRO_LITERAL(MIRROR_WIDE_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK), %r9
 END_MACRO
 
 MACRO3(GENERATE_ALLOC_ARRAY_TLAB, c_entrypoint, cxx_name, size_setup)