arm: Rewrite `art_quick_aput_obj`.

Test: run-gtests.sh
Test: testrunner.py --target --32 --optimizing
Test: testrunner.py --target --32 --optimizing --gcstress
Bug: 160737021
Change-Id: I32a51cee80dd33564481b9916967d6692c156d2e
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index 5a97572..8612300 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -637,13 +637,33 @@
     .cfi_rel_offset \rReg, \offset
 .endm
 
-    /*
-     * Macro to insert read barrier, only used in art_quick_aput_obj.
-     * rObj and rDest are registers, offset is a defined literal such as MIRROR_OBJECT_CLASS_OFFSET.
-     * TODO: When read barrier has a fast path, add heap unpoisoning support for the fast path.
-     */
-.macro READ_BARRIER rDest, rObj, offset
+    // Helper macros for `art_quick_aput_obj`.
 #ifdef USE_READ_BARRIER
+#ifdef USE_BAKER_READ_BARRIER
+.macro BAKER_RB_CHECK_GRAY_BIT_AND_LOAD rDest, rObj, offset, gray_slow_path_label
+    ldr ip, [\rObj, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+    tst ip, #LOCK_WORD_READ_BARRIER_STATE_MASK_SHIFTED
+    bne \gray_slow_path_label
+    // False dependency to avoid needing load/load fence.
+    add \rObj, \rObj, ip, lsr #32
+    ldr \rDest, [\rObj, #\offset]
+    UNPOISON_HEAP_REF \rDest
+.endm
+
+.macro BAKER_RB_LOAD_AND_MARK rDest, rObj, offset, mark_function
+    ldr \rDest, [\rObj, #\offset]
+    UNPOISON_HEAP_REF \rDest
+    str lr, [sp, #-8]!             @ Save LR with correct stack alignment.
+    .cfi_rel_offset lr, 0
+    .cfi_adjust_cfa_offset 8
+    bl \mark_function
+    ldr lr, [sp], #8               @ Restore LR.
+    .cfi_restore lr
+    .cfi_adjust_cfa_offset -8
+.endm
+#else  // USE_BAKER_READ_BARRIER
+    .extern artReadBarrierSlow
+.macro READ_BARRIER_SLOW rDest, rObj, offset
     push {r0-r3, ip, lr}            @ 6 words for saved registers (used in art_quick_aput_obj)
     .cfi_adjust_cfa_offset 24
     .cfi_rel_offset r0, 0
@@ -676,30 +696,26 @@
     pop {lr}                        @ restore lr
     .cfi_adjust_cfa_offset -4
     .cfi_restore lr
-#else
-    ldr \rDest, [\rObj, #\offset]
-    UNPOISON_HEAP_REF \rDest
+#endif // USE_BAKER_READ_BARRIER
 #endif  // USE_READ_BARRIER
-.endm
 
-#ifdef USE_READ_BARRIER
-    .extern artReadBarrierSlow
-#endif
     .hidden art_quick_aput_obj
 ENTRY art_quick_aput_obj
+    cbz r2, .Laput_obj_null
 #ifdef USE_READ_BARRIER
-    @ The offset to .Ldo_aput_null is too large to use cbz due to expansion from READ_BARRIER macro.
-    tst r2, r2
-    beq .Ldo_aput_null
-#else
-    cbz r2, .Ldo_aput_null
+    cmp rMR, #0
+    bne .Laput_obj_gc_marking
 #endif  // USE_READ_BARRIER
-    READ_BARRIER r3, r0, MIRROR_OBJECT_CLASS_OFFSET
-    READ_BARRIER ip, r2, MIRROR_OBJECT_CLASS_OFFSET
-    READ_BARRIER r3, r3, MIRROR_CLASS_COMPONENT_TYPE_OFFSET
-    cmp r3, ip  @ value's type == array's component type - trivial assignability
-    bne .Lcheck_assignability
-.Ldo_aput:
+    ldr r3, [r0, #MIRROR_OBJECT_CLASS_OFFSET]
+    UNPOISON_HEAP_REF r3
+    // R4 is a scratch register in managed ARM ABI.
+    ldr r4, [r2, #MIRROR_OBJECT_CLASS_OFFSET]
+    UNPOISON_HEAP_REF r4
+    ldr r3, [r3, #MIRROR_CLASS_COMPONENT_TYPE_OFFSET]
+    UNPOISON_HEAP_REF r3
+    cmp r3, r4  @ value's type == array's component type - trivial assignability
+    bne .Laput_obj_check_assignability
+.Laput_obj_store:
     add r3, r0, #MIRROR_OBJECT_ARRAY_DATA_OFFSET
     POISON_HEAP_REF r2
     str r2, [r3, r1, lsl #2]
@@ -707,26 +723,22 @@
     lsr r0, r0, #CARD_TABLE_CARD_SHIFT
     strb r3, [r3, r0]
     blx lr
-.Ldo_aput_null:
+
+.Laput_obj_null:
     add r3, r0, #MIRROR_OBJECT_ARRAY_DATA_OFFSET
     str r2, [r3, r1, lsl #2]
     blx lr
-.Lcheck_assignability:
+
+.Laput_obj_check_assignability:
     push {r0-r2, lr}             @ save arguments
     .cfi_adjust_cfa_offset 16
-    .cfi_rel_offset r0, 0
-    .cfi_rel_offset r1, 4
-    .cfi_rel_offset r2, 8
     .cfi_rel_offset lr, 12
-    mov r1, ip
+    mov r1, r4
     mov r0, r3
     bl artIsAssignableFromCode
     cbz r0, .Lthrow_array_store_exception
     .cfi_remember_state
     pop {r0-r2, lr}
-    .cfi_restore r0
-    .cfi_restore r1
-    .cfi_restore r2
     .cfi_restore lr
     .cfi_adjust_cfa_offset -16
     add r3, r0, #MIRROR_OBJECT_ARRAY_DATA_OFFSET
@@ -736,19 +748,60 @@
     lsr r0, r0, #CARD_TABLE_CARD_SHIFT
     strb r3, [r3, r0]
     blx lr
+
 .Lthrow_array_store_exception:
     CFI_RESTORE_STATE_AND_DEF_CFA sp, 16
     pop {r0-r2, lr}
-    .cfi_restore r0
-    .cfi_restore r1
-    .cfi_restore r2
     .cfi_restore lr
     .cfi_adjust_cfa_offset -16
+#ifdef USE_READ_BARRIER
+    .cfi_remember_state
+#endif  // USE_READ_BARRIER
     SETUP_SAVE_ALL_CALLEE_SAVES_FRAME r3
     mov r1, r2
-    mov r2, rSELF                  @ pass Thread::Current
+    mov r2, rSELF                  @ Pass Thread::Current.
     bl artThrowArrayStoreException @ (Class*, Class*, Thread*)
-    bkpt                           @ unreached
+    bkpt                           @ Unreachable.
+
+#ifdef USE_READ_BARRIER
+    CFI_RESTORE_STATE_AND_DEF_CFA sp, 0
+.Laput_obj_gc_marking:
+#ifdef USE_BAKER_READ_BARRIER
+    BAKER_RB_CHECK_GRAY_BIT_AND_LOAD \
+        r3, r0, MIRROR_OBJECT_CLASS_OFFSET, .Laput_obj_mark_array_class
+.Laput_obj_mark_array_class_continue:
+    BAKER_RB_CHECK_GRAY_BIT_AND_LOAD \
+        r3, r3, MIRROR_CLASS_COMPONENT_TYPE_OFFSET, .Laput_obj_mark_array_element
+.Laput_obj_mark_array_element_continue:
+    BAKER_RB_CHECK_GRAY_BIT_AND_LOAD \
+        r4, r2, MIRROR_OBJECT_CLASS_OFFSET, .Laput_obj_mark_object_class
+.Laput_obj_mark_object_class_continue:
+#else  // USE_BAKER_READ_BARRIER
+    READ_BARRIER_SLOW r3, r0, MIRROR_OBJECT_CLASS_OFFSET
+    READ_BARRIER_SLOW r3, r3, MIRROR_CLASS_COMPONENT_TYPE_OFFSET
+    READ_BARRIER_SLOW r4, r2, MIRROR_OBJECT_CLASS_OFFSET
+#endif  // USE_BAKER_READ_BARRIER
+
+    cmp r3, r4  @ value's type == array's component type - trivial assignability
+    // All registers are set up for correctly `.Laput_obj_check_assignability`.
+    bne .Laput_obj_check_assignability
+    b   .Laput_obj_store
+
+#ifdef USE_BAKER_READ_BARRIER
+.Laput_obj_mark_array_class:
+    BAKER_RB_LOAD_AND_MARK r3, r0, MIRROR_OBJECT_CLASS_OFFSET, art_quick_read_barrier_mark_reg03
+    b .Laput_obj_mark_array_class_continue
+
+.Laput_obj_mark_array_element:
+    BAKER_RB_LOAD_AND_MARK \
+        r3, r3, MIRROR_CLASS_COMPONENT_TYPE_OFFSET, art_quick_read_barrier_mark_reg03
+    b .Laput_obj_mark_array_element_continue
+
+.Laput_obj_mark_object_class:
+    BAKER_RB_LOAD_AND_MARK r4, r2, MIRROR_OBJECT_CLASS_OFFSET, art_quick_read_barrier_mark_reg04
+    b .Laput_obj_mark_object_class_continue
+#endif  // USE_BAKER_READ_BARRIER
+#endif  // USE_READ_BARRIER
 END art_quick_aput_obj
 
 // Macro to facilitate adding new allocation entrypoints.
@@ -1926,13 +1979,9 @@
      *
      * If `reg` is different from `r0`, the generated function follows a
      * non-standard runtime calling convention:
-     * - register `reg` is used to pass the (sole) argument of this
-     *   function (instead of R0);
-     * - register `reg` is used to return the result of this function
-     *   (instead of R0);
-     * - R0 is treated like a normal (non-argument) caller-save register;
-     * - everything else is the same as in the standard runtime calling
-     *   convention (e.g. standard callee-save registers are preserved).
+     * - register `reg` (which may be different from R0) is used to pass the (sole) argument,
+     * - register `reg` (which may be different from R0) is used to return the result,
+     * - all other registers are callee-save (the values they hold are preserved).
      */
 .macro READ_BARRIER_MARK_REG name, reg
 ENTRY \name
diff --git a/tools/cpp-define-generator/lockword.def b/tools/cpp-define-generator/lockword.def
index a170c15..5494d59 100644
--- a/tools/cpp-define-generator/lockword.def
+++ b/tools/cpp-define-generator/lockword.def
@@ -30,10 +30,8 @@
            art::LockWord::kMarkBitStateMaskShifted)
 ASM_DEFINE(LOCK_WORD_MARK_BIT_SHIFT,
            art::LockWord::kMarkBitStateShift)
-ASM_DEFINE(LOCK_WORD_READ_BARRIER_STATE_MASK,
+ASM_DEFINE(LOCK_WORD_READ_BARRIER_STATE_MASK_SHIFTED,
            art::LockWord::kReadBarrierStateMaskShifted)
-ASM_DEFINE(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED,
-           art::LockWord::kReadBarrierStateMaskShiftedToggled)
 ASM_DEFINE(LOCK_WORD_READ_BARRIER_STATE_SHIFT,
            art::LockWord::kReadBarrierStateShift)
 ASM_DEFINE(LOCK_WORD_STATE_FORWARDING_ADDRESS,