x86/x86-64: Rewrite `art_quick_aput_obj`.

Check for the GC marking just once and use a dedicated path
for GC marking on. Use `art_quick_read_barrier_mark_regNN`
for reference marking and remove the obsolete and slow
`READ_BARRIER` macros.

Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing
Test: testrunner.py --host --optimizing --gcstress
Bug: 160737021
Change-Id: I250e1bbeb5d93bc14055fc17d4eb0c6167c49e82
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 7f1311c..67ca2bb 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -1254,126 +1254,102 @@
     .endif
 END_MACRO
 
-    /*
-     * Macro to insert read barrier, only used in art_quick_aput_obj.
-     * obj_reg and dest_reg are registers, offset is a defined literal such as
-     * MIRROR_OBJECT_CLASS_OFFSET.
-     * pop_eax is a boolean flag, indicating if eax is popped after the call.
-     * TODO: When read barrier has a fast path, add heap unpoisoning support for the fast path.
-     */
-MACRO4(READ_BARRIER, obj_reg, offset, dest_reg, pop_eax)
-#ifdef USE_READ_BARRIER
-    PUSH eax                        // save registers used in art_quick_aput_obj
-    PUSH ebx
-    PUSH edx
-    PUSH ecx
-    // Outgoing argument set up
-    pushl MACRO_LITERAL((RAW_VAR(offset)))  // pass offset, double parentheses are necessary
-    CFI_ADJUST_CFA_OFFSET(4)
-    PUSH RAW_VAR(obj_reg)           // pass obj_reg
-    PUSH eax                        // pass ref, just pass eax for now since parameter ref is unused
-    call SYMBOL(artReadBarrierSlow) // artReadBarrierSlow(ref, obj_reg, offset)
-    // No need to unpoison return value in eax, artReadBarrierSlow() would do the unpoisoning.
-    .ifnc RAW_VAR(dest_reg), eax
-      movl %eax, REG_VAR(dest_reg)  // save loaded ref in dest_reg
-    .endif
-    addl MACRO_LITERAL(12), %esp    // pop arguments
-    CFI_ADJUST_CFA_OFFSET(-12)
-    POP_REG_NE ecx, RAW_VAR(dest_reg) // Restore args except dest_reg
-    POP_REG_NE edx, RAW_VAR(dest_reg)
-    POP_REG_NE ebx, RAW_VAR(dest_reg)
-    .ifc RAW_VAR(pop_eax), true
-      POP_REG_NE eax, RAW_VAR(dest_reg)
-    .endif
-#else
-    movl RAW_VAR(offset)(REG_VAR(obj_reg)), REG_VAR(dest_reg)
-    UNPOISON_HEAP_REF RAW_VAR(dest_reg)
-#endif  // USE_READ_BARRIER
-END_MACRO
-
 DEFINE_FUNCTION art_quick_aput_obj
     test %edx, %edx              // store of null
-    jz .Ldo_aput_null
-    READ_BARRIER eax, MIRROR_OBJECT_CLASS_OFFSET, ebx, true
-    READ_BARRIER ebx, MIRROR_CLASS_COMPONENT_TYPE_OFFSET, ebx, true
-    // value's type == array's component type - trivial assignability
-#if defined(USE_READ_BARRIER)
-    READ_BARRIER edx, MIRROR_OBJECT_CLASS_OFFSET, eax, false
-    cmpl %eax, %ebx
-    POP eax                      // restore eax from the push in the beginning of READ_BARRIER macro
-    // This asymmetric push/pop saves a push of eax and maintains stack alignment.
-#elif defined(USE_HEAP_POISONING)
-    PUSH eax                     // save eax
-    movl MIRROR_OBJECT_CLASS_OFFSET(%edx), %eax
-    UNPOISON_HEAP_REF eax
-    cmpl %eax, %ebx
-    POP eax                      // restore eax
-#else
-    cmpl MIRROR_OBJECT_CLASS_OFFSET(%edx), %ebx
-#endif
-    jne .Lcheck_assignability
-.Ldo_aput:
+    jz .Laput_obj_null
+    movl MIRROR_OBJECT_CLASS_OFFSET(%eax), %ebx
+    UNPOISON_HEAP_REF ebx
+#ifdef USE_READ_BARRIER
+    cmpl LITERAL(0), %fs:THREAD_IS_GC_MARKING_OFFSET
+    jnz .Laput_obj_gc_marking
+#endif  // USE_READ_BARRIER
+    movl MIRROR_CLASS_COMPONENT_TYPE_OFFSET(%ebx), %ebx
+    cmpl MIRROR_OBJECT_CLASS_OFFSET(%edx), %ebx  // Both poisoned if heap poisoning is enabled.
+    jne .Laput_obj_check_assignability
+.Laput_obj_store:
     POISON_HEAP_REF edx
     movl %edx, MIRROR_OBJECT_ARRAY_DATA_OFFSET(%eax, %ecx, 4)
     movl %fs:THREAD_CARD_TABLE_OFFSET, %edx
     shrl LITERAL(CARD_TABLE_CARD_SHIFT), %eax
     movb %dl, (%edx, %eax)
     ret
-.Ldo_aput_null:
+
+.Laput_obj_null:
     movl %edx, MIRROR_OBJECT_ARRAY_DATA_OFFSET(%eax, %ecx, 4)
     ret
-.Lcheck_assignability:
-    PUSH eax                      // save arguments
-    PUSH ecx
-    PUSH edx
-#if defined(USE_READ_BARRIER)
-    subl LITERAL(4), %esp         // alignment padding
-    CFI_ADJUST_CFA_OFFSET(4)
-    READ_BARRIER edx, MIRROR_OBJECT_CLASS_OFFSET, eax, true
-    subl LITERAL(4), %esp         // alignment padding
-    CFI_ADJUST_CFA_OFFSET(4)
-    PUSH eax                      // pass arg2 - type of the value to be stored
-#elif defined(USE_HEAP_POISONING)
-    subl LITERAL(8), %esp         // alignment padding
-    CFI_ADJUST_CFA_OFFSET(8)
+
+.Laput_obj_check_assignability:
+    UNPOISON_HEAP_REF ebx         // Unpoison array component type if poisoning is enabled.
+    PUSH_ARG eax                  // Save `art_quick_aput_obj()` arguments.
+    PUSH_ARG ecx
+    PUSH_ARG edx
+    INCREASE_FRAME 8              // Alignment padding.
+    // Pass arg2 - type of the value to be stored.
+#if defined(USE_HEAP_POISONING)
     movl MIRROR_OBJECT_CLASS_OFFSET(%edx), %eax
     UNPOISON_HEAP_REF eax
-    PUSH eax                      // pass arg2 - type of the value to be stored
+    PUSH_ARG eax
 #else
-    subl LITERAL(8), %esp         // alignment padding
-    CFI_ADJUST_CFA_OFFSET(8)
-    pushl MIRROR_OBJECT_CLASS_OFFSET(%edx)  // pass arg2 - type of the value to be stored
+    pushl MIRROR_OBJECT_CLASS_OFFSET(%edx)
     CFI_ADJUST_CFA_OFFSET(4)
 #endif
-    PUSH ebx                      // pass arg1 - component type of the array
+.Laput_obj_check_assignability_call:
+    PUSH_ARG ebx                  // Pass arg1 - component type of the array.
     call SYMBOL(artIsAssignableFromCode)  // (Class* a, Class* b)
-    addl LITERAL(16), %esp        // pop arguments
-    CFI_ADJUST_CFA_OFFSET(-16)
+    DECREASE_FRAME 16             // Pop `artIsAssignableFromCode()` arguments
     testl %eax, %eax
+    POP_ARG edx                   // Pop `art_quick_aput_obj()` arguments; flags unaffected.
+    POP_ARG ecx
+    POP_ARG eax
     jz   .Lthrow_array_store_exception
-    POP  edx
-    POP  ecx
-    POP  eax
     POISON_HEAP_REF edx
-    movl %edx, MIRROR_OBJECT_ARRAY_DATA_OFFSET(%eax, %ecx, 4)  // do the aput
+    movl %edx, MIRROR_OBJECT_ARRAY_DATA_OFFSET(%eax, %ecx, 4)  // Do the aput.
     movl %fs:THREAD_CARD_TABLE_OFFSET, %edx
     shrl LITERAL(CARD_TABLE_CARD_SHIFT), %eax
     movb %dl, (%edx, %eax)
     ret
-    CFI_ADJUST_CFA_OFFSET(12)     // 3 POP after the jz for unwinding.
+
 .Lthrow_array_store_exception:
-    POP  edx
-    POP  ecx
-    POP  eax
-    SETUP_SAVE_ALL_CALLEE_SAVES_FRAME ebx // save all registers as basis for long jump context
-    // Outgoing argument set up
-    PUSH eax                      // alignment padding
-    pushl %fs:THREAD_SELF_OFFSET  // pass Thread::Current()
-    CFI_ADJUST_CFA_OFFSET(4)
-    PUSH edx                      // pass arg2 - value
-    PUSH eax                      // pass arg1 - array
+#ifdef USE_READ_BARRIER
+    CFI_REMEMBER_STATE
+#endif  // USE_READ_BARRIER
+    SETUP_SAVE_ALL_CALLEE_SAVES_FRAME ebx // Save all registers as basis for long jump context.
+    // Outgoing argument set up.
+    PUSH_ARG eax                  // Alignment padding.
+    PUSH_ARG fs:THREAD_SELF_OFFSET  // Pass Thread::Current()
+    PUSH_ARG edx                  // Pass arg2 - value.
+    PUSH_ARG eax                  // Pass arg1 - array.
     call SYMBOL(artThrowArrayStoreException) // (array, value, Thread*)
     UNREACHABLE
+
+#ifdef USE_READ_BARRIER
+    CFI_RESTORE_STATE_AND_DEF_CFA esp, 4
+.Laput_obj_gc_marking:
+    PUSH_ARG eax                  // Save `art_quick_aput_obj()` arguments.
+    PUSH_ARG ecx                  // We need to align stack for `art_quick_read_barrier_mark_regNN`
+    PUSH_ARG edx                  // and use a register (EAX) as a temporary for the object class.
+    call SYMBOL(art_quick_read_barrier_mark_reg03)  // Mark EBX.
+    movl MIRROR_CLASS_COMPONENT_TYPE_OFFSET(%ebx), %ebx
+    UNPOISON_HEAP_REF ebx
+    call SYMBOL(art_quick_read_barrier_mark_reg03)  // Mark EBX.
+    movl MIRROR_OBJECT_CLASS_OFFSET(%edx), %eax
+    UNPOISON_HEAP_REF eax
+    call SYMBOL(art_quick_read_barrier_mark_reg00)  // Mark EAX.
+    cmpl %eax, %ebx
+    jne .Laput_obj_check_assignability_gc_marking
+    POP_ARG edx                   // Restore `art_quick_aput_obj()` arguments.
+    POP_ARG ecx
+    POP_ARG eax
+    jmp .Laput_obj_store
+
+.Laput_obj_check_assignability_gc_marking:
+    // Prepare arguments in line with `.Laput_obj_check_assignability_call` and jump there.
+    // (EAX, ECX and EDX were already saved in the right stack slots.)
+    INCREASE_FRAME 8              // Alignment padding.
+    PUSH_ARG eax                  // Pass arg2 - type of the value to be stored.
+    // The arg1 shall be pushed at `.Laput_obj_check_assignability_call`.
+    jmp .Laput_obj_check_assignability_call
+#endif  // USE_READ_BARRIER
 END_FUNCTION art_quick_aput_obj
 
 DEFINE_FUNCTION art_quick_memcpy
@@ -1993,15 +1969,10 @@
 // getting its argument and returning its result through register
 // `reg`, saving and restoring all caller-save registers.
 //
-// If `reg` is different from `eax`, the generated function follows a
-// non-standard runtime calling convention:
-// - register `reg` is used to pass the (sole) argument of this function
-//   (instead of EAX);
-// - register `reg` is used to return the result of this function
-//   (instead of EAX);
-// - EAX is treated like a normal (non-argument) caller-save register;
-// - everything else is the same as in the standard runtime calling
-//   convention (e.g. standard callee-save registers are preserved).
+// The generated function follows a non-standard runtime calling convention:
+// - register `reg` (which may differ from EAX) is used to pass the (sole) argument,
+// - register `reg` (which may differ from EAX) is used to return the result,
+// - all other registers are callee-save (the values they hold are preserved).
 MACRO2(READ_BARRIER_MARK_REG, name, reg)
     DEFINE_FUNCTION VAR(name)
     // Null check so that we can load the lock word.
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 673696c..18207ae 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -1163,134 +1163,89 @@
     .endif
 END_MACRO
 
-    /*
-     * Macro to insert read barrier, used in art_quick_aput_obj.
-     * obj_reg and dest_reg{32|64} are registers, offset is a defined literal such as
-     * MIRROR_OBJECT_CLASS_OFFSET. dest_reg needs two versions to handle the mismatch between
-     * 64b PUSH/POP and 32b argument.
-     * TODO: When read barrier has a fast path, add heap unpoisoning support for the fast path.
-     *
-     * As with art_quick_aput_obj function, the 64b versions are in comments.
-     */
-MACRO4(READ_BARRIER, obj_reg, offset, dest_reg32, dest_reg64)
-#ifdef USE_READ_BARRIER
-    PUSH rax                            // save registers that might be used
-    PUSH rdi
-    PUSH rsi
-    PUSH rdx
-    PUSH rcx
-    SETUP_FP_CALLEE_SAVE_FRAME
-    // Outgoing argument set up
-    // movl REG_VAR(ref_reg32), %edi    // pass ref, no-op for now since parameter ref is unused
-    // // movq REG_VAR(ref_reg64), %rdi
-    movl REG_VAR(obj_reg), %esi         // pass obj_reg
-    // movq REG_VAR(obj_reg), %rsi
-    movl MACRO_LITERAL((RAW_VAR(offset))), %edx // pass offset, double parentheses are necessary
-    // movq MACRO_LITERAL((RAW_VAR(offset))), %rdx
-    call SYMBOL(artReadBarrierSlow)     // artReadBarrierSlow(ref, obj_reg, offset)
-    // No need to unpoison return value in rax, artReadBarrierSlow() would do the unpoisoning.
-    .ifnc RAW_VAR(dest_reg32), eax
-    // .ifnc RAW_VAR(dest_reg64), rax
-      movl %eax, REG_VAR(dest_reg32)    // save loaded ref in dest_reg
-      // movq %rax, REG_VAR(dest_reg64)
-    .endif
-    RESTORE_FP_CALLEE_SAVE_FRAME
-    POP_REG_NE rcx, RAW_VAR(dest_reg64) // Restore registers except dest_reg
-    POP_REG_NE rdx, RAW_VAR(dest_reg64)
-    POP_REG_NE rsi, RAW_VAR(dest_reg64)
-    POP_REG_NE rdi, RAW_VAR(dest_reg64)
-    POP_REG_NE rax, RAW_VAR(dest_reg64)
-#else
-    movl RAW_VAR(offset)(REG_VAR(obj_reg)), REG_VAR(dest_reg32)
-    // movq RAW_VAR(offset)(REG_VAR(obj_reg)), REG_VAR(dest_reg64)
-    UNPOISON_HEAP_REF RAW_VAR(dest_reg32) // UNPOISON_HEAP_REF only takes a 32b register
-#endif  // USE_READ_BARRIER
-END_MACRO
-
 DEFINE_FUNCTION art_quick_aput_obj
-    testl %edx, %edx                // store of null
-//  test %rdx, %rdx
-    jz .Ldo_aput_null
-    READ_BARRIER edi, MIRROR_OBJECT_CLASS_OFFSET, ecx, rcx
-    // READ_BARRIER rdi, MIRROR_OBJECT_CLASS_OFFSET, ecx, rcx
-    READ_BARRIER ecx, MIRROR_CLASS_COMPONENT_TYPE_OFFSET, ecx, rcx
-    // READ_BARRIER rcx, MIRROR_CLASS_COMPONENT_TYPE_OFFSET, ecx, rcx
-#if defined(USE_HEAP_POISONING) || defined(USE_READ_BARRIER)
-    READ_BARRIER edx, MIRROR_OBJECT_CLASS_OFFSET, eax, rax  // rax is free.
-    // READ_BARRIER rdx, MIRROR_OBJECT_CLASS_OFFSET, eax, rax
-    cmpl %eax, %ecx  // value's type == array's component type - trivial assignability
-#else
-    cmpl MIRROR_OBJECT_CLASS_OFFSET(%edx), %ecx // value's type == array's component type - trivial assignability
-//  cmpq MIRROR_CLASS_OFFSET(%rdx), %rcx
-#endif
-    jne .Lcheck_assignability
-.Ldo_aput:
+    test %edx, %edx              // store of null
+    jz .Laput_obj_null
+    movl MIRROR_OBJECT_CLASS_OFFSET(%rdi), %ecx
+    UNPOISON_HEAP_REF ecx
+#ifdef USE_READ_BARRIER
+    cmpl LITERAL(0), %gs:THREAD_IS_GC_MARKING_OFFSET
+    jnz .Laput_obj_gc_marking
+#endif  // USE_READ_BARRIER
+    movl MIRROR_CLASS_COMPONENT_TYPE_OFFSET(%rcx), %ecx
+    cmpl MIRROR_OBJECT_CLASS_OFFSET(%rdx), %ecx  // Both poisoned if heap poisoning is enabled.
+    jne .Laput_obj_check_assignability
+.Laput_obj_store:
     POISON_HEAP_REF edx
-    movl %edx, MIRROR_OBJECT_ARRAY_DATA_OFFSET(%edi, %esi, 4)
-//  movq %rdx, MIRROR_OBJECT_ARRAY_DATA_OFFSET(%rdi, %rsi, 4)
+    movl %edx, MIRROR_OBJECT_ARRAY_DATA_OFFSET(%rdi, %rsi, 4)
     movq %gs:THREAD_CARD_TABLE_OFFSET, %rdx
     shrl LITERAL(CARD_TABLE_CARD_SHIFT), %edi
-//  shrl LITERAL(CARD_TABLE_CARD_SHIFT), %rdi
-    movb %dl, (%rdx, %rdi)                       // Note: this assumes that top 32b of %rdi are zero
+    movb %dl, (%rdx, %rdi)
     ret
-.Ldo_aput_null:
-    movl %edx, MIRROR_OBJECT_ARRAY_DATA_OFFSET(%edi, %esi, 4)
-//  movq %rdx, MIRROR_OBJECT_ARRAY_DATA_OFFSET(%rdi, %rsi, 4)
+
+.Laput_obj_null:
+    movl %edx, MIRROR_OBJECT_ARRAY_DATA_OFFSET(%rdi, %rsi, 4)
     ret
-.Lcheck_assignability:
-    // Save arguments.
-    PUSH rdi
-    PUSH rsi
-    PUSH rdx
+
+.Laput_obj_check_assignability:
+    UNPOISON_HEAP_REF ecx         // Unpoison array component type if poisoning is enabled.
+    PUSH_ARG rdi                  // Save arguments.
+    PUSH_ARG rsi
+    PUSH_ARG rdx
+    movl MIRROR_OBJECT_CLASS_OFFSET(%rdx), %esi  // Pass arg2 = value's class.
+    UNPOISON_HEAP_REF esi
+.Laput_obj_check_assignability_call:
+    movl %ecx, %edi               // Pass arg1 = array's component type.
     SETUP_FP_CALLEE_SAVE_FRAME
-
-#if defined(USE_HEAP_POISONING) || defined(USE_READ_BARRIER)
-    // The load of MIRROR_OBJECT_CLASS_OFFSET(%edx) is redundant, eax still holds the value.
-    movl %eax, %esi               // Pass arg2 = value's class.
-    // movq %rax, %rsi
-#else
-                                     // "Uncompress" = do nothing, as already zero-extended on load.
-    movl MIRROR_OBJECT_CLASS_OFFSET(%edx), %esi  // Pass arg2 = value's class.
-#endif
-    movq %rcx, %rdi               // Pass arg1 = array's component type.
-
     call SYMBOL(artIsAssignableFromCode)  // (Class* a, Class* b)
-
-    // Exception?
-    testq %rax, %rax
-    jz   .Lthrow_array_store_exception
-
-    RESTORE_FP_CALLEE_SAVE_FRAME
-    // Restore arguments.
-    POP  rdx
-    POP  rsi
-    POP  rdi
-
+    RESTORE_FP_CALLEE_SAVE_FRAME  // Resore FP registers.
+    POP_ARG rdx                   // Restore arguments.
+    POP_ARG rsi
+    POP_ARG rdi
+    testq %rax, %rax              // Check for exception.
+    jz   .Laput_obj_throw_array_store_exception
     POISON_HEAP_REF edx
-    movl %edx, MIRROR_OBJECT_ARRAY_DATA_OFFSET(%edi, %esi, 4)
-//  movq %rdx, MIRROR_OBJECT_ARRAY_DATA_OFFSET(%rdi, %rsi, 4)
+    movl %edx, MIRROR_OBJECT_ARRAY_DATA_OFFSET(%rdi, %rsi, 4)
     movq %gs:THREAD_CARD_TABLE_OFFSET, %rdx
     shrl LITERAL(CARD_TABLE_CARD_SHIFT), %edi
-//  shrl LITERAL(CARD_TABLE_CARD_SHIFT), %rdi
-    movb %dl, (%rdx, %rdi)                       // Note: this assumes that top 32b of %rdi are zero
-//  movb %dl, (%rdx, %rdi)
+    movb %dl, (%rdx, %rdi)
     ret
-    CFI_ADJUST_CFA_OFFSET(24 + 4 * 8)  // Reset unwind info so following code unwinds.
-.Lthrow_array_store_exception:
-    RESTORE_FP_CALLEE_SAVE_FRAME
-    // Restore arguments.
-    POP  rdx
-    POP  rsi
-    POP  rdi
 
+.Laput_obj_throw_array_store_exception:
+#ifdef USE_READ_BARRIER
+    CFI_REMEMBER_STATE
+#endif  // USE_READ_BARRIER
     SETUP_SAVE_ALL_CALLEE_SAVES_FRAME  // Save all registers as basis for long jump context.
-
     // Outgoing argument set up.
     movq %rdx, %rsi                         // Pass arg 2 = value.
     movq %gs:THREAD_SELF_OFFSET, %rdx       // Pass arg 3 = Thread::Current().
                                             // Pass arg 1 = array.
     call SYMBOL(artThrowArrayStoreException) // (array, value, Thread*)
     UNREACHABLE
+
+#ifdef USE_READ_BARRIER
+    CFI_RESTORE_STATE_AND_DEF_CFA esp, 4
+.Laput_obj_gc_marking:
+    // We need to align stack for `art_quick_read_barrier_mark_regNN`.
+    INCREASE_FRAME 8                        // Stack alignment.
+    call SYMBOL(art_quick_read_barrier_mark_reg01)  // Mark ECX
+    movl MIRROR_CLASS_COMPONENT_TYPE_OFFSET(%rcx), %ecx
+    UNPOISON_HEAP_REF ecx
+    call SYMBOL(art_quick_read_barrier_mark_reg01)  // Mark ECX
+    movl MIRROR_OBJECT_CLASS_OFFSET(%rdx), %eax
+    UNPOISON_HEAP_REF eax
+    call SYMBOL(art_quick_read_barrier_mark_reg00)  // Mark EAX
+    DECREASE_FRAME 8                        // Remove stack alignment.
+    cmpl %eax, %ecx
+    je .Laput_obj_store
+    // Prepare arguments in line with `.Laput_obj_check_assignability_call` and jump there.
+    PUSH_ARG rdi                  // Save arguments.
+    PUSH_ARG rsi
+    PUSH_ARG rdx
+    movl %eax, %esi               // Pass arg2 - type of the value to be stored.
+    // The arg1 shall be moved at `.Ldo_assignability_check_call`.
+    jmp .Laput_obj_check_assignability_call
+#endif  // USE_READ_BARRIER
 END_FUNCTION art_quick_aput_obj
 
 // TODO: This is quite silly on X86_64 now.
@@ -1855,16 +1810,9 @@
 //
 // The generated function follows a non-standard runtime calling
 // convention:
-// - register `reg` (which may be different from RDI) is used to pass
-//   the (sole) argument of this function;
-// - register `reg` (which may be different from RAX) is used to return
-//   the result of this function (instead of RAX);
-// - if `reg` is different from `rdi`, RDI is treated like a normal
-//   (non-argument) caller-save register;
-// - if `reg` is different from `rax`, RAX is treated like a normal
-//   (non-result) caller-save register;
-// - everything else is the same as in the standard runtime calling
-//   convention (e.g. standard callee-save registers are preserved).
+// - register `reg` (which may be different from RDI) is used to pass the (sole) argument,
+// - register `reg` (which may be different from RAX) is used to return the result,
+// - all other registers are callee-save (the values they hold are preserved).
 MACRO2(READ_BARRIER_MARK_REG, name, reg)
     DEFINE_FUNCTION VAR(name)
     // Null check so that we can load the lock word.