Move caller-saves saving/restoring to ReadBarrierMarkRegX.

Instead of saving/restoring live caller-save registers
before/after the call to read barrier mark entry points
ReadBarrierMarkRegX, have these entry points save/restore
all the caller-save registers themselves (except register
rX, which contains the return value).

Also refactor the assembly code of these entry points
using macros.

* Boot image code size variation on Nexus 5X
  (aosp_bullhead-userdebug build):
  - total ARM64 framework Oat files size change:
    119196792 bytes -> 115575920 bytes (-3.04%)
  - total ARM framework Oat files size change:
    100435212 bytes -> 97621188 bytes (-2.80%)

* Benchmarks (ARM64) score variations on Nexus 5X
  (aosp_bullhead-userdebug build):
  - RitzPerf (lower is better)
    - average score difference: -2.71%
  - CaffeineMark (higher is better)
    - no real difference for most tests
      (absolute variation lower than 1%)
    - better score on the "Method" benchmark:
      score variation 41253 -> 44891 (+8.82%)

Test: ART host and target (ARM, ARM64) tests.
Bug: 29506760
Bug: 12687968
Change-Id: I881bf73139a3f1c2bee9ffc6fc8c00f9a392afa6
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index 47e6625..5e6e175 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -80,7 +80,11 @@
 
   virtual void EmitNativeCode(CodeGenerator* codegen) = 0;
 
+  // Save live core and floating-point caller-save registers and
+  // update the stack mask in `locations` for registers holding object
+  // references.
   virtual void SaveLiveRegisters(CodeGenerator* codegen, LocationSummary* locations);
+  // Restore live core and floating-point caller-save registers.
   virtual void RestoreLiveRegisters(CodeGenerator* codegen, LocationSummary* locations);
 
   bool IsCoreRegisterSaved(int reg) const {
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 1aa7b54..474e9d5 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -437,11 +437,9 @@
         << instruction_->DebugName();
 
     __ Bind(GetEntryLabel());
-    // Save live registers before the runtime call, and in particular
-    // R0 (if it is live), as it is clobbered by functions
-    // art_quick_read_barrier_mark_regX.
-    SaveLiveRegisters(codegen, locations);
-
+    // No need to save live registers; it's taken care of by the
+    // entrypoint. Also, there is no need to update the stack mask,
+    // as this runtime call will not trigger a garbage collection.
     InvokeRuntimeCallingConvention calling_convention;
     CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen);
     DCHECK_NE(reg, SP);
@@ -469,8 +467,6 @@
                                instruction_,
                                instruction_->GetDexPc(),
                                this);
-
-    RestoreLiveRegisters(codegen, locations);
     __ b(GetExitLabel());
   }
 
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 07b7823..cec641f 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -603,11 +603,9 @@
         << instruction_->DebugName();
 
     __ Bind(GetEntryLabel());
-    // Save live registers before the runtime call, and in particular
-    // W0 (if it is live), as it is clobbered by functions
-    // art_quick_read_barrier_mark_regX.
-    SaveLiveRegisters(codegen, locations);
-
+    // No need to save live registers; it's taken care of by the
+    // entrypoint. Also, there is no need to update the stack mask,
+    // as this runtime call will not trigger a garbage collection.
     InvokeRuntimeCallingConvention calling_convention;
     CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen);
     DCHECK_NE(obj_.reg(), LR);
@@ -635,8 +633,6 @@
                                  instruction_,
                                  instruction_->GetDexPc(),
                                  this);
-
-    RestoreLiveRegisters(codegen, locations);
     __ B(GetExitLabel());
   }
 
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 1cc6060..93bf022 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -472,11 +472,9 @@
         << instruction_->DebugName();
 
     __ Bind(GetEntryLabel());
-    // Save live registers before the runtime call, and in particular
-    // EAX (if it is live), as it is clobbered by functions
-    // art_quick_read_barrier_mark_regX.
-    SaveLiveRegisters(codegen, locations);
-
+    // No need to save live registers; it's taken care of by the
+    // entrypoint. Also, there is no need to update the stack mask,
+    // as this runtime call will not trigger a garbage collection.
     InvokeRuntimeCallingConvention calling_convention;
     CodeGeneratorX86* x86_codegen = down_cast<CodeGeneratorX86*>(codegen);
     DCHECK_NE(reg, ESP);
@@ -502,8 +500,6 @@
                                instruction_,
                                instruction_->GetDexPc(),
                                this);
-
-    RestoreLiveRegisters(codegen, locations);
     __ jmp(GetExitLabel());
   }
 
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index a015893..0d85bea 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -493,11 +493,9 @@
         << instruction_->DebugName();
 
     __ Bind(GetEntryLabel());
-    // Save live registers before the runtime call, and in particular
-    // RDI and/or RAX (if they are live), as they are clobbered by
-    // functions art_quick_read_barrier_mark_regX.
-    SaveLiveRegisters(codegen, locations);
-
+    // No need to save live registers; it's taken care of by the
+    // entrypoint. Also, there is no need to update the stack mask,
+    // as this runtime call will not trigger a garbage collection.
     InvokeRuntimeCallingConvention calling_convention;
     CodeGeneratorX86_64* x86_64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
     DCHECK_NE(reg, RSP);
@@ -523,8 +521,6 @@
                                   instruction_,
                                   instruction_->GetDexPc(),
                                   this);
-
-    RestoreLiveRegisters(codegen, locations);
     __ jmp(GetExitLabel());
   }
 
diff --git a/runtime/arch/arm/entrypoints_init_arm.cc b/runtime/arch/arm/entrypoints_init_arm.cc
index 8f18ff3..dfa592c 100644
--- a/runtime/arch/arm/entrypoints_init_arm.cc
+++ b/runtime/arch/arm/entrypoints_init_arm.cc
@@ -33,7 +33,9 @@
 // Read barrier entrypoints.
 // art_quick_read_barrier_mark_regX uses an non-standard calling
 // convention: it expects its input in register X and returns its
-// result in that same register.
+// result in that same register, and saves and restores all
+// caller-save registers.
+extern "C" mirror::Object* art_quick_read_barrier_mark_reg00(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg01(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg02(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg03(mirror::Object*);
@@ -119,7 +121,7 @@
 
   // Read barrier.
   qpoints->pReadBarrierJni = ReadBarrierJni;
-  qpoints->pReadBarrierMarkReg00 = artReadBarrierMark;
+  qpoints->pReadBarrierMarkReg00 = art_quick_read_barrier_mark_reg00;
   qpoints->pReadBarrierMarkReg01 = art_quick_read_barrier_mark_reg01;
   qpoints->pReadBarrierMarkReg02 = art_quick_read_barrier_mark_reg02;
   qpoints->pReadBarrierMarkReg03 = art_quick_read_barrier_mark_reg03;
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index f9c34f5..34d3158 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -672,6 +672,12 @@
     .endif
 .endm
 
+// Save rReg's value to [sp, #offset].
+.macro PUSH_REG rReg, offset
+    str \rReg, [sp, #\offset]       @ save rReg
+    .cfi_rel_offset \rReg, \offset
+.endm
+
     /*
      * Macro to insert read barrier, only used in art_quick_aput_obj.
      * rObj and rDest are registers, offset is a defined literal such as MIRROR_OBJECT_CLASS_OFFSET.
@@ -1752,30 +1758,83 @@
     /*
      * Create a function `name` calling the ReadBarrier::Mark routine,
      * getting its argument and returning its result through register
-     * `reg`, thus following a non-standard runtime calling convention:
-     * - `reg` is used to pass the (sole) argument of this function
+     * `reg`, saving and restoring all caller-save registers.
+     *
+     * If `reg` is different from `r0`, the generated function follows a
+     * non-standard runtime calling convention:
+     * - register `reg` is used to pass the (sole) argument of this
+     *   function (instead of R0);
+     * - register `reg` is used to return the result of this function
      *   (instead of R0);
-     * - `reg` is used to return the result of this function (instead of R0);
      * - R0 is treated like a normal (non-argument) caller-save register;
      * - everything else is the same as in the standard runtime calling
-     *   convention (e.g. same callee-save registers).
+     *   convention (e.g. standard callee-save registers are preserved).
      */
 .macro READ_BARRIER_MARK_REG name, reg
 ENTRY \name
-    push  {lr}                          @ save return address
-    .cfi_adjust_cfa_offset 4
-    .cfi_rel_offset lr, 0
-    sub   sp, #4                        @ push padding (native calling convention 8-byte alignment)
-    .cfi_adjust_cfa_offset 4
-    mov   r0, \reg                      @ pass arg1 - obj from `reg`
-    bl    artReadBarrierMark            @ artReadBarrierMark(obj)
-    mov   \reg, r0                      @ return result into `reg`
-    add   sp, #4                        @ pop padding
-    .cfi_adjust_cfa_offset -4
-    pop   {pc}                          @ return
+    push  {r0-r4, r9, r12, lr}          @ save return address and core caller-save registers
+    .cfi_adjust_cfa_offset 32
+    .cfi_rel_offset r0, 0
+    .cfi_rel_offset r1, 4
+    .cfi_rel_offset r2, 8
+    .cfi_rel_offset r3, 12
+    .cfi_rel_offset r4, 16
+    .cfi_rel_offset r9, 20
+    .cfi_rel_offset r12, 24
+    .cfi_rel_offset lr, 28
+    vpush {s0-s15}                      @ save floating-point caller-save registers
+    .cfi_adjust_cfa_offset 64
+
+    .ifnc \reg, r0
+      mov   r0, \reg                    @ pass arg1 - obj from `reg`
+    .endif
+    bl    artReadBarrierMark            @ r0 <- artReadBarrierMark(obj)
+
+    vpop {s0-s15}                       @ restore floating-point registers
+    .cfi_adjust_cfa_offset -64
+    @ If `reg` is a caller-save register, save the result to its
+    @ corresponding stack slot; it will be restored by the "pop"
+    @ instruction below. Otherwise, move result into `reg`.
+    @
+    @ (Note that saving `reg` to its stack slot will overwrite the value
+    @ previously stored by the "push" instruction above. That is
+    @ alright, as in that case we know that `reg` is not a live
+    @ register, as it is used to pass the argument and return the result
+    @ of this function.)
+    .ifc \reg, r0
+      PUSH_REG r0, 0                    @ copy result to r0's stack location
+    .else
+      .ifc \reg, r1
+        PUSH_REG r0, 4                  @ copy result to r1's stack location
+      .else
+        .ifc \reg, r2
+          PUSH_REG r0, 8                @ copy result to r2's stack location
+        .else
+          .ifc \reg, r3
+            PUSH_REG r0, 12             @ copy result to r3's stack location
+          .else
+            .ifc \reg, r4
+              PUSH_REG r0, 16           @ copy result to r4's stack location
+            .else
+              .ifc \reg, r9
+                PUSH_REG r0, 20         @ copy result to r9's stack location
+              .else
+                .ifc \reg, r12
+                  PUSH_REG r0, 24       @ copy result to r12's stack location
+                .else
+                  mov   \reg, r0        @ return result into `reg`
+                .endif
+              .endif
+            .endif
+          .endif
+        .endif
+      .endif
+    .endif
+    pop   {r0-r4, r9, r12, pc}          @ restore caller-save registers and return
 END \name
 .endm
 
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg00, r0
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg01, r1
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg02, r2
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg03, r3
diff --git a/runtime/arch/arm64/entrypoints_init_arm64.cc b/runtime/arch/arm64/entrypoints_init_arm64.cc
index c3188b6..e30a860 100644
--- a/runtime/arch/arm64/entrypoints_init_arm64.cc
+++ b/runtime/arch/arm64/entrypoints_init_arm64.cc
@@ -33,7 +33,9 @@
 // Read barrier entrypoints.
 // art_quick_read_barrier_mark_regX uses an non-standard calling
 // convention: it expects its input in register X and returns its
-// result in that same register.
+// result in that same register, and saves and restores all
+// caller-save registers.
+extern "C" mirror::Object* art_quick_read_barrier_mark_reg00(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg01(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg02(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg03(mirror::Object*);
@@ -122,7 +124,7 @@
 
   // Read barrier.
   qpoints->pReadBarrierJni = ReadBarrierJni;
-  qpoints->pReadBarrierMarkReg00 = artReadBarrierMark;
+  qpoints->pReadBarrierMarkReg00 = art_quick_read_barrier_mark_reg00;
   qpoints->pReadBarrierMarkReg01 = art_quick_read_barrier_mark_reg01;
   qpoints->pReadBarrierMarkReg02 = art_quick_read_barrier_mark_reg02;
   qpoints->pReadBarrierMarkReg03 = art_quick_read_barrier_mark_reg03;
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index c893e77..6173ae7 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -1253,6 +1253,22 @@
     .endif
 .endm
 
+// Restore xReg1's value from [sp, #offset] if xReg1 is not the same as xExclude.
+// Restore xReg2's value from [sp, #(offset + 8)] if xReg2 is not the same as xExclude.
+.macro POP_REGS_NE xReg1, xReg2, offset, xExclude
+    .ifc \xReg1, \xExclude
+        ldr \xReg2, [sp, #(\offset + 8)]        // restore xReg2
+    .else
+        .ifc \xReg2, \xExclude
+            ldr \xReg1, [sp, #\offset]          // restore xReg1
+        .else
+            ldp \xReg1, \xReg2, [sp, #\offset]  // restore xReg1 and xReg2
+        .endif
+    .endif
+    .cfi_restore \xReg1
+    .cfi_restore \xReg2
+.endm
+
     /*
      * Macro to insert read barrier, only used in art_quick_aput_obj.
      * xDest, wDest and xObj are registers, offset is a defined literal such as
@@ -2222,56 +2238,148 @@
 
     /*
      * Create a function `name` calling the ReadBarrier::Mark routine,
-     * getting its argument and returning its result through register
-     * `reg`, thus following a non-standard runtime calling convention:
-     * - `reg` is used to pass the (sole) argument of this function
+     * getting its argument and returning its result through W register
+     * `wreg` (corresponding to X register `xreg`), saving and restoring
+     * all caller-save registers.
+     *
+     * If `wreg` is different from `w0`, the generated function follows a
+     * non-standard runtime calling convention:
+     * - register `wreg` is used to pass the (sole) argument of this
+     *   function (instead of W0);
+     * - register `wreg` is used to return the result of this function
      *   (instead of W0);
-     * - `reg` is used to return the result of this function (instead of W0);
      * - W0 is treated like a normal (non-argument) caller-save register;
      * - everything else is the same as in the standard runtime calling
-     *   convention (e.g. same callee-save registers).
+     *   convention (e.g. standard callee-save registers are preserved).
      */
-.macro READ_BARRIER_MARK_REG name, reg
+.macro READ_BARRIER_MARK_REG name, wreg, xreg
 ENTRY \name
-    str   xLR, [sp, #-16]!              // Save return address and add padding (16B align stack).
-    .cfi_adjust_cfa_offset 16
-    .cfi_rel_offset x30, 0
-    mov   w0, \reg                      // Pass arg1 - obj from `reg`
+    /*
+     * Allocate 46 stack slots * 8 = 368 bytes:
+     * - 20 slots for core registers X0-X19
+     * - 24 slots for floating-point registers D0-D7 and D16-D31
+     * -  1 slot for return address register XLR
+     * -  1 padding slot for 16-byte stack alignment
+     */
+    // Save all potentially live caller-save core registers.
+    stp   x0, x1,   [sp, #-368]!
+    .cfi_adjust_cfa_offset 368
+    .cfi_rel_offset x0, 0
+    .cfi_rel_offset x1, 8
+    stp   x2, x3,   [sp, #16]
+    .cfi_rel_offset x2, 16
+    .cfi_rel_offset x3, 24
+    stp   x4, x5,   [sp, #32]
+    .cfi_rel_offset x4, 32
+    .cfi_rel_offset x5, 40
+    stp   x6, x7,   [sp, #48]
+    .cfi_rel_offset x6, 48
+    .cfi_rel_offset x7, 56
+    stp   x8, x9,   [sp, #64]
+    .cfi_rel_offset x8, 64
+    .cfi_rel_offset x9, 72
+    stp   x10, x11, [sp, #80]
+    .cfi_rel_offset x10, 80
+    .cfi_rel_offset x11, 88
+    stp   x12, x13, [sp, #96]
+    .cfi_rel_offset x12, 96
+    .cfi_rel_offset x13, 104
+    stp   x14, x15, [sp, #112]
+    .cfi_rel_offset x14, 112
+    .cfi_rel_offset x15, 120
+    stp   x16, x17, [sp, #128]
+    .cfi_rel_offset x16, 128
+    .cfi_rel_offset x17, 136
+    stp   x18, x19, [sp, #144]
+    .cfi_rel_offset x18, 144
+    .cfi_rel_offset x19, 152
+    // Save all potentially live caller-save floating-point registers.
+    stp   d0, d1,   [sp, #160]
+    stp   d2, d3,   [sp, #176]
+    stp   d4, d5,   [sp, #192]
+    stp   d6, d7,   [sp, #208]
+    stp   d16, d17, [sp, #224]
+    stp   d18, d19, [sp, #240]
+    stp   d20, d21, [sp, #256]
+    stp   d22, d23, [sp, #272]
+    stp   d24, d25, [sp, #288]
+    stp   d26, d27, [sp, #304]
+    stp   d28, d29, [sp, #320]
+    stp   d30, d31, [sp, #336]
+    // Save return address.
+    str   xLR,      [sp, #352]
+    .cfi_rel_offset x30, 352
+    // (sp + #360 is a padding slot)
+
+    .ifnc \wreg, w0
+      mov   w0, \wreg                   // Pass arg1 - obj from `wreg`
+    .endif
     bl    artReadBarrierMark            // artReadBarrierMark(obj)
-    mov   \reg, w0                      // Return result into `reg`
-    ldr   xLR, [sp], #16                // Restore return address and remove padding.
+    .ifnc \wreg, w0
+      mov   \wreg, w0                   // Return result into `wreg`
+    .endif
+
+    // Restore core regs, except `xreg`, as `wreg` is used to return the
+    // result of this function (simply remove it from the stack instead).
+    POP_REGS_NE x0, x1,   0,   \xreg
+    POP_REGS_NE x2, x3,   16,  \xreg
+    POP_REGS_NE x4, x5,   32,  \xreg
+    POP_REGS_NE x6, x7,   48,  \xreg
+    POP_REGS_NE x8, x9,   64,  \xreg
+    POP_REGS_NE x10, x11, 80,  \xreg
+    POP_REGS_NE x12, x13, 96,  \xreg
+    POP_REGS_NE x14, x15, 112, \xreg
+    POP_REGS_NE x16, x17, 128, \xreg
+    POP_REGS_NE x18, x19, 144, \xreg
+    // Restore floating-point registers.
+    ldp   d0, d1,   [sp, #160]
+    ldp   d2, d3,   [sp, #176]
+    ldp   d4, d5,   [sp, #192]
+    ldp   d6, d7,   [sp, #208]
+    ldp   d16, d17, [sp, #224]
+    ldp   d18, d19, [sp, #240]
+    ldp   d20, d21, [sp, #256]
+    ldp   d22, d23, [sp, #272]
+    ldp   d24, d25, [sp, #288]
+    ldp   d26, d27, [sp, #304]
+    ldp   d28, d29, [sp, #320]
+    ldp   d30, d31, [sp, #336]
+    // Restore return address and remove padding.
+    ldr   xLR,      [sp, #352]
     .cfi_restore x30
-    .cfi_adjust_cfa_offset -16
+    add sp, sp, #368
+    .cfi_adjust_cfa_offset -368
     ret
 END \name
 .endm
 
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg01, w1
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg02, w2
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg03, w3
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg04, w4
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg05, w5
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg06, w6
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg07, w7
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg08, w8
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg09, w9
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg10, w10
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg11, w11
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg12, w12
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg13, w13
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg14, w14
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg15, w15
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg16, w16
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg17, w17
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg18, w18
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg19, w19
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg20, w20
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg21, w21
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg22, w22
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg23, w23
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg24, w24
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg25, w25
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg26, w26
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg27, w27
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg28, w28
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg29, w29
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg00, w0,  x0
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg01, w1,  x1
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg02, w2,  x2
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg03, w3,  x3
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg04, w4,  x4
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg05, w5,  x5
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg06, w6,  x6
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg07, w7,  x7
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg08, w8,  x8
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg09, w9,  x9
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg10, w10, x10
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg11, w11, x11
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg12, w12, x12
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg13, w13, x13
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg14, w14, x14
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg15, w15, x15
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg16, w16, x16
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg17, w17, x17
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg18, w18, x18
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg19, w19, x19
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg20, w20, x20
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg21, w21, x21
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg22, w22, x22
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg23, w23, x23
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg24, w24, x24
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg25, w25, x25
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg26, w26, x26
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg27, w27, x27
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg28, w28, x28
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg29, w29, x29
diff --git a/runtime/arch/x86/entrypoints_init_x86.cc b/runtime/arch/x86/entrypoints_init_x86.cc
index 69c939e..1b9ab44 100644
--- a/runtime/arch/x86/entrypoints_init_x86.cc
+++ b/runtime/arch/x86/entrypoints_init_x86.cc
@@ -31,7 +31,8 @@
 // Read barrier entrypoints.
 // art_quick_read_barrier_mark_regX uses an non-standard calling
 // convention: it expects its input in register X and returns its
-// result in that same register.
+// result in that same register, and saves and restores all
+// caller-save registers.
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg00(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg01(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg02(mirror::Object*);
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index e75fecb..77e04e7 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -1908,41 +1908,73 @@
     UNREACHABLE
 END_FUNCTION art_nested_signal_return
 
-// Call the ReadBarrierMark entry point, getting input and returning
-// result through EAX (register 0), following the standard runtime
-// calling convention.
-DEFINE_FUNCTION art_quick_read_barrier_mark_reg00
-    subl LITERAL(8), %esp            // alignment padding
-    CFI_ADJUST_CFA_OFFSET(8)
-    PUSH eax                         // pass arg1 - obj
-    call SYMBOL(artReadBarrierMark)  // artReadBarrierMark(obj)
-    addl LITERAL(12), %esp           // pop argument and remove padding
-    CFI_ADJUST_CFA_OFFSET(-12)
-    ret
-END_FUNCTION art_quick_read_barrier_mark_reg00
-
 // Create a function `name` calling the ReadBarrier::Mark routine,
 // getting its argument and returning its result through register
-// `reg`, thus following a non-standard runtime calling convention:
-// - `reg` is used to pass the (sole) argument of this function
+// `reg`, saving and restoring all caller-save registers.
+//
+// If `reg` is different from `eax`, the generated function follows a
+// non-standard runtime calling convention:
+// - register `reg` is used to pass the (sole) argument of this function
 //   (instead of EAX);
-// - `reg` is used to return the result of this function (instead of EAX);
+// - register `reg` is used to return the result of this function
+//   (instead of EAX);
 // - EAX is treated like a normal (non-argument) caller-save register;
 // - everything else is the same as in the standard runtime calling
-//   convention (e.g. same callee-save registers).
+//   convention (e.g. standard callee-save registers are preserved).
 MACRO2(READ_BARRIER_MARK_REG, name, reg)
     DEFINE_FUNCTION VAR(name)
-    subl LITERAL(8), %esp            // alignment padding
-    CFI_ADJUST_CFA_OFFSET(8)
+    // Save all potentially live caller-save core registers.
+    PUSH eax
+    PUSH ecx
+    PUSH edx
+    PUSH ebx
+    // 8-byte align the stack to improve (8-byte) XMM register saving and restoring.
+    // and create space for caller-save floating-point registers.
+    subl MACRO_LITERAL(4 + 8 * 8), %esp
+    CFI_ADJUST_CFA_OFFSET(4 + 8 * 8)
+    // Save all potentially live caller-save floating-point registers.
+    movsd %xmm0, 0(%esp)
+    movsd %xmm1, 8(%esp)
+    movsd %xmm2, 16(%esp)
+    movsd %xmm3, 24(%esp)
+    movsd %xmm4, 32(%esp)
+    movsd %xmm5, 40(%esp)
+    movsd %xmm6, 48(%esp)
+    movsd %xmm7, 56(%esp)
+
+    subl LITERAL(4), %esp            // alignment padding
+    CFI_ADJUST_CFA_OFFSET(4)
     PUSH RAW_VAR(reg)                // pass arg1 - obj from `reg`
     call SYMBOL(artReadBarrierMark)  // artReadBarrierMark(obj)
-    movl %eax, REG_VAR(reg)          // return result into `reg`
-    addl LITERAL(12), %esp           // pop argument and remove padding
-    CFI_ADJUST_CFA_OFFSET(-12)
+    .ifnc RAW_VAR(reg), eax
+      movl %eax, REG_VAR(reg)        // return result into `reg`
+    .endif
+    addl LITERAL(8), %esp            // pop argument and remove padding
+    CFI_ADJUST_CFA_OFFSET(-8)
+
+    // Restore floating-point registers.
+    movsd 0(%esp), %xmm0
+    movsd 8(%esp), %xmm1
+    movsd 16(%esp), %xmm2
+    movsd 24(%esp), %xmm3
+    movsd 32(%esp), %xmm4
+    movsd 40(%esp), %xmm5
+    movsd 48(%esp), %xmm6
+    movsd 56(%esp), %xmm7
+    // Remove floating-point registers and padding.
+    addl MACRO_LITERAL(8 * 8 + 4), %esp
+    CFI_ADJUST_CFA_OFFSET(-(8 * 8 + 4))
+    // Restore core regs, except `reg`, as it is used to return the
+    // result of this function (simply remove it from the stack instead).
+    POP_REG_NE ebx, RAW_VAR(reg)
+    POP_REG_NE edx, RAW_VAR(reg)
+    POP_REG_NE ecx, RAW_VAR(reg)
+    POP_REG_NE eax, RAW_VAR(reg)
     ret
     END_FUNCTION VAR(name)
 END_MACRO
 
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg00, eax
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg01, ecx
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg02, edx
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg03, ebx
diff --git a/runtime/arch/x86_64/entrypoints_init_x86_64.cc b/runtime/arch/x86_64/entrypoints_init_x86_64.cc
index 2bea3db..e6566e1 100644
--- a/runtime/arch/x86_64/entrypoints_init_x86_64.cc
+++ b/runtime/arch/x86_64/entrypoints_init_x86_64.cc
@@ -34,7 +34,8 @@
 // Read barrier entrypoints.
 // art_quick_read_barrier_mark_regX uses an non-standard calling
 // convention: it expects its input in register X and returns its
-// result in that same register.
+// result in that same register, and saves and restores all
+// caller-save registers.
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg00(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg01(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg02(mirror::Object*);
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 496e6a8..784ec39 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -1815,73 +1815,93 @@
     UNREACHABLE
 END_FUNCTION art_nested_signal_return
 
-// Call the ReadBarrier::Mark routine, getting argument and returning
-// result through RAX (register 0), thus following a non-standard
-// runtime calling convention:
-// - RAX is used to pass the (sole) argument of this function (instead
-//   of RDI);
-// - RDI is treated like a normal (non-argument) caller-save register;
-// - everything else is the same as in the standard runtime calling
-//   convention; in particular, RAX is still used to return the result
-//   of this function.
-DEFINE_FUNCTION art_quick_read_barrier_mark_reg00
-    SETUP_FP_CALLEE_SAVE_FRAME
-    subq LITERAL(8), %rsp           // Alignment padding.
-    CFI_ADJUST_CFA_OFFSET(8)
-    movq %rax, %rdi                 // Pass arg1 - obj from RAX.
-    call SYMBOL(artReadBarrierMark) // artReadBarrierMark(obj)
-    addq LITERAL(8), %rsp           // Remove padding.
-    CFI_ADJUST_CFA_OFFSET(-8)
-    RESTORE_FP_CALLEE_SAVE_FRAME
-    ret
-END_FUNCTION art_quick_read_barrier_mark_reg00
-
-// Call the ReadBarrier::Mark routine, getting argument and returning
-// result through RDI (register 7), thus following a non-standard
-// runtime calling convention:
-// - RDI is used to return the result of this function (instead of RAX);
-// - RAX is treated like a normal (non-result) caller-save register;
-// - everything else is the same as in the standard runtime calling
-//   convention; in particular, RDI is still used to pass the (sole)
-//   argument of this function.
-DEFINE_FUNCTION art_quick_read_barrier_mark_reg07
-    SETUP_FP_CALLEE_SAVE_FRAME
-    subq LITERAL(8), %rsp           // Alignment padding.
-    CFI_ADJUST_CFA_OFFSET(8)
-    call SYMBOL(artReadBarrierMark) // artReadBarrierMark(obj)
-    movq %rax, %rdi                 // Return result into RDI.
-    addq LITERAL(8), %rsp           // Remove padding.
-    CFI_ADJUST_CFA_OFFSET(-8)
-    RESTORE_FP_CALLEE_SAVE_FRAME
-    ret
-END_FUNCTION art_quick_read_barrier_mark_reg07
-
 // Create a function `name` calling the ReadBarrier::Mark routine,
 // getting its argument and returning its result through register
-// `reg`, thus following a non-standard runtime calling convention:
-// - `reg` is used to pass the (sole) argument of this function (instead
-//   of RDI);
-// - `reg` is used to return the result of this function (instead of RAX);
-// - RDI is treated like a normal (non-argument) caller-save register;
-// - RAX is treated like a normal (non-result) caller-save register;
+// `reg`, saving and restoring all caller-save registers.
+//
+// The generated function follows a non-standard runtime calling
+// convention:
+// - register `reg` (which may be different from RDI) is used to pass
+//   the (sole) argument of this function;
+// - register `reg` (which may be different from RAX) is used to return
+//   the result of this function (instead of RAX);
+// - if `reg` is different from `rdi`, RDI is treated like a normal
+//   (non-argument) caller-save register;
+// - if `reg` is different from `rax`, RAX is treated like a normal
+//   (non-result) caller-save register;
 // - everything else is the same as in the standard runtime calling
-//   convention (e.g. same callee-save registers).
+//   convention (e.g. standard callee-save registers are preserved).
 MACRO2(READ_BARRIER_MARK_REG, name, reg)
     DEFINE_FUNCTION VAR(name)
+    // Save all potentially live caller-save core registers.
+    PUSH rax
+    PUSH rcx
+    PUSH rdx
+    PUSH rsi
+    PUSH rdi
+    PUSH r8
+    PUSH r9
+    PUSH r10
+    PUSH r11
+    // Create space for caller-save floating-point registers.
+    subq MACRO_LITERAL(12 * 8), %rsp
+    CFI_ADJUST_CFA_OFFSET(12 * 8)
+    // Save all potentially live caller-save floating-point registers.
+    movq %xmm0, 0(%rsp)
+    movq %xmm1, 8(%rsp)
+    movq %xmm2, 16(%rsp)
+    movq %xmm3, 24(%rsp)
+    movq %xmm4, 32(%rsp)
+    movq %xmm5, 40(%rsp)
+    movq %xmm6, 48(%rsp)
+    movq %xmm7, 56(%rsp)
+    movq %xmm8, 64(%rsp)
+    movq %xmm9, 72(%rsp)
+    movq %xmm10, 80(%rsp)
+    movq %xmm11, 88(%rsp)
     SETUP_FP_CALLEE_SAVE_FRAME
-    subq LITERAL(8), %rsp           // Alignment padding.
-    CFI_ADJUST_CFA_OFFSET(8)
-    movq REG_VAR(reg), %rdi         // Pass arg1 - obj from `reg`.
+
+    .ifnc RAW_VAR(reg), rdi
+      movq REG_VAR(reg), %rdi       // Pass arg1 - obj from `reg`.
+    .endif
     call SYMBOL(artReadBarrierMark) // artReadBarrierMark(obj)
-    movq %rax, REG_VAR(reg)         // Return result into `reg`.
-    addq LITERAL(8), %rsp           // Remove padding.
-    CFI_ADJUST_CFA_OFFSET(-8)
+    .ifnc RAW_VAR(reg), rax
+      movq %rax, REG_VAR(reg)       // Return result into `reg`.
+    .endif
+
     RESTORE_FP_CALLEE_SAVE_FRAME
+    // Restore floating-point registers.
+    movq 0(%rsp), %xmm0
+    movq 8(%rsp), %xmm1
+    movq 16(%rsp), %xmm2
+    movq 24(%rsp), %xmm3
+    movq 32(%rsp), %xmm4
+    movq 40(%rsp), %xmm5
+    movq 48(%rsp), %xmm6
+    movq 56(%rsp), %xmm7
+    movq 64(%rsp), %xmm8
+    movq 72(%rsp), %xmm9
+    movq 80(%rsp), %xmm10
+    movq 88(%rsp), %xmm11
+    // Remove floating-point registers.
+    addq MACRO_LITERAL(12 * 8), %rsp
+    CFI_ADJUST_CFA_OFFSET(-(12 * 8))
+    // Restore core regs, except `reg`, as it is used to return the
+    // result of this function (simply remove it from the stack instead).
+    POP_REG_NE r11, RAW_VAR(reg)
+    POP_REG_NE r10, RAW_VAR(reg)
+    POP_REG_NE r9, RAW_VAR(reg)
+    POP_REG_NE r8, RAW_VAR(reg)
+    POP_REG_NE rdi, RAW_VAR(reg)
+    POP_REG_NE rsi, RAW_VAR(reg)
+    POP_REG_NE rdx, RAW_VAR(reg)
+    POP_REG_NE rcx, RAW_VAR(reg)
+    POP_REG_NE rax, RAW_VAR(reg)
     ret
     END_FUNCTION VAR(name)
 END_MACRO
 
-// Note: art_quick_read_barrier_mark_reg00 is implemented above.
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg00, rax
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg01, rcx
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg02, rdx
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg03, rbx
@@ -1889,7 +1909,7 @@
 // cannot be used to pass arguments.
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg05, rbp
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg06, rsi
-// Note: art_quick_read_barrier_mark_reg07 is implemented above.
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg07, rdi
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg08, r8
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg09, r9
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg10, r10
diff --git a/runtime/oat.h b/runtime/oat.h
index e506e3c..9b8f545 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -32,7 +32,7 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr uint8_t kOatMagic[] = { 'o', 'a', 't', '\n' };
-  static constexpr uint8_t kOatVersion[] = { '0', '8', '3', '\0' };
+  static constexpr uint8_t kOatVersion[] = { '0', '8', '4', '\0' };
 
   static constexpr const char* kImageLocationKey = "image-location";
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";