Introduce more compact ReadBarrierMark slow-paths.

Replace entry point ReadBarrierMark with 32
ReadBarrierMarkRegX entry points, using register
number X as input and output (instead of the standard
runtime calling convention) to save two moves in Baker's
read barrier mark slow-path code.

Test: ART host and target (ARM, ARM64) tests.
Bug: 29506760
Bug: 12687968
Change-Id: I73cfb82831cf040b8b018e984163c865cc44ed87
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index 9364be3..b8540ba 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -350,6 +350,16 @@
   // accessing the String's `value` field in String intrinsics.
   static uint32_t GetArrayDataOffset(HArrayGet* array_get);
 
+  // Return the entry point offset for ReadBarrierMarkRegX, where X is `reg`.
+  template <size_t pointer_size>
+  static int32_t GetReadBarrierMarkEntryPointsOffset(size_t reg) {
+    DCHECK_LT(reg, 32u);
+    // The ReadBarrierMarkRegX entry points are ordered by increasing
+    // register number in Thread::tls_Ptr_.quick_entrypoints.
+    return QUICK_ENTRYPOINT_OFFSET(pointer_size, pReadBarrierMarkReg00).Int32Value()
+        + pointer_size * reg;
+  }
+
   void EmitParallelMoves(Location from1,
                          Location to1,
                          Primitive::Type type1,
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 5316d59..e9bc3b3 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -412,8 +412,8 @@
 // Slow path marking an object during a read barrier.
 class ReadBarrierMarkSlowPathARM : public SlowPathCode {
  public:
-  ReadBarrierMarkSlowPathARM(HInstruction* instruction, Location out, Location obj)
-      : SlowPathCode(instruction), out_(out), obj_(obj) {
+  ReadBarrierMarkSlowPathARM(HInstruction* instruction, Location obj)
+      : SlowPathCode(instruction), obj_(obj) {
     DCHECK(kEmitCompilerReadBarrier);
   }
 
@@ -421,9 +421,9 @@
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     LocationSummary* locations = instruction_->GetLocations();
-    Register reg_out = out_.AsRegister<Register>();
+    Register reg = obj_.AsRegister<Register>();
     DCHECK(locations->CanCall());
-    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg_out));
+    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg));
     DCHECK(instruction_->IsInstanceFieldGet() ||
            instruction_->IsStaticFieldGet() ||
            instruction_->IsArrayGet() ||
@@ -437,24 +437,44 @@
         << instruction_->DebugName();
 
     __ Bind(GetEntryLabel());
+    // Save live registers before the runtime call, and in particular
+    // R0 (if it is live), as it is clobbered by functions
+    // art_quick_read_barrier_mark_regX.
     SaveLiveRegisters(codegen, locations);
 
     InvokeRuntimeCallingConvention calling_convention;
     CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen);
-    arm_codegen->Move32(Location::RegisterLocation(calling_convention.GetRegisterAt(0)), obj_);
-    arm_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pReadBarrierMark),
+    DCHECK_NE(reg, SP);
+    DCHECK_NE(reg, LR);
+    DCHECK_NE(reg, PC);
+    DCHECK(0 <= reg && reg < kNumberOfCoreRegisters) << reg;
+    // "Compact" slow path, saving two moves.
+    //
+    // Instead of using the standard runtime calling convention (input
+    // and output in R0):
+    //
+    //   R0 <- obj
+    //   R0 <- ReadBarrierMark(R0)
+    //   obj <- R0
+    //
+    // we just use rX (the register holding `obj`) as input and output
+    // of a dedicated entrypoint:
+    //
+    //   rX <- ReadBarrierMarkRegX(rX)
+    //
+    int32_t entry_point_offset =
+        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmWordSize>(reg);
+    // TODO: Do not emit a stack map for this runtime call.
+    arm_codegen->InvokeRuntime(entry_point_offset,
                                instruction_,
                                instruction_->GetDexPc(),
                                this);
-    CheckEntrypointTypes<kQuickReadBarrierMark, mirror::Object*, mirror::Object*>();
-    arm_codegen->Move32(out_, Location::RegisterLocation(R0));
 
     RestoreLiveRegisters(codegen, locations);
     __ b(GetExitLabel());
   }
 
  private:
-  const Location out_;
   const Location obj_;
 
   DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathARM);
@@ -6175,7 +6195,7 @@
 
       // Slow path used to mark the GC root `root`.
       SlowPathCode* slow_path =
-          new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM(instruction, root, root);
+          new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM(instruction, root);
       codegen_->AddSlowPath(slow_path);
 
       // IP = Thread::Current()->GetIsGcMarking()
@@ -6324,7 +6344,7 @@
 
   // Slow path used to mark the object `ref` when it is gray.
   SlowPathCode* slow_path =
-      new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM(instruction, ref, ref);
+      new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM(instruction, ref);
   AddSlowPath(slow_path);
 
   // if (rb_state == ReadBarrier::gray_ptr_)
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index fc2c2c3..e480f12 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -580,8 +580,8 @@
 // Slow path marking an object during a read barrier.
 class ReadBarrierMarkSlowPathARM64 : public SlowPathCodeARM64 {
  public:
-  ReadBarrierMarkSlowPathARM64(HInstruction* instruction, Location out, Location obj)
-      : SlowPathCodeARM64(instruction), out_(out), obj_(obj) {
+  ReadBarrierMarkSlowPathARM64(HInstruction* instruction, Location obj)
+      : SlowPathCodeARM64(instruction), obj_(obj) {
     DCHECK(kEmitCompilerReadBarrier);
   }
 
@@ -589,9 +589,8 @@
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     LocationSummary* locations = instruction_->GetLocations();
-    Primitive::Type type = Primitive::kPrimNot;
     DCHECK(locations->CanCall());
-    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(out_.reg()));
+    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(obj_.reg()));
     DCHECK(instruction_->IsInstanceFieldGet() ||
            instruction_->IsStaticFieldGet() ||
            instruction_->IsArrayGet() ||
@@ -605,24 +604,44 @@
         << instruction_->DebugName();
 
     __ Bind(GetEntryLabel());
+    // Save live registers before the runtime call, and in particular
+    // W0 (if it is live), as it is clobbered by functions
+    // art_quick_read_barrier_mark_regX.
     SaveLiveRegisters(codegen, locations);
 
     InvokeRuntimeCallingConvention calling_convention;
     CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen);
-    arm64_codegen->MoveLocation(LocationFrom(calling_convention.GetRegisterAt(0)), obj_, type);
-    arm64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pReadBarrierMark),
+    DCHECK_NE(obj_.reg(), LR);
+    DCHECK_NE(obj_.reg(), WSP);
+    DCHECK_NE(obj_.reg(), WZR);
+    DCHECK(0 <= obj_.reg() && obj_.reg() < kNumberOfWRegisters) << obj_.reg();
+    // "Compact" slow path, saving two moves.
+    //
+    // Instead of using the standard runtime calling convention (input
+    // and output in W0):
+    //
+    //   W0 <- obj
+    //   W0 <- ReadBarrierMark(W0)
+    //   obj <- W0
+    //
+    // we just use rX (the register holding `obj`) as input and output
+    // of a dedicated entrypoint:
+    //
+    //   rX <- ReadBarrierMarkRegX(rX)
+    //
+    int32_t entry_point_offset =
+        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64WordSize>(obj_.reg());
+    // TODO: Do not emit a stack map for this runtime call.
+    arm64_codegen->InvokeRuntime(entry_point_offset,
                                  instruction_,
                                  instruction_->GetDexPc(),
                                  this);
-    CheckEntrypointTypes<kQuickReadBarrierMark, mirror::Object*, mirror::Object*>();
-    arm64_codegen->MoveLocation(out_, calling_convention.GetReturnLocation(type), type);
 
     RestoreLiveRegisters(codegen, locations);
     __ B(GetExitLabel());
   }
 
  private:
-  const Location out_;
   const Location obj_;
 
   DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathARM64);
@@ -5059,7 +5078,7 @@
 
       // Slow path used to mark the GC root `root`.
       SlowPathCodeARM64* slow_path =
-          new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64(instruction, root, root);
+          new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64(instruction, root);
       codegen_->AddSlowPath(slow_path);
 
       MacroAssembler* masm = GetVIXLAssembler();
@@ -5267,7 +5286,7 @@
 
   // Slow path used to mark the object `ref` when it is gray.
   SlowPathCodeARM64* slow_path =
-      new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64(instruction, ref, ref);
+      new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64(instruction, ref);
   AddSlowPath(slow_path);
 
   // if (rb_state == ReadBarrier::gray_ptr_)
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 1261619..b33cabb 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -430,8 +430,8 @@
 // Slow path marking an object during a read barrier.
 class ReadBarrierMarkSlowPathX86 : public SlowPathCode {
  public:
-  ReadBarrierMarkSlowPathX86(HInstruction* instruction, Location out, Location obj)
-      : SlowPathCode(instruction), out_(out), obj_(obj) {
+  ReadBarrierMarkSlowPathX86(HInstruction* instruction, Location obj)
+      : SlowPathCode(instruction), obj_(obj) {
     DCHECK(kEmitCompilerReadBarrier);
   }
 
@@ -439,9 +439,9 @@
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     LocationSummary* locations = instruction_->GetLocations();
-    Register reg_out = out_.AsRegister<Register>();
+    Register reg = obj_.AsRegister<Register>();
     DCHECK(locations->CanCall());
-    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg_out));
+    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg));
     DCHECK(instruction_->IsInstanceFieldGet() ||
            instruction_->IsStaticFieldGet() ||
            instruction_->IsArrayGet() ||
@@ -455,24 +455,42 @@
         << instruction_->DebugName();
 
     __ Bind(GetEntryLabel());
+    // Save live registers before the runtime call, and in particular
+    // EAX (if it is live), as it is clobbered by functions
+    // art_quick_read_barrier_mark_regX.
     SaveLiveRegisters(codegen, locations);
 
     InvokeRuntimeCallingConvention calling_convention;
     CodeGeneratorX86* x86_codegen = down_cast<CodeGeneratorX86*>(codegen);
-    x86_codegen->Move32(Location::RegisterLocation(calling_convention.GetRegisterAt(0)), obj_);
-    x86_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pReadBarrierMark),
+    DCHECK_NE(reg, ESP);
+    DCHECK(0 <= reg && reg < kNumberOfCpuRegisters) << reg;
+    // "Compact" slow path, saving two moves.
+    //
+    // Instead of using the standard runtime calling convention (input
+    // and output in EAX):
+    //
+    //   EAX <- obj
+    //   EAX <- ReadBarrierMark(EAX)
+    //   obj <- EAX
+    //
+    // we just use rX (the register holding `obj`) as input and output
+    // of a dedicated entrypoint:
+    //
+    //   rX <- ReadBarrierMarkRegX(rX)
+    //
+    int32_t entry_point_offset =
+        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kX86WordSize>(reg);
+    // TODO: Do not emit a stack map for this runtime call.
+    x86_codegen->InvokeRuntime(entry_point_offset,
                                instruction_,
                                instruction_->GetDexPc(),
                                this);
-    CheckEntrypointTypes<kQuickReadBarrierMark, mirror::Object*, mirror::Object*>();
-    x86_codegen->Move32(out_, Location::RegisterLocation(EAX));
 
     RestoreLiveRegisters(codegen, locations);
     __ jmp(GetExitLabel());
   }
 
  private:
-  const Location out_;
   const Location obj_;
 
   DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathX86);
@@ -6926,7 +6944,7 @@
 
       // Slow path used to mark the GC root `root`.
       SlowPathCode* slow_path =
-          new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86(instruction, root, root);
+          new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86(instruction, root);
       codegen_->AddSlowPath(slow_path);
 
       __ fs()->cmpl(Address::Absolute(Thread::IsGcMarkingOffset<kX86WordSize>().Int32Value()),
@@ -7056,7 +7074,7 @@
 
   // Slow path used to mark the object `ref` when it is gray.
   SlowPathCode* slow_path =
-      new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86(instruction, ref, ref);
+      new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86(instruction, ref);
   AddSlowPath(slow_path);
 
   // if (rb_state == ReadBarrier::gray_ptr_)
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 5e30203..a524057 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -451,8 +451,8 @@
 // Slow path marking an object during a read barrier.
 class ReadBarrierMarkSlowPathX86_64 : public SlowPathCode {
  public:
-  ReadBarrierMarkSlowPathX86_64(HInstruction* instruction, Location out, Location obj)
-      : SlowPathCode(instruction), out_(out), obj_(obj) {
+  ReadBarrierMarkSlowPathX86_64(HInstruction* instruction, Location obj)
+      : SlowPathCode(instruction), obj_(obj) {
     DCHECK(kEmitCompilerReadBarrier);
   }
 
@@ -460,9 +460,9 @@
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     LocationSummary* locations = instruction_->GetLocations();
-    Register reg_out = out_.AsRegister<Register>();
+    Register reg = obj_.AsRegister<Register>();
     DCHECK(locations->CanCall());
-    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg_out));
+    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg));
     DCHECK(instruction_->IsInstanceFieldGet() ||
            instruction_->IsStaticFieldGet() ||
            instruction_->IsArrayGet() ||
@@ -476,24 +476,42 @@
         << instruction_->DebugName();
 
     __ Bind(GetEntryLabel());
+    // Save live registers before the runtime call, and in particular
+    // RDI and/or RAX (if they are live), as they are clobbered by
+    // functions art_quick_read_barrier_mark_regX.
     SaveLiveRegisters(codegen, locations);
 
     InvokeRuntimeCallingConvention calling_convention;
     CodeGeneratorX86_64* x86_64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
-    x86_64_codegen->Move(Location::RegisterLocation(calling_convention.GetRegisterAt(0)), obj_);
-    x86_64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pReadBarrierMark),
-                               instruction_,
-                               instruction_->GetDexPc(),
-                               this);
-    CheckEntrypointTypes<kQuickReadBarrierMark, mirror::Object*, mirror::Object*>();
-    x86_64_codegen->Move(out_, Location::RegisterLocation(RAX));
+    DCHECK_NE(reg, RSP);
+    DCHECK(0 <= reg && reg < kNumberOfCpuRegisters) << reg;
+    // "Compact" slow path, saving two moves.
+    //
+    // Instead of using the standard runtime calling convention (input
+    // and output in R0):
+    //
+    //   RDI <- obj
+    //   RAX <- ReadBarrierMark(RDI)
+    //   obj <- RAX
+    //
+    // we just use rX (the register holding `obj`) as input and output
+    // of a dedicated entrypoint:
+    //
+    //   rX <- ReadBarrierMarkRegX(rX)
+    //
+    int32_t entry_point_offset =
+        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kX86_64WordSize>(reg);
+    // TODO: Do not emit a stack map for this runtime call.
+    x86_64_codegen->InvokeRuntime(entry_point_offset,
+                                  instruction_,
+                                  instruction_->GetDexPc(),
+                                  this);
 
     RestoreLiveRegisters(codegen, locations);
     __ jmp(GetExitLabel());
   }
 
  private:
-  const Location out_;
   const Location obj_;
 
   DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathX86_64);
@@ -6378,7 +6396,7 @@
 
       // Slow path used to mark the GC root `root`.
       SlowPathCode* slow_path =
-          new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86_64(instruction, root, root);
+          new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86_64(instruction, root);
       codegen_->AddSlowPath(slow_path);
 
       __ gs()->cmpl(Address::Absolute(Thread::IsGcMarkingOffset<kX86_64WordSize>().Int32Value(),
@@ -6509,7 +6527,7 @@
 
   // Slow path used to mark the object `ref` when it is gray.
   SlowPathCode* slow_path =
-      new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86_64(instruction, ref, ref);
+      new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86_64(instruction, ref);
   AddSlowPath(slow_path);
 
   // if (rb_state == ReadBarrier::gray_ptr_)