Merge "[optimizing] Tune some x86_64 moves"
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 9d2fc43..e633970 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -99,7 +99,7 @@
       if (is_div_) {
         __ negq(cpu_reg_);
       } else {
-        __ movq(cpu_reg_, Immediate(0));
+        __ xorl(cpu_reg_, cpu_reg_);
       }
     }
     __ jmp(GetExitLabel());
@@ -675,7 +675,7 @@
         DCHECK(constant->IsLongConstant());
         value = constant->AsLongConstant()->GetValue();
       }
-      __ movq(CpuRegister(TMP), Immediate(value));
+      Load64BitValue(CpuRegister(TMP), value);
       __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
     } else {
       DCHECK(source.IsDoubleStackSlot());
@@ -708,9 +708,9 @@
     } else if (const_to_move->IsLongConstant()) {
       int64_t value = const_to_move->AsLongConstant()->GetValue();
       if (location.IsRegister()) {
-        __ movq(location.AsRegister<CpuRegister>(), Immediate(value));
+        Load64BitValue(location.AsRegister<CpuRegister>(), value);
       } else if (location.IsDoubleStackSlot()) {
-        __ movq(CpuRegister(TMP), Immediate(value));
+        Load64BitValue(CpuRegister(TMP), value);
         __ movq(Address(CpuRegister(RSP), location.GetStackIndex()), CpuRegister(TMP));
       } else {
         DCHECK(location.IsConstant());
@@ -959,7 +959,7 @@
     LocationSummary* locations = comp->GetLocations();
     CpuRegister reg = locations->Out().AsRegister<CpuRegister>();
     // Clear register: setcc only sets the low byte.
-    __ xorq(reg, reg);
+    __ xorl(reg, reg);
     Location lhs = locations->InAt(0);
     Location rhs = locations->InAt(1);
     if (rhs.IsRegister()) {
@@ -1422,8 +1422,8 @@
   size_t class_offset = mirror::Object::ClassOffset().SizeValue();
 
   // Set the hidden argument.
-  __ movq(invoke->GetLocations()->GetTemp(1).AsRegister<CpuRegister>(),
-          Immediate(invoke->GetDexMethodIndex()));
+  CpuRegister hidden_reg = invoke->GetLocations()->GetTemp(1).AsRegister<CpuRegister>();
+  codegen_->Load64BitValue(hidden_reg, invoke->GetDexMethodIndex());
 
   // temp = object->GetClass();
   if (receiver.IsStackSlot()) {
@@ -1859,7 +1859,7 @@
           XmmRegister temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
           Label done, nan;
 
-          __ movq(output, Immediate(kPrimLongMax));
+          codegen_->Load64BitValue(output, kPrimLongMax);
           // temp = long-to-float(output)
           __ cvtsi2ss(temp, output, true);
           // if input >= temp goto done
@@ -1872,7 +1872,7 @@
           __ jmp(&done);
           __ Bind(&nan);
           //  output = 0
-          __ xorq(output, output);
+          __ xorl(output, output);
           __ Bind(&done);
           break;
         }
@@ -1884,7 +1884,7 @@
           XmmRegister temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
           Label done, nan;
 
-          __ movq(output, Immediate(kPrimLongMax));
+          codegen_->Load64BitValue(output, kPrimLongMax);
           // temp = long-to-double(output)
           __ cvtsi2sd(temp, output, true);
           // if input >= temp goto done
@@ -1897,7 +1897,7 @@
           __ jmp(&done);
           __ Bind(&nan);
           //  output = 0
-          __ xorq(output, output);
+          __ xorl(output, output);
           __ Bind(&done);
           break;
         }
@@ -2486,7 +2486,7 @@
 
     case Primitive::kPrimLong: {
       if (instruction->IsRem()) {
-        __ xorq(output_register, output_register);
+        __ xorl(output_register, output_register);
       } else {
         __ movq(output_register, input_register);
         if (imm == -1) {
@@ -2530,7 +2530,7 @@
     DCHECK_EQ(instruction->GetResultType(), Primitive::kPrimLong);
     CpuRegister rdx = locations->GetTemp(0).AsRegister<CpuRegister>();
 
-    __ movq(rdx, Immediate(std::abs(imm) - 1));
+    codegen_->Load64BitValue(rdx, std::abs(imm) - 1);
     __ addq(rdx, numerator);
     __ testq(numerator, numerator);
     __ cmov(kGreaterEqual, rdx, numerator);
@@ -2627,7 +2627,7 @@
     __ movq(numerator, rax);
 
     // RAX = magic
-    __ movq(rax, Immediate(magic));
+    codegen_->Load64BitValue(rax, magic);
 
     // RDX:RAX = magic * numerator
     __ imulq(numerator);
@@ -2656,8 +2656,7 @@
       if (IsInt<32>(imm)) {
         __ imulq(rdx, Immediate(static_cast<int32_t>(imm)));
       } else {
-        __ movq(numerator, Immediate(imm));
-        __ imulq(rdx, numerator);
+        __ imulq(rdx, codegen_->LiteralInt64Address(imm));
       }
 
       __ subq(rax, rdx);
@@ -3023,8 +3022,8 @@
 void InstructionCodeGeneratorX86_64::VisitNewInstance(HNewInstance* instruction) {
   InvokeRuntimeCallingConvention calling_convention;
   codegen_->LoadCurrentMethod(CpuRegister(calling_convention.GetRegisterAt(1)));
-  __ movq(CpuRegister(calling_convention.GetRegisterAt(0)), Immediate(instruction->GetTypeIndex()));
-
+  codegen_->Load64BitValue(CpuRegister(calling_convention.GetRegisterAt(0)),
+                           instruction->GetTypeIndex());
   __ gs()->call(
       Address::Absolute(GetThreadOffset<kX86_64WordSize>(instruction->GetEntrypoint()), true));
 
@@ -3045,7 +3044,8 @@
 void InstructionCodeGeneratorX86_64::VisitNewArray(HNewArray* instruction) {
   InvokeRuntimeCallingConvention calling_convention;
   codegen_->LoadCurrentMethod(CpuRegister(calling_convention.GetRegisterAt(2)));
-  __ movq(CpuRegister(calling_convention.GetRegisterAt(0)), Immediate(instruction->GetTypeIndex()));
+  codegen_->Load64BitValue(CpuRegister(calling_convention.GetRegisterAt(0)),
+                           instruction->GetTypeIndex());
 
   __ gs()->call(
       Address::Absolute(GetThreadOffset<kX86_64WordSize>(instruction->GetEntrypoint()), true));
@@ -3952,45 +3952,42 @@
     } else if (constant->IsLongConstant()) {
       int64_t value = constant->AsLongConstant()->GetValue();
       if (destination.IsRegister()) {
-        __ movq(destination.AsRegister<CpuRegister>(), Immediate(value));
+        codegen_->Load64BitValue(destination.AsRegister<CpuRegister>(), value);
       } else {
         DCHECK(destination.IsDoubleStackSlot()) << destination;
-        __ movq(CpuRegister(TMP), Immediate(value));
+        codegen_->Load64BitValue(CpuRegister(TMP), value);
         __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
       }
     } else if (constant->IsFloatConstant()) {
       float fp_value = constant->AsFloatConstant()->GetValue();
       int32_t value = bit_cast<int32_t, float>(fp_value);
-      Immediate imm(value);
       if (destination.IsFpuRegister()) {
         XmmRegister dest = destination.AsFpuRegister<XmmRegister>();
         if (value == 0) {
           // easy FP 0.0.
           __ xorps(dest, dest);
         } else {
-          __ movl(CpuRegister(TMP), imm);
-          __ movd(dest, CpuRegister(TMP));
+          __ movss(dest, codegen_->LiteralFloatAddress(fp_value));
         }
       } else {
         DCHECK(destination.IsStackSlot()) << destination;
+        Immediate imm(value);
         __ movl(Address(CpuRegister(RSP), destination.GetStackIndex()), imm);
       }
     } else {
       DCHECK(constant->IsDoubleConstant()) << constant->DebugName();
       double fp_value =  constant->AsDoubleConstant()->GetValue();
       int64_t value = bit_cast<int64_t, double>(fp_value);
-      Immediate imm(value);
       if (destination.IsFpuRegister()) {
         XmmRegister dest = destination.AsFpuRegister<XmmRegister>();
         if (value == 0) {
           __ xorpd(dest, dest);
         } else {
-          __ movq(CpuRegister(TMP), imm);
-          __ movd(dest, CpuRegister(TMP));
+          __ movsd(dest, codegen_->LiteralDoubleAddress(fp_value));
         }
       } else {
         DCHECK(destination.IsDoubleStackSlot()) << destination;
-        __ movq(CpuRegister(TMP), imm);
+        codegen_->Load64BitValue(CpuRegister(TMP), value);
         __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
       }
     }
@@ -4449,6 +4446,17 @@
   LOG(FATAL) << "Unreachable";
 }
 
+void CodeGeneratorX86_64::Load64BitValue(CpuRegister dest, int64_t value) {
+  if (value == 0) {
+    __ xorl(dest, dest);
+  } else if (value > 0 && IsInt<32>(value)) {
+    // We can use a 32 bit move, as it will zero-extend and is one byte shorter.
+    __ movl(dest, Immediate(static_cast<int32_t>(value)));
+  } else {
+    __ movq(dest, Immediate(value));
+  }
+}
+
 void CodeGeneratorX86_64::Finalize(CodeAllocator* allocator) {
   // Generate the constant area if needed.
   X86_64Assembler* assembler = GetAssembler();
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index 13f9c46..480ea6b 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -282,6 +282,9 @@
   Address LiteralInt32Address(int32_t v);
   Address LiteralInt64Address(int64_t v);
 
+  // Load a 64 bit value into a register in the most efficient manner.
+  void Load64BitValue(CpuRegister dest, int64_t value);
+
  private:
   // Labels for each block that will be compiled.
   GrowableArray<Label> block_labels_;
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index 1fc5432..3dbcd4c 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -783,7 +783,7 @@
   __ Bind(&nan);
 
   //  output = 0
-  __ xorq(out, out);
+  __ xorl(out, out);
   __ Bind(&done);
 }