Add support for long-to-double in the optimizing compiler.

- Add support for the long-to-double Dex instruction in the
  optimizing compiler.
- Enable requests of temporary FPU (double) registers during
  code generation.
- Fix art::x86::X86Assembler::LoadLongConstant and extend
  it to int64_t values.
- Have art::x86_64::X86_64Assembler::cvtsi2sd work with
  64-bit operands.
- Generate x86, x86-64 and ARM (but not ARM64) code for
  long to double HTypeConversion nodes.
- Add related tests to test/422-type-conversion.

Change-Id: Ie73d9e5e25bd2e15f585c371e8fc2dcb83438ccd
diff --git a/compiler/optimizing/builder.cc b/compiler/optimizing/builder.cc
index b261460..35360fe 100644
--- a/compiler/optimizing/builder.cc
+++ b/compiler/optimizing/builder.cc
@@ -1034,6 +1034,11 @@
       break;
     }
 
+    case Instruction::LONG_TO_DOUBLE: {
+      Conversion_12x(instruction, Primitive::kPrimLong, Primitive::kPrimDouble);
+      break;
+    }
+
     case Instruction::INT_TO_BYTE: {
       Conversion_12x(instruction, Primitive::kPrimInt, Primitive::kPrimByte);
       break;
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index 0b59327..6cacd4f 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -228,7 +228,8 @@
       DCHECK(!blocked_fpu_registers_[loc.reg()]);
       blocked_fpu_registers_[loc.reg()] = true;
     } else {
-      DCHECK_EQ(loc.GetPolicy(), Location::kRequiresRegister);
+      DCHECK(loc.GetPolicy() == Location::kRequiresRegister
+             || loc.GetPolicy() == Location::kRequiresFpuRegister);
     }
   }
 
@@ -259,10 +260,21 @@
   for (size_t i = 0, e = locations->GetTempCount(); i < e; ++i) {
     Location loc = locations->GetTemp(i);
     if (loc.IsUnallocated()) {
-      DCHECK_EQ(loc.GetPolicy(), Location::kRequiresRegister);
-      // TODO: Adjust handling of temps. We currently consider temps to use
-      // core registers. They may also use floating point registers at some point.
-      loc = AllocateFreeRegister(Primitive::kPrimInt);
+      switch (loc.GetPolicy()) {
+        case Location::kRequiresRegister:
+          // Allocate a core register (large enough to fit a 32-bit integer).
+          loc = AllocateFreeRegister(Primitive::kPrimInt);
+          break;
+
+        case Location::kRequiresFpuRegister:
+          // Allocate a core register (large enough to fit a 64-bit double).
+          loc = AllocateFreeRegister(Primitive::kPrimDouble);
+          break;
+
+        default:
+          LOG(FATAL) << "Unexpected policy for temporary location "
+                     << loc.GetPolicy();
+      }
       locations->SetTempAt(i, loc);
     }
   }
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 890cfdd..22a8590 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -1481,6 +1481,14 @@
           break;
 
         case Primitive::kPrimLong:
+          // Processing a Dex `long-to-double' instruction.
+          locations->SetInAt(0, Location::RequiresRegister());
+          locations->SetOut(Location::RequiresFpuRegister());
+          locations->AddTemp(Location::RequiresRegister());
+          locations->AddTemp(Location::RequiresRegister());
+          locations->AddTemp(Location::RequiresFpuRegister());
+          break;
+
         case Primitive::kPrimFloat:
           LOG(FATAL) << "Type conversion from " << input_type
                      << " to " << result_type << " not yet implemented";
@@ -1645,7 +1653,41 @@
           break;
         }
 
-        case Primitive::kPrimLong:
+        case Primitive::kPrimLong: {
+          // Processing a Dex `long-to-double' instruction.
+          Register low = in.AsRegisterPairLow<Register>();
+          Register high = in.AsRegisterPairHigh<Register>();
+          SRegister out_s = out.AsFpuRegisterPairLow<SRegister>();
+          DRegister out_d = FromLowSToD(out_s);
+          Register constant_low = locations->GetTemp(0).As<Register>();
+          Register constant_high = locations->GetTemp(1).As<Register>();
+          SRegister temp_s = locations->GetTemp(2).AsFpuRegisterPairLow<SRegister>();
+          DRegister temp_d = FromLowSToD(temp_s);
+
+          // Binary encoding of 2^32 for type double.
+          const uint64_t c = UINT64_C(0x41F0000000000000);
+
+          // out_d = int-to-double(high)
+          __ vmovsr(out_s, high);
+          __ vcvtdi(out_d, out_s);
+          // Using vmovd to load the `c` constant as an immediate
+          // value into `temp_d` does not work, as this instruction
+          // only transfers 8 significant bits of its immediate
+          // operand.  Instead, use two 32-bit core registers to
+          // load `c` into `temp_d`.
+          __ LoadImmediate(constant_low, Low32Bits(c));
+          __ LoadImmediate(constant_high, High32Bits(c));
+          __ vmovdrr(temp_d, constant_low, constant_high);
+          // out_d = out_d * 2^32
+          __ vmuld(out_d, out_d, temp_d);
+          // temp_d = unsigned-to-double(low)
+          __ vmovsr(temp_s, low);
+          __ vcvtdu(temp_d, temp_s);
+          // out_d = out_d + temp_d
+          __ vaddd(out_d, out_d, temp_d);
+          break;
+        }
+
         case Primitive::kPrimFloat:
           LOG(FATAL) << "Type conversion from " << input_type
                      << " to " << result_type << " not yet implemented";
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 3689452..c482885 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -1429,6 +1429,13 @@
           break;
 
         case Primitive::kPrimLong:
+          // Processing a Dex `long-to-double' instruction.
+          locations->SetInAt(0, Location::RequiresRegister());
+          locations->SetOut(Location::RequiresFpuRegister());
+          locations->AddTemp(Location::RequiresFpuRegister());
+          locations->AddTemp(Location::RequiresFpuRegister());
+          break;
+
         case Primitive::kPrimFloat:
           LOG(FATAL) << "Type conversion from " << input_type
                      << " to " << result_type << " not yet implemented";
@@ -1608,7 +1615,37 @@
           __ cvtsi2sd(out.As<XmmRegister>(), in.As<Register>());
           break;
 
-        case Primitive::kPrimLong:
+        case Primitive::kPrimLong: {
+          // Processing a Dex `long-to-double' instruction.
+          Register low = in.AsRegisterPairLow<Register>();
+          Register high = in.AsRegisterPairHigh<Register>();
+          XmmRegister result = out.As<XmmRegister>();
+          XmmRegister temp = locations->GetTemp(0).As<XmmRegister>();
+          XmmRegister constant = locations->GetTemp(1).As<XmmRegister>();
+
+          // Binary encoding of 2^32 for type double.
+          const int64_t c1 = INT64_C(0x41F0000000000000);
+          // Binary encoding of 2^31 for type double.
+          const int64_t c2 = INT64_C(0x41E0000000000000);
+
+          // low = low - 2^31 (to prevent bit 31 of `low` to be
+          // interpreted as a sign bit)
+          __ subl(low, Immediate(0x80000000));
+          // temp = int-to-double(high)
+          __ cvtsi2sd(temp, high);
+          // temp = temp * 2^32
+          __ LoadLongConstant(constant, c1);
+          __ mulsd(temp, constant);
+          // result = int-to-double(low)
+          __ cvtsi2sd(result, low);
+          // result = result + 2^31 (restore the original value of `low`)
+          __ LoadLongConstant(constant, c2);
+          __ addsd(result, constant);
+          // result = result + temp
+          __ addsd(result, temp);
+          break;
+        }
+
         case Primitive::kPrimFloat:
           LOG(FATAL) << "Type conversion from " << input_type
                      << " to " << result_type << " not yet implemented";
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 34fa1e7..63938f3 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -1430,6 +1430,11 @@
           break;
 
         case Primitive::kPrimLong:
+          // Processing a Dex `long-to-double' instruction.
+          locations->SetInAt(0, Location::RequiresRegister());
+          locations->SetOut(Location::RequiresFpuRegister());
+          break;
+
         case Primitive::kPrimFloat:
           LOG(FATAL) << "Type conversion from " << input_type
                      << " to " << result_type << " not yet implemented";
@@ -1609,10 +1614,14 @@
         case Primitive::kPrimShort:
         case Primitive::kPrimInt:
         case Primitive::kPrimChar:
-          __ cvtsi2sd(out.As<XmmRegister>(), in.As<CpuRegister>());
+          __ cvtsi2sd(out.As<XmmRegister>(), in.As<CpuRegister>(), false);
           break;
 
         case Primitive::kPrimLong:
+          // Processing a Dex `long-to-double' instruction.
+          __ cvtsi2sd(out.As<XmmRegister>(), in.As<CpuRegister>(), true);
+          break;
+
         case Primitive::kPrimFloat:
           LOG(FATAL) << "Type conversion from " << input_type
                      << " to " << result_type << " not yet implemented";