Add support for long-to-float in the optimizing compiler.

- Add support for the long-to-float Dex instruction in the
  optimizing compiler.
- Have art::x86_64::X86_64Assembler::cvtsi2ss work with
  64-bit operands.
- Generate x86, x86-64 and ARM (but not ARM64) code for
  long to float HTypeConversion nodes.
- Add related tests to test/422-type-conversion.

Change-Id: Ic983cbeb1ae2051add40bc519a8f00a6196166c9
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 0967a01..7d6d827 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -1468,6 +1468,15 @@
           break;
 
         case Primitive::kPrimLong:
+          // Processing a Dex `long-to-float' instruction.
+          locations->SetInAt(0, Location::RequiresRegister());
+          locations->SetOut(Location::RequiresFpuRegister());
+          locations->AddTemp(Location::RequiresRegister());
+          locations->AddTemp(Location::RequiresRegister());
+          locations->AddTemp(Location::RequiresFpuRegister());
+          locations->AddTemp(Location::RequiresFpuRegister());
+          break;
+
         case Primitive::kPrimDouble:
           LOG(FATAL) << "Type conversion from " << input_type
                      << " to " << result_type << " not yet implemented";
@@ -1638,7 +1647,48 @@
           break;
         }
 
-        case Primitive::kPrimLong:
+        case Primitive::kPrimLong: {
+          // Processing a Dex `long-to-float' instruction.
+          Register low = in.AsRegisterPairLow<Register>();
+          Register high = in.AsRegisterPairHigh<Register>();
+          SRegister output = out.AsFpuRegister<SRegister>();
+          Register constant_low = locations->GetTemp(0).AsRegister<Register>();
+          Register constant_high = locations->GetTemp(1).AsRegister<Register>();
+          SRegister temp1_s = locations->GetTemp(2).AsFpuRegisterPairLow<SRegister>();
+          DRegister temp1_d = FromLowSToD(temp1_s);
+          SRegister temp2_s = locations->GetTemp(3).AsFpuRegisterPairLow<SRegister>();
+          DRegister temp2_d = FromLowSToD(temp2_s);
+
+          // Operations use doubles for precision reasons (each 32-bit
+          // half of a long fits in the 53-bit mantissa of a double,
+          // but not in the 24-bit mantissa of a float).  This is
+          // especially important for the low bits.  The result is
+          // eventually converted to float.
+
+          // temp1_d = int-to-double(high)
+          __ vmovsr(temp1_s, high);
+          __ vcvtdi(temp1_d, temp1_s);
+          // Using vmovd to load the `k2Pow32EncodingForDouble` constant
+          // as an immediate value into `temp2_d` does not work, as
+          // this instruction only transfers 8 significant bits of its
+          // immediate operand.  Instead, use two 32-bit core
+          // registers to load `k2Pow32EncodingForDouble` into
+          // `temp2_d`.
+          __ LoadImmediate(constant_low, Low32Bits(k2Pow32EncodingForDouble));
+          __ LoadImmediate(constant_high, High32Bits(k2Pow32EncodingForDouble));
+          __ vmovdrr(temp2_d, constant_low, constant_high);
+          // temp1_d = temp1_d * 2^32
+          __ vmuld(temp1_d, temp1_d, temp2_d);
+          // temp2_d = unsigned-to-double(low)
+          __ vmovsr(temp2_s, low);
+          __ vcvtdu(temp2_d, temp2_s);
+          // temp1_d = temp1_d + temp2_d
+          __ vaddd(temp1_d, temp1_d, temp2_d);
+          // output = double-to-float(temp1_d);
+          __ vcvtsd(output, temp1_d);
+          break;
+        }
+
         case Primitive::kPrimDouble:
           LOG(FATAL) << "Type conversion from " << input_type
                      << " to " << result_type << " not yet implemented";
@@ -1674,19 +1724,16 @@
           SRegister temp_s = locations->GetTemp(2).AsFpuRegisterPairLow<SRegister>();
           DRegister temp_d = FromLowSToD(temp_s);
 
-          // Binary encoding of 2^32 for type double.
-          const uint64_t c = UINT64_C(0x41F0000000000000);
-
           // out_d = int-to-double(high)
           __ vmovsr(out_s, high);
           __ vcvtdi(out_d, out_s);
-          // Using vmovd to load the `c` constant as an immediate
-          // value into `temp_d` does not work, as this instruction
-          // only transfers 8 significant bits of its immediate
-          // operand.  Instead, use two 32-bit core registers to
-          // load `c` into `temp_d`.
-          __ LoadImmediate(constant_low, Low32Bits(c));
-          __ LoadImmediate(constant_high, High32Bits(c));
+          // Using vmovd to load the `k2Pow32EncodingForDouble` constant
+          // as an immediate value into `temp_d` does not work, as
+          // this instruction only transfers 8 significant bits of its
+          // immediate operand.  Instead, use two 32-bit core
+          // registers to load `k2Pow32EncodingForDouble` into `temp_d`.
+          __ LoadImmediate(constant_low, Low32Bits(k2Pow32EncodingForDouble));
+          __ LoadImmediate(constant_high, High32Bits(k2Pow32EncodingForDouble));
           __ vmovdrr(temp_d, constant_low, constant_high);
           // out_d = out_d * 2^32
           __ vmuld(out_d, out_d, temp_d);