Implement double and float support for arm in register allocator.

The basic approach is:
- An instruction that needs two registers gets two intervals.
- When allocating the low part, we also allocate the high part.
- When splitting a low (or high) interval, we also split the high
  (or low) equivalent.
- Allocation follows the (S/D register) requirement that low
  registers are always even and the high equivalent is low + 1.

Change-Id: I06a5148e05a2ffc7e7555d08e871ed007b4c2797
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index 4d8154e..ada0fb7 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -620,6 +620,14 @@
         break;
       }
 
+      case Location::kFpuRegisterPair : {
+        stack_map_stream_.AddDexRegisterEntry(DexRegisterMap::kInFpuRegister, location.low());
+        stack_map_stream_.AddDexRegisterEntry(DexRegisterMap::kInFpuRegister, location.high());
+        ++i;
+        DCHECK_LT(i, environment_size);
+        break;
+      }
+
       default:
         LOG(FATAL) << "Unexpected kind " << location.GetKind();
     }
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index 4205ebe..9880239 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -142,6 +142,7 @@
     UNIMPLEMENTED(FATAL);
     UNREACHABLE();
   }
+  virtual bool NeedsTwoRegisters(Primitive::Type type) const = 0;
 
   void RecordPcInfo(HInstruction* instruction, uint32_t dex_pc);
 
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 3b3fb64..63f5f94 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -373,6 +373,16 @@
   return kArmWordSize;
 }
 
+size_t CodeGeneratorARM::SaveFloatingPointRegister(size_t stack_index, uint32_t reg_id) {
+  __ StoreSToOffset(static_cast<SRegister>(reg_id), SP, stack_index);
+  return kArmWordSize;
+}
+
+size_t CodeGeneratorARM::RestoreFloatingPointRegister(size_t stack_index, uint32_t reg_id) {
+  __ LoadSFromOffset(static_cast<SRegister>(reg_id), SP, stack_index);
+  return kArmWordSize;
+}
+
 CodeGeneratorARM::CodeGeneratorARM(HGraph* graph,
                                    const ArmInstructionSetFeatures* isa_features)
     : CodeGenerator(graph, kNumberOfCoreRegisters, kNumberOfSRegisters, kNumberOfRegisterPairs),
@@ -802,7 +812,8 @@
         __ LoadImmediate(IP, value);
         __ StoreToOffset(kStoreWord, IP, SP, location.GetStackIndex());
       }
-    } else if (const_to_move->IsLongConstant()) {
+    } else {
+      DCHECK(const_to_move->IsLongConstant()) << const_to_move;
       int64_t value = const_to_move->AsLongConstant()->GetValue();
       if (location.IsRegisterPair()) {
         __ LoadImmediate(location.AsRegisterPairLow<Register>(), Low32Bits(value));
@@ -2994,10 +3005,34 @@
       break;
     }
 
-    case Primitive::kPrimFloat:
-    case Primitive::kPrimDouble:
-      LOG(FATAL) << "Unimplemented register type " << instruction->GetType();
-      UNREACHABLE();
+    case Primitive::kPrimFloat: {
+      uint32_t data_offset = mirror::Array::DataOffset(sizeof(float)).Uint32Value();
+      Location out = locations->Out();
+      DCHECK(out.IsFpuRegister());
+      if (index.IsConstant()) {
+        size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
+        __ LoadSFromOffset(out.AsFpuRegister<SRegister>(), obj, offset);
+      } else {
+        __ add(IP, obj, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_4));
+        __ LoadSFromOffset(out.AsFpuRegister<SRegister>(), IP, data_offset);
+      }
+      break;
+    }
+
+    case Primitive::kPrimDouble: {
+      uint32_t data_offset = mirror::Array::DataOffset(sizeof(double)).Uint32Value();
+      Location out = locations->Out();
+      DCHECK(out.IsFpuRegisterPair());
+      if (index.IsConstant()) {
+        size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset;
+        __ LoadDFromOffset(FromLowSToD(out.AsFpuRegisterPairLow<SRegister>()), obj, offset);
+      } else {
+        __ add(IP, obj, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_8));
+        __ LoadDFromOffset(FromLowSToD(out.AsFpuRegisterPairLow<SRegister>()), IP, data_offset);
+      }
+      break;
+    }
+
     case Primitive::kPrimVoid:
       LOG(FATAL) << "Unreachable type " << instruction->GetType();
       UNREACHABLE();
@@ -3114,12 +3149,36 @@
       break;
     }
 
-    case Primitive::kPrimFloat:
-    case Primitive::kPrimDouble:
-      LOG(FATAL) << "Unimplemented register type " << instruction->GetType();
-      UNREACHABLE();
+    case Primitive::kPrimFloat: {
+      uint32_t data_offset = mirror::Array::DataOffset(sizeof(float)).Uint32Value();
+      Location value = locations->InAt(2);
+      DCHECK(value.IsFpuRegister());
+      if (index.IsConstant()) {
+        size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
+        __ StoreSToOffset(value.AsFpuRegister<SRegister>(), obj, offset);
+      } else {
+        __ add(IP, obj, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_4));
+        __ StoreSToOffset(value.AsFpuRegister<SRegister>(), IP, data_offset);
+      }
+      break;
+    }
+
+    case Primitive::kPrimDouble: {
+      uint32_t data_offset = mirror::Array::DataOffset(sizeof(double)).Uint32Value();
+      Location value = locations->InAt(2);
+      DCHECK(value.IsFpuRegisterPair());
+      if (index.IsConstant()) {
+        size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset;
+        __ StoreDToOffset(FromLowSToD(value.AsFpuRegisterPairLow<SRegister>()), obj, offset);
+      } else {
+        __ add(IP, obj, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_8));
+        __ StoreDToOffset(FromLowSToD(value.AsFpuRegisterPairLow<SRegister>()), IP, data_offset);
+      }
+      break;
+    }
+
     case Primitive::kPrimVoid:
-      LOG(FATAL) << "Unreachable type " << instruction->GetType();
+      LOG(FATAL) << "Unreachable type " << value_type;
       UNREACHABLE();
   }
 }
@@ -3247,21 +3306,62 @@
     if (destination.IsRegister()) {
       __ LoadFromOffset(kLoadWord, destination.AsRegister<Register>(),
                         SP, source.GetStackIndex());
+    } else if (destination.IsFpuRegister()) {
+      __ LoadSFromOffset(destination.AsFpuRegister<SRegister>(), SP, source.GetStackIndex());
     } else {
       DCHECK(destination.IsStackSlot());
       __ LoadFromOffset(kLoadWord, IP, SP, source.GetStackIndex());
       __ StoreToOffset(kStoreWord, IP, SP, destination.GetStackIndex());
     }
-  } else {
-    DCHECK(source.IsConstant());
-    DCHECK(source.GetConstant()->IsIntConstant());
-    int32_t value = source.GetConstant()->AsIntConstant()->GetValue();
-    if (destination.IsRegister()) {
-      __ LoadImmediate(destination.AsRegister<Register>(), value);
+  } else if (source.IsFpuRegister()) {
+    if (destination.IsFpuRegister()) {
+      __ vmovs(destination.AsFpuRegister<SRegister>(), source.AsFpuRegister<SRegister>());
     } else {
       DCHECK(destination.IsStackSlot());
-      __ LoadImmediate(IP, value);
+      __ StoreSToOffset(source.AsFpuRegister<SRegister>(), SP, destination.GetStackIndex());
+    }
+  } else if (source.IsFpuRegisterPair()) {
+    if (destination.IsFpuRegisterPair()) {
+      __ vmovd(FromLowSToD(destination.AsFpuRegisterPairLow<SRegister>()),
+               FromLowSToD(source.AsFpuRegisterPairLow<SRegister>()));
+    } else {
+      DCHECK(destination.IsDoubleStackSlot()) << destination;
+      __ StoreDToOffset(FromLowSToD(source.AsFpuRegisterPairLow<SRegister>()),
+                        SP, destination.GetStackIndex());
+    }
+  } else if (source.IsDoubleStackSlot()) {
+    if (destination.IsFpuRegisterPair()) {
+      __ LoadDFromOffset(FromLowSToD(destination.AsFpuRegisterPairLow<SRegister>()),
+                         SP, source.GetStackIndex());
+    } else {
+      DCHECK(destination.IsDoubleStackSlot()) << destination;
+      __ LoadFromOffset(kLoadWord, IP, SP, source.GetStackIndex());
       __ StoreToOffset(kStoreWord, IP, SP, destination.GetStackIndex());
+      __ LoadFromOffset(kLoadWord, IP, SP, source.GetHighStackIndex(kArmWordSize));
+      __ StoreToOffset(kStoreWord, IP, SP, destination.GetHighStackIndex(kArmWordSize));
+    }
+  } else {
+    DCHECK(source.IsConstant()) << source;
+    HInstruction* constant = source.GetConstant();
+    if (constant->IsIntConstant()) {
+      int32_t value = constant->AsIntConstant()->GetValue();
+      if (destination.IsRegister()) {
+        __ LoadImmediate(destination.AsRegister<Register>(), value);
+      } else {
+        DCHECK(destination.IsStackSlot());
+        __ LoadImmediate(IP, value);
+        __ StoreToOffset(kStoreWord, IP, SP, destination.GetStackIndex());
+      }
+    } else {
+      DCHECK(constant->IsFloatConstant());
+      float value = constant->AsFloatConstant()->GetValue();
+      if (destination.IsFpuRegister()) {
+        __ LoadSImmediate(destination.AsFpuRegister<SRegister>(), value);
+      } else {
+        DCHECK(destination.IsStackSlot());
+        __ LoadImmediate(IP, bit_cast<int32_t, float>(value));
+        __ StoreToOffset(kStoreWord, IP, SP, destination.GetStackIndex());
+      }
     }
   }
 }
@@ -3300,6 +3400,20 @@
     Exchange(destination.AsRegister<Register>(), source.GetStackIndex());
   } else if (source.IsStackSlot() && destination.IsStackSlot()) {
     Exchange(source.GetStackIndex(), destination.GetStackIndex());
+  } else if (source.IsFpuRegister() && destination.IsFpuRegister()) {
+    __ vmovrs(IP, source.AsFpuRegister<SRegister>());
+    __ vmovs(source.AsFpuRegister<SRegister>(), destination.AsFpuRegister<SRegister>());
+    __ vmovsr(destination.AsFpuRegister<SRegister>(), IP);
+  } else if (source.IsFpuRegister() || destination.IsFpuRegister()) {
+    SRegister reg = source.IsFpuRegister() ? source.AsFpuRegister<SRegister>()
+                                           : destination.AsFpuRegister<SRegister>();
+    int mem = source.IsFpuRegister()
+        ? destination.GetStackIndex()
+        : source.GetStackIndex();
+
+    __ vmovrs(IP, reg);
+    __ LoadSFromOffset(reg, SP, mem);
+    __ StoreToOffset(kStoreWord, IP, SP, mem);
   } else {
     LOG(FATAL) << "Unimplemented";
   }
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index 40f4edc..c1b4eda 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -168,6 +168,8 @@
   void Move(HInstruction* instruction, Location location, HInstruction* move_for) OVERRIDE;
   size_t SaveCoreRegister(size_t stack_index, uint32_t reg_id) OVERRIDE;
   size_t RestoreCoreRegister(size_t stack_index, uint32_t reg_id) OVERRIDE;
+  size_t SaveFloatingPointRegister(size_t stack_index, uint32_t reg_id) OVERRIDE;
+  size_t RestoreFloatingPointRegister(size_t stack_index, uint32_t reg_id) OVERRIDE;
 
   size_t GetWordSize() const OVERRIDE {
     return kArmWordSize;
@@ -237,6 +239,10 @@
     return isa_features_;
   }
 
+  bool NeedsTwoRegisters(Primitive::Type type) const OVERRIDE {
+    return type == Primitive::kPrimDouble || type == Primitive::kPrimLong;
+  }
+
  private:
   // Labels for each block that will be compiled.
   GrowableArray<Label> block_labels_;
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 19488a4..e4da07b 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -267,6 +267,10 @@
 
   ParallelMoveResolverARM64* GetMoveResolver() { return &move_resolver_; }
 
+  bool NeedsTwoRegisters(Primitive::Type type ATTRIBUTE_UNUSED) const OVERRIDE {
+    return false;
+  }
+
  private:
   // Labels for each block that will be compiled.
   vixl::Label* block_labels_;
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index 636f884..acde122 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -222,6 +222,10 @@
     block_labels_.SetSize(GetGraph()->GetBlocks().Size());
   }
 
+  bool NeedsTwoRegisters(Primitive::Type type) const OVERRIDE {
+    return type == Primitive::kPrimLong;
+  }
+
  private:
   // Labels for each block that will be compiled.
   GrowableArray<Label> block_labels_;
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index 0708864..87f6b0f 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -218,6 +218,10 @@
     block_labels_.SetSize(GetGraph()->GetBlocks().Size());
   }
 
+  bool NeedsTwoRegisters(Primitive::Type type ATTRIBUTE_UNUSED) const OVERRIDE {
+    return false;
+  }
+
  private:
   // Labels for each block that will be compiled.
   GrowableArray<Label> block_labels_;
diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index 9ed1e45..8a7bfd6 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc
@@ -30,10 +30,12 @@
   HGraphVisualizerPrinter(HGraph* graph,
                           std::ostream& output,
                           const char* pass_name,
+                          bool is_after_pass,
                           const CodeGenerator& codegen)
       : HGraphVisitor(graph),
         output_(output),
         pass_name_(pass_name),
+        is_after_pass_(is_after_pass),
         codegen_(codegen),
         indent_(0) {}
 
@@ -136,6 +138,10 @@
       output_ << "invalid";
     } else if (location.IsStackSlot()) {
       output_ << location.GetStackIndex() << "(sp)";
+    } else if (location.IsFpuRegisterPair()) {
+      codegen_.DumpFloatingPointRegister(output_, location.low());
+      output_ << " and ";
+      codegen_.DumpFloatingPointRegister(output_, location.high());
     } else {
       DCHECK(location.IsDoubleStackSlot());
       output_ << "2x" << location.GetStackIndex() << "(sp)";
@@ -224,7 +230,8 @@
 
   void Run() {
     StartTag("cfg");
-    PrintProperty("name", pass_name_);
+    std::string pass_desc = std::string(pass_name_) + (is_after_pass_ ? " (after)" : " (before)");
+    PrintProperty("name", pass_desc.c_str());
     VisitInsertionOrder();
     EndTag("cfg");
   }
@@ -275,6 +282,7 @@
  private:
   std::ostream& output_;
   const char* pass_name_;
+  const bool is_after_pass_;
   const CodeGenerator& codegen_;
   size_t indent_;
 
@@ -295,7 +303,7 @@
   }
 
   is_enabled_ = true;
-  HGraphVisualizerPrinter printer(graph_, *output_, "", codegen_);
+  HGraphVisualizerPrinter printer(graph_, *output_, "", true, codegen_);
   printer.StartTag("compilation");
   printer.PrintProperty("name", method_name);
   printer.PrintProperty("method", method_name);
@@ -305,8 +313,7 @@
 
 void HGraphVisualizer::DumpGraph(const char* pass_name, bool is_after_pass) const {
   if (is_enabled_) {
-    std::string pass_desc = std::string(pass_name) + (is_after_pass ? " (after)" : " (before)");
-    HGraphVisualizerPrinter printer(graph_, *output_, pass_desc.c_str(), codegen_);
+    HGraphVisualizerPrinter printer(graph_, *output_, pass_name, is_after_pass, codegen_);
     printer.Run();
   }
 }
diff --git a/compiler/optimizing/locations.h b/compiler/optimizing/locations.h
index 1ff26d9..7df99d4 100644
--- a/compiler/optimizing/locations.h
+++ b/compiler/optimizing/locations.h
@@ -160,6 +160,16 @@
     return GetPayload();
   }
 
+  int low() const {
+    DCHECK(IsPair());
+    return GetPayload() >> 16;
+  }
+
+  int high() const {
+    DCHECK(IsPair());
+    return GetPayload() & 0xFFFF;
+  }
+
   template <typename T>
   T AsRegister() const {
     DCHECK(IsRegister());
@@ -175,25 +185,41 @@
   template <typename T>
   T AsRegisterPairLow() const {
     DCHECK(IsRegisterPair());
-    return static_cast<T>(GetPayload() >> 16);
+    return static_cast<T>(low());
   }
 
   template <typename T>
   T AsRegisterPairHigh() const {
     DCHECK(IsRegisterPair());
-    return static_cast<T>(GetPayload() & 0xFFFF);
+    return static_cast<T>(high());
   }
 
   template <typename T>
   T AsFpuRegisterPairLow() const {
     DCHECK(IsFpuRegisterPair());
-    return static_cast<T>(GetPayload() >> 16);
+    return static_cast<T>(low());
   }
 
   template <typename T>
   T AsFpuRegisterPairHigh() const {
     DCHECK(IsFpuRegisterPair());
-    return static_cast<T>(GetPayload() & 0xFFFF);
+    return static_cast<T>(high());
+  }
+
+  bool IsPair() const {
+    return IsRegisterPair() || IsFpuRegisterPair();
+  }
+
+  Location ToLow() const {
+    return IsRegisterPair()
+        ? Location::RegisterLocation(low())
+        : Location::FpuRegisterLocation(low());
+  }
+
+  Location ToHigh() const {
+    return IsRegisterPair()
+        ? Location::RegisterLocation(high())
+        : Location::FpuRegisterLocation(high());
   }
 
   static uintptr_t EncodeStackIndex(intptr_t stack_index) {
@@ -264,6 +290,18 @@
     return value_ == other.value_;
   }
 
+  // Returns whether this location contains `other`.
+  bool Contains(Location other) const {
+    if (Equals(other)) return true;
+    if (IsRegisterPair() && other.IsRegister()) {
+      return low() == other.reg() || high() == other.reg();
+    }
+    if (IsFpuRegisterPair() && other.IsFpuRegister()) {
+      return low() == other.reg() || high() == other.reg();
+    }
+    return false;
+  }
+
   const char* DebugString() const {
     switch (GetKind()) {
       case kInvalid: return "I";
@@ -525,7 +563,8 @@
         && (output_.GetPolicy() == Location::kSameAsFirstInput)) {
       return false;
     }
-    if (inputs_.Get(input_index).IsRegister() || inputs_.Get(input_index).IsFpuRegister()) {
+    Location input = inputs_.Get(input_index);
+    if (input.IsRegister() || input.IsFpuRegister() || input.IsPair()) {
       return false;
     }
     return true;
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 0fc1fd8..b98bc70 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -2734,7 +2734,7 @@
 
   // True if this blocks a move from the given location.
   bool Blocks(Location loc) const {
-    return !IsEliminated() && source_.Equals(loc);
+    return !IsEliminated() && source_.Contains(loc);
   }
 
   // A move is redundant if it's been eliminated, if its source and
diff --git a/compiler/optimizing/register_allocator.cc b/compiler/optimizing/register_allocator.cc
index c1c805d..1efc52b 100644
--- a/compiler/optimizing/register_allocator.cc
+++ b/compiler/optimizing/register_allocator.cc
@@ -27,6 +27,12 @@
 static constexpr size_t kMaxLifetimePosition = -1;
 static constexpr size_t kDefaultNumberOfSpillSlots = 4;
 
+// For simplicity, we implement register pairs as (reg, reg + 1).
+// Note that this is a requirement for double registers on ARM, since we
+// allocate SRegister.
+static int GetHighForLowRegister(int reg) { return reg + 1; }
+static bool IsLowRegister(int reg) { return (reg & 1) == 0; }
+
 RegisterAllocator::RegisterAllocator(ArenaAllocator* allocator,
                                      CodeGenerator* codegen,
                                      const SsaLivenessAnalysis& liveness)
@@ -72,10 +78,16 @@
          !it.Done();
          it.Advance()) {
       HInstruction* current = it.Current();
-      if (current->GetType() == Primitive::kPrimLong ||
-          current->GetType() == Primitive::kPrimFloat ||
-          current->GetType() == Primitive::kPrimDouble) {
-        return false;
+      if (instruction_set == kX86) {
+        if (current->GetType() == Primitive::kPrimLong ||
+            current->GetType() == Primitive::kPrimFloat ||
+            current->GetType() == Primitive::kPrimDouble) {
+          return false;
+        }
+      } else if (instruction_set == kArm || instruction_set == kThumb2) {
+        if (current->GetType() == Primitive::kPrimLong) {
+          return false;
+        }
       }
     }
   }
@@ -130,7 +142,7 @@
       : physical_fp_register_intervals_.Get(reg);
   Primitive::Type type = location.IsRegister()
       ? Primitive::kPrimInt
-      : Primitive::kPrimDouble;
+      : Primitive::kPrimFloat;
   if (interval == nullptr) {
     interval = LiveInterval::MakeFixedInterval(allocator_, reg, type);
     if (location.IsRegister()) {
@@ -226,6 +238,12 @@
               LiveInterval::MakeTempInterval(allocator_, Primitive::kPrimDouble);
           temp_intervals_.Add(interval);
           interval->AddRange(position, position + 1);
+          if (codegen_->NeedsTwoRegisters(Primitive::kPrimDouble)) {
+            interval->AddHighInterval(true);
+            LiveInterval* high = interval->GetHighInterval();
+            temp_intervals_.Add(high);
+            unhandled_fp_intervals_.Add(high);
+          }
           unhandled_fp_intervals_.Add(interval);
           break;
         }
@@ -279,6 +297,9 @@
     Location input = locations->InAt(i);
     if (input.IsRegister() || input.IsFpuRegister()) {
       BlockRegister(input, position, position + 1);
+    } else if (input.IsPair()) {
+      BlockRegister(input.ToLow(), position, position + 1);
+      BlockRegister(input.ToHigh(), position, position + 1);
     }
   }
 
@@ -291,6 +312,10 @@
 
   DCHECK(unhandled.IsEmpty() || current->StartsBeforeOrAt(unhandled.Peek()));
 
+  if (codegen_->NeedsTwoRegisters(current->GetType())) {
+    current->AddHighInterval();
+  }
+
   // Some instructions define their output in fixed register/stack slot. We need
   // to ensure we know these locations before doing register allocation. For a
   // given register, we create an interval that covers these locations. The register
@@ -304,14 +329,30 @@
     if (first.IsRegister() || first.IsFpuRegister()) {
       current->SetFrom(position + 1);
       current->SetRegister(first.reg());
+    } else if (first.IsPair()) {
+      current->SetFrom(position + 1);
+      current->SetRegister(first.low());
+      LiveInterval* high = current->GetHighInterval();
+      high->SetRegister(first.high());
+      high->SetFrom(position + 1);
     }
   } else if (output.IsRegister() || output.IsFpuRegister()) {
     // Shift the interval's start by one to account for the blocked register.
     current->SetFrom(position + 1);
     current->SetRegister(output.reg());
     BlockRegister(output, position, position + 1);
+  } else if (output.IsPair()) {
+    current->SetFrom(position + 1);
+    current->SetRegister(output.low());
+    LiveInterval* high = current->GetHighInterval();
+    high->SetRegister(output.high());
+    high->SetFrom(position + 1);
+    BlockRegister(output.ToLow(), position, position + 1);
+    BlockRegister(output.ToHigh(), position, position + 1);
   } else if (output.IsStackSlot() || output.IsDoubleStackSlot()) {
     current->SetSpillSlot(output.GetStackIndex());
+  } else {
+    DCHECK(output.IsUnallocated() || output.IsConstant());
   }
 
   // If needed, add interval to the list of unhandled intervals.
@@ -516,6 +557,7 @@
     LiveInterval* current = unhandled_->Pop();
     DCHECK(!current->IsFixed() && !current->HasSpillSlot());
     DCHECK(unhandled_->IsEmpty() || unhandled_->Peek()->GetStart() >= current->GetStart());
+    DCHECK(!current->IsLowInterval() || unhandled_->Peek()->IsHighInterval());
 
     size_t position = current->GetStart();
 
@@ -566,6 +608,13 @@
       continue;
     }
 
+    if (current->IsHighInterval() && !current->GetLowInterval()->HasRegister()) {
+      DCHECK(!current->HasRegister());
+      // Allocating the low part was unsucessful. The splitted interval for the high part
+      // will be handled next (it is in the `unhandled_` list).
+      continue;
+    }
+
     // (4) Try to find an available register.
     bool success = TryAllocateFreeReg(current);
 
@@ -578,6 +627,9 @@
     //     intervals.
     if (success) {
       active_.Add(current);
+      if (current->HasHighInterval() && !current->GetHighInterval()->HasRegister()) {
+        current->GetHighInterval()->SetRegister(GetHighForLowRegister(current->GetRegister()));
+      }
     }
   }
 }
@@ -630,26 +682,31 @@
   if (current->HasRegister()) {
     // Some instructions have a fixed register output.
     reg = current->GetRegister();
-    DCHECK_NE(free_until[reg], 0u);
+    if (free_until[reg] == 0) {
+      DCHECK(current->IsHighInterval());
+      // AllocateBlockedReg will spill the holder of the register.
+      return false;
+    }
   } else {
+    DCHECK(!current->IsHighInterval());
     int hint = current->FindFirstRegisterHint(free_until);
     if (hint != kNoRegister) {
       DCHECK(!IsBlocked(hint));
       reg = hint;
+    } else if (current->IsLowInterval()) {
+      reg = FindAvailableRegisterPair(free_until);
     } else {
-      // Pick the register that is free the longest.
-      for (size_t i = 0; i < number_of_registers_; ++i) {
-        if (IsBlocked(i)) continue;
-        if (reg == -1 || free_until[i] > free_until[reg]) {
-          reg = i;
-          if (free_until[i] == kMaxLifetimePosition) break;
-        }
-      }
+      reg = FindAvailableRegister(free_until);
     }
   }
 
+  DCHECK_NE(reg, -1);
   // If we could not find a register, we need to spill.
-  if (reg == -1 || free_until[reg] == 0) {
+  if (free_until[reg] == 0) {
+    return false;
+  }
+
+  if (current->IsLowInterval() && free_until[GetHighForLowRegister(reg)] == 0) {
     return false;
   }
 
@@ -671,6 +728,40 @@
       : blocked_fp_registers_[reg];
 }
 
+int RegisterAllocator::FindAvailableRegisterPair(size_t* next_use) const {
+  int reg = -1;
+  // Pick the register pair that is used the last.
+  for (size_t i = 0; i < number_of_registers_; ++i) {
+    if (IsBlocked(i)) continue;
+    if (!IsLowRegister(i)) continue;
+    int high_register = GetHighForLowRegister(i);
+    if (IsBlocked(high_register)) continue;
+    int existing_high_register = GetHighForLowRegister(reg);
+    if ((reg == -1) || (next_use[i] >= next_use[reg]
+                        && next_use[high_register] >= next_use[existing_high_register])) {
+      reg = i;
+      if (next_use[i] == kMaxLifetimePosition
+          && next_use[high_register] == kMaxLifetimePosition) {
+        break;
+      }
+    }
+  }
+  return reg;
+}
+
+int RegisterAllocator::FindAvailableRegister(size_t* next_use) const {
+  int reg = -1;
+  // Pick the register that is used the last.
+  for (size_t i = 0; i < number_of_registers_; ++i) {
+    if (IsBlocked(i)) continue;
+    if (reg == -1 || next_use[i] > next_use[reg]) {
+      reg = i;
+      if (next_use[i] == kMaxLifetimePosition) break;
+    }
+  }
+  return reg;
+}
+
 // Find the register that is used the last, and spill the interval
 // that holds it. If the first use of `current` is after that register
 // we spill `current` instead.
@@ -731,17 +822,20 @@
     }
   }
 
-  // Pick the register that is used the last.
   int reg = -1;
-  for (size_t i = 0; i < number_of_registers_; ++i) {
-    if (IsBlocked(i)) continue;
-    if (reg == -1 || next_use[i] > next_use[reg]) {
-      reg = i;
-      if (next_use[i] == kMaxLifetimePosition) break;
-    }
+  if (current->HasRegister()) {
+    DCHECK(current->IsHighInterval());
+    reg = current->GetRegister();
+  } else if (current->IsLowInterval()) {
+    reg = FindAvailableRegisterPair(next_use);
+  } else {
+    DCHECK(!current->IsHighInterval());
+    reg = FindAvailableRegister(next_use);
   }
 
-  if (first_register_use >= next_use[reg]) {
+  if ((first_register_use >= next_use[reg])
+      || (current->IsLowInterval() && first_register_use >= next_use[GetHighForLowRegister(reg)])) {
+    DCHECK(!current->IsHighInterval());
     // If the first use of that instruction is after the last use of the found
     // register, we split this interval just before its first register use.
     AllocateSpillSlotFor(current);
@@ -815,23 +909,49 @@
       break;
     }
   }
+
   array->InsertAt(insert_at, interval);
+  // Insert the high interval before the low, to ensure the low is processed before.
+  if (interval->HasHighInterval()) {
+    array->InsertAt(insert_at, interval->GetHighInterval());
+  } else if (interval->HasLowInterval()) {
+    array->InsertAt(insert_at + 1, interval->GetLowInterval());
+  }
 }
 
 LiveInterval* RegisterAllocator::Split(LiveInterval* interval, size_t position) {
-  DCHECK(position >= interval->GetStart());
+  DCHECK_GE(position, interval->GetStart());
   DCHECK(!interval->IsDeadAt(position));
   if (position == interval->GetStart()) {
     // Spill slot will be allocated when handling `interval` again.
     interval->ClearRegister();
+    if (interval->HasHighInterval()) {
+      interval->GetHighInterval()->ClearRegister();
+    } else if (interval->HasLowInterval()) {
+      interval->GetLowInterval()->ClearRegister();
+    }
     return interval;
   } else {
     LiveInterval* new_interval = interval->SplitAt(position);
+    if (interval->HasHighInterval()) {
+      LiveInterval* high = interval->GetHighInterval()->SplitAt(position);
+      new_interval->SetHighInterval(high);
+      high->SetLowInterval(new_interval);
+    } else if (interval->HasLowInterval()) {
+      LiveInterval* low = interval->GetLowInterval()->SplitAt(position);
+      new_interval->SetLowInterval(low);
+      low->SetHighInterval(new_interval);
+    }
     return new_interval;
   }
 }
 
 void RegisterAllocator::AllocateSpillSlotFor(LiveInterval* interval) {
+  if (interval->IsHighInterval()) {
+    // The low interval will contain the spill slot.
+    return;
+  }
+
   LiveInterval* parent = interval->GetParent();
 
   // An instruction gets a spill slot for its entire lifetime. If the parent
@@ -898,6 +1018,7 @@
 static bool IsValidDestination(Location destination) {
   return destination.IsRegister()
       || destination.IsFpuRegister()
+      || destination.IsFpuRegisterPair()
       || destination.IsStackSlot()
       || destination.IsDoubleStackSlot();
 }
@@ -905,7 +1026,6 @@
 void RegisterAllocator::AddInputMoveFor(HInstruction* user,
                                         Location source,
                                         Location destination) const {
-  DCHECK(IsValidDestination(destination));
   if (source.Equals(destination)) return;
 
   DCHECK(!user->IsPhi());
@@ -1075,9 +1195,7 @@
   if (current->HasSpillSlot() && current->HasRegister()) {
     // We spill eagerly, so move must be at definition.
     InsertMoveAfter(interval->GetDefinedBy(),
-                    interval->IsFloatingPoint()
-                        ? Location::FpuRegisterLocation(interval->GetRegister())
-                        : Location::RegisterLocation(interval->GetRegister()),
+                    interval->ToLocation(),
                     interval->NeedsTwoSpillSlots()
                         ? Location::DoubleStackSlot(interval->GetParent()->GetSpillSlot())
                         : Location::StackSlot(interval->GetParent()->GetSpillSlot()));
@@ -1148,6 +1266,11 @@
           locations->AddLiveRegister(source);
           break;
         }
+        case Location::kFpuRegisterPair: {
+          locations->AddLiveRegister(source.ToLow());
+          locations->AddLiveRegister(source.ToHigh());
+          break;
+        }
         case Location::kStackSlot:  // Fall-through
         case Location::kDoubleStackSlot:  // Fall-through
         case Location::kConstant: {
@@ -1307,6 +1430,10 @@
   size_t temp_index = 0;
   for (size_t i = 0; i < temp_intervals_.Size(); ++i) {
     LiveInterval* temp = temp_intervals_.Get(i);
+    if (temp->IsHighInterval()) {
+      // High intervals can be skipped, they are already handled by the low interval.
+      continue;
+    }
     HInstruction* at = liveness_.GetTempUser(temp);
     if (at != current) {
       temp_index = 0;
@@ -1320,14 +1447,14 @@
         break;
 
       case Primitive::kPrimDouble:
-        // TODO: Support the case of ARM, where a double value
-        // requires an FPU register pair (note that the ARM back end
-        // does not yet use this register allocator when a method uses
-        // floats or doubles).
-        DCHECK(codegen_->GetInstructionSet() != kArm
-               && codegen_->GetInstructionSet() != kThumb2);
-        locations->SetTempAt(
-            temp_index++, Location::FpuRegisterLocation(temp->GetRegister()));
+        if (codegen_->NeedsTwoRegisters(Primitive::kPrimDouble)) {
+          Location location = Location::FpuRegisterPairLocation(
+              temp->GetRegister(), temp->GetHighInterval()->GetRegister());
+          locations->SetTempAt(temp_index++, location);
+        } else {
+          locations->SetTempAt(
+              temp_index++, Location::FpuRegisterLocation(temp->GetRegister()));
+        }
         break;
 
       default:
diff --git a/compiler/optimizing/register_allocator.h b/compiler/optimizing/register_allocator.h
index cbe741c..c152a8b 100644
--- a/compiler/optimizing/register_allocator.h
+++ b/compiler/optimizing/register_allocator.h
@@ -128,6 +128,8 @@
   bool ValidateInternal(bool log_fatal_on_failure) const;
   void DumpInterval(std::ostream& stream, LiveInterval* interval) const;
   void DumpAllIntervals(std::ostream& stream) const;
+  int FindAvailableRegisterPair(size_t* next_use) const;
+  int FindAvailableRegister(size_t* next_use) const;
 
   ArenaAllocator* const allocator_;
   CodeGenerator* const codegen_;
diff --git a/compiler/optimizing/ssa_liveness_analysis.cc b/compiler/optimizing/ssa_liveness_analysis.cc
index 660a5c5..d41157b 100644
--- a/compiler/optimizing/ssa_liveness_analysis.cc
+++ b/compiler/optimizing/ssa_liveness_analysis.cc
@@ -419,10 +419,21 @@
 }
 
 Location LiveInterval::ToLocation() const {
+  DCHECK(!IsHighInterval());
   if (HasRegister()) {
-    return IsFloatingPoint()
-        ? Location::FpuRegisterLocation(GetRegister())
-        : Location::RegisterLocation(GetRegister());
+    if (IsFloatingPoint()) {
+      if (HasHighInterval()) {
+        return Location::FpuRegisterPairLocation(GetRegister(), GetHighInterval()->GetRegister());
+      } else {
+        return Location::FpuRegisterLocation(GetRegister());
+      }
+    } else {
+      if (HasHighInterval()) {
+        return Location::RegisterPairLocation(GetRegister(), GetHighInterval()->GetRegister());
+      } else {
+        return Location::RegisterLocation(GetRegister());
+      }
+    }
   } else {
     HInstruction* defined_by = GetParent()->GetDefinedBy();
     if (defined_by->IsConstant()) {
diff --git a/compiler/optimizing/ssa_liveness_analysis.h b/compiler/optimizing/ssa_liveness_analysis.h
index 2312389..74611e1 100644
--- a/compiler/optimizing/ssa_liveness_analysis.h
+++ b/compiler/optimizing/ssa_liveness_analysis.h
@@ -77,6 +77,15 @@
     stream << "[" << start_ << ", " << end_ << ")";
   }
 
+  LiveRange* Dup(ArenaAllocator* allocator) const {
+    return new (allocator) LiveRange(
+        start_, end_, next_ == nullptr ? nullptr : next_->Dup(allocator));
+  }
+
+  LiveRange* GetLastRange() {
+    return next_ == nullptr ? this : next_->GetLastRange();
+  }
+
  private:
   size_t start_;
   size_t end_;
@@ -123,6 +132,12 @@
     stream << position_;
   }
 
+  UsePosition* Dup(ArenaAllocator* allocator) const {
+    return new (allocator) UsePosition(
+        user_, input_index_, is_environment_, position_,
+        next_ == nullptr ? nullptr : next_->Dup(allocator));
+  }
+
  private:
   HInstruction* const user_;
   const size_t input_index_;
@@ -478,6 +493,8 @@
     }
     stream << "}";
     stream << " is_fixed: " << is_fixed_ << ", is_split: " << IsSplit();
+    stream << " is_high: " << IsHighInterval();
+    stream << " is_low: " << IsLowInterval();
   }
 
   LiveInterval* GetNextSibling() const { return next_sibling_; }
@@ -512,6 +529,58 @@
   // Returns whether `other` and `this` share the same kind of register.
   bool SameRegisterKind(Location other) const;
 
+  bool HasHighInterval() const {
+    return !IsHighInterval() && (GetParent()->high_or_low_interval_ != nullptr);
+  }
+
+  bool HasLowInterval() const {
+    return IsHighInterval();
+  }
+
+  LiveInterval* GetLowInterval() const {
+    DCHECK(HasLowInterval());
+    return high_or_low_interval_;
+  }
+
+  LiveInterval* GetHighInterval() const {
+    DCHECK(HasHighInterval());
+    return high_or_low_interval_;
+  }
+
+  bool IsHighInterval() const {
+    return GetParent()->is_high_interval_;
+  }
+
+  bool IsLowInterval() const {
+    return !IsHighInterval() && (GetParent()->high_or_low_interval_ != nullptr);
+  }
+
+  void SetLowInterval(LiveInterval* low) {
+    DCHECK(IsHighInterval());
+    high_or_low_interval_ = low;
+  }
+
+  void SetHighInterval(LiveInterval* high) {
+    DCHECK(IsLowInterval());
+    high_or_low_interval_ = high;
+  }
+
+  void AddHighInterval(bool is_temp = false) {
+    DCHECK_EQ(GetParent(), this);
+    DCHECK(!HasHighInterval());
+    DCHECK(!HasLowInterval());
+    high_or_low_interval_ = new (allocator_) LiveInterval(
+        allocator_, type_, defined_by_, false, kNoRegister, is_temp, false, true);
+    high_or_low_interval_->high_or_low_interval_ = this;
+    if (first_range_ != nullptr) {
+      high_or_low_interval_->first_range_ = first_range_->Dup(allocator_);
+      high_or_low_interval_->last_range_ = first_range_->GetLastRange();
+    }
+    if (first_use_ != nullptr) {
+      high_or_low_interval_->first_use_ = first_use_->Dup(allocator_);
+    }
+  }
+
  private:
   LiveInterval(ArenaAllocator* allocator,
                Primitive::Type type,
@@ -519,7 +588,8 @@
                bool is_fixed = false,
                int reg = kNoRegister,
                bool is_temp = false,
-               bool is_slow_path_safepoint = false)
+               bool is_slow_path_safepoint = false,
+               bool is_high_interval = false)
       : allocator_(allocator),
         first_range_(nullptr),
         last_range_(nullptr),
@@ -532,6 +602,8 @@
         is_fixed_(is_fixed),
         is_temp_(is_temp),
         is_slow_path_safepoint_(is_slow_path_safepoint),
+        is_high_interval_(is_high_interval),
+        high_or_low_interval_(nullptr),
         defined_by_(defined_by) {}
 
   ArenaAllocator* const allocator_;
@@ -568,6 +640,13 @@
   // Whether the interval is for a safepoint that calls on slow path.
   const bool is_slow_path_safepoint_;
 
+  // Whether this interval is a synthesized interval for register pair.
+  const bool is_high_interval_;
+
+  // If this interval needs a register pair, the high or low equivalent.
+  // `is_high_interval_` tells whether this holds the low or the high.
+  LiveInterval* high_or_low_interval_;
+
   // The instruction represented by this interval.
   HInstruction* const defined_by_;
 
diff --git a/compiler/utils/arm/assembler_arm.h b/compiler/utils/arm/assembler_arm.h
index 87b3813..d912276 100644
--- a/compiler/utils/arm/assembler_arm.h
+++ b/compiler/utils/arm/assembler_arm.h
@@ -534,6 +534,13 @@
 
   // Load and Store. May clobber IP.
   virtual void LoadImmediate(Register rd, int32_t value, Condition cond = AL) = 0;
+  void LoadSImmediate(SRegister sd, float value, Condition cond = AL) {
+    if (!vmovs(sd, value, cond)) {
+      LoadImmediate(IP, bit_cast<int32_t, float>(value), cond);
+      vmovsr(sd, IP, cond);
+    }
+  }
+
   virtual void MarkExceptionHandler(Label* label) = 0;
   virtual void LoadFromOffset(LoadOperandType type,
                               Register reg,