Enable the register allocator on ARM.

- Also fixes a few bugs/wrong assumptions in code not hit by x86.
- We need to differentiate between moves due to connecting siblings within
  a block, and moves due to control flow resolution.

Change-Id: Idd05cf138a71c8f36f5531c473de613c0166fe38
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index 82fa639..11b3a33 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -67,8 +67,7 @@
     // Note that this follows the current calling convention.
     return GetFrameSize()
         + kVRegSize  // Art method
-        + (parameter->GetIndex() - graph_->GetNumberOfVRegs() + graph_->GetNumberOfInVRegs())
-          * kVRegSize;
+        + parameter->GetIndex() * kVRegSize;
   }
 
   virtual void GenerateFrameEntry() = 0;
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index d61df36..2aebf9a 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -48,7 +48,8 @@
 CodeGeneratorARM::CodeGeneratorARM(HGraph* graph)
     : CodeGenerator(graph, kNumberOfRegIds),
       location_builder_(graph, this),
-      instruction_visitor_(graph, this) {}
+      instruction_visitor_(graph, this),
+      move_resolver_(graph->GetArena(), this) {}
 
 static bool* GetBlockedRegisterPairs(bool* blocked_registers) {
   return blocked_registers + kNumberOfAllocIds;
@@ -106,6 +107,9 @@
   // Reserve thread register.
   blocked_registers[TR] = true;
 
+  // Reserve temp register.
+  blocked_registers[IP] = true;
+
   // TODO: We currently don't use Quick's callee saved registers.
   blocked_registers[R5] = true;
   blocked_registers[R6] = true;
@@ -254,8 +258,8 @@
     if (source.IsRegister()) {
       __ str(source.AsArm().AsCoreRegister(), Address(SP, destination.GetStackIndex()));
     } else {
-      __ ldr(R0, Address(SP, source.GetStackIndex()));
-      __ str(R0, Address(SP, destination.GetStackIndex()));
+      __ ldr(IP, Address(SP, source.GetStackIndex()));
+      __ str(IP, Address(SP, destination.GetStackIndex()));
     }
   }
 }
@@ -295,8 +299,8 @@
     } else {
       DCHECK(source.IsDoubleStackSlot());
       __ ldr(calling_convention.GetRegisterAt(argument_index), Address(SP, source.GetStackIndex()));
-      __ ldr(R0, Address(SP, source.GetHighStackIndex(kArmWordSize)));
-      __ str(R0, Address(SP, calling_convention.GetStackOffsetOf(argument_index + 1, kArmWordSize)));
+      __ ldr(IP, Address(SP, source.GetHighStackIndex(kArmWordSize)));
+      __ str(IP, Address(SP, calling_convention.GetStackOffsetOf(argument_index + 1, kArmWordSize)));
     }
   } else {
     DCHECK(destination.IsDoubleStackSlot());
@@ -313,15 +317,15 @@
       uint32_t argument_index = source.GetQuickParameterIndex();
       __ str(calling_convention.GetRegisterAt(argument_index),
              Address(SP, destination.GetStackIndex()));
-      __ ldr(R0,
+      __ ldr(IP,
              Address(SP, calling_convention.GetStackOffsetOf(argument_index + 1, kArmWordSize) + GetFrameSize()));
-      __ str(R0, Address(SP, destination.GetHighStackIndex(kArmWordSize)));
+      __ str(IP, Address(SP, destination.GetHighStackIndex(kArmWordSize)));
     } else {
       DCHECK(source.IsDoubleStackSlot());
-      __ ldr(R0, Address(SP, source.GetStackIndex()));
-      __ str(R0, Address(SP, destination.GetStackIndex()));
-      __ ldr(R0, Address(SP, source.GetHighStackIndex(kArmWordSize)));
-      __ str(R0, Address(SP, destination.GetHighStackIndex(kArmWordSize)));
+      __ ldr(IP, Address(SP, source.GetStackIndex()));
+      __ str(IP, Address(SP, destination.GetStackIndex()));
+      __ ldr(IP, Address(SP, source.GetHighStackIndex(kArmWordSize)));
+      __ str(IP, Address(SP, destination.GetHighStackIndex(kArmWordSize)));
     }
   }
 }
@@ -332,8 +336,8 @@
     if (location.IsRegister()) {
       __ LoadImmediate(location.AsArm().AsCoreRegister(), value);
     } else {
-      __ LoadImmediate(R0, value);
-      __ str(R0, Address(SP, location.GetStackIndex()));
+      __ LoadImmediate(IP, value);
+      __ str(IP, Address(SP, location.GetStackIndex()));
     }
   } else if (instruction->AsLongConstant() != nullptr) {
     int64_t value = instruction->AsLongConstant()->GetValue();
@@ -341,10 +345,10 @@
       __ LoadImmediate(location.AsArm().AsRegisterPairLow(), Low32Bits(value));
       __ LoadImmediate(location.AsArm().AsRegisterPairHigh(), High32Bits(value));
     } else {
-      __ LoadImmediate(R0, Low32Bits(value));
-      __ str(R0, Address(SP, location.GetStackIndex()));
-      __ LoadImmediate(R0, High32Bits(value));
-      __ str(R0, Address(SP, location.GetHighStackIndex(kArmWordSize)));
+      __ LoadImmediate(IP, Low32Bits(value));
+      __ str(IP, Address(SP, location.GetStackIndex()));
+      __ LoadImmediate(IP, High32Bits(value));
+      __ str(IP, Address(SP, location.GetHighStackIndex(kArmWordSize)));
     }
   } else if (instruction->AsLoadLocal() != nullptr) {
     uint32_t stack_slot = GetStackSlot(instruction->AsLoadLocal()->GetLocal());
@@ -493,7 +497,7 @@
 }
 
 void InstructionCodeGeneratorARM::VisitIntConstant(HIntConstant* constant) {
-  // Will be generated at use site.
+  codegen_->Move(constant, constant->GetLocations()->Out(), nullptr);
 }
 
 void LocationsBuilderARM::VisitLongConstant(HLongConstant* constant) {
@@ -564,7 +568,7 @@
 
 void LocationsBuilderARM::VisitInvokeStatic(HInvokeStatic* invoke) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(invoke);
-  locations->AddTemp(Location::RequiresRegister());
+  locations->AddTemp(ArmCoreLocation(R0));
 
   InvokeDexCallingConventionVisitor calling_convention_visitor;
   for (size_t i = 0; i < invoke->InputCount(); i++) {
@@ -811,15 +815,93 @@
 }
 
 void InstructionCodeGeneratorARM::VisitPhi(HPhi* instruction) {
-  LOG(FATAL) << "Unimplemented";
+  LOG(FATAL) << "Unreachable";
 }
 
 void LocationsBuilderARM::VisitParallelMove(HParallelMove* instruction) {
-  LOG(FATAL) << "Unimplemented";
+  LOG(FATAL) << "Unreachable";
 }
 
 void InstructionCodeGeneratorARM::VisitParallelMove(HParallelMove* instruction) {
-  LOG(FATAL) << "Unimplemented";
+  codegen_->GetMoveResolver()->EmitNativeCode(instruction);
+}
+
+ArmAssembler* ParallelMoveResolverARM::GetAssembler() const {
+  return codegen_->GetAssembler();
+}
+
+void ParallelMoveResolverARM::EmitMove(size_t index) {
+  MoveOperands* move = moves_.Get(index);
+  Location source = move->GetSource();
+  Location destination = move->GetDestination();
+
+  if (source.IsRegister()) {
+    if (destination.IsRegister()) {
+      __ Mov(destination.AsArm().AsCoreRegister(), source.AsArm().AsCoreRegister());
+    } else {
+      DCHECK(destination.IsStackSlot());
+      __ StoreToOffset(kStoreWord, source.AsArm().AsCoreRegister(),
+                       SP, destination.GetStackIndex());
+    }
+  } else if (source.IsStackSlot()) {
+    if (destination.IsRegister()) {
+      __ LoadFromOffset(kLoadWord, destination.AsArm().AsCoreRegister(),
+                        SP, source.GetStackIndex());
+    } else {
+      DCHECK(destination.IsStackSlot());
+      __ LoadFromOffset(kLoadWord, IP, SP, source.GetStackIndex());
+      __ StoreToOffset(kStoreWord, IP, SP, destination.GetStackIndex());
+    }
+  } else {
+    LOG(FATAL) << "Unimplemented";
+  }
+}
+
+void ParallelMoveResolverARM::Exchange(Register reg, int mem) {
+  __ Mov(IP, reg);
+  __ LoadFromOffset(kLoadWord, reg, SP, mem);
+  __ StoreToOffset(kStoreWord, IP, SP, mem);
+}
+
+void ParallelMoveResolverARM::Exchange(int mem1, int mem2) {
+  ScratchRegisterScope ensure_scratch(this, IP, R0, codegen_->GetNumberOfCoreRegisters());
+  int stack_offset = ensure_scratch.IsSpilled() ? kArmWordSize : 0;
+  __ LoadFromOffset(kLoadWord, static_cast<Register>(ensure_scratch.GetRegister()),
+                    SP, mem1 + stack_offset);
+  __ LoadFromOffset(kLoadWord, IP, SP, mem2 + stack_offset);
+  __ StoreToOffset(kStoreWord, static_cast<Register>(ensure_scratch.GetRegister()),
+                   SP, mem2 + stack_offset);
+  __ StoreToOffset(kStoreWord, IP, SP, mem1 + stack_offset);
+}
+
+void ParallelMoveResolverARM::EmitSwap(size_t index) {
+  MoveOperands* move = moves_.Get(index);
+  Location source = move->GetSource();
+  Location destination = move->GetDestination();
+
+  if (source.IsRegister() && destination.IsRegister()) {
+    DCHECK_NE(source.AsArm().AsCoreRegister(), IP);
+    DCHECK_NE(destination.AsArm().AsCoreRegister(), IP);
+    __ Mov(IP, source.AsArm().AsCoreRegister());
+    __ Mov(source.AsArm().AsCoreRegister(), destination.AsArm().AsCoreRegister());
+    __ Mov(destination.AsArm().AsCoreRegister(), IP);
+  } else if (source.IsRegister() && destination.IsStackSlot()) {
+    Exchange(source.AsArm().AsCoreRegister(), destination.GetStackIndex());
+  } else if (source.IsStackSlot() && destination.IsRegister()) {
+    Exchange(destination.AsArm().AsCoreRegister(), source.GetStackIndex());
+  } else if (source.IsStackSlot() && destination.IsStackSlot()) {
+    Exchange(source.GetStackIndex(), destination.GetStackIndex());
+  } else {
+    LOG(FATAL) << "Unimplemented";
+  }
+}
+
+void ParallelMoveResolverARM::SpillScratch(int reg) {
+  __ Push(static_cast<Register>(reg));
+}
+
+void ParallelMoveResolverARM::RestoreScratch(int reg) {
+  __ Pop(static_cast<Register>(reg));
 }
 
 }  // namespace arm
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index ac5ef21..712a24c 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -19,6 +19,7 @@
 
 #include "code_generator.h"
 #include "nodes.h"
+#include "parallel_move_resolver.h"
 #include "utils/arm/assembler_arm32.h"
 
 namespace art {
@@ -59,6 +60,27 @@
   DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConventionVisitor);
 };
 
+class ParallelMoveResolverARM : public ParallelMoveResolver {
+ public:
+  ParallelMoveResolverARM(ArenaAllocator* allocator, CodeGeneratorARM* codegen)
+      : ParallelMoveResolver(allocator), codegen_(codegen) {}
+
+  virtual void EmitMove(size_t index) OVERRIDE;
+  virtual void EmitSwap(size_t index) OVERRIDE;
+  virtual void SpillScratch(int reg) OVERRIDE;
+  virtual void RestoreScratch(int reg) OVERRIDE;
+
+  ArmAssembler* GetAssembler() const;
+
+ private:
+  void Exchange(Register reg, int mem);
+  void Exchange(int mem1, int mem2);
+
+  CodeGeneratorARM* const codegen_;
+
+  DISALLOW_COPY_AND_ASSIGN(ParallelMoveResolverARM);
+};
+
 class LocationsBuilderARM : public HGraphVisitor {
  public:
   explicit LocationsBuilderARM(HGraph* graph, CodeGeneratorARM* codegen)
@@ -145,6 +167,10 @@
   virtual void DumpCoreRegister(std::ostream& stream, int reg) const OVERRIDE;
   virtual void DumpFloatingPointRegister(std::ostream& stream, int reg) const OVERRIDE;
 
+  ParallelMoveResolverARM* GetMoveResolver() {
+    return &move_resolver_;
+  }
+
  private:
   // Helper method to move a 32bits value between two locations.
   void Move32(Location destination, Location source);
@@ -153,6 +179,7 @@
 
   LocationsBuilderARM location_builder_;
   InstructionCodeGeneratorARM instruction_visitor_;
+  ParallelMoveResolverARM move_resolver_;
   Arm32Assembler assembler_;
 
   DISALLOW_COPY_AND_ASSIGN(CodeGeneratorARM);
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index c7dca86..f24af5b 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -847,7 +847,7 @@
 
 void ParallelMoveResolverX86::MoveMemoryToMemory(int dst, int src) {
   ScratchRegisterScope ensure_scratch(
-      this, kNoRegister, codegen_->GetNumberOfCoreRegisters());
+      this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters());
   int stack_offset = ensure_scratch.IsSpilled() ? kX86WordSize : 0;
   __ movl(static_cast<Register>(ensure_scratch.GetRegister()), Address(ESP, src + stack_offset));
   __ movl(Address(ESP, dst + stack_offset), static_cast<Register>(ensure_scratch.GetRegister()));
@@ -879,7 +879,10 @@
 }
 
 void ParallelMoveResolverX86::Exchange(Register reg, int mem) {
-  ScratchRegisterScope ensure_scratch(this, reg, codegen_->GetNumberOfCoreRegisters());
+  Register suggested_scratch = reg == EAX ? EBX : EAX;
+  ScratchRegisterScope ensure_scratch(
+      this, reg, suggested_scratch, codegen_->GetNumberOfCoreRegisters());
+
   int stack_offset = ensure_scratch.IsSpilled() ? kX86WordSize : 0;
   __ movl(static_cast<Register>(ensure_scratch.GetRegister()), Address(ESP, mem + stack_offset));
   __ movl(Address(ESP, mem + stack_offset), reg);
@@ -889,9 +892,12 @@
 
 void ParallelMoveResolverX86::Exchange(int mem1, int mem2) {
   ScratchRegisterScope ensure_scratch1(
-      this, kNoRegister, codegen_->GetNumberOfCoreRegisters());
+      this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters());
+
+  Register suggested_scratch = ensure_scratch1.GetRegister() == EAX ? EBX : EAX;
   ScratchRegisterScope ensure_scratch2(
-      this, ensure_scratch1.GetRegister(), codegen_->GetNumberOfCoreRegisters());
+      this, ensure_scratch1.GetRegister(), suggested_scratch, codegen_->GetNumberOfCoreRegisters());
+
   int stack_offset = ensure_scratch1.IsSpilled() ? kX86WordSize : 0;
   stack_offset += ensure_scratch2.IsSpilled() ? kX86WordSize : 0;
   __ movl(static_cast<Register>(ensure_scratch1.GetRegister()), Address(ESP, mem1 + stack_offset));
diff --git a/compiler/optimizing/parallel_move_resolver.cc b/compiler/optimizing/parallel_move_resolver.cc
index 4a1b6ce..cadd3c5 100644
--- a/compiler/optimizing/parallel_move_resolver.cc
+++ b/compiler/optimizing/parallel_move_resolver.cc
@@ -163,7 +163,11 @@
   return false;
 }
 
-int ParallelMoveResolver::AllocateScratchRegister(int blocked, int register_count, bool* spilled) {
+int ParallelMoveResolver::AllocateScratchRegister(int blocked,
+                                                  int register_count,
+                                                  int if_scratch,
+                                                  bool* spilled) {
+  DCHECK_NE(blocked, if_scratch);
   int scratch = -1;
   for (int reg = 0; reg < register_count; ++reg) {
     if ((blocked != reg) &&
@@ -175,11 +179,7 @@
 
   if (scratch == -1) {
     *spilled = true;
-    for (int reg = 0; reg < register_count; ++reg) {
-      if (blocked != reg) {
-        scratch = reg;
-      }
-    }
+    scratch = if_scratch;
   } else {
     *spilled = false;
   }
@@ -189,11 +189,11 @@
 
 
 ParallelMoveResolver::ScratchRegisterScope::ScratchRegisterScope(
-    ParallelMoveResolver* resolver, int blocked, int number_of_registers)
+    ParallelMoveResolver* resolver, int blocked, int if_scratch, int number_of_registers)
     : resolver_(resolver),
       reg_(kNoRegister),
       spilled_(false) {
-  reg_ = resolver_->AllocateScratchRegister(blocked, number_of_registers, &spilled_);
+  reg_ = resolver_->AllocateScratchRegister(blocked, number_of_registers, if_scratch, &spilled_);
 
   if (spilled_) {
     resolver->SpillScratch(reg_);
diff --git a/compiler/optimizing/parallel_move_resolver.h b/compiler/optimizing/parallel_move_resolver.h
index e1189d8..fcc1de6 100644
--- a/compiler/optimizing/parallel_move_resolver.h
+++ b/compiler/optimizing/parallel_move_resolver.h
@@ -42,7 +42,10 @@
  protected:
   class ScratchRegisterScope : public ValueObject {
    public:
-    ScratchRegisterScope(ParallelMoveResolver* resolver, int blocked, int number_of_registers);
+    ScratchRegisterScope(ParallelMoveResolver* resolver,
+                         int blocked,
+                         int if_scratch,
+                         int number_of_registers);
     ~ScratchRegisterScope();
 
     int GetRegister() const { return reg_; }
@@ -55,7 +58,7 @@
   };
 
   bool IsScratchLocation(Location loc);
-  int AllocateScratchRegister(int blocked, int register_count, bool* spilled);
+  int AllocateScratchRegister(int blocked, int if_scratch, int register_count, bool* spilled);
 
   // Emit a move.
   virtual void EmitMove(size_t index) = 0;
diff --git a/compiler/optimizing/register_allocator.cc b/compiler/optimizing/register_allocator.cc
index c2a4769..348e9d4 100644
--- a/compiler/optimizing/register_allocator.cc
+++ b/compiler/optimizing/register_allocator.cc
@@ -651,7 +651,9 @@
     // Move must happen after the instruction.
     DCHECK(!at->IsControlFlow());
     move = at->GetNext()->AsParallelMove();
-    if (move == nullptr || IsInputMove(move)) {
+    // This is a parallel move for connecting siblings in a same block. We need to
+    // differentiate it with moves for connecting blocks, and input moves.
+    if (move == nullptr || move->GetLifetimePosition() != position) {
       move = new (allocator_) HParallelMove(allocator_);
       move->SetLifetimePosition(position);
       at->GetBlock()->InsertInstructionBefore(move, at->GetNext());
@@ -660,7 +662,9 @@
     // Move must happen before the instruction.
     HInstruction* previous = at->GetPrevious();
     if (previous != nullptr && previous->AsParallelMove() != nullptr) {
-      if (IsInputMove(previous)) {
+      // This is a parallel move for connecting siblings in a same block. We need to
+      // differentiate it with moves for connecting blocks, and input moves.
+      if (previous->GetLifetimePosition() != position) {
         previous = previous->GetPrevious();
       }
     }
@@ -684,8 +688,12 @@
   HInstruction* last = block->GetLastInstruction();
   HInstruction* previous = last->GetPrevious();
   HParallelMove* move;
-  if (previous == nullptr || previous->AsParallelMove() == nullptr) {
+  // This is a parallel move for connecting blocks. We need to differentiate
+  // it with moves for connecting siblings in a same block, and output moves.
+  if (previous == nullptr || previous->AsParallelMove() == nullptr
+      || previous->AsParallelMove()->GetLifetimePosition() != block->GetLifetimeEnd()) {
     move = new (allocator_) HParallelMove(allocator_);
+    move->SetLifetimePosition(block->GetLifetimeEnd());
     block->InsertInstructionBefore(move, last);
   } else {
     move = previous->AsParallelMove();
@@ -700,7 +708,9 @@
 
   HInstruction* first = block->GetFirstInstruction();
   HParallelMove* move = first->AsParallelMove();
-  if (move == nullptr || IsInputMove(move)) {
+  // This is a parallel move for connecting blocks. We need to differentiate
+  // it with moves for connecting siblings in a same block, and input moves.
+  if (move == nullptr || move->GetLifetimePosition() != block->GetLifetimeStart()) {
     move = new (allocator_) HParallelMove(allocator_);
     move->SetLifetimePosition(block->GetLifetimeStart());
     block->InsertInstructionBefore(move, first);
@@ -718,9 +728,14 @@
     return;
   }
 
+  size_t position = instruction->GetLifetimePosition() + 1;
   HParallelMove* move = instruction->GetNext()->AsParallelMove();
-  if (move == nullptr || IsInputMove(move)) {
+  // This is a parallel move for moving the output of an instruction. We need
+  // to differentiate with input moves, moves for connecting siblings in a
+  // and moves for connecting blocks.
+  if (move == nullptr || move->GetLifetimePosition() != position) {
     move = new (allocator_) HParallelMove(allocator_);
+    move->SetLifetimePosition(position);
     instruction->GetBlock()->InsertInstructionBefore(move, instruction->GetNext());
   }
   move->AddMove(new (allocator_) MoveOperands(source, destination));
diff --git a/compiler/optimizing/register_allocator.h b/compiler/optimizing/register_allocator.h
index 1b5585f..8b7c4f1 100644
--- a/compiler/optimizing/register_allocator.h
+++ b/compiler/optimizing/register_allocator.h
@@ -65,7 +65,7 @@
 
   static bool CanAllocateRegistersFor(const HGraph& graph, InstructionSet instruction_set);
   static bool Supports(InstructionSet instruction_set) {
-    return instruction_set == kX86;
+    return instruction_set == kX86 || instruction_set == kArm;
   }
 
   size_t GetNumberOfSpillSlots() const {