Opt compiler: Implement parallel move resolver without using swap. The algorithm of ParallelMoveResolverNoSwap() is almost the same with ParallelMoveResolverWithSwap(), except the way we resolve the circular dependency. NoSwap() uses additional scratch register to resolve the circular dependency. For example, (0->1) (1->2) (2->0) will be performed as (2->scratch) (1->2) (0->1) (scratch->0). On architectures without swap register support, NoSwap() can reduce the number of moves from 3x(N-1) to (N+1) when there is circular dependency with N moves. And also, NoSwap() algorithm does not depend on architecture register layout information, which means it can support register pairs on arm32 and X/W, D/S registers on arm64 without additional modification. Change-Id: Idf56bd5469bb78c0e339e43ab16387428a082318

commit: ad4450e5c3ffaa9566216cc6fafbf5c11186c467 [log] [tgz]
author: Zheng Xu <zheng.xu@arm.com> Fri Apr 17 18:48:56 2015 +0800
committer: Zheng Xu <zheng.xu@arm.com> Fri Apr 17 18:54:08 2015 +0800
tree: eecf36e8e9d8112e765ad8840eb2d27f8d0415ab
parent: f8bdd9f3a002970e4b8fdcf6fe6730116f1626c3 [diff]
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index 06f425e..6009036 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h

@@ -96,10 +96,10 @@
   DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConventionVisitor);
 };
 
-class ParallelMoveResolverARM : public ParallelMoveResolver {
+class ParallelMoveResolverARM : public ParallelMoveResolverWithSwap {
  public:
   ParallelMoveResolverARM(ArenaAllocator* allocator, CodeGeneratorARM* codegen)
-      : ParallelMoveResolver(allocator), codegen_(codegen) {}
+      : ParallelMoveResolverWithSwap(allocator), codegen_(codegen) {}
 
   void EmitMove(size_t index) OVERRIDE;
   void EmitSwap(size_t index) OVERRIDE;

diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index efc41e7..7390005 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc

@@ -425,24 +425,59 @@
   CodeGenerator::Finalize(allocator);
 }
 
+void ParallelMoveResolverARM64::PrepareForEmitNativeCode() {
+  // Note: There are 6 kinds of moves:
+  // 1. constant -> GPR/FPR (non-cycle)
+  // 2. constant -> stack (non-cycle)
+  // 3. GPR/FPR -> GPR/FPR
+  // 4. GPR/FPR -> stack
+  // 5. stack -> GPR/FPR
+  // 6. stack -> stack (non-cycle)
+  // Case 1, 2 and 6 should never be included in a dependency cycle on ARM64. For case 3, 4, and 5
+  // VIXL uses at most 1 GPR. VIXL has 2 GPR and 1 FPR temps, and there should be no intersecting
+  // cycles on ARM64, so we always have 1 GPR and 1 FPR available VIXL temps to resolve the
+  // dependency.
+  vixl_temps_.Open(GetVIXLAssembler());
+}
+
+void ParallelMoveResolverARM64::FinishEmitNativeCode() {
+  vixl_temps_.Close();
+}
+
+Location ParallelMoveResolverARM64::AllocateScratchLocationFor(Location::Kind kind) {
+  DCHECK(kind == Location::kRegister || kind == Location::kFpuRegister ||
+         kind == Location::kStackSlot || kind == Location::kDoubleStackSlot);
+  kind = (kind == Location::kFpuRegister) ? Location::kFpuRegister : Location::kRegister;
+  Location scratch = GetScratchLocation(kind);
+  if (!scratch.Equals(Location::NoLocation())) {
+    return scratch;
+  }
+  // Allocate from VIXL temp registers.
+  if (kind == Location::kRegister) {
+    scratch = LocationFrom(vixl_temps_.AcquireX());
+  } else {
+    DCHECK(kind == Location::kFpuRegister);
+    scratch = LocationFrom(vixl_temps_.AcquireD());
+  }
+  AddScratchLocation(scratch);
+  return scratch;
+}
+
+void ParallelMoveResolverARM64::FreeScratchLocation(Location loc) {
+  if (loc.IsRegister()) {
+    vixl_temps_.Release(XRegisterFrom(loc));
+  } else {
+    DCHECK(loc.IsFpuRegister());
+    vixl_temps_.Release(DRegisterFrom(loc));
+  }
+  RemoveScratchLocation(loc);
+}
+
 void ParallelMoveResolverARM64::EmitMove(size_t index) {
   MoveOperands* move = moves_.Get(index);
   codegen_->MoveLocation(move->GetDestination(), move->GetSource());
 }
 
-void ParallelMoveResolverARM64::EmitSwap(size_t index) {
-  MoveOperands* move = moves_.Get(index);
-  codegen_->SwapLocations(move->GetDestination(), move->GetSource());
-}
-
-void ParallelMoveResolverARM64::RestoreScratch(int reg) {
-  __ Pop(Register(VIXLRegCodeFromART(reg), kXRegSize));
-}
-
-void ParallelMoveResolverARM64::SpillScratch(int reg) {
-  __ Push(Register(VIXLRegCodeFromART(reg), kXRegSize));
-}
-
 void CodeGeneratorARM64::GenerateFrameEntry() {
   __ Bind(&frame_entry_label_);
 
@@ -726,10 +761,10 @@
       if (destination.IsRegister()) {
         __ Mov(Register(dst), RegisterFrom(source, type));
       } else {
+        DCHECK(destination.IsFpuRegister());
         __ Fmov(FPRegister(dst), FPRegisterFrom(source, type));
       }
     }
-
   } else {  // The destination is not a register. It must be a stack slot.
     DCHECK(destination.IsStackSlot() || destination.IsDoubleStackSlot());
     if (source.IsRegister() || source.IsFpuRegister()) {
@@ -772,67 +807,6 @@
   }
 }
 
-void CodeGeneratorARM64::SwapLocations(Location loc1, Location loc2) {
-  DCHECK(!loc1.IsConstant());
-  DCHECK(!loc2.IsConstant());
-
-  if (loc1.Equals(loc2)) {
-    return;
-  }
-
-  UseScratchRegisterScope temps(GetAssembler()->vixl_masm_);
-
-  bool is_slot1 = loc1.IsStackSlot() || loc1.IsDoubleStackSlot();
-  bool is_slot2 = loc2.IsStackSlot() || loc2.IsDoubleStackSlot();
-  bool is_fp_reg1 = loc1.IsFpuRegister();
-  bool is_fp_reg2 = loc2.IsFpuRegister();
-
-  if (loc2.IsRegister() && loc1.IsRegister()) {
-    Register r1 = XRegisterFrom(loc1);
-    Register r2 = XRegisterFrom(loc2);
-    Register tmp = temps.AcquireSameSizeAs(r1);
-    __ Mov(tmp, r2);
-    __ Mov(r2, r1);
-    __ Mov(r1, tmp);
-  } else if (is_fp_reg2 && is_fp_reg1) {
-    FPRegister r1 = DRegisterFrom(loc1);
-    FPRegister r2 = DRegisterFrom(loc2);
-    FPRegister tmp = temps.AcquireSameSizeAs(r1);
-    __ Fmov(tmp, r2);
-    __ Fmov(r2, r1);
-    __ Fmov(r1, tmp);
-  } else if (is_slot1 != is_slot2) {
-    MemOperand mem = StackOperandFrom(is_slot1 ? loc1 : loc2);
-    Location reg_loc = is_slot1 ? loc2 : loc1;
-    CPURegister reg, tmp;
-    if (reg_loc.IsFpuRegister()) {
-      reg = DRegisterFrom(reg_loc);
-      tmp = temps.AcquireD();
-    } else {
-      reg = XRegisterFrom(reg_loc);
-      tmp = temps.AcquireX();
-    }
-    __ Ldr(tmp, mem);
-    __ Str(reg, mem);
-    if (reg_loc.IsFpuRegister()) {
-      __ Fmov(FPRegister(reg), FPRegister(tmp));
-    } else {
-      __ Mov(Register(reg), Register(tmp));
-    }
-  } else if (is_slot1 && is_slot2) {
-    MemOperand mem1 = StackOperandFrom(loc1);
-    MemOperand mem2 = StackOperandFrom(loc2);
-    Register tmp1 = loc1.IsStackSlot() ? temps.AcquireW() : temps.AcquireX();
-    Register tmp2 = temps.AcquireSameSizeAs(tmp1);
-    __ Ldr(tmp1, mem1);
-    __ Ldr(tmp2, mem2);
-    __ Str(tmp1, mem2);
-    __ Str(tmp2, mem1);
-  } else {
-    LOG(FATAL) << "Unimplemented";
-  }
-}
-
 void CodeGeneratorARM64::Load(Primitive::Type type,
                               CPURegister dst,
                               const MemOperand& src) {

diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 07c6dd0..c862683 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h

@@ -194,15 +194,17 @@
   DISALLOW_COPY_AND_ASSIGN(LocationsBuilderARM64);
 };
 
-class ParallelMoveResolverARM64 : public ParallelMoveResolver {
+class ParallelMoveResolverARM64 : public ParallelMoveResolverNoSwap {
  public:
   ParallelMoveResolverARM64(ArenaAllocator* allocator, CodeGeneratorARM64* codegen)
-      : ParallelMoveResolver(allocator), codegen_(codegen) {}
+      : ParallelMoveResolverNoSwap(allocator), codegen_(codegen), vixl_temps_() {}
 
+ protected:
+  void PrepareForEmitNativeCode() OVERRIDE;
+  void FinishEmitNativeCode() OVERRIDE;
+  Location AllocateScratchLocationFor(Location::Kind kind) OVERRIDE;
+  void FreeScratchLocation(Location loc) OVERRIDE;
   void EmitMove(size_t index) OVERRIDE;
-  void EmitSwap(size_t index) OVERRIDE;
-  void RestoreScratch(int reg) OVERRIDE;
-  void SpillScratch(int reg) OVERRIDE;
 
  private:
   Arm64Assembler* GetAssembler() const;
@@ -211,6 +213,7 @@
   }
 
   CodeGeneratorARM64* const codegen_;
+  vixl::UseScratchRegisterScope vixl_temps_;
 
   DISALLOW_COPY_AND_ASSIGN(ParallelMoveResolverARM64);
 };
@@ -318,7 +321,6 @@
   // locations, and is used for optimisation and debugging.
   void MoveLocation(Location destination, Location source,
                     Primitive::Type type = Primitive::kPrimVoid);
-  void SwapLocations(Location loc_1, Location loc_2);
   void Load(Primitive::Type type, vixl::CPURegister dst, const vixl::MemOperand& src);
   void Store(Primitive::Type type, vixl::CPURegister rt, const vixl::MemOperand& dst);
   void LoadCurrentMethod(vixl::Register current_method);

diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index 368ae0f..07476c6 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h

@@ -93,10 +93,10 @@
   DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConventionVisitor);
 };
 
-class ParallelMoveResolverX86 : public ParallelMoveResolver {
+class ParallelMoveResolverX86 : public ParallelMoveResolverWithSwap {
  public:
   ParallelMoveResolverX86(ArenaAllocator* allocator, CodeGeneratorX86* codegen)
-      : ParallelMoveResolver(allocator), codegen_(codegen) {}
+      : ParallelMoveResolverWithSwap(allocator), codegen_(codegen) {}
 
   void EmitMove(size_t index) OVERRIDE;
   void EmitSwap(size_t index) OVERRIDE;

diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index b4876ef..6cdc822 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h

@@ -102,10 +102,10 @@
   DISALLOW_COPY_AND_ASSIGN(SlowPathCodeX86_64);
 };
 
-class ParallelMoveResolverX86_64 : public ParallelMoveResolver {
+class ParallelMoveResolverX86_64 : public ParallelMoveResolverWithSwap {
  public:
   ParallelMoveResolverX86_64(ArenaAllocator* allocator, CodeGeneratorX86_64* codegen)
-      : ParallelMoveResolver(allocator), codegen_(codegen) {}
+      : ParallelMoveResolverWithSwap(allocator), codegen_(codegen) {}
 
   void EmitMove(size_t index) OVERRIDE;
   void EmitSwap(size_t index) OVERRIDE;

diff --git a/compiler/optimizing/locations.h b/compiler/optimizing/locations.h
index de876be..c3a9915 100644
--- a/compiler/optimizing/locations.h
+++ b/compiler/optimizing/locations.h

@@ -285,17 +285,26 @@
   bool Contains(Location other) const {
     if (Equals(other)) {
       return true;
-    } else if (IsFpuRegisterPair() && other.IsFpuRegister()) {
-      return other.reg() == low() || other.reg() == high();
-    } else if (IsRegisterPair() && other.IsRegister()) {
-      return other.reg() == low() || other.reg() == high();
-    } else if (IsDoubleStackSlot() && other.IsStackSlot()) {
-      return (GetStackIndex() == other.GetStackIndex())
-          || (GetStackIndex() + 4 == other.GetStackIndex());
+    } else if (IsPair() || IsDoubleStackSlot()) {
+      return ToLow().Equals(other) || ToHigh().Equals(other);
     }
     return false;
   }
 
+  bool OverlapsWith(Location other) const {
+    // Only check the overlapping case that can happen with our register allocation algorithm.
+    bool overlap = Contains(other) || other.Contains(*this);
+    if (kIsDebugBuild && !overlap) {
+      // Note: These are also overlapping cases. But we are not able to handle them in
+      // ParallelMoveResolverWithSwap. Make sure that we do not meet such case with our compiler.
+      if ((IsPair() && other.IsPair()) || (IsDoubleStackSlot() && other.IsDoubleStackSlot())) {
+        DCHECK(!Contains(other.ToLow()));
+        DCHECK(!Contains(other.ToHigh()));
+      }
+    }
+    return overlap;
+  }
+
   const char* DebugString() const {
     switch (GetKind()) {
       case kInvalid: return "I";

diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 649038b..203bda9 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h

@@ -2105,7 +2105,7 @@
 
   friend class HGraph;
   ART_FRIEND_TEST(GraphTest, InsertInstructionBefore);
-  ART_FRIEND_TEST(ParallelMoveTest, ConstantLast);
+  ART_FRIEND_TYPED_TEST(ParallelMoveTest, ConstantLast);
   DISALLOW_COPY_AND_ASSIGN(HIntConstant);
 };
 
@@ -3502,7 +3502,7 @@
 
   // True if this blocks a move from the given location.
   bool Blocks(Location loc) const {
-    return !IsEliminated() && (source_.Contains(loc) || loc.Contains(source_));
+    return !IsEliminated() && source_.OverlapsWith(loc);
   }
 
   // A move is redundant if it's been eliminated, if its source and
@@ -3571,8 +3571,8 @@
         }
       }
       for (size_t i = 0, e = moves_.Size(); i < e; ++i) {
-        DCHECK(!destination.Equals(moves_.Get(i).GetDestination()))
-            << "Same destination for two moves in a parallel move.";
+        DCHECK(!destination.OverlapsWith(moves_.Get(i).GetDestination()))
+            << "Overlapped destination for two moves in a parallel move.";
       }
     }
     moves_.Add(MoveOperands(source, destination, type, instruction));

diff --git a/compiler/optimizing/parallel_move_resolver.cc b/compiler/optimizing/parallel_move_resolver.cc
index ad92ca5..54ea6f1 100644
--- a/compiler/optimizing/parallel_move_resolver.cc
+++ b/compiler/optimizing/parallel_move_resolver.cc

@@ -17,11 +17,23 @@
 
 #include "parallel_move_resolver.h"
 #include "nodes.h"
-#include "locations.h"
 
 namespace art {
 
-void ParallelMoveResolver::EmitNativeCode(HParallelMove* parallel_move) {
+void ParallelMoveResolver::BuildInitialMoveList(HParallelMove* parallel_move) {
+  // Perform a linear sweep of the moves to add them to the initial list of
+  // moves to perform, ignoring any move that is redundant (the source is
+  // the same as the destination, the destination is ignored and
+  // unallocated, or the move was already eliminated).
+  for (size_t i = 0; i < parallel_move->NumMoves(); ++i) {
+    MoveOperands* move = parallel_move->MoveOperandsAt(i);
+    if (!move->IsRedundant()) {
+      moves_.Add(move);
+    }
+  }
+}
+
+void ParallelMoveResolverWithSwap::EmitNativeCode(HParallelMove* parallel_move) {
   DCHECK(moves_.IsEmpty());
   // Build up a worklist of moves.
   BuildInitialMoveList(parallel_move);
@@ -50,20 +62,6 @@
   moves_.Reset();
 }
 
-
-void ParallelMoveResolver::BuildInitialMoveList(HParallelMove* parallel_move) {
-  // Perform a linear sweep of the moves to add them to the initial list of
-  // moves to perform, ignoring any move that is redundant (the source is
-  // the same as the destination, the destination is ignored and
-  // unallocated, or the move was already eliminated).
-  for (size_t i = 0; i < parallel_move->NumMoves(); ++i) {
-    MoveOperands* move = parallel_move->MoveOperandsAt(i);
-    if (!move->IsRedundant()) {
-      moves_.Add(move);
-    }
-  }
-}
-
 Location LowOf(Location location) {
   if (location.IsRegisterPair()) {
     return Location::RegisterLocation(location.low());
@@ -103,7 +101,7 @@
   }
 }
 
-MoveOperands* ParallelMoveResolver::PerformMove(size_t index) {
+MoveOperands* ParallelMoveResolverWithSwap::PerformMove(size_t index) {
   // Each call to this function performs a move and deletes it from the move
   // graph.  We first recursively perform any move blocking this one.  We
   // mark a move as "pending" on entry to PerformMove in order to detect
@@ -229,7 +227,7 @@
   }
 }
 
-bool ParallelMoveResolver::IsScratchLocation(Location loc) {
+bool ParallelMoveResolverWithSwap::IsScratchLocation(Location loc) {
   for (size_t i = 0; i < moves_.Size(); ++i) {
     if (moves_.Get(i)->Blocks(loc)) {
       return false;
@@ -245,10 +243,10 @@
   return false;
 }
 
-int ParallelMoveResolver::AllocateScratchRegister(int blocked,
-                                                  int register_count,
-                                                  int if_scratch,
-                                                  bool* spilled) {
+int ParallelMoveResolverWithSwap::AllocateScratchRegister(int blocked,
+                                                          int register_count,
+                                                          int if_scratch,
+                                                          bool* spilled) {
   DCHECK_NE(blocked, if_scratch);
   int scratch = -1;
   for (int reg = 0; reg < register_count; ++reg) {
@@ -269,8 +267,8 @@
 }
 
 
-ParallelMoveResolver::ScratchRegisterScope::ScratchRegisterScope(
-    ParallelMoveResolver* resolver, int blocked, int if_scratch, int number_of_registers)
+ParallelMoveResolverWithSwap::ScratchRegisterScope::ScratchRegisterScope(
+    ParallelMoveResolverWithSwap* resolver, int blocked, int if_scratch, int number_of_registers)
     : resolver_(resolver),
       reg_(kNoRegister),
       spilled_(false) {
@@ -282,10 +280,271 @@
 }
 
 
-ParallelMoveResolver::ScratchRegisterScope::~ScratchRegisterScope() {
+ParallelMoveResolverWithSwap::ScratchRegisterScope::~ScratchRegisterScope() {
   if (spilled_) {
     resolver_->RestoreScratch(reg_);
   }
 }
 
+void ParallelMoveResolverNoSwap::EmitNativeCode(HParallelMove* parallel_move) {
+  DCHECK_EQ(GetNumberOfPendingMoves(), 0u);
+  DCHECK(moves_.IsEmpty());
+  DCHECK(scratches_.IsEmpty());
+
+  // Backend dependent initialization.
+  PrepareForEmitNativeCode();
+
+  // Build up a worklist of moves.
+  BuildInitialMoveList(parallel_move);
+
+  for (size_t i = 0; i < moves_.Size(); ++i) {
+    const MoveOperands& move = *moves_.Get(i);
+    // Skip constants to perform them last. They don't block other moves and
+    // skipping such moves with register destinations keeps those registers
+    // free for the whole algorithm.
+    if (!move.IsEliminated() && !move.GetSource().IsConstant()) {
+      PerformMove(i);
+    }
+  }
+
+  // Perform the moves with constant sources and register destinations with UpdateMoveSource()
+  // to reduce the number of literal loads. Stack destinations are skipped since we won't be benefit
+  // from changing the constant sources to stack locations.
+  for (size_t i = 0; i < moves_.Size(); ++i) {
+    MoveOperands* move = moves_.Get(i);
+    Location destination = move->GetDestination();
+    if (!move->IsEliminated() && !destination.IsStackSlot() && !destination.IsDoubleStackSlot()) {
+      Location source = move->GetSource();
+      EmitMove(i);
+      move->Eliminate();
+      // This may introduce additional instruction dependency, but reduce number
+      // of moves and possible literal loads. For example,
+      // Original moves:
+      //   1234.5678 -> D0
+      //   1234.5678 -> D1
+      // Updated moves:
+      //   1234.5678 -> D0
+      //   D0 -> D1
+      UpdateMoveSource(source, destination);
+    }
+  }
+
+  // Perform the rest of the moves.
+  for (size_t i = 0; i < moves_.Size(); ++i) {
+    MoveOperands* move = moves_.Get(i);
+    if (!move->IsEliminated()) {
+      EmitMove(i);
+      move->Eliminate();
+    }
+  }
+
+  // All pending moves that we have added for resolve cycles should be performed.
+  DCHECK_EQ(GetNumberOfPendingMoves(), 0u);
+
+  // Backend dependent cleanup.
+  FinishEmitNativeCode();
+
+  moves_.Reset();
+  scratches_.Reset();
+}
+
+Location ParallelMoveResolverNoSwap::GetScratchLocation(Location::Kind kind) {
+  for (size_t i = 0; i < scratches_.Size(); ++i) {
+    Location loc = scratches_.Get(i);
+    if (loc.GetKind() == kind && !IsBlockedByMoves(loc)) {
+      return loc;
+    }
+  }
+  for (size_t i = 0; i < moves_.Size(); ++i) {
+    Location loc = moves_.Get(i)->GetDestination();
+    if (loc.GetKind() == kind && !IsBlockedByMoves(loc)) {
+      return loc;
+    }
+  }
+  return Location::NoLocation();
+}
+
+void ParallelMoveResolverNoSwap::AddScratchLocation(Location loc) {
+  if (kIsDebugBuild) {
+    for (size_t i = 0; i < scratches_.Size(); ++i) {
+      DCHECK(!loc.Equals(scratches_.Get(i)));
+    }
+  }
+  scratches_.Add(loc);
+}
+
+void ParallelMoveResolverNoSwap::RemoveScratchLocation(Location loc) {
+  DCHECK(!IsBlockedByMoves(loc));
+  for (size_t i = 0; i < scratches_.Size(); ++i) {
+    if (loc.Equals(scratches_.Get(i))) {
+      scratches_.DeleteAt(i);
+      break;
+    }
+  }
+}
+
+void ParallelMoveResolverNoSwap::PerformMove(size_t index) {
+  // Each call to this function performs a move and deletes it from the move
+  // graph. We first recursively perform any move blocking this one. We mark
+  // a move as "pending" on entry to PerformMove in order to detect cycles
+  // in the move graph. We use scratch location to resolve cycles, also
+  // additional pending moves might be added. After move has been performed,
+  // we will update source operand in the move graph to reduce dependencies in
+  // the graph.
+
+  MoveOperands* move = moves_.Get(index);
+  DCHECK(!move->IsPending());
+  DCHECK(!move->IsEliminated());
+  if (move->IsRedundant()) {
+    // Previous operations on the list of moves have caused this particular move
+    // to become a no-op, so we can safely eliminate it. Consider for example
+    // (0 -> 1) (1 -> 0) (1 -> 2). There is a cycle (0 -> 1) (1 -> 0), that we will
+    // resolve as (1 -> scratch) (0 -> 1) (scratch -> 0). If, by chance, '2' is
+    // used as the scratch location, the move (1 -> 2) will occur while resolving
+    // the cycle. When that move is emitted, the code will update moves with a '1'
+    // as their source to use '2' instead (see `UpdateMoveSource()`. In our example
+    // the initial move (1 -> 2) would then become the no-op (2 -> 2) that can be
+    // eliminated here.
+    move->Eliminate();
+    return;
+  }
+
+  // Clear this move's destination to indicate a pending move. The actual
+  // destination is saved in a stack-allocated local. Recursion may allow
+  // multiple moves to be pending.
+  DCHECK(!move->GetSource().IsInvalid());
+  Location destination = move->MarkPending();
+
+  // Perform a depth-first traversal of the move graph to resolve
+  // dependencies. Any unperformed, unpending move with a source the same
+  // as this one's destination blocks this one so recursively perform all
+  // such moves.
+  for (size_t i = 0; i < moves_.Size(); ++i) {
+    const MoveOperands& other_move = *moves_.Get(i);
+    if (other_move.Blocks(destination) && !other_move.IsPending()) {
+      PerformMove(i);
+    }
+  }
+
+  // We are about to resolve this move and don't need it marked as
+  // pending, so restore its destination.
+  move->ClearPending(destination);
+
+  // No one else should write to the move destination when the it is pending.
+  DCHECK(!move->IsRedundant());
+
+  Location source = move->GetSource();
+  // The move may be blocked on several pending moves, in case we have a cycle.
+  if (IsBlockedByMoves(destination)) {
+    // For a cycle like: (A -> B) (B -> C) (C -> A), we change it to following
+    // sequence:
+    // (C -> scratch)     # Emit right now.
+    // (A -> B) (B -> C)  # Unblocked.
+    // (scratch -> A)     # Add to pending_moves_, blocked by (A -> B).
+    Location::Kind kind = source.GetKind();
+    DCHECK_NE(kind, Location::kConstant);
+    Location scratch = AllocateScratchLocationFor(kind);
+    // We only care about the move size.
+    Primitive::Type type = move->Is64BitMove() ? Primitive::kPrimLong : Primitive::kPrimInt;
+    // Perform (C -> scratch)
+    move->SetDestination(scratch);
+    EmitMove(index);
+    move->Eliminate();
+    UpdateMoveSource(source, scratch);
+    // Add (scratch -> A).
+    AddPendingMove(scratch, destination, type);
+  } else {
+    // This move is not blocked.
+    EmitMove(index);
+    move->Eliminate();
+    UpdateMoveSource(source, destination);
+  }
+
+  // Moves in the pending list should not block any other moves. But performing
+  // unblocked moves in the pending list can free scratch registers, so we do this
+  // as early as possible.
+  MoveOperands* pending_move;
+  while ((pending_move = GetUnblockedPendingMove(source)) != nullptr) {
+    Location pending_source = pending_move->GetSource();
+    Location pending_destination = pending_move->GetDestination();
+    // We do not depend on the pending move index. So just delete the move instead
+    // of eliminating it to make the pending list cleaner.
+    DeletePendingMove(pending_move);
+    move->SetSource(pending_source);
+    move->SetDestination(pending_destination);
+    EmitMove(index);
+    move->Eliminate();
+    UpdateMoveSource(pending_source, pending_destination);
+    // Free any unblocked locations in the scratch location list.
+    for (size_t i = 0; i < scratches_.Size(); ++i) {
+      Location scratch = scratches_.Get(i);
+      // Only scratch overlapping with performed move source can be unblocked.
+      if (scratch.OverlapsWith(pending_source) && !IsBlockedByMoves(scratch)) {
+        FreeScratchLocation(pending_source);
+      }
+    }
+  }
+}
+
+void ParallelMoveResolverNoSwap::UpdateMoveSource(Location from, Location to) {
+  // This function is used to reduce the dependencies in the graph after
+  // (from -> to) has been performed. Since we ensure there is no move with the same
+  // destination, (to -> X) can not be blocked while (from -> X) might still be
+  // blocked. Consider for example the moves (0 -> 1) (1 -> 2) (1 -> 3). After
+  // (1 -> 2) has been performed, the moves left are (0 -> 1) and (1 -> 3). There is
+  // a dependency between the two. If we update the source location from 1 to 2, we
+  // will get (0 -> 1) and (2 -> 3). There is no dependency between the two.
+  //
+  // This is not something we must do, but we can use fewer scratch locations with
+  // this trick. For example, we can avoid using additional scratch locations for
+  // moves (0 -> 1), (1 -> 2), (1 -> 0).
+  for (size_t i = 0; i < moves_.Size(); ++i) {
+    MoveOperands* move = moves_.Get(i);
+    if (move->GetSource().Equals(from)) {
+      move->SetSource(to);
+    }
+  }
+}
+
+void ParallelMoveResolverNoSwap::AddPendingMove(Location source,
+    Location destination, Primitive::Type type) {
+  pending_moves_.Add(new (allocator_) MoveOperands(source, destination, type, nullptr));
+}
+
+void ParallelMoveResolverNoSwap::DeletePendingMove(MoveOperands* move) {
+  pending_moves_.Delete(move);
+}
+
+MoveOperands* ParallelMoveResolverNoSwap::GetUnblockedPendingMove(Location loc) {
+  for (size_t i = 0; i < pending_moves_.Size(); ++i) {
+    MoveOperands* move = pending_moves_.Get(i);
+    Location destination = move->GetDestination();
+    // Only moves with destination overlapping with input loc can be unblocked.
+    if (destination.OverlapsWith(loc) && !IsBlockedByMoves(destination)) {
+      return move;
+    }
+  }
+  return nullptr;
+}
+
+bool ParallelMoveResolverNoSwap::IsBlockedByMoves(Location loc) {
+  for (size_t i = 0; i < pending_moves_.Size(); ++i) {
+    if (pending_moves_.Get(i)->Blocks(loc)) {
+      return true;
+    }
+  }
+  for (size_t i = 0; i < moves_.Size(); ++i) {
+    if (moves_.Get(i)->Blocks(loc)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// So far it is only used for debugging purposes to make sure all pending moves
+// have been performed.
+size_t ParallelMoveResolverNoSwap::GetNumberOfPendingMoves() {
+  return pending_moves_.Size();
+}
+
 }  // namespace art

diff --git a/compiler/optimizing/parallel_move_resolver.h b/compiler/optimizing/parallel_move_resolver.h
index 95f8ad5..e89417d 100644
--- a/compiler/optimizing/parallel_move_resolver.h
+++ b/compiler/optimizing/parallel_move_resolver.h

@@ -19,30 +19,47 @@
 
 #include "base/value_object.h"
 #include "utils/growable_array.h"
+#include "locations.h"
 
 namespace art {
 
 class HParallelMove;
-class Location;
 class MoveOperands;
 
-/**
- * Helper class to resolve a set of parallel moves. Architecture dependent code
- * generator must have their own subclass that implements the `EmitMove` and `EmitSwap`
- * operations.
- */
+// Helper classes to resolve a set of parallel moves. Architecture dependent code generator must
+// have their own subclass that implements corresponding virtual functions.
 class ParallelMoveResolver : public ValueObject {
  public:
   explicit ParallelMoveResolver(ArenaAllocator* allocator) : moves_(allocator, 32) {}
   virtual ~ParallelMoveResolver() {}
 
   // Resolve a set of parallel moves, emitting assembler instructions.
-  void EmitNativeCode(HParallelMove* parallel_move);
+  virtual void EmitNativeCode(HParallelMove* parallel_move) = 0;
+
+ protected:
+  // Build the initial list of moves.
+  void BuildInitialMoveList(HParallelMove* parallel_move);
+
+  GrowableArray<MoveOperands*> moves_;
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(ParallelMoveResolver);
+};
+
+// This helper class uses swap to resolve dependencies and may emit swap.
+class ParallelMoveResolverWithSwap : public ParallelMoveResolver {
+ public:
+  explicit ParallelMoveResolverWithSwap(ArenaAllocator* allocator)
+      : ParallelMoveResolver(allocator) {}
+  virtual ~ParallelMoveResolverWithSwap() {}
+
+  // Resolve a set of parallel moves, emitting assembler instructions.
+  void EmitNativeCode(HParallelMove* parallel_move) OVERRIDE;
 
  protected:
   class ScratchRegisterScope : public ValueObject {
    public:
-    ScratchRegisterScope(ParallelMoveResolver* resolver,
+    ScratchRegisterScope(ParallelMoveResolverWithSwap* resolver,
                          int blocked,
                          int if_scratch,
                          int number_of_registers);
@@ -52,11 +69,12 @@
     bool IsSpilled() const { return spilled_; }
 
    private:
-    ParallelMoveResolver* resolver_;
+    ParallelMoveResolverWithSwap* resolver_;
     int reg_;
     bool spilled_;
   };
 
+  // Return true if the location can be scratched.
   bool IsScratchLocation(Location loc);
 
   // Allocate a scratch register for performing a move. The method will try to use
@@ -72,15 +90,9 @@
   virtual void SpillScratch(int reg) = 0;
   virtual void RestoreScratch(int reg) = 0;
 
-  // List of moves not yet resolved.
-  GrowableArray<MoveOperands*> moves_;
-
   static constexpr int kNoRegister = -1;
 
  private:
-  // Build the initial list of moves.
-  void BuildInitialMoveList(HParallelMove* parallel_move);
-
   // Perform the move at the moves_ index in question (possibly requiring
   // other moves to satisfy dependencies).
   //
@@ -99,7 +111,83 @@
   //    the right value.
   MoveOperands* PerformMove(size_t index);
 
-  DISALLOW_COPY_AND_ASSIGN(ParallelMoveResolver);
+  DISALLOW_COPY_AND_ASSIGN(ParallelMoveResolverWithSwap);
+};
+
+// This helper class uses additional scratch registers to resolve dependencies. It supports all kind
+// of dependency cycles and does not care about the register layout.
+class ParallelMoveResolverNoSwap : public ParallelMoveResolver {
+ public:
+  explicit ParallelMoveResolverNoSwap(ArenaAllocator* allocator)
+      : ParallelMoveResolver(allocator), scratches_(allocator, 32),
+        pending_moves_(allocator, 8), allocator_(allocator) {}
+  virtual ~ParallelMoveResolverNoSwap() {}
+
+  // Resolve a set of parallel moves, emitting assembler instructions.
+  void EmitNativeCode(HParallelMove* parallel_move) OVERRIDE;
+
+ protected:
+  // Called at the beginning of EmitNativeCode(). A subclass may put some architecture dependent
+  // initialization here.
+  virtual void PrepareForEmitNativeCode() = 0;
+
+  // Called at the end of EmitNativeCode(). A subclass may put some architecture dependent cleanup
+  // here. All scratch locations will be removed after this call.
+  virtual void FinishEmitNativeCode() = 0;
+
+  // Allocate a scratch location to perform a move from input kind of location. A subclass should
+  // implement this to get the best fit location. If there is no suitable physical register, it can
+  // also return a stack slot.
+  virtual Location AllocateScratchLocationFor(Location::Kind kind) = 0;
+
+  // Called after a move which takes a scratch location as source. A subclass can defer the cleanup
+  // to FinishEmitNativeCode().
+  virtual void FreeScratchLocation(Location loc) = 0;
+
+  // Emit a move.
+  virtual void EmitMove(size_t index) = 0;
+
+  // Return a scratch location from the moves which exactly matches the kind.
+  // Return Location::NoLocation() if no matching scratch location can be found.
+  Location GetScratchLocation(Location::Kind kind);
+
+  // Add a location to the scratch list which can be returned from GetScratchLocation() to resolve
+  // dependency cycles.
+  void AddScratchLocation(Location loc);
+
+  // Remove a location from the scratch list.
+  void RemoveScratchLocation(Location loc);
+
+  // List of scratch locations.
+  GrowableArray<Location> scratches_;
+
+ private:
+  // Perform the move at the given index in `moves_` (possibly requiring other moves to satisfy
+  // dependencies).
+  void PerformMove(size_t index);
+
+  void UpdateMoveSource(Location from, Location to);
+
+  void AddPendingMove(Location source, Location destination, Primitive::Type type);
+
+  void DeletePendingMove(MoveOperands* move);
+
+  // Find a move that may be unblocked after (loc -> XXX) is performed.
+  MoveOperands* GetUnblockedPendingMove(Location loc);
+
+  // Return true if the location is blocked by outstanding moves.
+  bool IsBlockedByMoves(Location loc);
+
+  // Return the number of pending moves.
+  size_t GetNumberOfPendingMoves();
+
+  // Additional pending moves which might be added to resolve dependency cycle.
+  GrowableArray<MoveOperands*> pending_moves_;
+
+  // Used to allocate pending MoveOperands.
+  ArenaAllocator* const allocator_;
+
+  DISALLOW_COPY_AND_ASSIGN(ParallelMoveResolverNoSwap);
 };
 
 }  // namespace art

diff --git a/compiler/optimizing/parallel_move_test.cc b/compiler/optimizing/parallel_move_test.cc
index 95cca51..f8f7010 100644
--- a/compiler/optimizing/parallel_move_test.cc
+++ b/compiler/optimizing/parallel_move_test.cc

@@ -19,27 +19,41 @@
 #include "parallel_move_resolver.h"
 
 #include "gtest/gtest.h"
+#include "gtest/gtest-typed-test.h"
 
 namespace art {
 
-class TestParallelMoveResolver : public ParallelMoveResolver {
- public:
-  explicit TestParallelMoveResolver(ArenaAllocator* allocator) : ParallelMoveResolver(allocator) {}
+constexpr int kScratchRegisterStartIndexForTest = 100;
 
-  void Dump(Location location) {
-    if (location.IsConstant()) {
-      message_ << "C";
-    } else if (location.IsPair()) {
-      message_ << location.low() << "," << location.high();
-    } else if (location.IsRegister()) {
-      message_ << location.reg();
-    } else if (location.IsStackSlot()) {
-      message_ << location.GetStackIndex() << "(sp)";
-    } else {
-      message_ << "2x" << location.GetStackIndex() << "(sp)";
-      DCHECK(location.IsDoubleStackSlot()) << location;
-    }
+static void DumpRegisterForTest(std::ostream& os, int reg) {
+  if (reg >= kScratchRegisterStartIndexForTest) {
+    os << "T" << reg - kScratchRegisterStartIndexForTest;
+  } else {
+    os << reg;
   }
+}
+
+static void DumpLocationForTest(std::ostream& os, Location location) {
+  if (location.IsConstant()) {
+    os << "C";
+  } else if (location.IsPair()) {
+    DumpRegisterForTest(os, location.low());
+    os << ",";
+    DumpRegisterForTest(os, location.high());
+  } else if (location.IsRegister()) {
+    DumpRegisterForTest(os, location.reg());
+  } else if (location.IsStackSlot()) {
+    os << location.GetStackIndex() << "(sp)";
+  } else {
+    DCHECK(location.IsDoubleStackSlot())<< location;
+    os << "2x" << location.GetStackIndex() << "(sp)";
+  }
+}
+
+class TestParallelMoveResolverWithSwap : public ParallelMoveResolverWithSwap {
+ public:
+  explicit TestParallelMoveResolverWithSwap(ArenaAllocator* allocator)
+      : ParallelMoveResolverWithSwap(allocator) {}
 
   void EmitMove(size_t index) OVERRIDE {
     MoveOperands* move = moves_.Get(index);
@@ -47,9 +61,9 @@
       message_ << " ";
     }
     message_ << "(";
-    Dump(move->GetSource());
+    DumpLocationForTest(message_, move->GetSource());
     message_ << " -> ";
-    Dump(move->GetDestination());
+    DumpLocationForTest(message_, move->GetDestination());
     message_ << ")";
   }
 
@@ -59,9 +73,9 @@
       message_ << " ";
     }
     message_ << "(";
-    Dump(move->GetSource());
+    DumpLocationForTest(message_, move->GetSource());
     message_ << " <-> ";
-    Dump(move->GetDestination());
+    DumpLocationForTest(message_, move->GetDestination());
     message_ << ")";
   }
 
@@ -76,7 +90,64 @@
   std::ostringstream message_;
 
 
-  DISALLOW_COPY_AND_ASSIGN(TestParallelMoveResolver);
+  DISALLOW_COPY_AND_ASSIGN(TestParallelMoveResolverWithSwap);
+};
+
+class TestParallelMoveResolverNoSwap : public ParallelMoveResolverNoSwap {
+ public:
+  explicit TestParallelMoveResolverNoSwap(ArenaAllocator* allocator)
+      : ParallelMoveResolverNoSwap(allocator), scratch_index_(kScratchRegisterStartIndexForTest) {}
+
+  void PrepareForEmitNativeCode() OVERRIDE {
+    scratch_index_ = kScratchRegisterStartIndexForTest;
+  }
+
+  void FinishEmitNativeCode() OVERRIDE {}
+
+  Location AllocateScratchLocationFor(Location::Kind kind) OVERRIDE {
+    if (kind == Location::kStackSlot || kind == Location::kFpuRegister ||
+        kind == Location::kRegister) {
+      kind = Location::kRegister;
+    } else {
+      // Allocate register pair for double stack slot which simulates 32-bit backend's behavior.
+      kind = Location::kRegisterPair;
+    }
+    Location scratch = GetScratchLocation(kind);
+    if (scratch.Equals(Location::NoLocation())) {
+      AddScratchLocation(Location::RegisterLocation(scratch_index_));
+      AddScratchLocation(Location::RegisterLocation(scratch_index_ + 1));
+      AddScratchLocation(Location::RegisterPairLocation(scratch_index_, scratch_index_ + 1));
+      scratch = (kind == Location::kRegister) ? Location::RegisterLocation(scratch_index_)
+          : Location::RegisterPairLocation(scratch_index_, scratch_index_ + 1);
+      scratch_index_ += 2;
+    }
+    return scratch;
+  }
+
+  void FreeScratchLocation(Location loc ATTRIBUTE_UNUSED) OVERRIDE {}
+
+  void EmitMove(size_t index) OVERRIDE {
+    MoveOperands* move = moves_.Get(index);
+    if (!message_.str().empty()) {
+      message_ << " ";
+    }
+    message_ << "(";
+    DumpLocationForTest(message_, move->GetSource());
+    message_ << " -> ";
+    DumpLocationForTest(message_, move->GetDestination());
+    message_ << ")";
+  }
+
+  std::string GetMessage() const {
+    return  message_.str();
+  }
+
+ private:
+  std::ostringstream message_;
+
+  int scratch_index_;
+
+  DISALLOW_COPY_AND_ASSIGN(TestParallelMoveResolverNoSwap);
 };
 
 static HParallelMove* BuildParallelMove(ArenaAllocator* allocator,
@@ -93,55 +164,102 @@
   return moves;
 }
 
-TEST(ParallelMoveTest, Dependency) {
+template <typename T>
+class ParallelMoveTest : public ::testing::Test {
+ public:
+  static const bool has_swap;
+};
+
+template<> const bool ParallelMoveTest<TestParallelMoveResolverWithSwap>::has_swap = true;
+template<> const bool ParallelMoveTest<TestParallelMoveResolverNoSwap>::has_swap = false;
+
+typedef ::testing::Types<TestParallelMoveResolverWithSwap, TestParallelMoveResolverNoSwap>
+    ParallelMoveResolverTestTypes;
+
+TYPED_TEST_CASE(ParallelMoveTest, ParallelMoveResolverTestTypes);
+
+
+TYPED_TEST(ParallelMoveTest, Dependency) {
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
 
   {
-    TestParallelMoveResolver resolver(&allocator);
+    TypeParam resolver(&allocator);
     static constexpr size_t moves[][2] = {{0, 1}, {1, 2}};
     resolver.EmitNativeCode(BuildParallelMove(&allocator, moves, arraysize(moves)));
-    ASSERT_STREQ("(1 -> 2) (0 -> 1)", resolver.GetMessage().c_str());
+    if (TestFixture::has_swap) {
+      ASSERT_STREQ("(1 -> 2) (0 -> 1)", resolver.GetMessage().c_str());
+    } else {
+      ASSERT_STREQ("(1 -> 2) (0 -> 1)", resolver.GetMessage().c_str());
+    }
   }
 
   {
-    TestParallelMoveResolver resolver(&allocator);
+    TypeParam resolver(&allocator);
     static constexpr size_t moves[][2] = {{0, 1}, {1, 2}, {2, 3}, {1, 4}};
     resolver.EmitNativeCode(BuildParallelMove(&allocator, moves, arraysize(moves)));
-    ASSERT_STREQ("(2 -> 3) (1 -> 2) (1 -> 4) (0 -> 1)", resolver.GetMessage().c_str());
+    if (TestFixture::has_swap) {
+      ASSERT_STREQ("(2 -> 3) (1 -> 2) (1 -> 4) (0 -> 1)", resolver.GetMessage().c_str());
+    } else {
+      ASSERT_STREQ("(2 -> 3) (1 -> 2) (0 -> 1) (2 -> 4)", resolver.GetMessage().c_str());
+    }
   }
 }
 
-TEST(ParallelMoveTest, Swap) {
+TYPED_TEST(ParallelMoveTest, Cycle) {
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
 
   {
-    TestParallelMoveResolver resolver(&allocator);
+    TypeParam resolver(&allocator);
     static constexpr size_t moves[][2] = {{0, 1}, {1, 0}};
     resolver.EmitNativeCode(BuildParallelMove(&allocator, moves, arraysize(moves)));
-    ASSERT_STREQ("(1 <-> 0)", resolver.GetMessage().c_str());
+    if (TestFixture::has_swap) {
+      ASSERT_STREQ("(1 <-> 0)", resolver.GetMessage().c_str());
+    } else {
+      ASSERT_STREQ("(1 -> T0) (0 -> 1) (T0 -> 0)", resolver.GetMessage().c_str());
+    }
   }
 
   {
-    TestParallelMoveResolver resolver(&allocator);
+    TypeParam resolver(&allocator);
     static constexpr size_t moves[][2] = {{0, 1}, {1, 2}, {1, 0}};
     resolver.EmitNativeCode(BuildParallelMove(&allocator, moves, arraysize(moves)));
-    ASSERT_STREQ("(1 -> 2) (1 <-> 0)", resolver.GetMessage().c_str());
+    if (TestFixture::has_swap) {
+      ASSERT_STREQ("(1 -> 2) (1 <-> 0)", resolver.GetMessage().c_str());
+    } else {
+      ASSERT_STREQ("(1 -> 2) (0 -> 1) (2 -> 0)", resolver.GetMessage().c_str());
+    }
   }
 
   {
-    TestParallelMoveResolver resolver(&allocator);
+    TypeParam resolver(&allocator);
+    static constexpr size_t moves[][2] = {{0, 1}, {1, 0}, {0, 2}};
+    resolver.EmitNativeCode(BuildParallelMove(&allocator, moves, arraysize(moves)));
+    if (TestFixture::has_swap) {
+      ASSERT_STREQ("(0 -> 2) (1 <-> 0)", resolver.GetMessage().c_str());
+    } else {
+      ASSERT_STREQ("(0 -> 2) (1 -> 0) (2 -> 1)", resolver.GetMessage().c_str());
+    }
+  }
+
+  {
+    TypeParam resolver(&allocator);
     static constexpr size_t moves[][2] = {{0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 0}};
     resolver.EmitNativeCode(BuildParallelMove(&allocator, moves, arraysize(moves)));
-    ASSERT_STREQ("(4 <-> 0) (3 <-> 4) (2 <-> 3) (1 <-> 2)", resolver.GetMessage().c_str());
+    if (TestFixture::has_swap) {
+      ASSERT_STREQ("(4 <-> 0) (3 <-> 4) (2 <-> 3) (1 <-> 2)", resolver.GetMessage().c_str());
+    } else {
+      ASSERT_STREQ("(4 -> T0) (3 -> 4) (2 -> 3) (1 -> 2) (0 -> 1) (T0 -> 0)",
+          resolver.GetMessage().c_str());
+    }
   }
 }
 
-TEST(ParallelMoveTest, ConstantLast) {
+TYPED_TEST(ParallelMoveTest, ConstantLast) {
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
-  TestParallelMoveResolver resolver(&allocator);
+  TypeParam resolver(&allocator);
   HParallelMove* moves = new (&allocator) HParallelMove(&allocator);
   moves->AddMove(
       Location::ConstantLocation(new (&allocator) HIntConstant(0)),
@@ -157,12 +275,12 @@
   ASSERT_STREQ("(1 -> 2) (C -> 0)", resolver.GetMessage().c_str());
 }
 
-TEST(ParallelMoveTest, Pairs) {
+TYPED_TEST(ParallelMoveTest, Pairs) {
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
 
   {
-    TestParallelMoveResolver resolver(&allocator);
+    TypeParam resolver(&allocator);
     HParallelMove* moves = new (&allocator) HParallelMove(&allocator);
     moves->AddMove(
         Location::RegisterLocation(2),
@@ -179,7 +297,7 @@
   }
 
   {
-    TestParallelMoveResolver resolver(&allocator);
+    TypeParam resolver(&allocator);
     HParallelMove* moves = new (&allocator) HParallelMove(&allocator);
     moves->AddMove(
         Location::RegisterPairLocation(0, 1),
@@ -196,7 +314,7 @@
   }
 
   {
-    TestParallelMoveResolver resolver(&allocator);
+    TypeParam resolver(&allocator);
     HParallelMove* moves = new (&allocator) HParallelMove(&allocator);
     moves->AddMove(
         Location::RegisterPairLocation(0, 1),
@@ -209,10 +327,14 @@
         Primitive::kPrimInt,
         nullptr);
     resolver.EmitNativeCode(moves);
-    ASSERT_STREQ("(0,1 <-> 2,3)", resolver.GetMessage().c_str());
+    if (TestFixture::has_swap) {
+      ASSERT_STREQ("(0,1 <-> 2,3)", resolver.GetMessage().c_str());
+    } else {
+      ASSERT_STREQ("(2 -> T0) (0,1 -> 2,3) (T0 -> 0)", resolver.GetMessage().c_str());
+    }
   }
   {
-    TestParallelMoveResolver resolver(&allocator);
+    TypeParam resolver(&allocator);
     HParallelMove* moves = new (&allocator) HParallelMove(&allocator);
     moves->AddMove(
         Location::RegisterLocation(2),
@@ -230,10 +352,15 @@
         Primitive::kPrimLong,
         nullptr);
     resolver.EmitNativeCode(moves);
-    ASSERT_STREQ("(0,1 <-> 2,3) (7 -> 1) (0 -> 7)", resolver.GetMessage().c_str());
+    if (TestFixture::has_swap) {
+      ASSERT_STREQ("(0,1 <-> 2,3) (7 -> 1) (0 -> 7)", resolver.GetMessage().c_str());
+    } else {
+      ASSERT_STREQ("(0,1 -> T0,T1) (7 -> 1) (2 -> 7) (T0,T1 -> 2,3)",
+          resolver.GetMessage().c_str());
+    }
   }
   {
-    TestParallelMoveResolver resolver(&allocator);
+    TypeParam resolver(&allocator);
     HParallelMove* moves = new (&allocator) HParallelMove(&allocator);
     moves->AddMove(
         Location::RegisterLocation(2),
@@ -251,10 +378,15 @@
         Primitive::kPrimInt,
         nullptr);
     resolver.EmitNativeCode(moves);
-    ASSERT_STREQ("(0,1 <-> 2,3) (7 -> 1) (0 -> 7)", resolver.GetMessage().c_str());
+    if (TestFixture::has_swap) {
+      ASSERT_STREQ("(0,1 <-> 2,3) (7 -> 1) (0 -> 7)", resolver.GetMessage().c_str());
+    } else {
+      ASSERT_STREQ("(0,1 -> T0,T1) (7 -> 1) (2 -> 7) (T0,T1 -> 2,3)",
+          resolver.GetMessage().c_str());
+    }
   }
   {
-    TestParallelMoveResolver resolver(&allocator);
+    TypeParam resolver(&allocator);
     HParallelMove* moves = new (&allocator) HParallelMove(&allocator);
     moves->AddMove(
         Location::RegisterPairLocation(0, 1),
@@ -272,10 +404,14 @@
         Primitive::kPrimInt,
         nullptr);
     resolver.EmitNativeCode(moves);
-    ASSERT_STREQ("(0,1 <-> 2,3) (7 -> 1) (0 -> 7)", resolver.GetMessage().c_str());
+    if (TestFixture::has_swap) {
+      ASSERT_STREQ("(0,1 <-> 2,3) (7 -> 1) (0 -> 7)", resolver.GetMessage().c_str());
+    } else {
+      ASSERT_STREQ("(7 -> T0) (2 -> 7) (0,1 -> 2,3) (T0 -> 1)", resolver.GetMessage().c_str());
+    }
   }
   {
-    TestParallelMoveResolver resolver(&allocator);
+    TypeParam resolver(&allocator);
     HParallelMove* moves = new (&allocator) HParallelMove(&allocator);
     moves->AddMove(
         Location::RegisterPairLocation(0, 1),
@@ -288,10 +424,14 @@
         Primitive::kPrimLong,
         nullptr);
     resolver.EmitNativeCode(moves);
-    ASSERT_STREQ("(2,3 <-> 0,1)", resolver.GetMessage().c_str());
+    if (TestFixture::has_swap) {
+      ASSERT_STREQ("(2,3 <-> 0,1)", resolver.GetMessage().c_str());
+    } else {
+      ASSERT_STREQ("(2,3 -> T0,T1) (0,1 -> 2,3) (T0,T1 -> 0,1)", resolver.GetMessage().c_str());
+    }
   }
   {
-    TestParallelMoveResolver resolver(&allocator);
+    TypeParam resolver(&allocator);
     HParallelMove* moves = new (&allocator) HParallelMove(&allocator);
     moves->AddMove(
         Location::RegisterPairLocation(2, 3),
@@ -304,12 +444,85 @@
         Primitive::kPrimLong,
         nullptr);
     resolver.EmitNativeCode(moves);
-    ASSERT_STREQ("(0,1 <-> 2,3)", resolver.GetMessage().c_str());
+    if (TestFixture::has_swap) {
+      ASSERT_STREQ("(0,1 <-> 2,3)", resolver.GetMessage().c_str());
+    } else {
+      ASSERT_STREQ("(0,1 -> T0,T1) (2,3 -> 0,1) (T0,T1 -> 2,3)", resolver.GetMessage().c_str());
+    }
+  }
+}
+
+TYPED_TEST(ParallelMoveTest, MultiCycles) {
+  ArenaPool pool;
+  ArenaAllocator allocator(&pool);
+
+  {
+    TypeParam resolver(&allocator);
+    static constexpr size_t moves[][2] = {{0, 1}, {1, 0}, {2, 3}, {3, 2}};
+    resolver.EmitNativeCode(BuildParallelMove(&allocator, moves, arraysize(moves)));
+    if (TestFixture::has_swap) {
+      ASSERT_STREQ("(1 <-> 0) (3 <-> 2)",  resolver.GetMessage().c_str());
+    } else {
+      ASSERT_STREQ("(1 -> T0) (0 -> 1) (T0 -> 0) (3 -> T0) (2 -> 3) (T0 -> 2)",
+          resolver.GetMessage().c_str());
+    }
+  }
+  {
+    TypeParam resolver(&allocator);
+    HParallelMove* moves = new (&allocator) HParallelMove(&allocator);
+    moves->AddMove(
+        Location::RegisterPairLocation(0, 1),
+        Location::RegisterPairLocation(2, 3),
+        Primitive::kPrimLong,
+        nullptr);
+    moves->AddMove(
+        Location::RegisterLocation(2),
+        Location::RegisterLocation(0),
+        Primitive::kPrimInt,
+        nullptr);
+    moves->AddMove(
+        Location::RegisterLocation(3),
+        Location::RegisterLocation(1),
+        Primitive::kPrimInt,
+        nullptr);
+    resolver.EmitNativeCode(moves);
+    if (TestFixture::has_swap) {
+      ASSERT_STREQ("(0,1 <-> 2,3)", resolver.GetMessage().c_str());
+    } else {
+      ASSERT_STREQ("(2 -> T0) (3 -> T1) (0,1 -> 2,3) (T0 -> 0) (T1 -> 1)",
+          resolver.GetMessage().c_str());
+    }
+  }
+  {
+    TypeParam resolver(&allocator);
+    HParallelMove* moves = new (&allocator) HParallelMove(&allocator);
+    moves->AddMove(
+        Location::RegisterLocation(2),
+        Location::RegisterLocation(0),
+        Primitive::kPrimInt,
+        nullptr);
+    moves->AddMove(
+        Location::RegisterLocation(3),
+        Location::RegisterLocation(1),
+        Primitive::kPrimInt,
+        nullptr);
+    moves->AddMove(
+        Location::RegisterPairLocation(0, 1),
+        Location::RegisterPairLocation(2, 3),
+        Primitive::kPrimLong,
+        nullptr);
+    resolver.EmitNativeCode(moves);
+    if (TestFixture::has_swap) {
+      ASSERT_STREQ("(0,1 <-> 2,3)", resolver.GetMessage().c_str());
+    } else {
+      ASSERT_STREQ("(3 -> T0) (0,1 -> T2,T3) (T0 -> 1) (2 -> 0) (T2,T3 -> 2,3)",
+          resolver.GetMessage().c_str());
+    }
   }
 
   {
     // Test involving registers used in single context and pair context.
-    TestParallelMoveResolver resolver(&allocator);
+    TypeParam resolver(&allocator);
     HParallelMove* moves = new (&allocator) HParallelMove(&allocator);
     moves->AddMove(
         Location::RegisterLocation(10),
@@ -327,17 +540,22 @@
         Primitive::kPrimLong,
         nullptr);
     resolver.EmitNativeCode(moves);
-    ASSERT_STREQ("(2x32(sp) <-> 10,11) (4,5 <-> 2x32(sp)) (4 -> 5)", resolver.GetMessage().c_str());
+    if (TestFixture::has_swap) {
+      ASSERT_STREQ("(2x32(sp) <-> 10,11) (4,5 <-> 2x32(sp)) (4 -> 5)", resolver.GetMessage().c_str());
+    } else {
+      ASSERT_STREQ("(2x32(sp) -> T0,T1) (4,5 -> 2x32(sp)) (10 -> 5) (T0,T1 -> 10,11)",
+          resolver.GetMessage().c_str());
+    }
   }
 }
 
 // Test that we do 64bits moves before 32bits moves.
-TEST(ParallelMoveTest, CyclesWith64BitsMoves) {
+TYPED_TEST(ParallelMoveTest, CyclesWith64BitsMoves) {
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
 
   {
-    TestParallelMoveResolver resolver(&allocator);
+    TypeParam resolver(&allocator);
     HParallelMove* moves = new (&allocator) HParallelMove(&allocator);
     moves->AddMove(
         Location::RegisterLocation(0),
@@ -355,11 +573,16 @@
         Primitive::kPrimInt,
         nullptr);
     resolver.EmitNativeCode(moves);
-    ASSERT_STREQ("(0 <-> 1) (48(sp) <-> 0)", resolver.GetMessage().c_str());
+    if (TestFixture::has_swap) {
+      ASSERT_STREQ("(0 <-> 1) (48(sp) <-> 0)", resolver.GetMessage().c_str());
+    } else {
+      ASSERT_STREQ("(48(sp) -> T0) (1 -> 48(sp)) (0 -> 1) (T0 -> 0)",
+          resolver.GetMessage().c_str());
+    }
   }
 
   {
-    TestParallelMoveResolver resolver(&allocator);
+    TypeParam resolver(&allocator);
     HParallelMove* moves = new (&allocator) HParallelMove(&allocator);
     moves->AddMove(
         Location::RegisterPairLocation(0, 1),
@@ -377,7 +600,12 @@
         Primitive::kPrimLong,
         nullptr);
     resolver.EmitNativeCode(moves);
-    ASSERT_STREQ("(2x32(sp) <-> 0,1) (2,3 <-> 2x32(sp))", resolver.GetMessage().c_str());
+    if (TestFixture::has_swap) {
+      ASSERT_STREQ("(2x32(sp) <-> 0,1) (2,3 <-> 2x32(sp))", resolver.GetMessage().c_str());
+    } else {
+      ASSERT_STREQ("(2x32(sp) -> T0,T1) (2,3 -> 2x32(sp)) (0,1 -> 2,3) (T0,T1 -> 0,1)",
+          resolver.GetMessage().c_str());
+    }
   }
 }
 

diff --git a/runtime/base/macros.h b/runtime/base/macros.h
index 6c33232..c00ae78 100644
--- a/runtime/base/macros.h
+++ b/runtime/base/macros.h

@@ -46,6 +46,11 @@
 #define ART_FRIEND_TEST(test_set_name, individual_test)\
 friend class test_set_name##_##individual_test##_Test
 
+// Declare a friend relationship in a class with a typed test.
+#define ART_FRIEND_TYPED_TEST(test_set_name, individual_test)\
+template<typename T> ART_FRIEND_TEST(test_set_name, individual_test)
+
+
 // DISALLOW_COPY_AND_ASSIGN disallows the copy and operator= functions. It goes in the private:
 // declarations in a class.
 #if !defined(DISALLOW_COPY_AND_ASSIGN)
commit	ad4450e5c3ffaa9566216cc6fafbf5c11186c467	[log] [tgz]
author	Zheng Xu <zheng.xu@arm.com>	Fri Apr 17 18:48:56 2015 +0800
committer	Zheng Xu <zheng.xu@arm.com>	Fri Apr 17 18:54:08 2015 +0800
tree	eecf36e8e9d8112e765ad8840eb2d27f8d0415ab
parent	f8bdd9f3a002970e4b8fdcf6fe6730116f1626c3 [diff]