Finally implement Location::kNoOutputOverlap.

The [i, i + 1) interval scheme we chose for representing
lifetime positions is not optimal for doing this optimization.
It however doesn't prevent recognizing a non-split interval
during the TryAllocateFreeReg phase, and try to re-use
its inputs' registers.

Change-Id: I80a2823b0048d3310becfc5f5fb7b1230dfd8201
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index dc2446d..fd4e391 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -290,7 +290,7 @@
         result_location = locations->InAt(0);
         break;
     }
-    locations->SetOut(result_location);
+    locations->UpdateOut(result_location);
   }
 }
 
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index b0cd7ba..78fd181 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -1296,13 +1296,14 @@
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(neg, LocationSummary::kNoCall);
   switch (neg->GetResultType()) {
-    case Primitive::kPrimInt:
-    case Primitive::kPrimLong: {
-      Location::OutputOverlap output_overlaps = (neg->GetResultType() == Primitive::kPrimLong)
-          ? Location::kOutputOverlap
-          : Location::kNoOutputOverlap;
+    case Primitive::kPrimInt: {
       locations->SetInAt(0, Location::RequiresRegister());
-      locations->SetOut(Location::RequiresRegister(), output_overlaps);
+      locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+      break;
+    }
+    case Primitive::kPrimLong: {
+      locations->SetInAt(0, Location::RequiresRegister());
+      locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
       break;
     }
 
@@ -1837,7 +1838,7 @@
     case Primitive::kPrimLong: {
       locations->SetInAt(0, Location::RequiresRegister());
       locations->SetInAt(1, Location::RequiresRegister());
-      locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
+      locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
       break;
     }
 
@@ -1914,7 +1915,7 @@
     case Primitive::kPrimLong: {
       locations->SetInAt(0, Location::RequiresRegister());
       locations->SetInAt(1, Location::RequiresRegister());
-      locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
+      locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
       break;
     }
     case Primitive::kPrimFloat:
@@ -2297,7 +2298,7 @@
     case Primitive::kPrimInt: {
       locations->SetInAt(0, Location::RequiresRegister());
       locations->SetInAt(1, Location::RegisterOrConstant(op->InputAt(1)));
-      locations->SetOut(Location::RequiresRegister());
+      locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
       break;
     }
     case Primitive::kPrimLong: {
@@ -2492,7 +2493,8 @@
     case Primitive::kPrimLong: {
       locations->SetInAt(0, Location::RequiresRegister());
       locations->SetInAt(1, Location::RequiresRegister());
-      locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+      // Output overlaps because it is written before doing the low comparison.
+      locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
       break;
     }
     case Primitive::kPrimFloat:
@@ -2765,12 +2767,14 @@
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
 
-  bool generate_volatile = field_info.IsVolatile()
+  bool volatile_for_double = field_info.IsVolatile()
       && (field_info.GetFieldType() == Primitive::kPrimDouble)
       && !codegen_->GetInstructionSetFeatures().HasAtomicLdrdAndStrd();
-  if (generate_volatile) {
+  bool overlap = field_info.IsVolatile() && (field_info.GetFieldType() == Primitive::kPrimLong);
+  locations->SetOut(Location::RequiresRegister(),
+                    (overlap ? Location::kOutputOverlap : Location::kNoOutputOverlap));
+  if (volatile_for_double) {
     // Arm encoding have some additional constraints for ldrexd/strexd:
     // - registers need to be consecutive
     // - the first register should be even but not R14.
@@ -3614,7 +3618,8 @@
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, call_kind);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RequiresRegister());
-  locations->SetOut(Location::RequiresRegister());
+  // The out register is used as a temporary, so it overlaps with the inputs.
+  locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
 }
 
 void InstructionCodeGeneratorARM::VisitInstanceOf(HInstanceOf* instruction) {
@@ -3710,10 +3715,7 @@
          || instruction->GetResultType() == Primitive::kPrimLong);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RequiresRegister());
-  Location::OutputOverlap output_overlaps = (instruction->GetResultType() == Primitive::kPrimLong)
-      ? Location::kOutputOverlap
-      : Location::kNoOutputOverlap;
-  locations->SetOut(Location::RequiresRegister(), output_overlaps);
+  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
 }
 
 void InstructionCodeGeneratorARM::VisitAnd(HAnd* instruction) {
diff --git a/compiler/optimizing/locations.h b/compiler/optimizing/locations.h
index 8b06d60..68e3a30 100644
--- a/compiler/optimizing/locations.h
+++ b/compiler/optimizing/locations.h
@@ -482,16 +482,17 @@
   }
 
   void SetOut(Location location, Location::OutputOverlap overlaps = Location::kOutputOverlap) {
-    DCHECK(output_.IsUnallocated() || output_.IsInvalid());
+    DCHECK(output_.IsInvalid());
     output_overlaps_ = overlaps;
     output_ = location;
   }
 
   void UpdateOut(Location location) {
-    // The only reason for updating an output is for parameters where
-    // we only know the exact stack slot after doing full register
-    // allocation.
-    DCHECK(output_.IsStackSlot() || output_.IsDoubleStackSlot());
+    // There are two reasons for updating an output:
+    // 1) Parameters, where we only know the exact stack slot after
+    //    doing full register allocation.
+    // 2) Unallocated location.
+    DCHECK(output_.IsStackSlot() || output_.IsDoubleStackSlot() || output_.IsUnallocated());
     output_ = location;
   }
 
@@ -563,28 +564,22 @@
     return live_registers_.GetNumberOfRegisters();
   }
 
-  bool InputOverlapsWithOutputOrTemp(uint32_t input_index, bool is_environment) const {
-    if (is_environment) return true;
-    if ((input_index == 0)
+  bool OutputUsesSameAs(uint32_t input_index) const {
+    return (input_index == 0)
         && output_.IsUnallocated()
-        && (output_.GetPolicy() == Location::kSameAsFirstInput)) {
-      return false;
-    }
+        && (output_.GetPolicy() == Location::kSameAsFirstInput);
+  }
+
+  bool IsFixedInput(uint32_t input_index) const {
     Location input = inputs_.Get(input_index);
-    if (input.IsRegister()
+    return input.IsRegister()
         || input.IsFpuRegister()
         || input.IsPair()
         || input.IsStackSlot()
-        || input.IsDoubleStackSlot()) {
-      // For fixed locations, the register allocator requires to have inputs die before
-      // the instruction, so that input moves use the location of the input just
-      // before that instruction (and not potential moves due to splitting).
-      return false;
-    }
-    return true;
+        || input.IsDoubleStackSlot();
   }
 
-  bool OutputOverlapsWithInputs() const {
+  bool OutputCanOverlapWithInputs() const {
     return output_overlaps_ == Location::kOutputOverlap;
   }
 
diff --git a/compiler/optimizing/register_allocator.cc b/compiler/optimizing/register_allocator.cc
index 0a3f24b..3809720 100644
--- a/compiler/optimizing/register_allocator.cc
+++ b/compiler/optimizing/register_allocator.cc
@@ -485,6 +485,9 @@
         BitVector* liveness_of_register = liveness_of_values.Get(current->GetRegister());
         for (size_t j = it.CurrentRange()->GetStart(); j < it.CurrentRange()->GetEnd(); ++j) {
           if (liveness_of_register->IsBitSet(j)) {
+            if (current->IsUsingInputRegister() && current->CanUseInputRegister()) {
+              continue;
+            }
             if (log_fatal_on_failure) {
               std::ostringstream message;
               message << "Register conflict at " << j << " ";
@@ -639,6 +642,29 @@
   }
 }
 
+static void FreeIfNotCoverAt(LiveInterval* interval, size_t position, size_t* free_until) {
+  DCHECK(!interval->IsHighInterval());
+  // Note that the same instruction may occur multiple times in the input list,
+  // so `free_until` may have changed already.
+  if (interval->IsDeadAt(position)) {
+    // Set the register to be free. Note that inactive intervals might later
+    // update this.
+    free_until[interval->GetRegister()] = kMaxLifetimePosition;
+    if (interval->HasHighInterval()) {
+      DCHECK(interval->GetHighInterval()->IsDeadAt(position));
+      free_until[interval->GetHighInterval()->GetRegister()] = kMaxLifetimePosition;
+    }
+  } else if (!interval->Covers(position)) {
+    // The interval becomes inactive at `defined_by`. We make its register
+    // available only until the next use strictly after `defined_by`.
+    free_until[interval->GetRegister()] = interval->FirstUseAfter(position);
+    if (interval->HasHighInterval()) {
+      DCHECK(!interval->GetHighInterval()->Covers(position));
+      free_until[interval->GetHighInterval()->GetRegister()] = free_until[interval->GetRegister()];
+    }
+  }
+}
+
 // Find a free register. If multiple are found, pick the register that
 // is free the longest.
 bool RegisterAllocator::TryAllocateFreeReg(LiveInterval* current) {
@@ -656,6 +682,32 @@
     free_until[interval->GetRegister()] = 0;
   }
 
+  // An interval that starts an instruction (that is, it is not split), may
+  // re-use the registers used by the inputs of that instruciton, based on the
+  // location summary.
+  HInstruction* defined_by = current->GetDefinedBy();
+  if (defined_by != nullptr && !current->IsSplit()) {
+    LocationSummary* locations = defined_by->GetLocations();
+    if (!locations->OutputCanOverlapWithInputs() && locations->Out().IsUnallocated()) {
+      for (HInputIterator it(defined_by); !it.Done(); it.Advance()) {
+        // Take the last interval of the input. It is the location of that interval
+        // that will be used at `defined_by`.
+        LiveInterval* interval = it.Current()->GetLiveInterval()->GetLastSibling();
+        // Note that interval may have not been processed yet.
+        // TODO: Handle non-split intervals last in the work list.
+        if (interval->HasRegister() && interval->SameRegisterKind(*current)) {
+          // The input must be live until the end of `defined_by`, to comply to
+          // the linear scan algorithm. So we use `defined_by`'s end lifetime
+          // position to check whether the input is dead or is inactive after
+          // `defined_by`.
+          DCHECK(interval->Covers(defined_by->GetLifetimePosition()));
+          size_t position = defined_by->GetLifetimePosition() + 1;
+          FreeIfNotCoverAt(interval, position, free_until);
+        }
+      }
+    }
+  }
+
   // For each inactive interval, set its register to be free until
   // the next intersection with `current`.
   for (size_t i = 0, e = inactive_.Size(); i < e; ++i) {
@@ -1497,7 +1549,7 @@
           DCHECK(locations->InAt(0).Equals(source));
         }
       }
-      locations->SetOut(source);
+      locations->UpdateOut(source);
     } else {
       DCHECK(source.Equals(location));
     }
diff --git a/compiler/optimizing/ssa_liveness_analysis.h b/compiler/optimizing/ssa_liveness_analysis.h
index b0d3853..0e68a61 100644
--- a/compiler/optimizing/ssa_liveness_analysis.h
+++ b/compiler/optimizing/ssa_liveness_analysis.h
@@ -18,6 +18,7 @@
 #define ART_COMPILER_OPTIMIZING_SSA_LIVENESS_ANALYSIS_H_
 
 #include "nodes.h"
+#include <iostream>
 
 namespace art {
 
@@ -181,12 +182,21 @@
 
   void AddUse(HInstruction* instruction, size_t input_index, bool is_environment) {
     // Set the use within the instruction.
-    size_t position = instruction->GetLifetimePosition();
-    if (instruction->GetLocations()->InputOverlapsWithOutputOrTemp(input_index, is_environment)) {
-      // If it overlaps, we need to make sure the user will not try to allocate a temp
-      // or its output to the same register.
-      ++position;
+    size_t position = instruction->GetLifetimePosition() + 1;
+    LocationSummary* locations = instruction->GetLocations();
+    if (!is_environment) {
+      if (locations->IsFixedInput(input_index) || locations->OutputUsesSameAs(input_index)) {
+        // For fixed inputs and output same as input, the register allocator
+        // requires to have inputs die at the instruction, so that input moves use the
+        // location of the input just before that instruction (and not potential moves due
+        // to splitting).
+        position = instruction->GetLifetimePosition();
+      }
     }
+
+    DCHECK(position == instruction->GetLifetimePosition()
+           || position == instruction->GetLifetimePosition() + 1);
+
     if ((first_use_ != nullptr)
         && (first_use_->GetUser() == instruction)
         && (first_use_->GetPosition() < position)) {
@@ -301,6 +311,7 @@
   LiveInterval* GetParent() const { return parent_; }
 
   LiveRange* GetFirstRange() const { return first_range_; }
+  LiveRange* GetLastRange() const { return last_range_; }
 
   int GetRegister() const { return register_; }
   void SetRegister(int reg) { register_ = reg; }
@@ -403,6 +414,23 @@
     return FirstRegisterUseAfter(GetStart());
   }
 
+  size_t FirstUseAfter(size_t position) const {
+    if (is_temp_) {
+      return position == GetStart() ? position : kNoLifetime;
+    }
+
+    UsePosition* use = first_use_;
+    size_t end = GetEnd();
+    while (use != nullptr && use->GetPosition() <= end) {
+      size_t use_position = use->GetPosition();
+      if (use_position > position) {
+        return use_position;
+      }
+      use = use->GetNext();
+    }
+    return kNoLifetime;
+  }
+
   UsePosition* GetFirstUse() const {
     return first_use_;
   }
@@ -511,6 +539,13 @@
   }
 
   LiveInterval* GetNextSibling() const { return next_sibling_; }
+  LiveInterval* GetLastSibling() {
+    LiveInterval* result = this;
+    while (result->next_sibling_ != nullptr) {
+      result = result->next_sibling_;
+    }
+    return result;
+  }
 
   // Returns the first register hint that is at least free before
   // the value contained in `free_until`. If none is found, returns
@@ -541,6 +576,9 @@
 
   // Returns whether `other` and `this` share the same kind of register.
   bool SameRegisterKind(Location other) const;
+  bool SameRegisterKind(const LiveInterval& other) const {
+    return IsFloatingPoint() == other.IsFloatingPoint();
+  }
 
   bool HasHighInterval() const {
     return IsLowInterval();
@@ -594,6 +632,60 @@
     }
   }
 
+  // Returns whether an interval, when it is non-split, is using
+  // the same register of one of its input.
+  bool IsUsingInputRegister() const {
+    if (defined_by_ != nullptr && !IsSplit()) {
+      for (HInputIterator it(defined_by_); !it.Done(); it.Advance()) {
+        LiveInterval* interval = it.Current()->GetLiveInterval();
+
+        // Find the interval that covers `defined_by`_.
+        while (interval != nullptr && !interval->Covers(defined_by_->GetLifetimePosition())) {
+          interval = interval->GetNextSibling();
+        }
+
+        // Check if both intervals have the same register of the same kind.
+        if (interval != nullptr
+            && interval->SameRegisterKind(*this)
+            && interval->GetRegister() == GetRegister()) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  // Returns whether an interval, when it is non-split, can safely use
+  // the same register of one of its input. Note that this method requires
+  // IsUsingInputRegister() to be true.
+  bool CanUseInputRegister() const {
+    DCHECK(IsUsingInputRegister());
+    if (defined_by_ != nullptr && !IsSplit()) {
+      LocationSummary* locations = defined_by_->GetLocations();
+      if (locations->OutputCanOverlapWithInputs()) {
+        return false;
+      }
+      for (HInputIterator it(defined_by_); !it.Done(); it.Advance()) {
+        LiveInterval* interval = it.Current()->GetLiveInterval();
+
+        // Find the interval that covers `defined_by`_.
+        while (interval != nullptr && !interval->Covers(defined_by_->GetLifetimePosition())) {
+          interval = interval->GetNextSibling();
+        }
+
+        if (interval != nullptr
+            && interval->SameRegisterKind(*this)
+            && interval->GetRegister() == GetRegister()) {
+          // We found the input that has the same register. Check if it is live after
+          // `defined_by`_.
+          return !interval->Covers(defined_by_->GetLifetimePosition() + 1);
+        }
+      }
+    }
+    LOG(FATAL) << "Unreachable";
+    UNREACHABLE();
+  }
+
  private:
   LiveInterval(ArenaAllocator* allocator,
                Primitive::Type type,