ART: Change x86 long param ABI (Quick/JNI/Opt)

Ensure that we don't pass a long parameter across the last register
and the stack: skip the register and allocate it only on the stack.
This was requested to simplify the optimizing compiler code
generation for x86.

Optimizing (Baseline) compiler support for x86 longs:
- Remove QuickParameter from Location, as there are no longer any uses
  of it.

Bump oat.h version because we changed an ABI again.

I changed IsParamALong() to return false for argument 0 (this argument).
I am not sure why it differed from all other tests.

I have not tested on ARM.  I followed Nicolas's suggestions for setting
the value of kSplitPairAcrossRegisterAndStack for different
architectures.

Change-Id: I2f16b33c1dac58dd4f4f503e9c2309d845f5fb7a
Signed-off-by: Mark Mendell <mark.p.mendell@intel.com>
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index 0337096..c04ae12 100755
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -2469,11 +2469,17 @@
       return m2l_->TargetReg(fpArgMappingToPhysicalReg[cur_fp_reg_++],
                              arg.IsWide() ? kWide : kNotWide);
     }
-  } else {
-    if (cur_core_reg_ < coreArgMappingToPhysicalRegSize) {
-      result = m2l_->TargetReg(coreArgMappingToPhysicalReg[cur_core_reg_++],
-                               arg.IsRef() ? kRef : kNotWide);
-      if (arg.IsWide() && cur_core_reg_ < coreArgMappingToPhysicalRegSize) {
+  } else if (cur_core_reg_ < coreArgMappingToPhysicalRegSize) {
+    result = m2l_->TargetReg(coreArgMappingToPhysicalReg[cur_core_reg_++],
+                             arg.IsRef() ? kRef : kNotWide);
+    if (arg.IsWide()) {
+      // This must be a long, as double is handled above.
+      // Ensure that we don't split a long across the last register and the stack.
+      if (cur_core_reg_ == coreArgMappingToPhysicalRegSize) {
+        // Leave the last core register unused and force the whole long to the stack.
+        cur_core_reg_++;
+        result = RegStorage::InvalidReg();
+      } else if (cur_core_reg_ < coreArgMappingToPhysicalRegSize) {
         result = RegStorage::MakeRegPair(
             result, m2l_->TargetReg(coreArgMappingToPhysicalReg[cur_core_reg_++], kNotWide));
       }
diff --git a/compiler/jni/quick/calling_convention.h b/compiler/jni/quick/calling_convention.h
index 6db0c3b..0c64a36 100644
--- a/compiler/jni/quick/calling_convention.h
+++ b/compiler/jni/quick/calling_convention.h
@@ -141,7 +141,7 @@
     if (IsStatic()) {
       param++;  // 0th argument must skip return value at start of the shorty
     } else if (param == 0) {
-      return true;  // this argument
+      return false;  // this argument
     }
     return shorty_[param] == 'J';
   }
diff --git a/compiler/jni/quick/x86/calling_convention_x86.cc b/compiler/jni/quick/x86/calling_convention_x86.cc
index fc72e88..8a45f0c 100644
--- a/compiler/jni/quick/x86/calling_convention_x86.cc
+++ b/compiler/jni/quick/x86/calling_convention_x86.cc
@@ -85,9 +85,19 @@
   ManagedRegister res = ManagedRegister::NoRegister();
   if (!IsCurrentParamAFloatOrDouble()) {
     switch (gpr_arg_count_) {
-      case 0: res = X86ManagedRegister::FromCpuRegister(ECX); break;
-      case 1: res = X86ManagedRegister::FromCpuRegister(EDX); break;
-      case 2: res = X86ManagedRegister::FromCpuRegister(EBX); break;
+      case 0:
+        res = X86ManagedRegister::FromCpuRegister(ECX);
+        break;
+      case 1:
+        res = X86ManagedRegister::FromCpuRegister(EDX);
+        break;
+      case 2:
+        // Don't split a long between the last register and the stack.
+        if (IsCurrentParamALong()) {
+          return ManagedRegister::NoRegister();
+        }
+        res = X86ManagedRegister::FromCpuRegister(EBX);
+        break;
     }
   } else if (itr_float_and_doubles_ < 4) {
     // First four float parameters are passed via XMM0..XMM3
@@ -120,27 +130,34 @@
     ResetIterator(FrameOffset(0));
     while (HasNext()) {
       ManagedRegister in_reg = CurrentParamRegister();
+      bool is_long = IsCurrentParamALong();
       if (!in_reg.IsNoRegister()) {
         int32_t size = IsParamADouble(itr_args_) ? 8 : 4;
         int32_t spill_offset = CurrentParamStackOffset().Uint32Value();
         ManagedRegisterSpill spill(in_reg, size, spill_offset);
         entry_spills_.push_back(spill);
-        if (IsCurrentParamALong() && !IsCurrentParamAReference()) {  // Long.
-          // special case, as we may need a second register here.
+        if (is_long) {
+          // special case, as we need a second register here.
           in_reg = CurrentParamHighLongRegister();
-          if (!in_reg.IsNoRegister()) {
-            // We have to spill the second half of the long.
-            ManagedRegisterSpill spill2(in_reg, size, spill_offset + 4);
-            entry_spills_.push_back(spill2);
-            // Long was allocated in 2 registers.
-            gpr_arg_count_++;
-          }
+          DCHECK(!in_reg.IsNoRegister());
+          // We have to spill the second half of the long.
+          ManagedRegisterSpill spill2(in_reg, size, spill_offset + 4);
+          entry_spills_.push_back(spill2);
         }
 
         // Keep track of the number of GPRs allocated.
         if (!IsCurrentParamAFloatOrDouble()) {
-          gpr_arg_count_++;
+          if (is_long) {
+            // Long was allocated in 2 registers.
+            gpr_arg_count_ += 2;
+          } else {
+            gpr_arg_count_++;
+          }
         }
+      } else if (is_long) {
+        // We need to skip the unused last register, which is empty.
+        // If we are already out of registers, this is harmless.
+        gpr_arg_count_ += 2;
       }
       Next();
     }
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 063550b..d880eef 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -534,9 +534,6 @@
         X86ManagedRegister pair = X86ManagedRegister::FromRegisterPair(
             calling_convention.GetRegisterPairAt(index));
         return Location::RegisterPairLocation(pair.AsRegisterPairLow(), pair.AsRegisterPairHigh());
-      } else if (index + 1 == calling_convention.GetNumberOfRegisters()) {
-        // stack_index_ is the right offset for the memory.
-        return Location::QuickParameter(index, stack_index_ - 2);
       } else {
         return Location::DoubleStackSlot(calling_convention.GetStackOffsetOf(stack_index_ - 2));
       }
@@ -628,16 +625,6 @@
           Location::RegisterLocation(destination.AsRegisterPairLow<Register>()));
     } else if (source.IsFpuRegister()) {
       LOG(FATAL) << "Unimplemented";
-    } else if (source.IsQuickParameter()) {
-      uint16_t register_index = source.GetQuickParameterRegisterIndex();
-      uint16_t stack_index = source.GetQuickParameterStackIndex();
-      InvokeDexCallingConvention calling_convention;
-      EmitParallelMoves(
-          Location::RegisterLocation(calling_convention.GetRegisterAt(register_index)),
-          Location::RegisterLocation(destination.AsRegisterPairLow<Register>()),
-          Location::StackSlot(
-              calling_convention.GetStackOffsetOf(stack_index + 1) + GetFrameSize()),
-          Location::RegisterLocation(destination.AsRegisterPairHigh<Register>()));
     } else {
       // No conflict possible, so just do the moves.
       DCHECK(source.IsDoubleStackSlot());
@@ -645,23 +632,6 @@
       __ movl(destination.AsRegisterPairHigh<Register>(),
               Address(ESP, source.GetHighStackIndex(kX86WordSize)));
     }
-  } else if (destination.IsQuickParameter()) {
-    InvokeDexCallingConvention calling_convention;
-    uint16_t register_index = destination.GetQuickParameterRegisterIndex();
-    uint16_t stack_index = destination.GetQuickParameterStackIndex();
-    if (source.IsRegisterPair()) {
-      LOG(FATAL) << "Unimplemented";
-    } else if (source.IsFpuRegister()) {
-      LOG(FATAL) << "Unimplemented";
-    } else {
-      DCHECK(source.IsDoubleStackSlot());
-      EmitParallelMoves(
-          Location::StackSlot(source.GetStackIndex()),
-          Location::StackSlot(calling_convention.GetStackOffsetOf(stack_index)),
-          Location::StackSlot(source.GetHighStackIndex(kX86WordSize)),
-          Location::StackSlot(calling_convention.GetStackOffsetOf(stack_index + 1)));
-      __ movl(calling_convention.GetRegisterAt(register_index), Address(ESP, source.GetStackIndex()));
-    }
   } else if (destination.IsFpuRegister()) {
     if (source.IsFpuRegister()) {
       __ movaps(destination.AsFpuRegister<XmmRegister>(), source.AsFpuRegister<XmmRegister>());
@@ -677,18 +647,6 @@
       __ movl(Address(ESP, destination.GetStackIndex()), source.AsRegisterPairLow<Register>());
       __ movl(Address(ESP, destination.GetHighStackIndex(kX86WordSize)),
               source.AsRegisterPairHigh<Register>());
-    } else if (source.IsQuickParameter()) {
-      // No conflict possible, so just do the move.
-      InvokeDexCallingConvention calling_convention;
-      uint16_t register_index = source.GetQuickParameterRegisterIndex();
-      uint16_t stack_index = source.GetQuickParameterStackIndex();
-      // Just move the low part. The only time a source is a quick parameter is
-      // when moving the parameter to its stack locations. And the (Java) caller
-      // of this method has already done that.
-      __ movl(Address(ESP, destination.GetStackIndex()),
-              calling_convention.GetRegisterAt(register_index));
-      DCHECK_EQ(calling_convention.GetStackOffsetOf(stack_index + 1) + GetFrameSize(),
-                static_cast<size_t>(destination.GetHighStackIndex(kX86WordSize)));
     } else if (source.IsFpuRegister()) {
       __ movsd(Address(ESP, destination.GetStackIndex()), source.AsFpuRegister<XmmRegister>());
     } else {
diff --git a/compiler/optimizing/locations.h b/compiler/optimizing/locations.h
index 8b06d60..8eef986 100644
--- a/compiler/optimizing/locations.h
+++ b/compiler/optimizing/locations.h
@@ -62,17 +62,11 @@
     // We do not use the value 9 because it conflicts with kLocationConstantMask.
     kDoNotUse9 = 9,
 
-    // On 32bits architectures, quick can pass a long where the
-    // low bits are in the last parameter register, and the high
-    // bits are in a stack slot. The kQuickParameter kind is for
-    // handling this special case.
-    kQuickParameter = 10,
-
     // Unallocated location represents a location that is not fixed and can be
     // allocated by a register allocator.  Each unallocated location has
     // a policy that specifies what kind of location is suitable. Payload
     // contains register allocation policy.
-    kUnallocated = 11,
+    kUnallocated = 10,
   };
 
   Location() : value_(kInvalid) {
@@ -82,7 +76,6 @@
     static_assert((kStackSlot & kLocationConstantMask) != kConstant, "TagError");
     static_assert((kDoubleStackSlot & kLocationConstantMask) != kConstant, "TagError");
     static_assert((kRegister & kLocationConstantMask) != kConstant, "TagError");
-    static_assert((kQuickParameter & kLocationConstantMask) != kConstant, "TagError");
     static_assert((kFpuRegister & kLocationConstantMask) != kConstant, "TagError");
     static_assert((kRegisterPair & kLocationConstantMask) != kConstant, "TagError");
     static_assert((kFpuRegisterPair & kLocationConstantMask) != kConstant, "TagError");
@@ -267,24 +260,6 @@
     return GetPayload() - kStackIndexBias + word_size;
   }
 
-  static Location QuickParameter(uint16_t register_index, uint16_t stack_index) {
-    return Location(kQuickParameter, register_index << 16 | stack_index);
-  }
-
-  uint32_t GetQuickParameterRegisterIndex() const {
-    DCHECK(IsQuickParameter());
-    return GetPayload() >> 16;
-  }
-
-  uint32_t GetQuickParameterStackIndex() const {
-    DCHECK(IsQuickParameter());
-    return GetPayload() & 0xFFFF;
-  }
-
-  bool IsQuickParameter() const {
-    return GetKind() == kQuickParameter;
-  }
-
   Kind GetKind() const {
     return IsConstant() ? kConstant : KindField::Decode(value_);
   }
@@ -299,7 +274,6 @@
       case kRegister: return "R";
       case kStackSlot: return "S";
       case kDoubleStackSlot: return "DS";
-      case kQuickParameter: return "Q";
       case kUnallocated: return "U";
       case kConstant: return "C";
       case kFpuRegister: return "F";
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index fd3a1cf..beacd49 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -467,7 +467,8 @@
 
     // Now check ebx
     SKIP_OVER_FLOATS esi, edi, al, .Lgpr_setup_finished
-    // Must be first word of a long, or an integer.
+    // Must be first word of a long, or an integer. First word of long doesn't
+    // go into EBX, but can be loaded there anyways, as it is harmless.
     movl (%edi), %ebx
     jmp .Lgpr_setup_finished
 .LfirstLong:
@@ -569,7 +570,8 @@
 
     // Is there anything for ebx?
     SKIP_OVER_FLOATS esi, edi, al, .Lgpr_setup_finished2
-    // First word of long or integer.  Load into EBX.
+    // Must be first word of a long, or an integer. First word of long doesn't
+    // go into EBX, but can be loaded there anyways, as it is harmless.
     movl (%edi), %ebx
     jmp .Lgpr_setup_finished2
 .LSecondLong2:
@@ -585,7 +587,8 @@
 
     // Anything for EBX?
     SKIP_OVER_FLOATS esi, edi, al, .Lgpr_setup_finished2
-    // First word of long or integer.  Load into EBX.
+    // Must be first word of a long, or an integer. First word of long doesn't
+    // go into EBX, but can be loaded there anyways, as it is harmless.
     movl (%edi), %ebx
     jmp .Lgpr_setup_finished2
     // Nothing left to load.
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index a67ebca..98f1684 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -59,6 +59,7 @@
   // | S0         |
   // |            |    4x2 bytes padding
   // | Method*    |  <- sp
+  static constexpr bool kSplitPairAcrossRegisterAndStack = kArm32QuickCodeUseSoftFloat;
   static constexpr bool kAlignPairRegister = !kArm32QuickCodeUseSoftFloat;
   static constexpr bool kQuickSoftFloatAbi = kArm32QuickCodeUseSoftFloat;
   static constexpr bool kQuickDoubleRegAlignedFloatBackFilled = !kArm32QuickCodeUseSoftFloat;
@@ -95,6 +96,7 @@
   // | D0         |
   // |            |    padding
   // | Method*    |  <- sp
+  static constexpr bool kSplitPairAcrossRegisterAndStack = false;
   static constexpr bool kAlignPairRegister = false;
   static constexpr bool kQuickSoftFloatAbi = false;  // This is a hard float ABI.
   static constexpr bool kQuickDoubleRegAlignedFloatBackFilled = false;
@@ -125,6 +127,7 @@
   // | A2         |    arg2
   // | A1         |    arg1
   // | A0/Method* |  <- sp
+  static constexpr bool kSplitPairAcrossRegisterAndStack = true;
   static constexpr bool kAlignPairRegister = false;
   static constexpr bool kQuickSoftFloatAbi = true;  // This is a soft float ABI.
   static constexpr bool kQuickDoubleRegAlignedFloatBackFilled = false;
@@ -203,6 +206,7 @@
   // | XMM1        |    float arg 2
   // | XMM0        |    float arg 1
   // | EAX/Method* |  <- sp
+  static constexpr bool kSplitPairAcrossRegisterAndStack = false;
   static constexpr bool kAlignPairRegister = false;
   static constexpr bool kQuickSoftFloatAbi = false;  // This is a hard float ABI.
   static constexpr bool kQuickDoubleRegAlignedFloatBackFilled = false;
@@ -243,6 +247,7 @@
   // | XMM0            |    float arg 1
   // | Padding         |
   // | RDI/Method*     |  <- sp
+  static constexpr bool kSplitPairAcrossRegisterAndStack = false;
   static constexpr bool kAlignPairRegister = false;
   static constexpr bool kQuickSoftFloatAbi = false;  // This is a hard float ABI.
   static constexpr bool kQuickDoubleRegAlignedFloatBackFilled = false;
@@ -452,6 +457,11 @@
             }
             is_split_long_or_double_ = (GetBytesPerGprSpillLocation(kRuntimeISA) == 4) &&
                 ((gpr_index_ + 1) == kNumQuickGprArgs);
+            if (!kSplitPairAcrossRegisterAndStack && is_split_long_or_double_) {
+              // We don't want to split this. Pass over this register.
+              gpr_index_++;
+              is_split_long_or_double_ = false;
+            }
             Visit();
             if (kBytesStackArgLocation == 4) {
               stack_index_+= 2;
diff --git a/runtime/oat.h b/runtime/oat.h
index 3e28606..7faf33b 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -32,7 +32,7 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr uint8_t kOatMagic[] = { 'o', 'a', 't', '\n' };
-  static constexpr uint8_t kOatVersion[] = { '0', '5', '4', '\0' };
+  static constexpr uint8_t kOatVersion[] = { '0', '5', '5', '\0' };
 
   static constexpr const char* kImageLocationKey = "image-location";
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";