From 2a83e8f06031948741ae3dda3633433ddd669693 Mon Sep 17 00:00:00 2001
From: buzbee <buzbee@google.com>
Date: Fri, 13 Jul 2012 16:42:30 -0700
Subject: Quick compiler, fix wide bug

In Dalvik, 64-bit data items are represented as a pair of 32-bit
registers.  The Art compiler maintained this notation, while llvm
expects properly typed data.  During the conversion to bitcode, we
must drop the high word of pairs, while correctly typing the low.

This CL fixes several bugs related to this.  "Placeholder" llvm
Values are created only for the low word of pairs, and we now skip
Phi node generation for high words.  Doing this required a bit
of tightening up of the size & type inference code (which previously
was able to get away with ignoring high words).

Also, I've moved shift operations into intrinics because Dalvik
and llvm have different ideas about what a shift means.

Bitcode generation is only supported for the Arm target at the
moment.  With this CL, all target tests pass and the phone boots.
Some caveats:

  o Performance data is not yet meaningful, either compile or
    run times.
  o When configured for Quick, we run single-threaded.
  o In a small percentage of methods, we generate invalid llvm
    bitcode (missing exception edges).  As-checked-in, llvm
    function generation is turned off to avoid missing edge
    complaints (to enable testing of the Quick backend).

Change-Id: I66932ffb44d299fcaf0a112e0d1c217c49341ccf
---
 src/compiler/Dataflow.cc              |  27 +++--
 src/compiler/Frontend.cc              |   6 +-
 src/compiler/Ralloc.cc                |  92 +++++++++++-----
 src/compiler/SSATransformation.cc     |   2 +-
 src/compiler/codegen/GenCommon.cc     |   3 +
 src/compiler/codegen/MethodBitcode.cc | 196 ++++++++++++++++++----------------
 src/compiler/codegen/RallocUtil.cc    |   2 -
 src/greenland/intrinsic_func_list.def |  46 +++++++-
 8 files changed, 231 insertions(+), 143 deletions(-)

(limited to 'src')

diff --git a/src/compiler/Dataflow.cc b/src/compiler/Dataflow.cc
index e7998d106f..33ef0addad 100644
--- a/src/compiler/Dataflow.cc
+++ b/src/compiler/Dataflow.cc
@@ -69,7 +69,7 @@ const int oatDataFlowAttributes[kMirOpLast] = {
   DF_DA | DF_REF_A,
 
   // 0D MOVE_EXCEPTION vAA
-  DF_DA | DF_CORE_A,
+  DF_DA | DF_REF_A,
 
   // 0E RETURN_VOID
   DF_NOP,
@@ -180,41 +180,40 @@ const int oatDataFlowAttributes[kMirOpLast] = {
   DF_DA | DF_UB | DF_B_WIDE | DF_UC | DF_C_WIDE | DF_CORE_A | DF_CORE_B | DF_CORE_C,
 
   // 32 IF_EQ vA, vB, +CCCC
-  DF_UA | DF_UB | DF_CORE_A | DF_CORE_B,
+  DF_UA | DF_UB,
 
   // 33 IF_NE vA, vB, +CCCC
-  DF_UA | DF_UB | DF_CORE_A | DF_CORE_B,
+  DF_UA | DF_UB,
 
   // 34 IF_LT vA, vB, +CCCC
-  DF_UA | DF_UB | DF_CORE_A | DF_CORE_B,
+  DF_UA | DF_UB,
 
   // 35 IF_GE vA, vB, +CCCC
-  DF_UA | DF_UB | DF_CORE_A | DF_CORE_B,
+  DF_UA | DF_UB,
 
   // 36 IF_GT vA, vB, +CCCC
-  DF_UA | DF_UB | DF_CORE_A | DF_CORE_B,
+  DF_UA | DF_UB,
 
   // 37 IF_LE vA, vB, +CCCC
-  DF_UA | DF_UB | DF_CORE_A | DF_CORE_B,
-
+  DF_UA | DF_UB,
 
   // 38 IF_EQZ vAA, +BBBB
-  DF_UA | DF_CORE_A,
+  DF_UA,
 
   // 39 IF_NEZ vAA, +BBBB
-  DF_UA | DF_CORE_A,
+  DF_UA,
 
   // 3A IF_LTZ vAA, +BBBB
-  DF_UA | DF_CORE_A,
+  DF_UA,
 
   // 3B IF_GEZ vAA, +BBBB
-  DF_UA | DF_CORE_A,
+  DF_UA,
 
   // 3C IF_GTZ vAA, +BBBB
-  DF_UA | DF_CORE_A,
+  DF_UA,
 
   // 3D IF_LEZ vAA, +BBBB
-  DF_UA | DF_CORE_A,
+  DF_UA,
 
   // 3E UNUSED_3E
   DF_NOP,
diff --git a/src/compiler/Frontend.cc b/src/compiler/Frontend.cc
index bcaba10ea8..3924f45ec8 100644
--- a/src/compiler/Frontend.cc
+++ b/src/compiler/Frontend.cc
@@ -763,7 +763,9 @@ CompiledMethod* oatCompileMethod(Compiler& compiler,
   cUnit->numRegs = code_item->registers_size_ - cUnit->numIns;
   cUnit->numOuts = code_item->outs_size_;
 #if defined(ART_USE_QUICK_COMPILER)
+#if defined(TARGET_ARM)
   cUnit->genBitcode = true;
+#endif
 #endif
   /* Adjust this value accordingly once inlining is performed */
   cUnit->numDalvikRegisters = code_item->registers_size_;
@@ -781,8 +783,8 @@ CompiledMethod* oatCompileMethod(Compiler& compiler,
   }
 #if defined(ART_USE_QUICK_COMPILER)
   if (cUnit->genBitcode) {
-    cUnit->printMe = true;
-    cUnit->enableDebug |= (1 << kDebugDumpBitcodeFile);
+    //cUnit->printMe = true;
+    //cUnit->enableDebug |= (1 << kDebugDumpBitcodeFile);
     // Disable non-safe optimizations for now
     cUnit->disableOpt |= ~(1 << kSafeOptimizations);
   }
diff --git a/src/compiler/Ralloc.cc b/src/compiler/Ralloc.cc
index ea4d6c109d..500b1b2d1b 100644
--- a/src/compiler/Ralloc.cc
+++ b/src/compiler/Ralloc.cc
@@ -23,9 +23,6 @@ namespace art {
 
 bool setFp(CompilationUnit* cUnit, int index, bool isFP) {
   bool change = false;
-  if (cUnit->regLocation[index].highWord) {
-    return change;
-  }
   if (isFP && !cUnit->regLocation[index].fp) {
     cUnit->regLocation[index].fp = true;
     cUnit->regLocation[index].defined = true;
@@ -36,9 +33,6 @@ bool setFp(CompilationUnit* cUnit, int index, bool isFP) {
 
 bool setCore(CompilationUnit* cUnit, int index, bool isCore) {
   bool change = false;
-  if (cUnit->regLocation[index].highWord) {
-    return change;
-  }
   if (isCore && !cUnit->regLocation[index].defined) {
     cUnit->regLocation[index].core = true;
     cUnit->regLocation[index].defined = true;
@@ -49,9 +43,6 @@ bool setCore(CompilationUnit* cUnit, int index, bool isCore) {
 
 bool setRef(CompilationUnit* cUnit, int index, bool isRef) {
   bool change = false;
-  if (cUnit->regLocation[index].highWord) {
-    return change;
-  }
   if (isRef && !cUnit->regLocation[index].defined) {
     cUnit->regLocation[index].ref = true;
     cUnit->regLocation[index].defined = true;
@@ -60,6 +51,24 @@ bool setRef(CompilationUnit* cUnit, int index, bool isRef) {
   return change;
 }
 
+bool setWide(CompilationUnit* cUnit, int index, bool isWide) {
+  bool change = false;
+  if (isWide && !cUnit->regLocation[index].wide) {
+    cUnit->regLocation[index].wide = true;
+    change = true;
+  }
+  return change;
+}
+
+bool setHigh(CompilationUnit* cUnit, int index, bool isHigh) {
+  bool change = false;
+  if (isHigh && !cUnit->regLocation[index].highWord) {
+    cUnit->regLocation[index].highWord = true;
+    change = true;
+  }
+  return change;
+}
+
 bool remapNames(CompilationUnit* cUnit, BasicBlock* bb)
 {
   if (bb->blockType != kDalvikByteCode && bb->blockType != kEntryBlock &&
@@ -123,6 +132,7 @@ bool inferTypeAndSize(CompilationUnit* cUnit, BasicBlock* bb)
         }
         if (attrs & DF_A_WIDE) {
           cUnit->regLocation[ssaRep->defs[0]].wide = true;
+          cUnit->regLocation[ssaRep->defs[1]].wide = true;
           cUnit->regLocation[ssaRep->defs[1]].highWord = true;
           DCHECK_EQ(SRegToVReg(cUnit, ssaRep->defs[0])+1,
           SRegToVReg(cUnit, ssaRep->defs[1]));
@@ -140,6 +150,7 @@ bool inferTypeAndSize(CompilationUnit* cUnit, BasicBlock* bb)
         }
         if (attrs & DF_A_WIDE) {
           cUnit->regLocation[ssaRep->uses[next]].wide = true;
+          cUnit->regLocation[ssaRep->uses[next + 1]].wide = true;
           cUnit->regLocation[ssaRep->uses[next + 1]].highWord = true;
           DCHECK_EQ(SRegToVReg(cUnit, ssaRep->uses[next])+1,
           SRegToVReg(cUnit, ssaRep->uses[next + 1]));
@@ -157,6 +168,7 @@ bool inferTypeAndSize(CompilationUnit* cUnit, BasicBlock* bb)
         }
         if (attrs & DF_B_WIDE) {
           cUnit->regLocation[ssaRep->uses[next]].wide = true;
+          cUnit->regLocation[ssaRep->uses[next + 1]].wide = true;
           cUnit->regLocation[ssaRep->uses[next + 1]].highWord = true;
           DCHECK_EQ(SRegToVReg(cUnit, ssaRep->uses[next])+1,
                                SRegToVReg(cUnit, ssaRep->uses[next + 1]));
@@ -174,6 +186,7 @@ bool inferTypeAndSize(CompilationUnit* cUnit, BasicBlock* bb)
         }
         if (attrs & DF_C_WIDE) {
           cUnit->regLocation[ssaRep->uses[next]].wide = true;
+          cUnit->regLocation[ssaRep->uses[next + 1]].wide = true;
           cUnit->regLocation[ssaRep->uses[next + 1]].highWord = true;
           DCHECK_EQ(SRegToVReg(cUnit, ssaRep->uses[next])+1,
           SRegToVReg(cUnit, ssaRep->uses[next + 1]));
@@ -192,6 +205,7 @@ bool inferTypeAndSize(CompilationUnit* cUnit, BasicBlock* bb)
               changed |= setCore(cUnit, ssaRep->uses[0], true);
               changed |= setCore(cUnit, ssaRep->uses[1], true);
               cUnit->regLocation[ssaRep->uses[0]].wide = true;
+              cUnit->regLocation[ssaRep->uses[1]].wide = true;
               cUnit->regLocation[ssaRep->uses[1]].highWord = true;
               break;
             case 'F':
@@ -201,6 +215,7 @@ bool inferTypeAndSize(CompilationUnit* cUnit, BasicBlock* bb)
               changed |= setFp(cUnit, ssaRep->uses[0], true);
               changed |= setFp(cUnit, ssaRep->uses[1], true);
               cUnit->regLocation[ssaRep->uses[0]].wide = true;
+              cUnit->regLocation[ssaRep->uses[1]].wide = true;
               cUnit->regLocation[ssaRep->uses[1]].highWord = true;
               break;
             case 'L':
@@ -254,6 +269,7 @@ bool inferTypeAndSize(CompilationUnit* cUnit, BasicBlock* bb)
                 ssaRep->fpUse[i] = true;
                 ssaRep->fpUse[i+1] = true;
                 cUnit->regLocation[ssaRep->uses[i]].wide = true;
+                cUnit->regLocation[ssaRep->uses[i+1]].wide = true;
                 cUnit->regLocation[ssaRep->uses[i+1]].highWord = true;
                 DCHECK_EQ(SRegToVReg(cUnit, ssaRep->uses[i])+1,
                                      SRegToVReg(cUnit, ssaRep->uses[i+1]));
@@ -261,6 +277,7 @@ bool inferTypeAndSize(CompilationUnit* cUnit, BasicBlock* bb)
                 break;
               case 'J':
                 cUnit->regLocation[ssaRep->uses[i]].wide = true;
+                cUnit->regLocation[ssaRep->uses[i+1]].wide = true;
                 cUnit->regLocation[ssaRep->uses[i+1]].highWord = true;
                 DCHECK_EQ(SRegToVReg(cUnit, ssaRep->uses[i])+1,
                                      SRegToVReg(cUnit, ssaRep->uses[i+1]));
@@ -292,23 +309,27 @@ bool inferTypeAndSize(CompilationUnit* cUnit, BasicBlock* bb)
         }
       // Special-case handling for moves & Phi
       if (attrs & (DF_IS_MOVE | DF_NULL_TRANSFER_N)) {
-        // If any of our inputs or outputs is defined, set all
-        bool definedFP = false;
-        bool definedCore = false;
-        bool definedRef = false;
-        definedFP |= (cUnit->regLocation[ssaRep->defs[0]].defined &&
-                      cUnit->regLocation[ssaRep->defs[0]].fp);
-        definedCore |= (cUnit->regLocation[ssaRep->defs[0]].defined &&
-                        cUnit->regLocation[ssaRep->defs[0]].core);
-        definedRef |= (cUnit->regLocation[ssaRep->defs[0]].defined &&
-                       cUnit->regLocation[ssaRep->defs[0]].ref);
-        for (int i = 0; i < ssaRep->numUses; i++) {
-          definedFP |= (cUnit->regLocation[ssaRep->uses[i]].defined &&
-                        cUnit->regLocation[ssaRep->uses[i]].fp);
-          definedCore |= (cUnit->regLocation[ssaRep->uses[i]].defined
-                          && cUnit->regLocation[ssaRep->uses[i]].core);
-          definedRef |= (cUnit->regLocation[ssaRep->uses[i]].defined
-                         && cUnit->regLocation[ssaRep->uses[i]].ref);
+        /*
+         * If any of our inputs or outputs is defined, set all.
+         * Some ugliness related to Phi nodes and wide values.
+         * The Phi set will include all low words or all high
+         * words, so we have to treat them specially.
+         */
+        bool isPhi = (static_cast<int>(mir->dalvikInsn.opcode) ==
+                      kMirOpPhi);
+        RegLocation rlTemp = cUnit->regLocation[ssaRep->defs[0]];
+        bool definedFP = rlTemp.defined && rlTemp.fp;
+        bool definedCore = rlTemp.defined && rlTemp.core;
+        bool definedRef = rlTemp.defined && rlTemp.ref;
+        bool isWide = rlTemp.wide || ((attrs & DF_A_WIDE) != 0);
+        bool isHigh = isPhi && rlTemp.wide && rlTemp.highWord;
+        for (int i = 0; i < ssaRep->numUses;i++) {
+          rlTemp = cUnit->regLocation[ssaRep->uses[i]];
+          definedFP |= rlTemp.defined && rlTemp.fp;
+          definedCore |= rlTemp.defined && rlTemp.core;
+          definedRef |= rlTemp.defined && rlTemp.ref;
+          isWide |= rlTemp.wide;
+          isHigh |= isPhi && rlTemp.wide && rlTemp.highWord;
         }
         /*
          * TODO: cleaner fix
@@ -334,10 +355,23 @@ bool inferTypeAndSize(CompilationUnit* cUnit, BasicBlock* bb)
         changed |= setFp(cUnit, ssaRep->defs[0], definedFP);
         changed |= setCore(cUnit, ssaRep->defs[0], definedCore);
         changed |= setRef(cUnit, ssaRep->defs[0], definedRef);
+        changed |= setWide(cUnit, ssaRep->defs[0], isWide);
+        changed |= setHigh(cUnit, ssaRep->defs[0], isHigh);
+        if (attrs & DF_A_WIDE) {
+          changed |= setWide(cUnit, ssaRep->defs[1], true);
+          changed |= setHigh(cUnit, ssaRep->defs[1], true);
+        }
         for (int i = 0; i < ssaRep->numUses; i++) {
-         changed |= setFp(cUnit, ssaRep->uses[i], definedFP);
-         changed |= setCore(cUnit, ssaRep->uses[i], definedCore);
-         changed |= setRef(cUnit, ssaRep->uses[i], definedRef);
+          changed |= setFp(cUnit, ssaRep->uses[i], definedFP);
+          changed |= setCore(cUnit, ssaRep->uses[i], definedCore);
+          changed |= setRef(cUnit, ssaRep->uses[i], definedRef);
+          changed |= setWide(cUnit, ssaRep->uses[i], isWide);
+          changed |= setHigh(cUnit, ssaRep->uses[i], isHigh);
+        }
+        if (attrs & DF_A_WIDE) {
+          DCHECK_EQ(ssaRep->numUses, 2);
+          changed |= setWide(cUnit, ssaRep->uses[1], true);
+          changed |= setHigh(cUnit, ssaRep->uses[1], true);
         }
       }
     }
diff --git a/src/compiler/SSATransformation.cc b/src/compiler/SSATransformation.cc
index 7d6a733277..10957b2517 100644
--- a/src/compiler/SSATransformation.cc
+++ b/src/compiler/SSATransformation.cc
@@ -747,7 +747,7 @@ void insertPhiNodes(CompilationUnit* cUnit)
                                 kPostOrderDFSTraversal, true /* isIterative */);
 
   /* Iterate through each Dalvik register */
-  for (dalvikReg = 0; dalvikReg < cUnit->numDalvikRegisters; dalvikReg++) {
+  for (dalvikReg = cUnit->numDalvikRegisters - 1; dalvikReg >= 0; dalvikReg--) {
     bool change;
     ArenaBitVectorIterator iterator;
 
diff --git a/src/compiler/codegen/GenCommon.cc b/src/compiler/codegen/GenCommon.cc
index 9082a49ad3..b4b0f6a9e0 100644
--- a/src/compiler/codegen/GenCommon.cc
+++ b/src/compiler/codegen/GenCommon.cc
@@ -2062,16 +2062,19 @@ bool genArithOpIntLit(CompilationUnit* cUnit, Instruction::Code opcode,
       op = kOpXor;
       break;
     case Instruction::SHL_INT_LIT8:
+    case Instruction::SHL_INT:
       lit &= 31;
       shiftOp = true;
       op = kOpLsl;
       break;
     case Instruction::SHR_INT_LIT8:
+    case Instruction::SHR_INT:
       lit &= 31;
       shiftOp = true;
       op = kOpAsr;
       break;
     case Instruction::USHR_INT_LIT8:
+    case Instruction::USHR_INT:
       lit &= 31;
       shiftOp = true;
       op = kOpLsr;
diff --git a/src/compiler/codegen/MethodBitcode.cc b/src/compiler/codegen/MethodBitcode.cc
index 83ebf9bfce..b7c4331d7d 100644
--- a/src/compiler/codegen/MethodBitcode.cc
+++ b/src/compiler/codegen/MethodBitcode.cc
@@ -464,24 +464,27 @@ void convertFPArithOp(CompilationUnit* cUnit, OpKind op, RegLocation rlDest,
   defineValue(cUnit, res, rlDest.origSReg);
 }
 
-void convertShift(CompilationUnit* cUnit, OpKind op, RegLocation rlDest,
-                  RegLocation rlSrc1, RegLocation rlSrc2)
+void convertShift(CompilationUnit* cUnit,
+                  greenland::IntrinsicHelper::IntrinsicId id,
+                  RegLocation rlDest, RegLocation rlSrc1, RegLocation rlSrc2)
 {
-  llvm::Value* src1 = getLLVMValue(cUnit, rlSrc1.origSReg);
-  llvm::Value* src2 = getLLVMValue(cUnit, rlSrc2.origSReg);
-  /*
-   * TODO: Figure out how best to handle constraining the shift
-   * amount to 31 for int and 63 for long.  We take care of this
-   * inline for int and in the out-of-line handler for longs, so
-   * it's a bit of a waste to generate llvm bitcode for this.
-   * Yet more intrinsics?
-   */
-  UNIMPLEMENTED(WARNING) << "llvm shift mismatch";
-  if (rlDest.wide) {
-    // llvm thinks the shift could should be in 64 bits.
-    src2 = cUnit->irb->CreateZExt(src2, cUnit->irb->getInt64Ty());
-  }
-  llvm::Value* res = genArithOp(cUnit, op, rlDest.wide, src1, src2);
+  llvm::Function* intr = cUnit->intrinsic_helper->GetIntrinsicFunction(id);
+  llvm::SmallVector<llvm::Value*, 2>args;
+  args.push_back(getLLVMValue(cUnit, rlSrc1.origSReg));
+  args.push_back(getLLVMValue(cUnit, rlSrc2.origSReg));
+  llvm::Value* res = cUnit->irb->CreateCall(intr, args);
+  defineValue(cUnit, res, rlDest.origSReg);
+}
+
+void convertShiftLit(CompilationUnit* cUnit,
+                     greenland::IntrinsicHelper::IntrinsicId id,
+                     RegLocation rlDest, RegLocation rlSrc, int shiftAmount)
+{
+  llvm::Function* intr = cUnit->intrinsic_helper->GetIntrinsicFunction(id);
+  llvm::SmallVector<llvm::Value*, 2>args;
+  args.push_back(getLLVMValue(cUnit, rlSrc.origSReg));
+  args.push_back(cUnit->irb->getInt32(shiftAmount));
+  llvm::Value* res = cUnit->irb->CreateCall(intr, args);
   defineValue(cUnit, res, rlDest.origSReg);
 }
 
@@ -1099,27 +1102,33 @@ bool convertMIRNode(CompilationUnit* cUnit, MIR* mir, BasicBlock* bb,
       break;
     case Instruction::SHL_LONG:
     case Instruction::SHL_LONG_2ADDR:
-      convertShift(cUnit, kOpLsl, rlDest, rlSrc[0], rlSrc[1]);
+      convertShift(cUnit, greenland::IntrinsicHelper::SHLLong,
+                    rlDest, rlSrc[0], rlSrc[1]);
       break;
     case Instruction::SHL_INT:
     case Instruction::SHL_INT_2ADDR:
-      convertShift(cUnit, kOpLsl, rlDest, rlSrc[0], rlSrc[1]);
+      convertShift(cUnit, greenland::IntrinsicHelper::SHLInt,
+                   rlDest, rlSrc[0], rlSrc[1]);
       break;
     case Instruction::SHR_LONG:
     case Instruction::SHR_LONG_2ADDR:
-      convertShift(cUnit, kOpAsr, rlDest, rlSrc[0], rlSrc[1]);
+      convertShift(cUnit, greenland::IntrinsicHelper::SHRLong,
+                   rlDest, rlSrc[0], rlSrc[1]);
       break;
     case Instruction::SHR_INT:
     case Instruction::SHR_INT_2ADDR:
-      convertShift(cUnit, kOpAsr, rlDest, rlSrc[0], rlSrc[1]);
+      convertShift(cUnit, greenland::IntrinsicHelper::SHRInt,
+                   rlDest, rlSrc[0], rlSrc[1]);
       break;
     case Instruction::USHR_LONG:
     case Instruction::USHR_LONG_2ADDR:
-      convertShift(cUnit, kOpLsr, rlDest, rlSrc[0], rlSrc[1]);
+      convertShift(cUnit, greenland::IntrinsicHelper::USHRLong,
+                   rlDest, rlSrc[0], rlSrc[1]);
       break;
     case Instruction::USHR_INT:
     case Instruction::USHR_INT_2ADDR:
-      convertShift(cUnit, kOpLsr, rlDest, rlSrc[0], rlSrc[1]);
+      convertShift(cUnit, greenland::IntrinsicHelper::USHRInt,
+                   rlDest, rlSrc[0], rlSrc[1]);
       break;
 
     case Instruction::ADD_INT_LIT16:
@@ -1155,13 +1164,16 @@ bool convertMIRNode(CompilationUnit* cUnit, MIR* mir, BasicBlock* bb,
       convertArithOpLit(cUnit, kOpXor, rlDest, rlSrc[0], vC);
       break;
     case Instruction::SHL_INT_LIT8:
-      convertArithOpLit(cUnit, kOpLsl, rlDest, rlSrc[0], vC & 0x1f);
+      convertShiftLit(cUnit, greenland::IntrinsicHelper::SHLInt,
+                      rlDest, rlSrc[0], vC & 0x1f);
       break;
     case Instruction::SHR_INT_LIT8:
-      convertArithOpLit(cUnit, kOpAsr, rlDest, rlSrc[0], vC & 0x1f);
+      convertShiftLit(cUnit, greenland::IntrinsicHelper::SHRInt,
+                      rlDest, rlSrc[0], vC & 0x1f);
       break;
     case Instruction::USHR_INT_LIT8:
-      convertArithOpLit(cUnit, kOpLsr, rlDest, rlSrc[0], vC & 0x1f);
+      convertShiftLit(cUnit, greenland::IntrinsicHelper::USHRInt,
+                      rlDest, rlSrc[0], vC & 0x1f);
       break;
 
     case Instruction::ADD_FLOAT:
@@ -1589,19 +1601,30 @@ void convertExtendedMIR(CompilationUnit* cUnit, BasicBlock* bb, MIR* mir,
 
   switch ((ExtendedMIROpcode)mir->dalvikInsn.opcode) {
     case kMirOpPhi: {
-      int* incoming = (int*)mir->dalvikInsn.vB;
       RegLocation rlDest = cUnit->regLocation[mir->ssaRep->defs[0]];
+      /*
+       * The Art compiler's Phi nodes only handle 32-bit operands,
+       * representing wide values using a matched set of Phi nodes
+       * for the lower and upper halves.  In the llvm world, we only
+       * want a single Phi for wides.  Here we will simply discard
+       * the Phi node representing the high word.
+       */
+      if (rlDest.highWord) {
+        return;  // No Phi node - handled via low word
+      }
+      int* incoming = (int*)mir->dalvikInsn.vB;
       llvm::Type* phiType =
           llvmTypeFromLocRec(cUnit, rlDest);
       llvm::PHINode* phi = cUnit->irb->CreatePHI(phiType, mir->ssaRep->numUses);
       for (int i = 0; i < mir->ssaRep->numUses; i++) {
         RegLocation loc;
-        if (rlDest.wide) {
-           loc = oatGetSrcWide(cUnit, mir, i);
-           i++;
-        } else {
-           loc = oatGetSrc(cUnit, mir, i);
-        }
+        // Don't check width here.
+        loc = oatGetRawSrc(cUnit, mir, i);
+        DCHECK_EQ(rlDest.wide, loc.wide);
+        DCHECK_EQ(rlDest.wide & rlDest.highWord, loc.wide & loc.highWord);
+        DCHECK_EQ(rlDest.fp, loc.fp);
+        DCHECK_EQ(rlDest.core, loc.core);
+        DCHECK_EQ(rlDest.ref, loc.ref);
         phi->addIncoming(getLLVMValue(cUnit, loc.origSReg),
                          getLLVMBlock(cUnit, incoming[i]));
       }
@@ -1895,30 +1918,18 @@ void oatMethodMIR2Bitcode(CompilationUnit* cUnit)
   arg_iter++;  /* Skip path method */
   for (int i = 0; i < cUnit->numSSARegs; i++) {
     llvm::Value* val;
-    if ((i < cUnit->numRegs) || (i >= (cUnit->numRegs + cUnit->numIns))) {
-      // Handle SSA defs, skipping Method* and compiler temps
-      if (SRegToVReg(cUnit, i) < 0) {
-        val = NULL;
-      } else {
-        llvm::Constant* immValue = cUnit->irb->GetJInt(0);
-        val = emitConst(cUnit, immValue, cUnit->regLocation[i]);
-        val->setName(llvmSSAName(cUnit, i));
-      }
+    if ((SRegToVReg(cUnit, i) < 0) || cUnit->regLocation[i].highWord) {
+      oatInsertGrowableList(cUnit, &cUnit->llvmValues, 0);
+    } else if ((i < cUnit->numRegs) ||
+               (i >= (cUnit->numRegs + cUnit->numIns))) {
+      llvm::Constant* immValue = cUnit->irb->GetJInt(0);
+      val = emitConst(cUnit, immValue, cUnit->regLocation[i]);
+      val->setName(llvmSSAName(cUnit, i));
       oatInsertGrowableList(cUnit, &cUnit->llvmValues, (intptr_t)val);
-      if (cUnit->regLocation[i].wide) {
-        // Skip high half of wide values
-        oatInsertGrowableList(cUnit, &cUnit->llvmValues, 0);
-        i++;
-      }
     } else {
       // Recover previously-created argument values
       llvm::Value* argVal = arg_iter++;
       oatInsertGrowableList(cUnit, &cUnit->llvmValues, (intptr_t)argVal);
-      if (cUnit->regLocation[i].wide) {
-        // Skip high half of wide values.
-        oatInsertGrowableList(cUnit, &cUnit->llvmValues, 0);
-        i++;
-      }
     }
   }
 
@@ -1959,7 +1970,7 @@ void oatMethodMIR2Bitcode(CompilationUnit* cUnit)
   cUnit->irb->SetInsertPoint(cUnit->entryBB);
   cUnit->irb->CreateBr(cUnit->entryTargetBB);
 
-  llvm::verifyFunction(*cUnit->func, llvm::PrintMessageAction);
+  //llvm::verifyFunction(*cUnit->func, llvm::PrintMessageAction);
 
   if (cUnit->enableDebug & (1 << kDebugDumpBitcodeFile)) {
     // Write bitcode to file
@@ -2258,43 +2269,23 @@ void cvtBinOp(CompilationUnit* cUnit, OpKind op, llvm::Instruction* inst)
   }
 }
 
-void cvtShiftOp(CompilationUnit* cUnit, OpKind op, llvm::Instruction* inst)
+void cvtShiftOp(CompilationUnit* cUnit, Instruction::Code opcode,
+                llvm::CallInst* callInst)
 {
-  if (inst->getType() == cUnit->irb->getInt64Ty()) {
-    /*
-     * llvm wants the shift amount to be 64 bits, whereas we've constained
-     * it to be in 6 bits.  It should always be held as an unnamed temp
-     * at this point that was the result of a previous UExt.  We'll backtrack
-     * to find the pre-extension value and use that.
-     * TODO: probably better to handle this in cvtIntExt() or just intrinsify
-     */
-    RegLocation rlDest = getLoc(cUnit, inst);
-    RegLocation rlSrc = getLoc(cUnit, inst->getOperand(0));
-    RegLocation rlShift = getLoc(cUnit, inst->getOperand(1));
-    DCHECK(rlShift.wide);
-    DCHECK_EQ(rlShift.sRegLow, INVALID_SREG);
-    // Now, free the temp registers - we won't need them.
-    // TODO: kill the dead extend ops
-    oatFreeTemp(cUnit, rlShift.lowReg);
-    oatFreeTemp(cUnit, rlShift.highReg);
-    // Get the pre-extend operand
-    llvm::Instruction* extInst =
-        llvm::dyn_cast<llvm::Instruction>(inst->getOperand(1));
-    DCHECK(extInst != NULL);
-    rlShift = getLoc(cUnit, extInst->getOperand(0));
-    DCHECK(!rlShift.wide);
-    Instruction::Code opcode;
-    if (op == kOpLsl)
-      opcode = Instruction::SHL_LONG;
-    else if (op == kOpAsr)
-      opcode = Instruction::SHR_LONG;
-    else {
-      DCHECK_EQ(op, kOpLsr);
-      opcode = Instruction::USHR_LONG;
-    }
-    genShiftOpLong(cUnit, opcode, rlDest, rlSrc, rlShift);
+  DCHECK_EQ(callInst->getNumArgOperands(), 2U);
+  RegLocation rlDest = getLoc(cUnit, callInst);
+  RegLocation rlSrc = getLoc(cUnit, callInst->getArgOperand(0));
+  llvm::Value* rhs = callInst->getArgOperand(1);
+  if (llvm::ConstantInt* src2 = llvm::dyn_cast<llvm::ConstantInt>(rhs)) {
+    DCHECK(!rlDest.wide);
+    genArithOpIntLit(cUnit, opcode, rlDest, rlSrc, src2->getSExtValue());
   } else {
-    cvtBinOp(cUnit, op, inst);
+    RegLocation rlShift = getLoc(cUnit, rhs);
+    if (callInst->getType() == cUnit->irb->getInt64Ty()) {
+      genShiftOpLong(cUnit, opcode, rlDest, rlSrc, rlShift);
+    } else {
+      genArithOpInt(cUnit, opcode, rlDest, rlSrc, rlShift);
+    }
   }
 }
 
@@ -3098,9 +3089,25 @@ bool methodBitcodeBlockCodeGen(CompilationUnit* cUnit, llvm::BasicBlock* bb)
               cvtLongCompare(cUnit, callInst);
               break;
 
-            case greenland::IntrinsicHelper::UnknownId:
-              cvtCall(cUnit, callInst, callee);
+            case greenland::IntrinsicHelper::SHLLong:
+              cvtShiftOp(cUnit, Instruction::SHL_LONG, callInst);
+              break;
+            case greenland::IntrinsicHelper::SHRLong:
+              cvtShiftOp(cUnit, Instruction::SHR_LONG, callInst);
+              break;
+            case greenland::IntrinsicHelper::USHRLong:
+              cvtShiftOp(cUnit, Instruction::USHR_LONG, callInst);
               break;
+            case greenland::IntrinsicHelper::SHLInt:
+              cvtShiftOp(cUnit, Instruction::SHL_INT, callInst);
+              break;
+            case greenland::IntrinsicHelper::SHRInt:
+              cvtShiftOp(cUnit, Instruction::SHR_INT, callInst);
+              break;
+            case greenland::IntrinsicHelper::USHRInt:
+              cvtShiftOp(cUnit, Instruction::USHR_INT, callInst);
+              break;
+
             default:
               LOG(FATAL) << "Unexpected intrinsic " << (int)id << ", "
                          << cUnit->intrinsic_helper->GetName(id);
@@ -3117,9 +3124,6 @@ bool methodBitcodeBlockCodeGen(CompilationUnit* cUnit, llvm::BasicBlock* bb)
       case llvm::Instruction::And: cvtBinOp(cUnit, kOpAnd, inst); break;
       case llvm::Instruction::Or: cvtBinOp(cUnit, kOpOr, inst); break;
       case llvm::Instruction::Xor: cvtBinOp(cUnit, kOpXor, inst); break;
-      case llvm::Instruction::Shl: cvtShiftOp(cUnit, kOpLsl, inst); break;
-      case llvm::Instruction::LShr: cvtShiftOp(cUnit, kOpLsr, inst); break;
-      case llvm::Instruction::AShr: cvtShiftOp(cUnit, kOpAsr, inst); break;
       case llvm::Instruction::PHI: cvtPhi(cUnit, inst); break;
       case llvm::Instruction::Ret: cvtRet(cUnit, inst); break;
       case llvm::Instruction::FAdd: cvtBinFPOp(cUnit, kOpAdd, inst); break;
@@ -3143,6 +3147,9 @@ bool methodBitcodeBlockCodeGen(CompilationUnit* cUnit, llvm::BasicBlock* bb)
       case llvm::Instruction::Unreachable:
         break;  // FIXME: can we really ignore these?
 
+      case llvm::Instruction::Shl:
+      case llvm::Instruction::LShr:
+      case llvm::Instruction::AShr:
       case llvm::Instruction::Invoke:
       case llvm::Instruction::FPToUI:
       case llvm::Instruction::UIToFP:
@@ -3174,7 +3181,8 @@ bool methodBitcodeBlockCodeGen(CompilationUnit* cUnit, llvm::BasicBlock* bb)
         LOG(FATAL) << "Unexpected llvm opcode: " << opcode; break;
 
       default:
-        LOG(FATAL) << "Unknown llvm opcode: " << opcode; break;
+        LOG(FATAL) << "Unknown llvm opcode: " << inst->getOpcodeName();
+        break;
     }
   }
 
diff --git a/src/compiler/codegen/RallocUtil.cc b/src/compiler/codegen/RallocUtil.cc
index 2088cdc360..9d1878a02b 100644
--- a/src/compiler/codegen/RallocUtil.cc
+++ b/src/compiler/codegen/RallocUtil.cc
@@ -998,14 +998,12 @@ extern RegLocation oatGetRawSrc(CompilationUnit* cUnit, MIR* mir, int num)
 {
   DCHECK(num < mir->ssaRep->numUses);
   RegLocation res = cUnit->regLocation[mir->ssaRep->uses[num]];
-  DCHECK(!res.wide || num < (mir->ssaRep->numUses - 1));
   return res;
 }
 extern RegLocation oatGetRawDest(CompilationUnit* cUnit, MIR* mir)
 {
   DCHECK_GT(mir->ssaRep->numDefs, 0);
   RegLocation res = cUnit->regLocation[mir->ssaRep->defs[0]];
-  DCHECK(!res.wide || mir->ssaRep->numDefs == 2);
   return res;
 }
 extern RegLocation oatGetDest(CompilationUnit* cUnit, MIR* mir)
diff --git a/src/greenland/intrinsic_func_list.def b/src/greenland/intrinsic_func_list.def
index 608e760c75..0ebebb25a3 100644
--- a/src/greenland/intrinsic_func_list.def
+++ b/src/greenland/intrinsic_func_list.def
@@ -1228,7 +1228,7 @@ _EVAL_DEF_INTRINSICS_FUNC(CopyObj,
                           kJavaObjectTy,
                           _EXPAND_ARG1(kJavaObjectTy))
 
-// int copy_long(long)
+// long copy_long(long)
 _EVAL_DEF_INTRINSICS_FUNC(CopyLong,
                           dex_lang_copy_long,
                           kAttrReadOnly | kAttrNoThrow,
@@ -1250,6 +1250,50 @@ _EVAL_DEF_INTRINSICS_FUNC(CopyDouble,
                           _EXPAND_ARG1(kDoubleTy))
 
 //----------------------------------------------------------------------------
+// Shift intrinsics.  Shift semantics for Dalvik are a bit different than
+// the llvm shift operators.  For 32-bit shifts, the shift count is constrained
+// to the range of 0..31, while for 64-bit shifts we limit to 0..63.
+// Further, the shift count for Long shifts in Dalvik is 32 bits, while
+// llvm requires a 64-bit shift count. For GBC, we represent shifts as an
+//  intrinsic to allow most efficient target-dependent lowering.
+//----------------------------------------------------------------------------
+// long shl_long(long,int)
+_EVAL_DEF_INTRINSICS_FUNC(SHLLong,
+                          dex_lang_shl_long,
+                          kAttrReadOnly | kAttrNoThrow,
+                          kInt64Ty,
+                          _EXPAND_ARG2(kInt64Ty,kInt32Ty))
+// long shr_long(long,int)
+_EVAL_DEF_INTRINSICS_FUNC(SHRLong,
+                          dex_lang_shr_long,
+                          kAttrReadOnly | kAttrNoThrow,
+                          kInt64Ty,
+                          _EXPAND_ARG2(kInt64Ty,kInt32Ty))
+// long ushr_long(long,int)
+_EVAL_DEF_INTRINSICS_FUNC(USHRLong,
+                          dex_lang_ushl_long,
+                          kAttrReadOnly | kAttrNoThrow,
+                          kInt64Ty,
+                          _EXPAND_ARG2(kInt64Ty,kInt32Ty))
+// int shl_int(int,int)
+_EVAL_DEF_INTRINSICS_FUNC(SHLInt,
+                          dex_lang_shl_int,
+                          kAttrReadOnly | kAttrNoThrow,
+                          kInt32Ty,
+                          _EXPAND_ARG2(kInt32Ty,kInt32Ty))
+// long shr_int(int,int)
+_EVAL_DEF_INTRINSICS_FUNC(SHRInt,
+                          dex_lang_shr_int,
+                          kAttrReadOnly | kAttrNoThrow,
+                          kInt32Ty,
+                          _EXPAND_ARG2(kInt32Ty,kInt32Ty))
+// int ushr_long(int,int)
+_EVAL_DEF_INTRINSICS_FUNC(USHRInt,
+                          dex_lang_ushl_int,
+                          kAttrReadOnly | kAttrNoThrow,
+                          kInt32Ty,
+                          _EXPAND_ARG2(kInt32Ty,kInt32Ty))
+//----------------------------------------------------------------------------
 // Conversion instrinsics.  Note: these should eventually be removed.  We
 // can express these directly in bitcode, but by using intrinsics the
 // Quick compiler can be more efficient.  Some extra optimization infrastructure
-- 
cgit v1.2.3-59-g8ed1b