8 files changed, 231 insertions, 143 deletions
diff --git a/src/compiler/Dataflow.cc b/src/compiler/Dataflow.cc
index e7998d106f..33ef0addad 100644
--- a/src/compiler/Dataflow.cc
+++ b/src/compiler/Dataflow.cc
@@ -69,7 +69,7 @@ const int oatDataFlowAttributes[kMirOpLast] = {
   DF_DA | DF_REF_A,
 
   // 0D MOVE_EXCEPTION vAA
-  DF_DA | DF_CORE_A,
+  DF_DA | DF_REF_A,
 
   // 0E RETURN_VOID
   DF_NOP,
@@ -180,41 +180,40 @@ const int oatDataFlowAttributes[kMirOpLast] = {
   DF_DA | DF_UB | DF_B_WIDE | DF_UC | DF_C_WIDE | DF_CORE_A | DF_CORE_B | DF_CORE_C,
 
   // 32 IF_EQ vA, vB, +CCCC
-  DF_UA | DF_UB | DF_CORE_A | DF_CORE_B,
+  DF_UA | DF_UB,
 
   // 33 IF_NE vA, vB, +CCCC
-  DF_UA | DF_UB | DF_CORE_A | DF_CORE_B,
+  DF_UA | DF_UB,
 
   // 34 IF_LT vA, vB, +CCCC
-  DF_UA | DF_UB | DF_CORE_A | DF_CORE_B,
+  DF_UA | DF_UB,
 
   // 35 IF_GE vA, vB, +CCCC
-  DF_UA | DF_UB | DF_CORE_A | DF_CORE_B,
+  DF_UA | DF_UB,
 
   // 36 IF_GT vA, vB, +CCCC
-  DF_UA | DF_UB | DF_CORE_A | DF_CORE_B,
+  DF_UA | DF_UB,
 
   // 37 IF_LE vA, vB, +CCCC
-  DF_UA | DF_UB | DF_CORE_A | DF_CORE_B,
-
+  DF_UA | DF_UB,
 
   // 38 IF_EQZ vAA, +BBBB
-  DF_UA | DF_CORE_A,
+  DF_UA,
 
   // 39 IF_NEZ vAA, +BBBB
-  DF_UA | DF_CORE_A,
+  DF_UA,
 
   // 3A IF_LTZ vAA, +BBBB
-  DF_UA | DF_CORE_A,
+  DF_UA,
 
   // 3B IF_GEZ vAA, +BBBB
-  DF_UA | DF_CORE_A,
+  DF_UA,
 
   // 3C IF_GTZ vAA, +BBBB
-  DF_UA | DF_CORE_A,
+  DF_UA,
 
   // 3D IF_LEZ vAA, +BBBB
-  DF_UA | DF_CORE_A,
+  DF_UA,
 
   // 3E UNUSED_3E
   DF_NOP,
diff --git a/src/compiler/Frontend.cc b/src/compiler/Frontend.cc
index bcaba10ea8..3924f45ec8 100644
--- a/src/compiler/Frontend.cc
+++ b/src/compiler/Frontend.cc
@@ -763,8 +763,10 @@ CompiledMethod* oatCompileMethod(Compiler& compiler,
   cUnit->numRegs = code_item->registers_size_ - cUnit->numIns;
   cUnit->numOuts = code_item->outs_size_;
 #if defined(ART_USE_QUICK_COMPILER)
+#if defined(TARGET_ARM)
   cUnit->genBitcode = true;
 #endif
+#endif
   /* Adjust this value accordingly once inlining is performed */
   cUnit->numDalvikRegisters = code_item->registers_size_;
   // TODO: set this from command line
@@ -781,8 +783,8 @@ CompiledMethod* oatCompileMethod(Compiler& compiler,
   }
 #if defined(ART_USE_QUICK_COMPILER)
   if (cUnit->genBitcode) {
-    cUnit->printMe = true;
-    cUnit->enableDebug |= (1 << kDebugDumpBitcodeFile);
+    //cUnit->printMe = true;
+    //cUnit->enableDebug |= (1 << kDebugDumpBitcodeFile);
     // Disable non-safe optimizations for now
     cUnit->disableOpt |= ~(1 << kSafeOptimizations);
   }
diff --git a/src/compiler/Ralloc.cc b/src/compiler/Ralloc.cc
index ea4d6c109d..500b1b2d1b 100644
--- a/src/compiler/Ralloc.cc
+++ b/src/compiler/Ralloc.cc
@@ -23,9 +23,6 @@ namespace art {
 
 bool setFp(CompilationUnit* cUnit, int index, bool isFP) {
   bool change = false;
-  if (cUnit->regLocation[index].highWord) {
-    return change;
-  }
   if (isFP && !cUnit->regLocation[index].fp) {
     cUnit->regLocation[index].fp = true;
     cUnit->regLocation[index].defined = true;
@@ -36,9 +33,6 @@ bool setFp(CompilationUnit* cUnit, int index, bool isFP) {
 
 bool setCore(CompilationUnit* cUnit, int index, bool isCore) {
   bool change = false;
-  if (cUnit->regLocation[index].highWord) {
-    return change;
-  }
   if (isCore && !cUnit->regLocation[index].defined) {
     cUnit->regLocation[index].core = true;
     cUnit->regLocation[index].defined = true;
@@ -49,9 +43,6 @@ bool setCore(CompilationUnit* cUnit, int index, bool isCore) {
 
 bool setRef(CompilationUnit* cUnit, int index, bool isRef) {
   bool change = false;
-  if (cUnit->regLocation[index].highWord) {
-    return change;
-  }
   if (isRef && !cUnit->regLocation[index].defined) {
     cUnit->regLocation[index].ref = true;
     cUnit->regLocation[index].defined = true;
@@ -60,6 +51,24 @@ bool setRef(CompilationUnit* cUnit, int index, bool isRef) {
   return change;
 }
 
+bool setWide(CompilationUnit* cUnit, int index, bool isWide) {
+  bool change = false;
+  if (isWide && !cUnit->regLocation[index].wide) {
+    cUnit->regLocation[index].wide = true;
+    change = true;
+  }
+  return change;
+}
+
+bool setHigh(CompilationUnit* cUnit, int index, bool isHigh) {
+  bool change = false;
+  if (isHigh && !cUnit->regLocation[index].highWord) {
+    cUnit->regLocation[index].highWord = true;
+    change = true;
+  }
+  return change;
+}
+
 bool remapNames(CompilationUnit* cUnit, BasicBlock* bb)
 {
   if (bb->blockType != kDalvikByteCode && bb->blockType != kEntryBlock &&
@@ -123,6 +132,7 @@ bool inferTypeAndSize(CompilationUnit* cUnit, BasicBlock* bb)
         }
         if (attrs & DF_A_WIDE) {
           cUnit->regLocation[ssaRep->defs[0]].wide = true;
+          cUnit->regLocation[ssaRep->defs[1]].wide = true;
           cUnit->regLocation[ssaRep->defs[1]].highWord = true;
           DCHECK_EQ(SRegToVReg(cUnit, ssaRep->defs[0])+1,
           SRegToVReg(cUnit, ssaRep->defs[1]));
@@ -140,6 +150,7 @@ bool inferTypeAndSize(CompilationUnit* cUnit, BasicBlock* bb)
         }
         if (attrs & DF_A_WIDE) {
           cUnit->regLocation[ssaRep->uses[next]].wide = true;
+          cUnit->regLocation[ssaRep->uses[next + 1]].wide = true;
           cUnit->regLocation[ssaRep->uses[next + 1]].highWord = true;
           DCHECK_EQ(SRegToVReg(cUnit, ssaRep->uses[next])+1,
           SRegToVReg(cUnit, ssaRep->uses[next + 1]));
@@ -157,6 +168,7 @@ bool inferTypeAndSize(CompilationUnit* cUnit, BasicBlock* bb)
         }
         if (attrs & DF_B_WIDE) {
           cUnit->regLocation[ssaRep->uses[next]].wide = true;
+          cUnit->regLocation[ssaRep->uses[next + 1]].wide = true;
           cUnit->regLocation[ssaRep->uses[next + 1]].highWord = true;
           DCHECK_EQ(SRegToVReg(cUnit, ssaRep->uses[next])+1,
                                SRegToVReg(cUnit, ssaRep->uses[next + 1]));
@@ -174,6 +186,7 @@ bool inferTypeAndSize(CompilationUnit* cUnit, BasicBlock* bb)
         }
         if (attrs & DF_C_WIDE) {
           cUnit->regLocation[ssaRep->uses[next]].wide = true;
+          cUnit->regLocation[ssaRep->uses[next + 1]].wide = true;
           cUnit->regLocation[ssaRep->uses[next + 1]].highWord = true;
           DCHECK_EQ(SRegToVReg(cUnit, ssaRep->uses[next])+1,
           SRegToVReg(cUnit, ssaRep->uses[next + 1]));
@@ -192,6 +205,7 @@ bool inferTypeAndSize(CompilationUnit* cUnit, BasicBlock* bb)
               changed |= setCore(cUnit, ssaRep->uses[0], true);
               changed |= setCore(cUnit, ssaRep->uses[1], true);
               cUnit->regLocation[ssaRep->uses[0]].wide = true;
+              cUnit->regLocation[ssaRep->uses[1]].wide = true;
               cUnit->regLocation[ssaRep->uses[1]].highWord = true;
               break;
             case 'F':
@@ -201,6 +215,7 @@ bool inferTypeAndSize(CompilationUnit* cUnit, BasicBlock* bb)
               changed |= setFp(cUnit, ssaRep->uses[0], true);
               changed |= setFp(cUnit, ssaRep->uses[1], true);
               cUnit->regLocation[ssaRep->uses[0]].wide = true;
+              cUnit->regLocation[ssaRep->uses[1]].wide = true;
               cUnit->regLocation[ssaRep->uses[1]].highWord = true;
               break;
             case 'L':
@@ -254,6 +269,7 @@ bool inferTypeAndSize(CompilationUnit* cUnit, BasicBlock* bb)
                 ssaRep->fpUse[i] = true;
                 ssaRep->fpUse[i+1] = true;
                 cUnit->regLocation[ssaRep->uses[i]].wide = true;
+                cUnit->regLocation[ssaRep->uses[i+1]].wide = true;
                 cUnit->regLocation[ssaRep->uses[i+1]].highWord = true;
                 DCHECK_EQ(SRegToVReg(cUnit, ssaRep->uses[i])+1,
                                      SRegToVReg(cUnit, ssaRep->uses[i+1]));
@@ -261,6 +277,7 @@ bool inferTypeAndSize(CompilationUnit* cUnit, BasicBlock* bb)
                 break;
               case 'J':
                 cUnit->regLocation[ssaRep->uses[i]].wide = true;
+                cUnit->regLocation[ssaRep->uses[i+1]].wide = true;
                 cUnit->regLocation[ssaRep->uses[i+1]].highWord = true;
                 DCHECK_EQ(SRegToVReg(cUnit, ssaRep->uses[i])+1,
                                      SRegToVReg(cUnit, ssaRep->uses[i+1]));
@@ -292,23 +309,27 @@ bool inferTypeAndSize(CompilationUnit* cUnit, BasicBlock* bb)
         }
       // Special-case handling for moves & Phi
       if (attrs & (DF_IS_MOVE | DF_NULL_TRANSFER_N)) {
-        // If any of our inputs or outputs is defined, set all
-        bool definedFP = false;
-        bool definedCore = false;
-        bool definedRef = false;
-        definedFP |= (cUnit->regLocation[ssaRep->defs[0]].defined &&
-                      cUnit->regLocation[ssaRep->defs[0]].fp);
-        definedCore |= (cUnit->regLocation[ssaRep->defs[0]].defined &&
-                        cUnit->regLocation[ssaRep->defs[0]].core);
-        definedRef |= (cUnit->regLocation[ssaRep->defs[0]].defined &&
-                       cUnit->regLocation[ssaRep->defs[0]].ref);
-        for (int i = 0; i < ssaRep->numUses; i++) {
-          definedFP |= (cUnit->regLocation[ssaRep->uses[i]].defined &&
-                        cUnit->regLocation[ssaRep->uses[i]].fp);
-          definedCore |= (cUnit->regLocation[ssaRep->uses[i]].defined
-                          && cUnit->regLocation[ssaRep->uses[i]].core);
-          definedRef |= (cUnit->regLocation[ssaRep->uses[i]].defined
-                         && cUnit->regLocation[ssaRep->uses[i]].ref);
+        /*
+         * If any of our inputs or outputs is defined, set all.
+         * Some ugliness related to Phi nodes and wide values.
+         * The Phi set will include all low words or all high
+         * words, so we have to treat them specially.
+         */
+        bool isPhi = (static_cast<int>(mir->dalvikInsn.opcode) ==
+                      kMirOpPhi);
+        RegLocation rlTemp = cUnit->regLocation[ssaRep->defs[0]];
+        bool definedFP = rlTemp.defined && rlTemp.fp;
+        bool definedCore = rlTemp.defined && rlTemp.core;
+        bool definedRef = rlTemp.defined && rlTemp.ref;
+        bool isWide = rlTemp.wide || ((attrs & DF_A_WIDE) != 0);
+        bool isHigh = isPhi && rlTemp.wide && rlTemp.highWord;
+        for (int i = 0; i < ssaRep->numUses;i++) {
+          rlTemp = cUnit->regLocation[ssaRep->uses[i]];
+          definedFP |= rlTemp.defined && rlTemp.fp;
+          definedCore |= rlTemp.defined && rlTemp.core;
+          definedRef |= rlTemp.defined && rlTemp.ref;
+          isWide |= rlTemp.wide;
+          isHigh |= isPhi && rlTemp.wide && rlTemp.highWord;
         }
         /*
          * TODO: cleaner fix
@@ -334,10 +355,23 @@ bool inferTypeAndSize(CompilationUnit* cUnit, BasicBlock* bb)
         changed |= setFp(cUnit, ssaRep->defs[0], definedFP);
         changed |= setCore(cUnit, ssaRep->defs[0], definedCore);
         changed |= setRef(cUnit, ssaRep->defs[0], definedRef);
+        changed |= setWide(cUnit, ssaRep->defs[0], isWide);
+        changed |= setHigh(cUnit, ssaRep->defs[0], isHigh);
+        if (attrs & DF_A_WIDE) {
+          changed |= setWide(cUnit, ssaRep->defs[1], true);
+          changed |= setHigh(cUnit, ssaRep->defs[1], true);
+        }
         for (int i = 0; i < ssaRep->numUses; i++) {
-         changed |= setFp(cUnit, ssaRep->uses[i], definedFP);
-         changed |= setCore(cUnit, ssaRep->uses[i], definedCore);
-         changed |= setRef(cUnit, ssaRep->uses[i], definedRef);
+          changed |= setFp(cUnit, ssaRep->uses[i], definedFP);
+          changed |= setCore(cUnit, ssaRep->uses[i], definedCore);
+          changed |= setRef(cUnit, ssaRep->uses[i], definedRef);
+          changed |= setWide(cUnit, ssaRep->uses[i], isWide);
+          changed |= setHigh(cUnit, ssaRep->uses[i], isHigh);
+        }
+        if (attrs & DF_A_WIDE) {
+          DCHECK_EQ(ssaRep->numUses, 2);
+          changed |= setWide(cUnit, ssaRep->uses[1], true);
+          changed |= setHigh(cUnit, ssaRep->uses[1], true);
         }
       }
     }
diff --git a/src/compiler/SSATransformation.cc b/src/compiler/SSATransformation.cc
index 7d6a733277..10957b2517 100644
--- a/src/compiler/SSATransformation.cc
+++ b/src/compiler/SSATransformation.cc
@@ -747,7 +747,7 @@ void insertPhiNodes(CompilationUnit* cUnit)
                                 kPostOrderDFSTraversal, true /* isIterative */);
 
   /* Iterate through each Dalvik register */
-  for (dalvikReg = 0; dalvikReg < cUnit->numDalvikRegisters; dalvikReg++) {
+  for (dalvikReg = cUnit->numDalvikRegisters - 1; dalvikReg >= 0; dalvikReg--) {
     bool change;
     ArenaBitVectorIterator iterator;
 
diff --git a/src/compiler/codegen/GenCommon.cc b/src/compiler/codegen/GenCommon.cc
index 9082a49ad3..b4b0f6a9e0 100644
--- a/src/compiler/codegen/GenCommon.cc
+++ b/src/compiler/codegen/GenCommon.cc
@@ -2062,16 +2062,19 @@ bool genArithOpIntLit(CompilationUnit* cUnit, Instruction::Code opcode,
       op = kOpXor;
       break;
     case Instruction::SHL_INT_LIT8:
+    case Instruction::SHL_INT:
       lit &= 31;
       shiftOp = true;
       op = kOpLsl;
       break;
     case Instruction::SHR_INT_LIT8:
+    case Instruction::SHR_INT:
       lit &= 31;
       shiftOp = true;
       op = kOpAsr;
       break;
     case Instruction::USHR_INT_LIT8:
+    case Instruction::USHR_INT:
       lit &= 31;
       shiftOp = true;
       op = kOpLsr;
diff --git a/src/compiler/codegen/MethodBitcode.cc b/src/compiler/codegen/MethodBitcode.cc
index 83ebf9bfce..b7c4331d7d 100644
--- a/src/compiler/codegen/MethodBitcode.cc
+++ b/src/compiler/codegen/MethodBitcode.cc
@@ -464,24 +464,27 @@ void convertFPArithOp(CompilationUnit* cUnit, OpKind op, RegLocation rlDest,
   defineValue(cUnit, res, rlDest.origSReg);
 }
 
-void convertShift(CompilationUnit* cUnit, OpKind op, RegLocation rlDest,
-                  RegLocation rlSrc1, RegLocation rlSrc2)
+void convertShift(CompilationUnit* cUnit,
+                  greenland::IntrinsicHelper::IntrinsicId id,
+                  RegLocation rlDest, RegLocation rlSrc1, RegLocation rlSrc2)
 {
-  llvm::Value* src1 = getLLVMValue(cUnit, rlSrc1.origSReg);
-  llvm::Value* src2 = getLLVMValue(cUnit, rlSrc2.origSReg);
-  /*
-   * TODO: Figure out how best to handle constraining the shift
-   * amount to 31 for int and 63 for long.  We take care of this
-   * inline for int and in the out-of-line handler for longs, so
-   * it's a bit of a waste to generate llvm bitcode for this.
-   * Yet more intrinsics?
-   */
-  UNIMPLEMENTED(WARNING) << "llvm shift mismatch";
-  if (rlDest.wide) {
-    // llvm thinks the shift could should be in 64 bits.
-    src2 = cUnit->irb->CreateZExt(src2, cUnit->irb->getInt64Ty());
-  }
-  llvm::Value* res = genArithOp(cUnit, op, rlDest.wide, src1, src2);
+  llvm::Function* intr = cUnit->intrinsic_helper->GetIntrinsicFunction(id);
+  llvm::SmallVector<llvm::Value*, 2>args;
+  args.push_back(getLLVMValue(cUnit, rlSrc1.origSReg));
+  args.push_back(getLLVMValue(cUnit, rlSrc2.origSReg));
+  llvm::Value* res = cUnit->irb->CreateCall(intr, args);
+  defineValue(cUnit, res, rlDest.origSReg);
+}
+
+void convertShiftLit(CompilationUnit* cUnit,
+                     greenland::IntrinsicHelper::IntrinsicId id,
+                     RegLocation rlDest, RegLocation rlSrc, int shiftAmount)
+{
+  llvm::Function* intr = cUnit->intrinsic_helper->GetIntrinsicFunction(id);
+  llvm::SmallVector<llvm::Value*, 2>args;
+  args.push_back(getLLVMValue(cUnit, rlSrc.origSReg));
+  args.push_back(cUnit->irb->getInt32(shiftAmount));
+  llvm::Value* res = cUnit->irb->CreateCall(intr, args);
   defineValue(cUnit, res, rlDest.origSReg);
 }
 
@@ -1099,27 +1102,33 @@ bool convertMIRNode(CompilationUnit* cUnit, MIR* mir, BasicBlock* bb,
       break;
     case Instruction::SHL_LONG:
     case Instruction::SHL_LONG_2ADDR:
-      convertShift(cUnit, kOpLsl, rlDest, rlSrc[0], rlSrc[1]);
+      convertShift(cUnit, greenland::IntrinsicHelper::SHLLong,
+                    rlDest, rlSrc[0], rlSrc[1]);
       break;
     case Instruction::SHL_INT:
     case Instruction::SHL_INT_2ADDR:
-      convertShift(cUnit, kOpLsl, rlDest, rlSrc[0], rlSrc[1]);
+      convertShift(cUnit, greenland::IntrinsicHelper::SHLInt,
+                   rlDest, rlSrc[0], rlSrc[1]);
       break;
     case Instruction::SHR_LONG:
     case Instruction::SHR_LONG_2ADDR:
-      convertShift(cUnit, kOpAsr, rlDest, rlSrc[0], rlSrc[1]);
+      convertShift(cUnit, greenland::IntrinsicHelper::SHRLong,
+                   rlDest, rlSrc[0], rlSrc[1]);
       break;
     case Instruction::SHR_INT:
     case Instruction::SHR_INT_2ADDR:
-      convertShift(cUnit, kOpAsr, rlDest, rlSrc[0], rlSrc[1]);
+      convertShift(cUnit, greenland::IntrinsicHelper::SHRInt,
+                   rlDest, rlSrc[0], rlSrc[1]);
       break;
     case Instruction::USHR_LONG:
     case Instruction::USHR_LONG_2ADDR:
-      convertShift(cUnit, kOpLsr, rlDest, rlSrc[0], rlSrc[1]);
+      convertShift(cUnit, greenland::IntrinsicHelper::USHRLong,
+                   rlDest, rlSrc[0], rlSrc[1]);
       break;
     case Instruction::USHR_INT:
     case Instruction::USHR_INT_2ADDR:
-      convertShift(cUnit, kOpLsr, rlDest, rlSrc[0], rlSrc[1]);
+      convertShift(cUnit, greenland::IntrinsicHelper::USHRInt,
+                   rlDest, rlSrc[0], rlSrc[1]);
       break;
 
     case Instruction::ADD_INT_LIT16:
@@ -1155,13 +1164,16 @@ bool convertMIRNode(CompilationUnit* cUnit, MIR* mir, BasicBlock* bb,
       convertArithOpLit(cUnit, kOpXor, rlDest, rlSrc[0], vC);
       break;
     case Instruction::SHL_INT_LIT8:
-      convertArithOpLit(cUnit, kOpLsl, rlDest, rlSrc[0], vC & 0x1f);
+      convertShiftLit(cUnit, greenland::IntrinsicHelper::SHLInt,
+                      rlDest, rlSrc[0], vC & 0x1f);
       break;
     case Instruction::SHR_INT_LIT8:
-      convertArithOpLit(cUnit, kOpAsr, rlDest, rlSrc[0], vC & 0x1f);
+      convertShiftLit(cUnit, greenland::IntrinsicHelper::SHRInt,
+                      rlDest, rlSrc[0], vC & 0x1f);
       break;
     case Instruction::USHR_INT_LIT8:
-      convertArithOpLit(cUnit, kOpLsr, rlDest, rlSrc[0], vC & 0x1f);
+      convertShiftLit(cUnit, greenland::IntrinsicHelper::USHRInt,
+                      rlDest, rlSrc[0], vC & 0x1f);
       break;
 
     case Instruction::ADD_FLOAT:
@@ -1589,19 +1601,30 @@ void convertExtendedMIR(CompilationUnit* cUnit, BasicBlock* bb, MIR* mir,
 
   switch ((ExtendedMIROpcode)mir->dalvikInsn.opcode) {
     case kMirOpPhi: {
-      int* incoming = (int*)mir->dalvikInsn.vB;
       RegLocation rlDest = cUnit->regLocation[mir->ssaRep->defs[0]];
+      /*
+       * The Art compiler's Phi nodes only handle 32-bit operands,
+       * representing wide values using a matched set of Phi nodes
+       * for the lower and upper halves.  In the llvm world, we only
+       * want a single Phi for wides.  Here we will simply discard
+       * the Phi node representing the high word.
+       */
+      if (rlDest.highWord) {
+        return;  // No Phi node - handled via low word
+      }
+      int* incoming = (int*)mir->dalvikInsn.vB;
       llvm::Type* phiType =
           llvmTypeFromLocRec(cUnit, rlDest);
       llvm::PHINode* phi = cUnit->irb->CreatePHI(phiType, mir->ssaRep->numUses);
       for (int i = 0; i < mir->ssaRep->numUses; i++) {
         RegLocation loc;
-        if (rlDest.wide) {
-           loc = oatGetSrcWide(cUnit, mir, i);
-           i++;
-        } else {
-           loc = oatGetSrc(cUnit, mir, i);
-        }
+        // Don't check width here.
+        loc = oatGetRawSrc(cUnit, mir, i);
+        DCHECK_EQ(rlDest.wide, loc.wide);
+        DCHECK_EQ(rlDest.wide & rlDest.highWord, loc.wide & loc.highWord);
+        DCHECK_EQ(rlDest.fp, loc.fp);
+        DCHECK_EQ(rlDest.core, loc.core);
+        DCHECK_EQ(rlDest.ref, loc.ref);
         phi->addIncoming(getLLVMValue(cUnit, loc.origSReg),
                          getLLVMBlock(cUnit, incoming[i]));
       }
@@ -1895,30 +1918,18 @@ void oatMethodMIR2Bitcode(CompilationUnit* cUnit)
   arg_iter++;  /* Skip path method */
   for (int i = 0; i < cUnit->numSSARegs; i++) {
     llvm::Value* val;
-    if ((i < cUnit->numRegs) || (i >= (cUnit->numRegs + cUnit->numIns))) {
-      // Handle SSA defs, skipping Method* and compiler temps
-      if (SRegToVReg(cUnit, i) < 0) {
-        val = NULL;
-      } else {
-        llvm::Constant* immValue = cUnit->irb->GetJInt(0);
-        val = emitConst(cUnit, immValue, cUnit->regLocation[i]);
-        val->setName(llvmSSAName(cUnit, i));
-      }
+    if ((SRegToVReg(cUnit, i) < 0) || cUnit->regLocation[i].highWord) {
+      oatInsertGrowableList(cUnit, &cUnit->llvmValues, 0);
+    } else if ((i < cUnit->numRegs) ||
+               (i >= (cUnit->numRegs + cUnit->numIns))) {
+      llvm::Constant* immValue = cUnit->irb->GetJInt(0);
+      val = emitConst(cUnit, immValue, cUnit->regLocation[i]);
+      val->setName(llvmSSAName(cUnit, i));
       oatInsertGrowableList(cUnit, &cUnit->llvmValues, (intptr_t)val);
-      if (cUnit->regLocation[i].wide) {
-        // Skip high half of wide values
-        oatInsertGrowableList(cUnit, &cUnit->llvmValues, 0);
-        i++;
-      }
     } else {
       // Recover previously-created argument values
       llvm::Value* argVal = arg_iter++;
       oatInsertGrowableList(cUnit, &cUnit->llvmValues, (intptr_t)argVal);
-      if (cUnit->regLocation[i].wide) {
-        // Skip high half of wide values.
-        oatInsertGrowableList(cUnit, &cUnit->llvmValues, 0);
-        i++;
-      }
     }
   }
 
@@ -1959,7 +1970,7 @@ void oatMethodMIR2Bitcode(CompilationUnit* cUnit)
   cUnit->irb->SetInsertPoint(cUnit->entryBB);
   cUnit->irb->CreateBr(cUnit->entryTargetBB);
 
-  llvm::verifyFunction(*cUnit->func, llvm::PrintMessageAction);
+  //llvm::verifyFunction(*cUnit->func, llvm::PrintMessageAction);
 
   if (cUnit->enableDebug & (1 << kDebugDumpBitcodeFile)) {
     // Write bitcode to file
@@ -2258,43 +2269,23 @@ void cvtBinOp(CompilationUnit* cUnit, OpKind op, llvm::Instruction* inst)
   }
 }
 
-void cvtShiftOp(CompilationUnit* cUnit, OpKind op, llvm::Instruction* inst)
+void cvtShiftOp(CompilationUnit* cUnit, Instruction::Code opcode,
+                llvm::CallInst* callInst)
 {
-  if (inst->getType() == cUnit->irb->getInt64Ty()) {
-    /*
-     * llvm wants the shift amount to be 64 bits, whereas we've constained
-     * it to be in 6 bits.  It should always be held as an unnamed temp
-     * at this point that was the result of a previous UExt.  We'll backtrack
-     * to find the pre-extension value and use that.
-     * TODO: probably better to handle this in cvtIntExt() or just intrinsify
-     */
-    RegLocation rlDest = getLoc(cUnit, inst);
-    RegLocation rlSrc = getLoc(cUnit, inst->getOperand(0));
-    RegLocation rlShift = getLoc(cUnit, inst->getOperand(1));
-    DCHECK(rlShift.wide);
-    DCHECK_EQ(rlShift.sRegLow, INVALID_SREG);
-    // Now, free the temp registers - we won't need them.
-    // TODO: kill the dead extend ops
-    oatFreeTemp(cUnit, rlShift.lowReg);
-    oatFreeTemp(cUnit, rlShift.highReg);
-    // Get the pre-extend operand
-    llvm::Instruction* extInst =
-        llvm::dyn_cast<llvm::Instruction>(inst->getOperand(1));
-    DCHECK(extInst != NULL);
-    rlShift = getLoc(cUnit, extInst->getOperand(0));
-    DCHECK(!rlShift.wide);
-    Instruction::Code opcode;
-    if (op == kOpLsl)
-      opcode = Instruction::SHL_LONG;
-    else if (op == kOpAsr)
-      opcode = Instruction::SHR_LONG;
-    else {
-      DCHECK_EQ(op, kOpLsr);
-      opcode = Instruction::USHR_LONG;
-    }
-    genShiftOpLong(cUnit, opcode, rlDest, rlSrc, rlShift);
+  DCHECK_EQ(callInst->getNumArgOperands(), 2U);
+  RegLocation rlDest = getLoc(cUnit, callInst);
+  RegLocation rlSrc = getLoc(cUnit, callInst->getArgOperand(0));
+  llvm::Value* rhs = callInst->getArgOperand(1);
+  if (llvm::ConstantInt* src2 = llvm::dyn_cast<llvm::ConstantInt>(rhs)) {
+    DCHECK(!rlDest.wide);
+    genArithOpIntLit(cUnit, opcode, rlDest, rlSrc, src2->getSExtValue());
   } else {
-    cvtBinOp(cUnit, op, inst);
+    RegLocation rlShift = getLoc(cUnit, rhs);
+    if (callInst->getType() == cUnit->irb->getInt64Ty()) {
+      genShiftOpLong(cUnit, opcode, rlDest, rlSrc, rlShift);
+    } else {
+      genArithOpInt(cUnit, opcode, rlDest, rlSrc, rlShift);
+    }
   }
 }
 
@@ -3098,9 +3089,25 @@ bool methodBitcodeBlockCodeGen(CompilationUnit* cUnit, llvm::BasicBlock* bb)
               cvtLongCompare(cUnit, callInst);
               break;
 
-            case greenland::IntrinsicHelper::UnknownId:
-              cvtCall(cUnit, callInst, callee);
+            case greenland::IntrinsicHelper::SHLLong:
+              cvtShiftOp(cUnit, Instruction::SHL_LONG, callInst);
+              break;
+            case greenland::IntrinsicHelper::SHRLong:
+              cvtShiftOp(cUnit, Instruction::SHR_LONG, callInst);
+              break;
+            case greenland::IntrinsicHelper::USHRLong:
+              cvtShiftOp(cUnit, Instruction::USHR_LONG, callInst);
               break;
+            case greenland::IntrinsicHelper::SHLInt:
+              cvtShiftOp(cUnit, Instruction::SHL_INT, callInst);
+              break;
+            case greenland::IntrinsicHelper::SHRInt:
+              cvtShiftOp(cUnit, Instruction::SHR_INT, callInst);
+              break;
+            case greenland::IntrinsicHelper::USHRInt:
+              cvtShiftOp(cUnit, Instruction::USHR_INT, callInst);
+              break;
+
             default:
               LOG(FATAL) << "Unexpected intrinsic " << (int)id << ", "
                          << cUnit->intrinsic_helper->GetName(id);
@@ -3117,9 +3124,6 @@ bool methodBitcodeBlockCodeGen(CompilationUnit* cUnit, llvm::BasicBlock* bb)
       case llvm::Instruction::And: cvtBinOp(cUnit, kOpAnd, inst); break;
       case llvm::Instruction::Or: cvtBinOp(cUnit, kOpOr, inst); break;
       case llvm::Instruction::Xor: cvtBinOp(cUnit, kOpXor, inst); break;
-      case llvm::Instruction::Shl: cvtShiftOp(cUnit, kOpLsl, inst); break;
-      case llvm::Instruction::LShr: cvtShiftOp(cUnit, kOpLsr, inst); break;
-      case llvm::Instruction::AShr: cvtShiftOp(cUnit, kOpAsr, inst); break;
       case llvm::Instruction::PHI: cvtPhi(cUnit, inst); break;
       case llvm::Instruction::Ret: cvtRet(cUnit, inst); break;
       case llvm::Instruction::FAdd: cvtBinFPOp(cUnit, kOpAdd, inst); break;
@@ -3143,6 +3147,9 @@ bool methodBitcodeBlockCodeGen(CompilationUnit* cUnit, llvm::BasicBlock* bb)
       case llvm::Instruction::Unreachable:
         break;  // FIXME: can we really ignore these?
 
+      case llvm::Instruction::Shl:
+      case llvm::Instruction::LShr:
+      case llvm::Instruction::AShr:
       case llvm::Instruction::Invoke:
       case llvm::Instruction::FPToUI:
       case llvm::Instruction::UIToFP:
@@ -3174,7 +3181,8 @@ bool methodBitcodeBlockCodeGen(CompilationUnit* cUnit, llvm::BasicBlock* bb)
         LOG(FATAL) << "Unexpected llvm opcode: " << opcode; break;
 
       default:
-        LOG(FATAL) << "Unknown llvm opcode: " << opcode; break;
+        LOG(FATAL) << "Unknown llvm opcode: " << inst->getOpcodeName();
+        break;
     }
   }
 
diff --git a/src/compiler/codegen/RallocUtil.cc b/src/compiler/codegen/RallocUtil.cc
index 2088cdc360..9d1878a02b 100644
--- a/src/compiler/codegen/RallocUtil.cc
+++ b/src/compiler/codegen/RallocUtil.cc
@@ -998,14 +998,12 @@ extern RegLocation oatGetRawSrc(CompilationUnit* cUnit, MIR* mir, int num)
 {
   DCHECK(num < mir->ssaRep->numUses);
   RegLocation res = cUnit->regLocation[mir->ssaRep->uses[num]];
-  DCHECK(!res.wide || num < (mir->ssaRep->numUses - 1));
   return res;
 }
 extern RegLocation oatGetRawDest(CompilationUnit* cUnit, MIR* mir)
 {
   DCHECK_GT(mir->ssaRep->numDefs, 0);
   RegLocation res = cUnit->regLocation[mir->ssaRep->defs[0]];
-  DCHECK(!res.wide || mir->ssaRep->numDefs == 2);
   return res;
 }
 extern RegLocation oatGetDest(CompilationUnit* cUnit, MIR* mir)
diff --git a/src/greenland/intrinsic_func_list.def b/src/greenland/intrinsic_func_list.def
index 608e760c75..0ebebb25a3 100644
--- a/src/greenland/intrinsic_func_list.def
+++ b/src/greenland/intrinsic_func_list.def
@@ -1228,7 +1228,7 @@ _EVAL_DEF_INTRINSICS_FUNC(CopyObj,
                           kJavaObjectTy,
                           _EXPAND_ARG1(kJavaObjectTy))
 
-// int copy_long(long)
+// long copy_long(long)
 _EVAL_DEF_INTRINSICS_FUNC(CopyLong,
                           dex_lang_copy_long,
                           kAttrReadOnly | kAttrNoThrow,
@@ -1250,6 +1250,50 @@ _EVAL_DEF_INTRINSICS_FUNC(CopyDouble,
                           _EXPAND_ARG1(kDoubleTy))
 
 //----------------------------------------------------------------------------
+// Shift intrinsics.  Shift semantics for Dalvik are a bit different than
+// the llvm shift operators.  For 32-bit shifts, the shift count is constrained
+// to the range of 0..31, while for 64-bit shifts we limit to 0..63.
+// Further, the shift count for Long shifts in Dalvik is 32 bits, while
+// llvm requires a 64-bit shift count. For GBC, we represent shifts as an
+//  intrinsic to allow most efficient target-dependent lowering.
+//----------------------------------------------------------------------------
+// long shl_long(long,int)
+_EVAL_DEF_INTRINSICS_FUNC(SHLLong,
+                          dex_lang_shl_long,
+                          kAttrReadOnly | kAttrNoThrow,
+                          kInt64Ty,
+                          _EXPAND_ARG2(kInt64Ty,kInt32Ty))
+// long shr_long(long,int)
+_EVAL_DEF_INTRINSICS_FUNC(SHRLong,
+                          dex_lang_shr_long,
+                          kAttrReadOnly | kAttrNoThrow,
+                          kInt64Ty,
+                          _EXPAND_ARG2(kInt64Ty,kInt32Ty))
+// long ushr_long(long,int)
+_EVAL_DEF_INTRINSICS_FUNC(USHRLong,
+                          dex_lang_ushl_long,
+                          kAttrReadOnly | kAttrNoThrow,
+                          kInt64Ty,
+                          _EXPAND_ARG2(kInt64Ty,kInt32Ty))
+// int shl_int(int,int)
+_EVAL_DEF_INTRINSICS_FUNC(SHLInt,
+                          dex_lang_shl_int,
+                          kAttrReadOnly | kAttrNoThrow,
+                          kInt32Ty,
+                          _EXPAND_ARG2(kInt32Ty,kInt32Ty))
+// long shr_int(int,int)
+_EVAL_DEF_INTRINSICS_FUNC(SHRInt,
+                          dex_lang_shr_int,
+                          kAttrReadOnly | kAttrNoThrow,
+                          kInt32Ty,
+                          _EXPAND_ARG2(kInt32Ty,kInt32Ty))
+// int ushr_long(int,int)
+_EVAL_DEF_INTRINSICS_FUNC(USHRInt,
+                          dex_lang_ushl_int,
+                          kAttrReadOnly | kAttrNoThrow,
+                          kInt32Ty,
+                          _EXPAND_ARG2(kInt32Ty,kInt32Ty))
+//----------------------------------------------------------------------------
 // Conversion instrinsics.  Note: these should eventually be removed.  We
 // can express these directly in bitcode, but by using intrinsics the
 // Quick compiler can be more efficient.  Some extra optimization infrastructure