Rework type & size inference, literal usage

Fixes a bug in the old type inference mechanism (wasn't properly
propogating type info across Phi & move nodes).  Combined type and
size inferences passes.

Fixed long-standing bug in the code to load a special double-precision
immediate (would have been extremely difficult to hit this in the field).
Improved loading floating point immediates.

Change-Id: I1ec72edc3b25525f14d965089f8952d4f0294942
diff --git a/src/compiler/codegen/arm/ArchUtility.cc b/src/compiler/codegen/arm/ArchUtility.cc
index 6f435e7..1d6bb41 100644
--- a/src/compiler/codegen/arm/ArchUtility.cc
+++ b/src/compiler/codegen/arm/ArchUtility.cc
@@ -371,7 +371,7 @@
                 buildInsnString(EncodingMap[lir->opcode].fmt, lir, opOperands,
                                 baseAddr, 256);
                 char tBuf[256];
-                snprintf(tBuf, 256, "%p (%04x): %-8s%s%s", baseAddr + offset, offset,
+                snprintf(tBuf, 256, "%p (%04x): %-9s%s%s", baseAddr + offset, offset,
                          opName, opOperands, lir->flags.isNop ? "(nop)" : "");
                 LOG(INFO) << tBuf;
             }
diff --git a/src/compiler/codegen/arm/ArmLIR.h b/src/compiler/codegen/arm/ArmLIR.h
index 53e8dc8..07e2e97 100644
--- a/src/compiler/codegen/arm/ArmLIR.h
+++ b/src/compiler/codegen/arm/ArmLIR.h
@@ -682,7 +682,7 @@
     kThumb2BUncond,      /* b <label> */
     kThumb2MovImm16H,    /* similar to kThumb2MovImm16, but target high hw */
     kThumb2AddPCR,       /* Thumb2 2-operand add with hard-coded PC target */
-    kThumb2AdrST,        /* Special purpose encoding of ADR for switch tables */
+    kThumb2Adr,          /* Special purpose encoding of ADR for switch tables */
     kThumb2MovImm16LST,  /* Special purpose version for switch table use */
     kThumb2MovImm16HST,  /* Special purpose version for switch table use */
     kThumb2LdmiaWB,      /* ldmia  [111010011001[ rn[19..16] mask[15..0] */
diff --git a/src/compiler/codegen/arm/Assemble.cc b/src/compiler/codegen/arm/Assemble.cc
index 62706a7..d22c267 100644
--- a/src/compiler/codegen/arm/Assemble.cc
+++ b/src/compiler/codegen/arm/Assemble.cc
@@ -932,7 +932,7 @@
                  kFmtUnused, -1, -1,
                  IS_UNARY_OP | REG_USE0 | IS_BRANCH,
                  "add", "rPC, !0C", 1),
-    ENCODING_MAP(kThumb2AdrST,       0xf20f0000,
+    ENCODING_MAP(kThumb2Adr,         0xf20f0000,
                  kFmtBitBlt, 11, 8, kFmtImm12, -1, -1, kFmtUnused, -1, -1,
                  kFmtUnused, -1, -1,
                  IS_TERTIARY_OP | REG_DEF0,/* Note: doesn't affect flags */
@@ -1085,6 +1085,7 @@
         if (lir->opcode == kThumbLdrPcRel ||
             lir->opcode == kThumb2LdrPcRel12 ||
             lir->opcode == kThumbAddPcRel ||
+            ((lir->opcode == kThumb2Vldrd) && (lir->operands[1] == r15pc)) ||
             ((lir->opcode == kThumb2Vldrs) && (lir->operands[1] == r15pc))) {
             /*
              * PC-relative loads are mostly used to load immediates
@@ -1106,57 +1107,34 @@
                 LOG(FATAL) << "Unexpected pc-rel offset " << delta;
             }
             // Now, check for the two difficult cases
-            if (1 || ((lir->opcode == kThumb2LdrPcRel12) && (delta > 4091)) ||
-                ((lir->opcode == kThumb2Vldrs) && (delta > 1020))) {
-            /*
-             * OK - the load doesn't work.  We'll just materialize
-             * the immediate directly using mov16l and mov16h.
-             * It's a little ugly for float immediates as we don't have
-             * float ops like the core mov imm16H/L.  In this case
-             * we'll materialize in a core register (rLR) and then copy.
-             * NOTE/WARNING: This is a *very* fragile workaround that will
-             * be addressed in a later release when we have a late spill
-             * capability.  We can get away with it for now because rLR
-             * is currently only used during call setups, and our convention
-             * requires all arguments to be passed in core register & the
-             * frame (and thus, we won't see any vlrds in the sequence).
-             * The normal resource mask mechanism will prevent any damaging
-             * code motion.
-             */
-                int tgtReg = (lir->opcode == kThumb2Vldrs) ? rLR :
-                              lir->operands[0];
-                int immVal = lirTarget->operands[0];
-                // The standard utilities won't work here - build manually
-                ArmLIR *newMov16L =
+            if (((lir->opcode == kThumb2LdrPcRel12) && (delta > 4091)) ||
+                ((lir->opcode == kThumb2Vldrs) && (delta > 1020)) ||
+                ((lir->opcode == kThumb2Vldrd) && (delta > 1020))) {
+                int baseReg = (lir->opcode == kThumb2LdrPcRel12) ?
+                    lir->operands[0] : rLR;
+
+                // Add new Adr to generate the address
+                ArmLIR *newAdr =
                     (ArmLIR *)oatNew(sizeof(ArmLIR), true);
-                newMov16L->generic.dalvikOffset = lir->generic.dalvikOffset;
-                newMov16L->opcode = kThumb2MovImm16;
-                newMov16L->operands[0] = tgtReg;
-                newMov16L->operands[1] = immVal & 0xffff;
-                oatSetupResourceMasks(newMov16L);
-                oatInsertLIRBefore((LIR*)lir, (LIR*)newMov16L);
-                ArmLIR *newMov16H =
-                    (ArmLIR *)oatNew(sizeof(ArmLIR), true);
-                newMov16H->generic.dalvikOffset = lir->generic.dalvikOffset;
-                newMov16H->opcode = kThumb2MovImm16H;
-                newMov16H->operands[0] = tgtReg;
-                newMov16H->operands[1] = (immVal >> 16) & 0xffff;
-                oatSetupResourceMasks(newMov16H);
-                oatInsertLIRBefore((LIR*)lir, (LIR*)newMov16H);
-                if (lir->opcode == kThumb2Vldrs) {
-                    // Convert the vldrs to a kThumb2Fmsr
-                    lir->opcode = kThumb2Fmsr;
-                    lir->operands[1] = rLR;
-                    lir->generic.target = NULL;
-                    lir->operands[2] = 0;
-                    oatSetupResourceMasks(lir);
-                } else {
-                    // Nullify the original load
-                    lir->flags.isNop = true;
+                newAdr->generic.dalvikOffset = lir->generic.dalvikOffset;
+                newAdr->generic.target = lir->generic.target;
+                newAdr->opcode = kThumb2Adr;
+                newAdr->operands[0] = baseReg;
+                oatSetupResourceMasks(newAdr);
+                oatInsertLIRBefore((LIR*)lir, (LIR*)newAdr);
+
+                // Convert to normal load
+                if (lir->opcode == kThumb2LdrPcRel12) {
+                    lir->opcode = kThumb2LdrRRI12;
                 }
+                // Change the load to be relative to the new Adr base
+                lir->operands[1] = baseReg;
+                lir->operands[2] = 0;
+                oatSetupResourceMasks(lir);
                 res = kRetryAll;
             } else {
-                if (lir->opcode == kThumb2Vldrs) {
+                if ((lir->opcode == kThumb2Vldrs) ||
+                    (lir->opcode == kThumb2Vldrd)) {
                     lir->operands[2] = delta >> 2;
                 } else {
                     lir->operands[1] = (lir->opcode == kThumb2LdrPcRel12) ?
@@ -1259,16 +1237,19 @@
 
             lir->operands[0] = (delta >> 12) & 0x7ff;
             NEXT_LIR(lir)->operands[0] = (delta>> 1) & 0x7ff;
-        } else if (lir->opcode == kThumb2AdrST) {
+        } else if (lir->opcode == kThumb2Adr) {
             SwitchTable *tabRec = (SwitchTable*)lir->operands[2];
-            int disp = tabRec->offset - ((lir->generic.offset + 4) & ~3);
+            ArmLIR* target = (ArmLIR*)lir->generic.target;
+            int targetDisp = tabRec ? tabRec->offset : target->generic.offset;
+            int disp = targetDisp - ((lir->generic.offset + 4) & ~3);
             if (disp < 4096) {
                 lir->operands[1] = disp;
             } else {
-                // convert to ldimm16l, ldimm16h, add tgt, pc, r12
+                // convert to ldimm16l, ldimm16h, add tgt, pc, operands[0]
                 ArmLIR *newMov16L =
                     (ArmLIR *)oatNew(sizeof(ArmLIR), true);
                 newMov16L->generic.dalvikOffset = lir->generic.dalvikOffset;
+                newMov16L->generic.target = lir->generic.target;
                 newMov16L->opcode = kThumb2MovImm16LST;
                 newMov16L->operands[0] = lir->operands[0];
                 newMov16L->operands[2] = (intptr_t)lir;
@@ -1278,6 +1259,7 @@
                 ArmLIR *newMov16H =
                     (ArmLIR *)oatNew(sizeof(ArmLIR), true);
                 newMov16H->generic.dalvikOffset = lir->generic.dalvikOffset;
+                newMov16H->generic.target = lir->generic.target;
                 newMov16H->opcode = kThumb2MovImm16HST;
                 newMov16H->operands[0] = lir->operands[0];
                 newMov16H->operands[2] = (intptr_t)lir;
@@ -1294,13 +1276,19 @@
             // operands[1] should hold disp, [2] has add, [3] has tabRec
             ArmLIR *addPCInst = (ArmLIR*)lir->operands[2];
             SwitchTable *tabRec = (SwitchTable*)lir->operands[3];
-            lir->operands[1] = (tabRec->offset -
+            // If tabRec is null, this is a literal load - use generic.target
+            ArmLIR* target = (ArmLIR*)lir->generic.target;
+            int targetDisp = tabRec ? tabRec->offset : target->generic.offset;
+            lir->operands[1] = (targetDisp -
                 (addPCInst->generic.offset + 4)) & 0xffff;
         } else if (lir->opcode == kThumb2MovImm16HST) {
             // operands[1] should hold disp, [2] has add, [3] has tabRec
             ArmLIR *addPCInst = (ArmLIR*)lir->operands[2];
             SwitchTable *tabRec = (SwitchTable*)lir->operands[3];
-            lir->operands[1] = ((tabRec->offset -
+            // If tabRec is null, this is a literal load - use generic.target
+            ArmLIR* target = (ArmLIR*)lir->generic.target;
+            int targetDisp = tabRec ? tabRec->offset : target->generic.offset;
+            lir->operands[1] = ((targetDisp -
                 (addPCInst->generic.offset + 4)) >> 16) & 0xffff;
         }
         ArmEncodingMap *encoder = &EncodingMap[lir->opcode];
diff --git a/src/compiler/codegen/arm/CodegenCommon.cc b/src/compiler/codegen/arm/CodegenCommon.cc
index 4b9b592..40494f3 100644
--- a/src/compiler/codegen/arm/CodegenCommon.cc
+++ b/src/compiler/codegen/arm/CodegenCommon.cc
@@ -326,6 +326,25 @@
     return NULL;
 }
 
+/* Search the existing constants in the literal pool for an exact wide match */
+static ArmLIR* scanLiteralPoolWide(LIR* dataTarget, int valLo, int valHi)
+{
+    bool loMatch = false;
+    LIR* loTarget = NULL;
+    while (dataTarget) {
+        if (loMatch && (((ArmLIR*)dataTarget)->operands[0] == valHi)) {
+            return (ArmLIR*)loTarget;
+        }
+        loMatch = false;
+        if (((ArmLIR*)dataTarget)->operands[0] == valLo) {
+            loMatch = true;
+            loTarget = dataTarget;
+        }
+        dataTarget = dataTarget->next;
+    }
+    return NULL;
+}
+
 /*
  * The following are building blocks to insert constants into the pool or
  * instruction streams.
@@ -350,6 +369,23 @@
     return NULL;
 }
 
+/* Add a 64-bit constant to the constant pool or mixed with code */
+static ArmLIR* addWideData(CompilationUnit* cUnit, LIR* *constantListP,
+                           int valLo, int valHi)
+{
+    ArmLIR* res;
+    //NOTE: hard-coded little endian
+    if (constantListP == NULL) {
+        res = addWordData(cUnit, NULL, valLo);
+        addWordData(cUnit, NULL, valHi);
+    } else {
+        // Insert high word into list first
+        addWordData(cUnit, constantListP, valHi);
+        res = addWordData(cUnit, constantListP, valLo);
+    }
+    return res;
+}
+
 /*
  * Generate an kArmPseudoBarrier marker to indicate the boundary of special
  * blocks.
diff --git a/src/compiler/codegen/arm/MethodCodegenDriver.cc b/src/compiler/codegen/arm/MethodCodegenDriver.cc
index 0af213f..8dc388c 100644
--- a/src/compiler/codegen/arm/MethodCodegenDriver.cc
+++ b/src/compiler/codegen/arm/MethodCodegenDriver.cc
@@ -1182,10 +1182,10 @@
 
         case OP_CONST_WIDE_16:
         case OP_CONST_WIDE_32:
-            rlResult = oatEvalLoc(cUnit, rlDest, kCoreReg, true);
-            loadConstantNoClobber(cUnit, rlResult.lowReg, mir->dalvikInsn.vB);
-            //TUNING: do high separately to avoid load dependency
-            opRegRegImm(cUnit, kOpAsr, rlResult.highReg, rlResult.lowReg, 31);
+            rlResult = oatEvalLoc(cUnit, rlDest, kAnyReg, true);
+            loadConstantValueWide(cUnit, rlResult.lowReg, rlResult.highReg,
+                                  mir->dalvikInsn.vB,
+                                  (mir->dalvikInsn.vB & 0x80000000) ? -1 : 0);
             storeValueWide(cUnit, rlDest, rlResult);
             break;
 
diff --git a/src/compiler/codegen/arm/Thumb2/Factory.cc b/src/compiler/codegen/arm/Thumb2/Factory.cc
index 78b9e61..254802d 100644
--- a/src/compiler/codegen/arm/Thumb2/Factory.cc
+++ b/src/compiler/codegen/arm/Thumb2/Factory.cc
@@ -619,7 +619,7 @@
     if (zeroes != 0)
         return -1;
     if (bitB) {
-        if ((notBitB != 0) || (bSmear != 0x1f))
+        if ((notBitB != 0) || (bSmear != 0xff))
             return -1;
     } else {
         if ((notBitB != 1) || (bSmear != 0x0))
@@ -642,9 +642,29 @@
 {
     int encodedImm = encodeImmDouble(valLo, valHi);
     ArmLIR* res;
-    if (FPREG(rDestLo) && (encodedImm >= 0)) {
-        res = newLIR2(cUnit, kThumb2Vmovd_IMM8, S2D(rDestLo, rDestHi),
-                      encodedImm);
+    if (FPREG(rDestLo)) {
+        if (encodedImm >= 0) {
+            res = newLIR2(cUnit, kThumb2Vmovd_IMM8, S2D(rDestLo, rDestHi),
+                          encodedImm);
+        } else {
+            ArmLIR* dataTarget = scanLiteralPoolWide(cUnit->literalList, valLo,
+               valHi);
+            if (dataTarget == NULL) {
+                dataTarget = addWideData(cUnit, &cUnit->literalList, valLo,
+                                         valHi);
+            }
+            ArmLIR* loadPcRel = (ArmLIR* ) oatNew(sizeof(ArmLIR), true);
+            loadPcRel->generic.dalvikOffset = cUnit->currentDalvikOffset;
+            loadPcRel->opcode = kThumb2Vldrd;
+            loadPcRel->generic.target = (LIR* ) dataTarget;
+            loadPcRel->operands[0] = S2D(rDestLo, rDestHi);
+            loadPcRel->operands[1] = r15pc;
+            setupResourceMasks(loadPcRel);
+            setMemRefType(loadPcRel, true, kLiteral);
+            loadPcRel->aliasInfo = dataTarget->operands[0];
+            oatAppendLIR(cUnit, (LIR* ) loadPcRel);
+            res = loadPcRel;
+        }
     } else {
         res = loadConstantNoClobber(cUnit, rDestLo, valLo);
         loadConstantNoClobber(cUnit, rDestHi, valHi);
diff --git a/src/compiler/codegen/arm/Thumb2/Gen.cc b/src/compiler/codegen/arm/Thumb2/Gen.cc
index 52d67de..2404ca7 100644
--- a/src/compiler/codegen/arm/Thumb2/Gen.cc
+++ b/src/compiler/codegen/arm/Thumb2/Gen.cc
@@ -283,7 +283,7 @@
         rKey = tmp;
     }
     // Materialize a pointer to the switch table
-    newLIR3(cUnit, kThumb2AdrST, rBase, 0, (intptr_t)tabRec);
+    newLIR3(cUnit, kThumb2Adr, rBase, 0, (intptr_t)tabRec);
     // Set up rIdx
     int rIdx = oatAllocTemp(cUnit);
     loadConstant(cUnit, rIdx, size);
@@ -324,7 +324,7 @@
     rlSrc = loadValue(cUnit, rlSrc, kCoreReg);
     int tableBase = oatAllocTemp(cUnit);
     // Materialize a pointer to the switch table
-    newLIR3(cUnit, kThumb2AdrST, tableBase, 0, (intptr_t)tabRec);
+    newLIR3(cUnit, kThumb2Adr, tableBase, 0, (intptr_t)tabRec);
     int lowKey = s4FromSwitchData(&table[2]);
     int keyReg;
     // Remove the bias, if necessary
@@ -383,7 +383,7 @@
     loadWordDisp(cUnit, rSELF,
                  OFFSETOF_MEMBER(Thread, pHandleFillArrayDataFromCode), rLR);
     // Materialize a pointer to the fill data image
-    newLIR3(cUnit, kThumb2AdrST, r1, 0, (intptr_t)tabRec);
+    newLIR3(cUnit, kThumb2Adr, r1, 0, (intptr_t)tabRec);
     callUnwindableHelper(cUnit, rLR);
     oatClobberCallRegs(cUnit);
 }