Faster AssembleLIR for ARM.

This also reduces sizeof(LIR) by 4 bytes (32-bit builds).

Change-Id: I0cb81f9bf098dfc50050d5bc705c171af26464ce
diff --git a/compiler/dex/compiler_enums.h b/compiler/dex/compiler_enums.h
index eb4a336..964a222 100644
--- a/compiler/dex/compiler_enums.h
+++ b/compiler/dex/compiler_enums.h
@@ -417,7 +417,7 @@
 enum FixupKind {
   kFixupNone,
   kFixupLabel,       // For labels we just adjust the offset.
-  kFixupLoad,        // Mostly for imediates.
+  kFixupLoad,        // Mostly for immediates.
   kFixupVLoad,       // FP load which *may* be pc-relative.
   kFixupCBxZ,        // Cbz, Cbnz.
   kFixupPushPop,     // Not really pc relative, but changes size based on args.
diff --git a/compiler/dex/quick/arm/assemble_arm.cc b/compiler/dex/quick/arm/assemble_arm.cc
index caf84d9..7955d6c 100644
--- a/compiler/dex/quick/arm/assemble_arm.cc
+++ b/compiler/dex/quick/arm/assemble_arm.cc
@@ -1062,134 +1062,142 @@
  */
 #define PADDING_MOV_R5_R5               0x1C2D
 
-void ArmMir2Lir::EncodeLIR(LIR* lir) {
-  int opcode = lir->opcode;
-  if (IsPseudoLirOp(opcode)) {
-    if (UNLIKELY(opcode == kPseudoPseudoAlign4)) {
-      // Note: size for this opcode will be either 0 or 2 depending on final alignment.
-      lir->u.a.bytes[0] = (PADDING_MOV_R5_R5 & 0xff);
-      lir->u.a.bytes[1] = ((PADDING_MOV_R5_R5 >> 8) & 0xff);
-      lir->flags.size = (lir->offset & 0x2);
-    }
-  } else if (LIKELY(!lir->flags.is_nop)) {
-    const ArmEncodingMap *encoder = &EncodingMap[lir->opcode];
-    uint32_t bits = encoder->skeleton;
-    for (int i = 0; i < 4; i++) {
-      uint32_t operand;
-      uint32_t value;
-      operand = lir->operands[i];
-      ArmEncodingKind kind = encoder->field_loc[i].kind;
-      if (LIKELY(kind == kFmtBitBlt)) {
-        value = (operand << encoder->field_loc[i].start) &
-            ((1 << (encoder->field_loc[i].end + 1)) - 1);
-        bits |= value;
-      } else {
-        switch (encoder->field_loc[i].kind) {
-          case kFmtSkip:
-            break;  // Nothing to do, but continue to next.
-          case kFmtUnused:
-            i = 4;  // Done, break out of the enclosing loop.
-            break;
-          case kFmtFPImm:
-            value = ((operand & 0xF0) >> 4) << encoder->field_loc[i].end;
-            value |= (operand & 0x0F) << encoder->field_loc[i].start;
-            bits |= value;
-            break;
-          case kFmtBrOffset:
-            value = ((operand  & 0x80000) >> 19) << 26;
-            value |= ((operand & 0x40000) >> 18) << 11;
-            value |= ((operand & 0x20000) >> 17) << 13;
-            value |= ((operand & 0x1f800) >> 11) << 16;
-            value |= (operand  & 0x007ff);
-            bits |= value;
-            break;
-          case kFmtShift5:
-            value = ((operand & 0x1c) >> 2) << 12;
-            value |= (operand & 0x03) << 6;
-            bits |= value;
-            break;
-          case kFmtShift:
-            value = ((operand & 0x70) >> 4) << 12;
-            value |= (operand & 0x0f) << 4;
-            bits |= value;
-            break;
-          case kFmtBWidth:
-            value = operand - 1;
-            bits |= value;
-            break;
-          case kFmtLsb:
-            value = ((operand & 0x1c) >> 2) << 12;
-            value |= (operand & 0x03) << 6;
-            bits |= value;
-            break;
-          case kFmtImm6:
-            value = ((operand & 0x20) >> 5) << 9;
-            value |= (operand & 0x1f) << 3;
-            bits |= value;
-            break;
-          case kFmtDfp: {
-            DCHECK(ARM_DOUBLEREG(operand));
-            DCHECK_EQ((operand & 0x1), 0U);
-            uint32_t reg_name = (operand & ARM_FP_REG_MASK) >> 1;
-            /* Snag the 1-bit slice and position it */
-            value = ((reg_name & 0x10) >> 4) << encoder->field_loc[i].end;
-            /* Extract and position the 4-bit slice */
-            value |= (reg_name & 0x0f) << encoder->field_loc[i].start;
-            bits |= value;
-            break;
+uint8_t* ArmMir2Lir::EncodeLIRs(uint8_t* write_pos, LIR* lir) {
+  for (; lir != NULL; lir = NEXT_LIR(lir)) {
+    if (!lir->flags.is_nop) {
+      int opcode = lir->opcode;
+      if (IsPseudoLirOp(opcode)) {
+        if (UNLIKELY(opcode == kPseudoPseudoAlign4)) {
+          // Note: size for this opcode will be either 0 or 2 depending on final alignment.
+          if (lir->offset & 0x2) {
+            write_pos[0] = (PADDING_MOV_R5_R5 & 0xff);
+            write_pos[1] = ((PADDING_MOV_R5_R5 >> 8) & 0xff);
+            write_pos += 2;
           }
-          case kFmtSfp:
-            DCHECK(ARM_SINGLEREG(operand));
-            /* Snag the 1-bit slice and position it */
-            value = (operand & 0x1) << encoder->field_loc[i].end;
-            /* Extract and position the 4-bit slice */
-            value |= ((operand & 0x1e) >> 1) << encoder->field_loc[i].start;
+        }
+      } else if (LIKELY(!lir->flags.is_nop)) {
+        const ArmEncodingMap *encoder = &EncodingMap[lir->opcode];
+        uint32_t bits = encoder->skeleton;
+        for (int i = 0; i < 4; i++) {
+          uint32_t operand;
+          uint32_t value;
+          operand = lir->operands[i];
+          ArmEncodingKind kind = encoder->field_loc[i].kind;
+          if (LIKELY(kind == kFmtBitBlt)) {
+            value = (operand << encoder->field_loc[i].start) &
+                ((1 << (encoder->field_loc[i].end + 1)) - 1);
             bits |= value;
-            break;
-          case kFmtImm12:
-          case kFmtModImm:
-            value = ((operand & 0x800) >> 11) << 26;
-            value |= ((operand & 0x700) >> 8) << 12;
-            value |= operand & 0x0ff;
-            bits |= value;
-            break;
-          case kFmtImm16:
-            value = ((operand & 0x0800) >> 11) << 26;
-            value |= ((operand & 0xf000) >> 12) << 16;
-            value |= ((operand & 0x0700) >> 8) << 12;
-            value |= operand & 0x0ff;
-            bits |= value;
-            break;
-          case kFmtOff24: {
-            uint32_t signbit = (operand >> 31) & 0x1;
-            uint32_t i1 = (operand >> 22) & 0x1;
-            uint32_t i2 = (operand >> 21) & 0x1;
-            uint32_t imm10 = (operand >> 11) & 0x03ff;
-            uint32_t imm11 = operand & 0x07ff;
-            uint32_t j1 = (i1 ^ signbit) ? 0 : 1;
-            uint32_t j2 = (i2 ^ signbit) ? 0 : 1;
-            value = (signbit << 26) | (j1 << 13) | (j2 << 11) | (imm10 << 16) |
-                imm11;
-            bits |= value;
+          } else {
+            switch (encoder->field_loc[i].kind) {
+              case kFmtSkip:
+                break;  // Nothing to do, but continue to next.
+              case kFmtUnused:
+                i = 4;  // Done, break out of the enclosing loop.
+                break;
+              case kFmtFPImm:
+                value = ((operand & 0xF0) >> 4) << encoder->field_loc[i].end;
+                value |= (operand & 0x0F) << encoder->field_loc[i].start;
+                bits |= value;
+                break;
+              case kFmtBrOffset:
+                value = ((operand  & 0x80000) >> 19) << 26;
+                value |= ((operand & 0x40000) >> 18) << 11;
+                value |= ((operand & 0x20000) >> 17) << 13;
+                value |= ((operand & 0x1f800) >> 11) << 16;
+                value |= (operand  & 0x007ff);
+                bits |= value;
+                break;
+              case kFmtShift5:
+                value = ((operand & 0x1c) >> 2) << 12;
+                value |= (operand & 0x03) << 6;
+                bits |= value;
+                break;
+              case kFmtShift:
+                value = ((operand & 0x70) >> 4) << 12;
+                value |= (operand & 0x0f) << 4;
+                bits |= value;
+                break;
+              case kFmtBWidth:
+                value = operand - 1;
+                bits |= value;
+                break;
+              case kFmtLsb:
+                value = ((operand & 0x1c) >> 2) << 12;
+                value |= (operand & 0x03) << 6;
+                bits |= value;
+                break;
+              case kFmtImm6:
+                value = ((operand & 0x20) >> 5) << 9;
+                value |= (operand & 0x1f) << 3;
+                bits |= value;
+                break;
+              case kFmtDfp: {
+                DCHECK(ARM_DOUBLEREG(operand));
+                DCHECK_EQ((operand & 0x1), 0U);
+                uint32_t reg_name = (operand & ARM_FP_REG_MASK) >> 1;
+                /* Snag the 1-bit slice and position it */
+                value = ((reg_name & 0x10) >> 4) << encoder->field_loc[i].end;
+                /* Extract and position the 4-bit slice */
+                value |= (reg_name & 0x0f) << encoder->field_loc[i].start;
+                bits |= value;
+                break;
+              }
+              case kFmtSfp:
+                DCHECK(ARM_SINGLEREG(operand));
+                /* Snag the 1-bit slice and position it */
+                value = (operand & 0x1) << encoder->field_loc[i].end;
+                /* Extract and position the 4-bit slice */
+                value |= ((operand & 0x1e) >> 1) << encoder->field_loc[i].start;
+                bits |= value;
+                break;
+              case kFmtImm12:
+              case kFmtModImm:
+                value = ((operand & 0x800) >> 11) << 26;
+                value |= ((operand & 0x700) >> 8) << 12;
+                value |= operand & 0x0ff;
+                bits |= value;
+                break;
+              case kFmtImm16:
+                value = ((operand & 0x0800) >> 11) << 26;
+                value |= ((operand & 0xf000) >> 12) << 16;
+                value |= ((operand & 0x0700) >> 8) << 12;
+                value |= operand & 0x0ff;
+                bits |= value;
+                break;
+              case kFmtOff24: {
+                uint32_t signbit = (operand >> 31) & 0x1;
+                uint32_t i1 = (operand >> 22) & 0x1;
+                uint32_t i2 = (operand >> 21) & 0x1;
+                uint32_t imm10 = (operand >> 11) & 0x03ff;
+                uint32_t imm11 = operand & 0x07ff;
+                uint32_t j1 = (i1 ^ signbit) ? 0 : 1;
+                uint32_t j2 = (i2 ^ signbit) ? 0 : 1;
+                value = (signbit << 26) | (j1 << 13) | (j2 << 11) | (imm10 << 16) |
+                    imm11;
+                bits |= value;
+                }
+                break;
+              default:
+                LOG(FATAL) << "Bad fmt:" << encoder->field_loc[i].kind;
             }
-            break;
-          default:
-            LOG(FATAL) << "Bad fmt:" << encoder->field_loc[i].kind;
+          }
+        }
+        if (encoder->size == 4) {
+          write_pos[0] = ((bits >> 16) & 0xff);
+          write_pos[1] = ((bits >> 24) & 0xff);
+          write_pos[2] = (bits & 0xff);
+          write_pos[3] = ((bits >> 8) & 0xff);
+          write_pos += 4;
+        } else {
+          DCHECK_EQ(encoder->size, 2);
+          write_pos[0] = (bits & 0xff);
+          write_pos[1] = ((bits >> 8) & 0xff);
+          write_pos += 2;
         }
       }
     }
-    if (encoder->size == 4) {
-      lir->u.a.bytes[0] = ((bits >> 16) & 0xff);
-      lir->u.a.bytes[1] = ((bits >> 24) & 0xff);
-      lir->u.a.bytes[2] = (bits & 0xff);
-      lir->u.a.bytes[3] = ((bits >> 8) & 0xff);
-    } else {
-      DCHECK_EQ(encoder->size, 2);
-      lir->u.a.bytes[0] = (bits & 0xff);
-      lir->u.a.bytes[1] = ((bits >> 8) & 0xff);
-    }
-    lir->flags.size = encoder->size;
   }
+  return write_pos;
 }
 
 // Assemble the LIR into binary instruction format.
@@ -1198,7 +1206,7 @@
   LIR* prev_lir;
   cu_->NewTimingSplit("Assemble");
   int assembler_retries = 0;
-  CodeOffset starting_offset = EncodeRange(first_lir_insn_, last_lir_insn_, 0);
+  CodeOffset starting_offset = LinkFixupInsns(first_lir_insn_, last_lir_insn_, 0);
   data_offset_ = (starting_offset + 0x3) & ~0x3;
   int32_t offset_adjustment;
   AssignDataOffsets();
@@ -1304,8 +1312,6 @@
               lir->operands[2] = 0;
               lir->operands[1] = base_reg;
             }
-            // Must redo encoding here - won't ever revisit this node.
-            EncodeLIR(lir);
             prev_lir = new_adr;  // Continue scan with new_adr;
             lir = new_adr->u.a.pcrel_next;
             res = kRetryAll;
@@ -1346,9 +1352,8 @@
             /* operand[0] is src1 in both cb[n]z & CmpRI8 */
             lir->operands[1] = 0;
             lir->target = 0;
-            EncodeLIR(lir);   // NOTE: sets flags.size.
+            lir->flags.size = EncodingMap[lir->opcode].size;
             // Add back the new size.
-            DCHECK_EQ(lir->flags.size, static_cast<uint32_t>(EncodingMap[lir->opcode].size));
             offset_adjustment += lir->flags.size;
             // Set up the new following inst.
             new_inst->offset = lir->offset + lir->flags.size;
@@ -1570,20 +1575,6 @@
         default:
           LOG(FATAL) << "Unexpected case " << lir->flags.fixup;
       }
-      /*
-       * If one of the pc-relative instructions expanded we'll have
-       * to make another pass.  Don't bother to fully assemble the
-       * instruction.
-       */
-      if (res == kSuccess) {
-        EncodeLIR(lir);
-        if (assembler_retries == 0) {
-          // Go ahead and fix up the code buffer image.
-          for (int i = 0; i < lir->flags.size; i++) {
-            code_buffer_[lir->offset + i] = lir->u.a.bytes[i];
-          }
-        }
-      }
       prev_lir = lir;
       lir = lir->u.a.pcrel_next;
     }
@@ -1602,21 +1593,15 @@
     }
   }
 
-  // Rebuild the CodeBuffer if we had to retry; otherwise it should be good as-is.
-  if (assembler_retries != 0) {
-    code_buffer_.clear();
-    for (LIR* lir = first_lir_insn_; lir != NULL; lir = NEXT_LIR(lir)) {
-      if (lir->flags.is_nop) {
-        continue;
-      } else  {
-        for (int i = 0; i < lir->flags.size; i++) {
-          code_buffer_.push_back(lir->u.a.bytes[i]);
-        }
-      }
-    }
-  }
+  // Build the CodeBuffer.
+  DCHECK_LE(data_offset_, total_size_);
+  code_buffer_.reserve(total_size_);
+  code_buffer_.resize(starting_offset);
+  uint8_t* write_pos = &code_buffer_[0];
+  write_pos = EncodeLIRs(write_pos, first_lir_insn_);
+  DCHECK_EQ(static_cast<CodeOffset>(write_pos - &code_buffer_[0]), starting_offset);
 
-  data_offset_ = (code_buffer_.size() + 0x3) & ~0x3;
+  DCHECK_EQ(data_offset_, (code_buffer_.size() + 0x3) & ~0x3);
 
   // Install literals
   InstallLiteralPools();
@@ -1641,19 +1626,11 @@
 }
 
 // Encode instruction bit pattern and assign offsets.
-uint32_t ArmMir2Lir::EncodeRange(LIR* head_lir, LIR* tail_lir, uint32_t offset) {
+uint32_t ArmMir2Lir::LinkFixupInsns(LIR* head_lir, LIR* tail_lir, uint32_t offset) {
   LIR* end_lir = tail_lir->next;
 
-  /*
-   * A significant percentage of methods can be assembled in a single pass.  We'll
-   * go ahead and build the code image here, leaving holes for pc-relative fixup
-   * codes.  If the code size changes during that pass, we'll have to throw away
-   * this work - but if not, we're ready to go.
-   */
-  code_buffer_.reserve(estimated_native_code_size_ + 256);  // Add a little slop.
   LIR* last_fixup = NULL;
   for (LIR* lir = head_lir; lir != end_lir; lir = NEXT_LIR(lir)) {
-    lir->offset = offset;
     if (!lir->flags.is_nop) {
       if (lir->flags.fixup != kFixupNone) {
         if (!IsPseudoLirOp(lir->opcode)) {
@@ -1675,11 +1652,7 @@
           last_fixup->u.a.pcrel_next = lir;
         }
         last_fixup = lir;
-      } else {
-        EncodeLIR(lir);
-      }
-      for (int i = 0; i < lir->flags.size; i++) {
-        code_buffer_.push_back(lir->u.a.bytes[i]);
+        lir->offset = offset;
       }
       offset += lir->flags.size;
     }
diff --git a/compiler/dex/quick/arm/codegen_arm.h b/compiler/dex/quick/arm/codegen_arm.h
index 0f1e171..8bfdb6a 100644
--- a/compiler/dex/quick/arm/codegen_arm.h
+++ b/compiler/dex/quick/arm/codegen_arm.h
@@ -77,10 +77,10 @@
 
     // Required for target - miscellaneous.
     void AssembleLIR();
-    uint32_t EncodeRange(LIR* head_lir, LIR* tail_lir, uint32_t starting_offset);
+    uint32_t LinkFixupInsns(LIR* head_lir, LIR* tail_lir, CodeOffset offset);
     int AssignInsnOffsets();
     void AssignOffsets();
-    void EncodeLIR(LIR* lir);
+    static uint8_t* EncodeLIRs(uint8_t* write_pos, LIR* lir);
     void DumpResourceMask(LIR* lir, uint64_t mask, const char* prefix);
     void SetupTargetResourceMasks(LIR* lir, uint64_t flags);
     const char* GetTargetInstFmt(int opcode);
diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index bac35aa..8614151 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h
@@ -131,7 +131,6 @@
 
 struct AssemblyInfo {
   LIR* pcrel_next;           // Chain of LIR nodes needing pc relative fixups.
-  uint8_t bytes[16];         // Encoded instruction bytes.
 };
 
 struct LIR {
@@ -151,7 +150,7 @@
   } flags;
   union {
     UseDefMasks m;               // Use & Def masks used during optimization.
-    AssemblyInfo a;              // Instruction encoding used during assembly phase.
+    AssemblyInfo a;              // Instruction info used during assembly phase.
   } u;
   int32_t operands[5];           // [0..4] = [dest, src1, src2, extra, extra2].
 };
@@ -340,7 +339,7 @@
       return code_buffer_.size() / sizeof(code_buffer_[0]);
     }
 
-    bool IsPseudoLirOp(int opcode) {
+    static bool IsPseudoLirOp(int opcode) {
       return (opcode < 0);
     }