ART: Arm32 packed-switch jump tables

Add jump table support to the thumb2 assembler. Jump tables are
a collection of labels for the case targets, and an anchor label
denoting the position of the jump.

Use the jump table support to implement packed-switch support for
arm32.

Add tests for BindTrackedLabel and JumpTable to the thumb2 assembler
test.

Bug: 24092914
Change-Id: I5c84f193dfebf9e07f48678efc8bd151bb1410dd
diff --git a/compiler/utils/arm/assembler_thumb2.cc b/compiler/utils/arm/assembler_thumb2.cc
index cc87856..fb3aa1e 100644
--- a/compiler/utils/arm/assembler_thumb2.cc
+++ b/compiler/utils/arm/assembler_thumb2.cc
@@ -92,7 +92,7 @@
   label->BindTo(bound_pc);
 }
 
-void Thumb2Assembler::BindLiterals() {
+uint32_t Thumb2Assembler::BindLiterals() {
   // We don't add the padding here, that's done only after adjusting the Fixup sizes.
   uint32_t code_size = buffer_.Size();
   for (Literal& lit : literals_) {
@@ -100,6 +100,15 @@
     BindLabel(label, code_size);
     code_size += lit.GetSize();
   }
+  return code_size;
+}
+
+void Thumb2Assembler::BindJumpTables(uint32_t code_size) {
+  for (JumpTable& table : jump_tables_) {
+    Label* label = table.GetLabel();
+    BindLabel(label, code_size);
+    code_size += table.GetSize();
+  }
 }
 
 void Thumb2Assembler::AdjustFixupIfNeeded(Fixup* fixup, uint32_t* current_code_size,
@@ -144,7 +153,7 @@
       AdjustFixupIfNeeded(fixup, &current_code_size, &fixups_to_recalculate);
     } while (!fixups_to_recalculate.empty());
 
-    if ((current_code_size & 2) != 0 && !literals_.empty()) {
+    if ((current_code_size & 2) != 0 && (!literals_.empty() || !jump_tables_.empty())) {
       // If we need to add padding before literals, this may just push some out of range,
       // so recalculate all load literals. This makes up for the fact that we don't mark
       // load literal as a dependency of all previous Fixups even though it actually is.
@@ -173,6 +182,13 @@
       label->Reinitialize();
       label->BindTo(old_position + literals_adjustment);
     }
+    for (JumpTable& table : jump_tables_) {
+      Label* label = table.GetLabel();
+      DCHECK(label->IsBound());
+      int old_position = label->Position();
+      label->Reinitialize();
+      label->BindTo(old_position + literals_adjustment);
+    }
   }
 
   return current_code_size;
@@ -229,6 +245,43 @@
   }
 }
 
+void Thumb2Assembler::EmitJumpTables() {
+  if (!jump_tables_.empty()) {
+    // Jump tables require 4 byte alignment. (We don't support byte and half-word jump tables.)
+    uint32_t code_size = buffer_.Size();
+    DCHECK_ALIGNED(code_size, 2);
+    if ((code_size & 2u) != 0u) {
+      Emit16(0);
+    }
+    for (JumpTable& table : jump_tables_) {
+      // Bulk ensure capacity, as this may be large.
+      size_t orig_size = buffer_.Size();
+      buffer_.ExtendCapacity(orig_size + table.GetSize());
+#ifndef NDEBUG
+      buffer_.has_ensured_capacity_ = true;
+#endif
+
+      DCHECK_EQ(static_cast<size_t>(table.GetLabel()->Position()), buffer_.Size());
+      int32_t anchor_position = table.GetAnchorLabel()->Position() + 4;
+
+      for (Label* target : table.GetData()) {
+        // Ensure that the label was tracked, so that it will have the right position.
+        DCHECK(std::find(tracked_labels_.begin(), tracked_labels_.end(), target) !=
+                   tracked_labels_.end());
+
+        int32_t offset = target->Position() - anchor_position;
+        buffer_.Emit<int32_t>(offset);
+      }
+
+#ifndef NDEBUG
+      buffer_.has_ensured_capacity_ = false;
+#endif
+      size_t new_size = buffer_.Size();
+      DCHECK_LE(new_size - orig_size, table.GetSize());
+    }
+  }
+}
+
 inline int16_t Thumb2Assembler::BEncoding16(int32_t offset, Condition cond) {
   DCHECK_ALIGNED(offset, 2);
   int16_t encoding = B15 | B14;
@@ -382,12 +435,34 @@
   return B31 | B30 | B29 | B28 | B27 | B23 | B22 | B20 | (rn << 16) | (rt << 12) | offset;
 }
 
+inline int16_t Thumb2Assembler::AdrEncoding16(Register rd, int32_t offset) {
+  DCHECK(IsUint<10>(offset));
+  DCHECK(IsAligned<4>(offset));
+  DCHECK(!IsHighRegister(rd));
+  return B15 | B13 | (rd << 8) | (offset >> 2);
+}
+
+inline int32_t Thumb2Assembler::AdrEncoding32(Register rd, int32_t offset) {
+  DCHECK(IsUint<12>(offset));
+  // Bit     26: offset[11]
+  // Bits 14-12: offset[10-8]
+  // Bits   7-0: offset[7-0]
+  int32_t immediate_mask =
+      ((offset & (1 << 11)) << (26 - 11)) |
+      ((offset & (7 << 8)) << (12 - 8)) |
+      (offset & 0xFF);
+  return B31 | B30 | B29 | B28 | B25 | B19 | B18 | B17 | B16 | (rd << 8) | immediate_mask;
+}
+
 void Thumb2Assembler::FinalizeCode() {
   ArmAssembler::FinalizeCode();
-  BindLiterals();
+  uint32_t size_after_literals = BindLiterals();
+  BindJumpTables(size_after_literals);
   uint32_t adjusted_code_size = AdjustFixups();
   EmitFixups(adjusted_code_size);
   EmitLiterals();
+  FinalizeTrackedLabels();
+  EmitJumpTables();
 }
 
 bool Thumb2Assembler::ShifterOperandCanAlwaysHold(uint32_t immediate) {
@@ -1770,6 +1845,15 @@
     case kLiteralFar:
       return 14u;
 
+    case kLiteralAddr1KiB:
+      return 2u;
+    case kLiteralAddr4KiB:
+      return 4u;
+    case kLiteralAddr64KiB:
+      return 6u;
+    case kLiteralAddrFar:
+      return 10u;
+
     case kLongOrFPLiteral1KiB:
       return 4u;
     case kLongOrFPLiteral256KiB:
@@ -1831,6 +1915,8 @@
     case kLiteral1KiB:
     case kLiteral4KiB:
     case kLongOrFPLiteral1KiB:
+    case kLiteralAddr1KiB:
+    case kLiteralAddr4KiB:
       DCHECK(diff >= 0 || (GetSize() == kLiteral1KiB && diff == -2));
       diff += LiteralPoolPaddingSize(current_code_size);
       // Load literal instructions round down the PC+4 to a multiple of 4, so if the PC
@@ -1843,12 +1929,14 @@
     case kLiteral1MiB:
     case kLiteral64KiB:
     case kLongOrFPLiteral256KiB:
+    case kLiteralAddr64KiB:
       DCHECK_GE(diff, 4);  // The target must be at least 4 bytes after the ADD rX, PC.
       diff -= 4;        // One extra 32-bit MOV.
       diff += LiteralPoolPaddingSize(current_code_size);
       break;
     case kLiteralFar:
     case kLongOrFPLiteralFar:
+    case kLiteralAddrFar:
       DCHECK_GE(diff, 8);  // The target must be at least 4 bytes after the ADD rX, PC.
       diff -= 8;        // Extra MOVW+MOVT; both 32-bit.
       diff += LiteralPoolPaddingSize(current_code_size);
@@ -1929,6 +2017,29 @@
       // This encoding can reach any target.
       break;
 
+    case kLiteralAddr1KiB:
+      DCHECK(!IsHighRegister(rn_));
+      if (IsUint<10>(GetOffset(current_code_size))) {
+        break;
+      }
+      current_code_size += IncreaseSize(kLiteralAddr4KiB);
+      FALLTHROUGH_INTENDED;
+    case kLiteralAddr4KiB:
+      if (IsUint<12>(GetOffset(current_code_size))) {
+        break;
+      }
+      current_code_size += IncreaseSize(kLiteralAddr64KiB);
+      FALLTHROUGH_INTENDED;
+    case kLiteralAddr64KiB:
+      if (IsUint<16>(GetOffset(current_code_size))) {
+        break;
+      }
+      current_code_size += IncreaseSize(kLiteralAddrFar);
+      FALLTHROUGH_INTENDED;
+    case kLiteralAddrFar:
+      // This encoding can reach any target.
+      break;
+
     case kLongOrFPLiteral1KiB:
       if (IsUint<10>(GetOffset(current_code_size))) {
         break;
@@ -2055,6 +2166,42 @@
       break;
     }
 
+    case kLiteralAddr1KiB: {
+      DCHECK(type_ == kLoadLiteralAddr);
+      int16_t encoding = AdrEncoding16(rn_, GetOffset(code_size));
+      buffer->Store<int16_t>(location_, encoding);
+      break;
+    }
+    case kLiteralAddr4KiB: {
+      DCHECK(type_ == kLoadLiteralAddr);
+      int32_t encoding = AdrEncoding32(rn_, GetOffset(code_size));
+      buffer->Store<int16_t>(location_, encoding >> 16);
+      buffer->Store<int16_t>(location_ + 2u, static_cast<int16_t>(encoding & 0xffff));
+      break;
+    }
+    case kLiteralAddr64KiB: {
+      DCHECK(type_ == kLoadLiteralAddr);
+      int32_t mov_encoding = MovwEncoding32(rn_, GetOffset(code_size));
+      int16_t add_pc_encoding = AddRdnRmEncoding16(rn_, PC);
+      buffer->Store<int16_t>(location_, mov_encoding >> 16);
+      buffer->Store<int16_t>(location_ + 2u, static_cast<int16_t>(mov_encoding & 0xffff));
+      buffer->Store<int16_t>(location_ + 4u, add_pc_encoding);
+      break;
+    }
+    case kLiteralAddrFar: {
+      DCHECK(type_ == kLoadLiteralAddr);
+      int32_t offset = GetOffset(code_size);
+      int32_t movw_encoding = MovwEncoding32(rn_, offset & 0xffff);
+      int32_t movt_encoding = MovtEncoding32(rn_, offset & ~0xffff);
+      int16_t add_pc_encoding = AddRdnRmEncoding16(rn_, PC);
+      buffer->Store<int16_t>(location_, movw_encoding >> 16);
+      buffer->Store<int16_t>(location_ + 2u, static_cast<int16_t>(movw_encoding & 0xffff));
+      buffer->Store<int16_t>(location_ + 4u, movt_encoding >> 16);
+      buffer->Store<int16_t>(location_ + 6u, static_cast<int16_t>(movt_encoding & 0xffff));
+      buffer->Store<int16_t>(location_ + 8u, add_pc_encoding);
+      break;
+    }
+
     case kLongOrFPLiteral1KiB: {
       int32_t encoding = LoadWideOrFpEncoding(PC, GetOffset(code_size));  // DCHECKs type_.
       buffer->Store<int16_t>(location_, encoding >> 16);
@@ -3260,6 +3407,25 @@
   }
 }
 
+void Thumb2Assembler::CmpConstant(Register rn, int32_t value, Condition cond) {
+  // We prefer to select the shorter code sequence rather than selecting add for
+  // positive values and sub for negatives ones, which would slightly improve
+  // the readability of generated code for some constants.
+  ShifterOperand shifter_op;
+  if (ShifterOperandCanHold(kNoRegister, rn, CMP, value, &shifter_op)) {
+    cmp(rn, shifter_op, cond);
+  } else if (ShifterOperandCanHold(kNoRegister, rn, CMN, ~value, &shifter_op)) {
+    cmn(rn, shifter_op, cond);
+  } else {
+    CHECK(rn != IP);
+    movw(IP, Low16Bits(value), cond);
+    uint16_t value_high = High16Bits(value);
+    if (value_high != 0) {
+      movt(IP, value_high, cond);
+    }
+    cmp(rn, ShifterOperand(IP), cond);
+  }
+}
 
 void Thumb2Assembler::LoadImmediate(Register rd, int32_t value, Condition cond) {
   ShifterOperand shifter_op;
@@ -3476,5 +3642,39 @@
     b(label, NE);
   }
 }
+
+JumpTable* Thumb2Assembler::CreateJumpTable(std::vector<Label*>&& labels, Register base_reg) {
+  jump_tables_.emplace_back(std::move(labels));
+  JumpTable* table = &jump_tables_.back();
+  DCHECK(!table->GetLabel()->IsBound());
+
+  bool use32bit = IsForced32Bit() || IsHighRegister(base_reg);
+  uint32_t location = buffer_.Size();
+  Fixup::Size size = use32bit ? Fixup::kLiteralAddr4KiB : Fixup::kLiteralAddr1KiB;
+  FixupId fixup_id = AddFixup(Fixup::LoadLiteralAddress(location, base_reg, size));
+  Emit16(static_cast<uint16_t>(table->GetLabel()->position_));
+  table->GetLabel()->LinkTo(fixup_id);
+  if (use32bit) {
+    Emit16(0);
+  }
+  DCHECK_EQ(location + GetFixup(fixup_id)->GetSizeInBytes(), buffer_.Size());
+
+  return table;
+}
+
+void Thumb2Assembler::EmitJumpTableDispatch(JumpTable* jump_table, Register displacement_reg) {
+  CHECK(!IsForced32Bit()) << "Forced 32-bit dispatch not implemented yet";
+  // 32-bit ADD doesn't support PC as an input, so we need a two-instruction sequence:
+  //   SUB ip, ip, #0
+  //   ADD pc, ip, reg
+  // TODO: Implement.
+
+  // The anchor's position needs to be fixed up before we can compute offsets - so make it a tracked
+  // label.
+  BindTrackedLabel(jump_table->GetAnchorLabel());
+
+  add(PC, PC, ShifterOperand(displacement_reg));
+}
+
 }  // namespace arm
 }  // namespace art