ART: Add support for constant vector literals

Add in some vector instructions.  Implement the ConstVector
instruction, which takes 4 words of data and loads it into
an XMM register.

Initially, only the ConstVector MIR opcode is implemented. Others will
be added after this one goes in.

Change-Id: I5c79bc8b7de9030ef1c213fc8b227debc47f6337
Signed-off-by: Mark Mendell <mark.p.mendell@intel.com>
diff --git a/compiler/dex/compiler_enums.h b/compiler/dex/compiler_enums.h
index 05ab8ca..5b4492f 100644
--- a/compiler/dex/compiler_enums.h
+++ b/compiler/dex/compiler_enums.h
@@ -126,6 +126,104 @@
   kMirOpCheck,
   kMirOpCheckPart2,
   kMirOpSelect,
+
+  // Vector opcodes:
+  // TypeSize is an encoded field giving the element type and the vector size.
+  // It is encoded as OpSize << 16 | (number of bits in vector)
+  //
+  // Destination and source are integers that will be interpreted by the
+  // backend that supports Vector operations.  Backends are permitted to support only
+  // certain vector register sizes.
+  //
+  // At this point, only two operand instructions are supported.  Three operand instructions
+  // could be supported by using a bit in TypeSize and arg[0] where needed.
+
+  // @brief MIR to move constant data to a vector register
+  // vA: number of bits in register
+  // vB: destination
+  // args[0]~args[3]: up to 128 bits of data for initialization
+  kMirOpConstVector,
+
+  // @brief MIR to move a vectorized register to another
+  // vA: TypeSize
+  // vB: destination
+  // vC: source
+  kMirOpMoveVector,
+
+  // @brief Packed multiply of units in two vector registers: vB = vB .* vC using vA to know the type of the vector.
+  // vA: TypeSize
+  // vB: destination and source
+  // vC: source
+  kMirOpPackedMultiply,
+
+  // @brief Packed addition of units in two vector registers: vB = vB .+ vC using vA to know the type of the vector.
+  // vA: TypeSize
+  // vB: destination and source
+  // vC: source
+  kMirOpPackedAddition,
+
+  // @brief Packed subtraction of units in two vector registers: vB = vB .- vC using vA to know the type of the vector.
+  // vA: TypeSize
+  // vB: destination and source
+  // vC: source
+  kMirOpPackedSubtract,
+
+  // @brief Packed shift left of units in two vector registers: vB = vB .<< vC using vA to know the type of the vector.
+  // vA: TypeSize
+  // vB: destination and source
+  // vC: immediate
+  kMirOpPackedShiftLeft,
+
+  // @brief Packed signed shift right of units in two vector registers: vB = vB .>> vC using vA to know the type of the vector.
+  // vA: TypeSize
+  // vB: destination and source
+  // vC: immediate
+  kMirOpPackedSignedShiftRight,
+
+  // @brief Packed unsigned shift right of units in two vector registers: vB = vB .>>> vC using vA to know the type of the vector.
+  // vA: TypeSize
+  // vB: destination and source
+  // vC: immediate
+  kMirOpPackedUnsignedShiftRight,
+
+  // @brief Packed bitwise and of units in two vector registers: vB = vB .& vC using vA to know the type of the vector.
+  // vA: TypeSize
+  // vB: destination and source
+  // vC: source
+  kMirOpPackedAnd,
+
+  // @brief Packed bitwise or of units in two vector registers: vB = vB .| vC using vA to know the type of the vector.
+  // vA: TypeSize
+  // vB: destination and source
+  // vC: source
+  kMirOpPackedOr,
+
+  // @brief Packed bitwise xor of units in two vector registers: vB = vB .^ vC using vA to know the type of the vector.
+  // vA: TypeSize
+  // vB: destination and source
+  // vC: source
+  kMirOpPackedXor,
+
+  // @brief Reduce a 128-bit packed element into a single VR by taking lower bits
+  // @details Instruction does a horizontal addition of the packed elements and then adds it to VR
+  // vA: TypeSize
+  // vB: destination and source VR (not vector register)
+  // vC: source (vector register)
+  kMirOpPackedAddReduce,
+
+  // @brief Extract a packed element into a single VR.
+  // vA: TypeSize
+  // vB: destination VR (not vector register)
+  // vC: source (vector register)
+  // arg[0]: The index to use for extraction from vector register (which packed element)
+  kMirOpPackedReduce,
+
+  // @brief Create a vector value, with all TypeSize values equal to vC
+  // vA: TypeSize
+  // vB: destination vector register
+  // vC: source VR (not vector register)
+  kMirOpPackedSet,
+
   kMirOpLast,
 };
 
diff --git a/compiler/dex/mir_graph.cc b/compiler/dex/mir_graph.cc
index ca90a83..ba4224e 100644
--- a/compiler/dex/mir_graph.cc
+++ b/compiler/dex/mir_graph.cc
@@ -45,6 +45,20 @@
   "Check1",
   "Check2",
   "Select",
+  "ConstVector",
+  "MoveVector",
+  "PackedMultiply",
+  "PackedAddition",
+  "PackedSubtract",
+  "PackedShiftLeft",
+  "PackedSignedShiftRight",
+  "PackedUnsignedShiftRight",
+  "PackedAnd",
+  "PackedOr",
+  "PackedXor",
+  "PackedAddReduce",
+  "PackedReduce",
+  "PackedSet",
 };
 
 MIRGraph::MIRGraph(CompilationUnit* cu, ArenaAllocator* arena)
@@ -798,13 +812,35 @@
                 bb->first_mir_insn ? " | " : " ");
         for (mir = bb->first_mir_insn; mir; mir = mir->next) {
             int opcode = mir->dalvikInsn.opcode;
-            fprintf(file, "    {%04x %s %s %s\\l}%s\\\n", mir->offset,
-                    mir->ssa_rep ? GetDalvikDisassembly(mir) :
-                    (opcode < kMirOpFirst) ?  Instruction::Name(mir->dalvikInsn.opcode) :
-                    extended_mir_op_names_[opcode - kMirOpFirst],
-                    (mir->optimization_flags & MIR_IGNORE_RANGE_CHECK) != 0 ? " no_rangecheck" : " ",
-                    (mir->optimization_flags & MIR_IGNORE_NULL_CHECK) != 0 ? " no_nullcheck" : " ",
-                    mir->next ? " | " : " ");
+            if (opcode > kMirOpSelect && opcode < kMirOpLast) {
+              if (opcode == kMirOpConstVector) {
+                fprintf(file, "    {%04x %s %d %d %d %d %d %d\\l}%s\\\n", mir->offset,
+                        extended_mir_op_names_[kMirOpConstVector - kMirOpFirst],
+                        mir->dalvikInsn.vA,
+                        mir->dalvikInsn.vB,
+                        mir->dalvikInsn.arg[0],
+                        mir->dalvikInsn.arg[1],
+                        mir->dalvikInsn.arg[2],
+                        mir->dalvikInsn.arg[3],
+                        mir->next ? " | " : " ");
+              } else {
+                fprintf(file, "    {%04x %s %d %d %d\\l}%s\\\n", mir->offset,
+                        extended_mir_op_names_[opcode - kMirOpFirst],
+                        mir->dalvikInsn.vA,
+                        mir->dalvikInsn.vB,
+                        mir->dalvikInsn.vC,
+                        mir->next ? " | " : " ");
+              }
+            } else {
+              fprintf(file, "    {%04x %s %s %s\\l}%s\\\n", mir->offset,
+                      mir->ssa_rep ? GetDalvikDisassembly(mir) :
+                      (opcode < kMirOpFirst) ?
+                        Instruction::Name(mir->dalvikInsn.opcode) :
+                        extended_mir_op_names_[opcode - kMirOpFirst],
+                      (mir->optimization_flags & MIR_IGNORE_RANGE_CHECK) != 0 ? " no_rangecheck" : " ",
+                      (mir->optimization_flags & MIR_IGNORE_NULL_CHECK) != 0 ? " no_nullcheck" : " ",
+                      mir->next ? " | " : " ");
+            }
         }
         fprintf(file, "  }\"];\n\n");
     } else if (bb->block_type == kExceptionHandling) {
diff --git a/compiler/dex/quick/codegen_util.cc b/compiler/dex/quick/codegen_util.cc
index 784dfaf..6f81238 100644
--- a/compiler/dex/quick/codegen_util.cc
+++ b/compiler/dex/quick/codegen_util.cc
@@ -1203,4 +1203,8 @@
   return loc;
 }
 
+void Mir2Lir::GenMachineSpecificExtendedMethodMIR(BasicBlock* bb, MIR* mir) {
+  LOG(FATAL) << "Unknown MIR opcode not supported on this architecture";
+}
+
 }  // namespace art
diff --git a/compiler/dex/quick/mir_to_lir.cc b/compiler/dex/quick/mir_to_lir.cc
index 2c4ca88..10c2459 100644
--- a/compiler/dex/quick/mir_to_lir.cc
+++ b/compiler/dex/quick/mir_to_lir.cc
@@ -975,7 +975,18 @@
     case kMirOpSelect:
       GenSelect(bb, mir);
       break;
+    case kMirOpPhi:
+    case kMirOpNop:
+    case kMirOpNullCheck:
+    case kMirOpRangeCheck:
+    case kMirOpDivZeroCheck:
+    case kMirOpCheck:
+    case kMirOpCheckPart2:
+      // Ignore these known opcodes
+      break;
     default:
+      // Give the backends a chance to handle unknown extended MIR opcodes.
+      GenMachineSpecificExtendedMethodMIR(bb, mir);
       break;
   }
 }
diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index 507da0e..3201b60 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h
@@ -1180,6 +1180,14 @@
     virtual void GenFusedFPCmpBranch(BasicBlock* bb, MIR* mir, bool gt_bias, bool is_double) = 0;
     virtual void GenFusedLongCmpBranch(BasicBlock* bb, MIR* mir) = 0;
 
+    /*
+     * @brief Handle Machine Specific MIR Extended opcodes.
+     * @param bb The basic block in which the MIR is from.
+     * @param mir The MIR whose opcode is not standard extended MIR.
+     * @note Base class implementation will abort for unknown opcodes.
+     */
+    virtual void GenMachineSpecificExtendedMethodMIR(BasicBlock* bb, MIR* mir);
+
     /**
      * @brief Lowers the kMirOpSelect MIR into LIR.
      * @param bb The basic block in which the MIR is from.
diff --git a/compiler/dex/quick/x86/assemble_x86.cc b/compiler/dex/quick/x86/assemble_x86.cc
index c0c60d7..9200106 100644
--- a/compiler/dex/quick/x86/assemble_x86.cc
+++ b/compiler/dex/quick/x86/assemble_x86.cc
@@ -320,6 +320,11 @@
   { kX86Fstp32M, kMem, IS_STORE | IS_UNARY_OP | REG_USE0 | USE_FP_STACK, { 0x0, 0, 0xD9, 0x00, 0, 3, 0, 0 }, "FstpsM", "[!0r,!1d]" },
   { kX86Fstp64M, kMem, IS_STORE | IS_UNARY_OP | REG_USE0 | USE_FP_STACK, { 0x0, 0, 0xDD, 0x00, 0, 3, 0, 0 }, "FstpdM", "[!0r,!1d]" },
 
+  EXT_0F_ENCODING_MAP(Mova128,    0x66, 0x6F, REG_DEF0),
+  { kX86Mova128MR, kMemReg,   IS_STORE | IS_TERTIARY_OP | REG_USE02,  { 0x66, 0, 0x0F, 0x6F, 0, 0, 0, 0 }, "Mova128MR", "[!0r+!1d],!2r" },
+  { kX86Mova128AR, kArrayReg, IS_STORE | IS_QUIN_OP     | REG_USE014, { 0x66, 0, 0x0F, 0x6F, 0, 0, 0, 0 }, "Mova128AR", "[!0r+!1r<<!2d+!3d],!4r" },
+
+
   EXT_0F_ENCODING_MAP(Movups,    0x0, 0x10, REG_DEF0),
   { kX86MovupsMR, kMemReg,      IS_STORE | IS_TERTIARY_OP | REG_USE02,  { 0x0, 0, 0x0F, 0x11, 0, 0, 0, 0 }, "MovupsMR", "[!0r+!1d],!2r" },
   { kX86MovupsAR, kArrayReg,    IS_STORE | IS_QUIN_OP     | REG_USE014, { 0x0, 0, 0x0F, 0x11, 0, 0, 0, 0 }, "MovupsAR", "[!0r+!1r<<!2d+!3d],!4r" },
@@ -1508,6 +1513,26 @@
 void X86Mir2Lir::AssignOffsets() {
   int offset = AssignInsnOffsets();
 
+  if (const_vectors_ != nullptr) {
+    /* assign offsets to vector literals */
+
+    // First, get offset to 12 mod 16 to align to 16 byte boundary.
+    // This will ensure that the vector is 16 byte aligned, as the procedure is
+    // always aligned at at 4 mod 16.
+    int align_size = (16-4) - (offset & 0xF);
+    if (align_size < 0) {
+      align_size += 16;
+    }
+
+    offset += align_size;
+
+    // Now assign each literal the right offset.
+    for (LIR *p = const_vectors_; p != nullptr; p = p->next) {
+      p->offset = offset;
+      offset += 16;
+    }
+  }
+
   /* Const values have to be word aligned */
   offset = RoundUp(offset, 4);
 
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index 47d1792..cc0e1f2 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -408,6 +408,22 @@
     bool GenInlinedIndexOf(CallInfo* info, bool zero_based);
 
     /*
+     * @brief Load 128 bit constant into vector register.
+     * @param bb The basic block in which the MIR is from.
+     * @param mir The MIR whose opcode is kMirConstVector
+     * @note vA is the TypeSize for the register.
+     * @note vB is the destination XMM register. arg[0..3] are 32 bit constant values.
+     */
+    void GenConst128(BasicBlock* bb, MIR* mir);
+
+    /*
+     * @brief Generate code for a vector opcode.
+     * @param bb The basic block in which the MIR is from.
+     * @param mir The MIR whose opcode is a non-standard opcode.
+     */
+    void GenMachineSpecificExtendedMethodMIR(BasicBlock* bb, MIR* mir);
+
+    /*
      * @brief Return the correct x86 opcode for the Dex operation
      * @param op Dex opcode for the operation
      * @param loc Register location of the operand
@@ -613,6 +629,22 @@
 
     // 64-bit mode
     bool gen64bit_;
+
+    // The list of const vector literals.
+    LIR *const_vectors_;
+
+    /*
+     * @brief Search for a matching vector literal
+     * @param mir A kMirOpConst128b MIR instruction to match.
+     * @returns pointer to matching LIR constant, or nullptr if not found.
+     */
+    LIR *ScanVectorLiteral(MIR *mir);
+
+    /*
+     * @brief Add a constant vector literal
+     * @param mir A kMirOpConst128b MIR instruction to match.
+     */
+    LIR *AddVectorLiteral(MIR *mir);
 };
 
 }  // namespace art
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index 2e6bfde..237c68c 100644
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -641,13 +641,15 @@
       method_address_insns_(arena, 100, kGrowableArrayMisc),
       class_type_address_insns_(arena, 100, kGrowableArrayMisc),
       call_method_insns_(arena, 100, kGrowableArrayMisc),
-      stack_decrement_(nullptr), stack_increment_(nullptr), gen64bit_(gen64bit) {
+      stack_decrement_(nullptr), stack_increment_(nullptr), gen64bit_(gen64bit),
+      const_vectors_(nullptr) {
+  store_method_addr_used_ = false;
   if (kIsDebugBuild) {
     for (int i = 0; i < kX86Last; i++) {
       if (X86Mir2Lir::EncodingMap[i].opcode != i) {
         LOG(FATAL) << "Encoding order for " << X86Mir2Lir::EncodingMap[i].name
-            << " is wrong: expecting " << i << ", seeing "
-            << static_cast<int>(X86Mir2Lir::EncodingMap[i].opcode);
+                   << " is wrong: expecting " << i << ", seeing "
+                   << static_cast<int>(X86Mir2Lir::EncodingMap[i].opcode);
       }
     }
   }
@@ -838,12 +840,46 @@
   return call;
 }
 
+/*
+ * @brief Enter a 32 bit quantity into a buffer
+ * @param buf buffer.
+ * @param data Data value.
+ */
+
+static void PushWord(std::vector<uint8_t>&buf, int32_t data) {
+  buf.push_back(data & 0xff);
+  buf.push_back((data >> 8) & 0xff);
+  buf.push_back((data >> 16) & 0xff);
+  buf.push_back((data >> 24) & 0xff);
+}
+
 void X86Mir2Lir::InstallLiteralPools() {
   // These are handled differently for x86.
   DCHECK(code_literal_list_ == nullptr);
   DCHECK(method_literal_list_ == nullptr);
   DCHECK(class_literal_list_ == nullptr);
 
+  // Align to 16 byte boundary.  We have implicit knowledge that the start of the method is
+  // on a 4 byte boundary.   How can I check this if it changes (other than aligned loads
+  // will fail at runtime)?
+  if (const_vectors_ != nullptr) {
+    int align_size = (16-4) - (code_buffer_.size() & 0xF);
+    if (align_size < 0) {
+      align_size += 16;
+    }
+
+    while (align_size > 0) {
+      code_buffer_.push_back(0);
+      align_size--;
+    }
+    for (LIR *p = const_vectors_; p != nullptr; p = p->next) {
+      PushWord(code_buffer_, p->operands[0]);
+      PushWord(code_buffer_, p->operands[1]);
+      PushWord(code_buffer_, p->operands[2]);
+      PushWord(code_buffer_, p->operands[3]);
+    }
+  }
+
   // Handle the fixups for methods.
   for (uint32_t i = 0; i < method_address_insns_.Size(); i++) {
       LIR* p = method_address_insns_.Get(i);
@@ -1074,18 +1110,6 @@
 }
 
 /*
- * @brief Enter a 32 bit quantity into the FDE buffer
- * @param buf FDE buffer.
- * @param data Data value.
- */
-static void PushWord(std::vector<uint8_t>&buf, int data) {
-  buf.push_back(data & 0xff);
-  buf.push_back((data >> 8) & 0xff);
-  buf.push_back((data >> 16) & 0xff);
-  buf.push_back((data >> 24) & 0xff);
-}
-
-/*
  * @brief Enter an 'advance LOC' into the FDE buffer
  * @param buf FDE buffer.
  * @param increment Amount by which to increase the current location.
@@ -1235,4 +1259,73 @@
   return cfi_info;
 }
 
+void X86Mir2Lir::GenMachineSpecificExtendedMethodMIR(BasicBlock* bb, MIR* mir) {
+  switch (static_cast<ExtendedMIROpcode>(mir->dalvikInsn.opcode)) {
+    case kMirOpConstVector:
+      GenConst128(bb, mir);
+      break;
+    default:
+      break;
+  }
+}
+
+void X86Mir2Lir::GenConst128(BasicBlock* bb, MIR* mir) {
+  int type_size = mir->dalvikInsn.vA;
+  // We support 128 bit vectors.
+  DCHECK_EQ(type_size & 0xFFFF, 128);
+  int reg = mir->dalvikInsn.vB;
+  DCHECK_LT(reg, 8);
+  uint32_t *args = mir->dalvikInsn.arg;
+  // Check for all 0 case.
+  if (args[0] == 0 && args[1] == 0 && args[2] == 0 && args[3] == 0) {
+    NewLIR2(kX86XorpsRR, reg, reg);
+    return;
+  }
+  // Okay, load it from the constant vector area.
+  LIR *data_target = ScanVectorLiteral(mir);
+  if (data_target == nullptr) {
+    data_target = AddVectorLiteral(mir);
+  }
+
+  // Address the start of the method.
+  RegLocation rl_method = mir_graph_->GetRegLocation(base_of_code_->s_reg_low);
+  rl_method = LoadValue(rl_method, kCoreReg);
+
+  // Load the proper value from the literal area.
+  // We don't know the proper offset for the value, so pick one that will force
+  // 4 byte offset.  We will fix this up in the assembler later to have the right
+  // value.
+  LIR *load = NewLIR3(kX86Mova128RM, reg, rl_method.reg.GetReg(),  256 /* bogus */);
+  load->flags.fixup = kFixupLoad;
+  load->target = data_target;
+  SetMemRefType(load, true, kLiteral);
+}
+
+LIR *X86Mir2Lir::ScanVectorLiteral(MIR *mir) {
+  int *args = reinterpret_cast<int*>(mir->dalvikInsn.arg);
+  for (LIR *p = const_vectors_; p != nullptr; p = p->next) {
+    if (args[0] == p->operands[0] && args[1] == p->operands[1] &&
+        args[2] == p->operands[2] && args[3] == p->operands[3]) {
+      return p;
+    }
+  }
+  return nullptr;
+}
+
+LIR *X86Mir2Lir::AddVectorLiteral(MIR *mir) {
+  LIR* new_value = static_cast<LIR*>(arena_->Alloc(sizeof(LIR), kArenaAllocData));
+  int *args = reinterpret_cast<int*>(mir->dalvikInsn.arg);
+  new_value->operands[0] = args[0];
+  new_value->operands[1] = args[1];
+  new_value->operands[2] = args[2];
+  new_value->operands[3] = args[3];
+  new_value->next = const_vectors_;
+  if (const_vectors_ == nullptr) {
+    estimated_native_code_size_ += 12;  // Amount needed to align to 16 byte boundary.
+  }
+  estimated_native_code_size_ += 16;  // Space for one vector.
+  const_vectors_ = new_value;
+  return new_value;
+}
+
 }  // namespace art
diff --git a/compiler/dex/quick/x86/utility_x86.cc b/compiler/dex/quick/x86/utility_x86.cc
index fb85318..e9592a6 100644
--- a/compiler/dex/quick/x86/utility_x86.cc
+++ b/compiler/dex/quick/x86/utility_x86.cc
@@ -866,6 +866,9 @@
     case kMirOpFusedCmpgDouble:
       AnalyzeFPInstruction(opcode, bb, mir);
       break;
+    case kMirOpConstVector:
+      store_method_addr_ = true;
+      break;
     default:
       // Ignore the rest.
       break;
diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h
index c8c2542..adfed0c 100644
--- a/compiler/dex/quick/x86/x86_lir.h
+++ b/compiler/dex/quick/x86/x86_lir.h
@@ -425,6 +425,8 @@
   kX86Fild64M,                  // push 64-bit integer on x87 stack
   kX86Fstp32M,                  // pop top x87 fp stack and do 32-bit store
   kX86Fstp64M,                  // pop top x87 fp stack and do 64-bit store
+  Binary0fOpCode(kX86Mova128),  // move 128 bits aligned
+  kX86Mova128MR, kX86Mova128AR,  // store 128 bit aligned from xmm1 to m128
   Binary0fOpCode(kX86Movups),   // load unaligned packed single FP values from xmm2/m128 to xmm1
   kX86MovupsMR, kX86MovupsAR,   // store unaligned packed single FP values from xmm1 to m128
   Binary0fOpCode(kX86Movaps),   // load aligned packed single FP values from xmm2/m128 to xmm1