14 files changed, 599 insertions, 176 deletions
diff --git a/compiler/dex/backend.h b/compiler/dex/backend.h
index 596b3c9802..1f24849257 100644
--- a/compiler/dex/backend.h
+++ b/compiler/dex/backend.h
@@ -28,6 +28,25 @@ class Backend {
     virtual void Materialize() = 0;
     virtual CompiledMethod* GetCompiledMethod() = 0;
 
+    // Queries for backend support for vectors
+    /*
+     * Return the number of bits in a vector register.
+     * @return 0 if vector registers are not supported, or the
+     * number of bits in the vector register if supported.
+     */
+    virtual int VectorRegisterSize() { return 0; }
+
+    /*
+     * Return the number of reservable vector registers supported
+     * @param fp_used  ‘true’ if floating point computations will be
+     * executed while vector registers are reserved.
+     * @return the number of vector registers that are available
+     * @note The backend should ensure that sufficient vector registers
+     * are held back to generate scalar code without exhausting vector
+     * registers, if scalar code also uses the vector registers.
+     */
+    virtual int NumReservableVectorRegisters(bool fp_used) { return 0; }
+
   protected:
     explicit Backend(ArenaAllocator* arena) : arena_(arena) {}
     ArenaAllocator* const arena_;
diff --git a/compiler/dex/compiler_enums.h b/compiler/dex/compiler_enums.h
index caecb7a48e..799a742032 100644
--- a/compiler/dex/compiler_enums.h
+++ b/compiler/dex/compiler_enums.h
@@ -133,91 +133,101 @@ enum ExtendedMIROpcode {
   // could be supported by using a bit in TypeSize and arg[0] where needed.
 
   // @brief MIR to move constant data to a vector register
-  // vA: number of bits in register
-  // vB: destination
+  // vA: destination
+  // vB: number of bits in register
   // args[0]~args[3]: up to 128 bits of data for initialization
   kMirOpConstVector,
 
   // @brief MIR to move a vectorized register to another
-  // vA: TypeSize
-  // vB: destination
-  // vC: source
+  // vA: destination
+  // vB: source
+  // vC: TypeSize
   kMirOpMoveVector,
 
   // @brief Packed multiply of units in two vector registers: vB = vB .* vC using vA to know the type of the vector.
-  // vA: TypeSize
-  // vB: destination and source
-  // vC: source
+  // vA: destination and source
+  // vB: source
+  // vC: TypeSize
   kMirOpPackedMultiply,
 
   // @brief Packed addition of units in two vector registers: vB = vB .+ vC using vA to know the type of the vector.
-  // vA: TypeSize
-  // vB: destination and source
-  // vC: source
+  // vA: destination and source
+  // vB: source
+  // vC: TypeSize
   kMirOpPackedAddition,
 
   // @brief Packed subtraction of units in two vector registers: vB = vB .- vC using vA to know the type of the vector.
-  // vA: TypeSize
-  // vB: destination and source
-  // vC: source
+  // vA: destination and source
+  // vB: source
+  // vC: TypeSize
   kMirOpPackedSubtract,
 
   // @brief Packed shift left of units in two vector registers: vB = vB .<< vC using vA to know the type of the vector.
-  // vA: TypeSize
-  // vB: destination and source
-  // vC: immediate
+  // vA: destination and source
+  // vB: amount to shift
+  // vC: TypeSize
   kMirOpPackedShiftLeft,
 
   // @brief Packed signed shift right of units in two vector registers: vB = vB .>> vC using vA to know the type of the vector.
-  // vA: TypeSize
-  // vB: destination and source
-  // vC: immediate
+  // vA: destination and source
+  // vB: amount to shift
+  // vC: TypeSize
   kMirOpPackedSignedShiftRight,
 
   // @brief Packed unsigned shift right of units in two vector registers: vB = vB .>>> vC using vA to know the type of the vector.
-  // vA: TypeSize
-  // vB: destination and source
-  // vC: immediate
+  // vA: destination and source
+  // vB: amount to shift
+  // vC: TypeSize
   kMirOpPackedUnsignedShiftRight,
 
   // @brief Packed bitwise and of units in two vector registers: vB = vB .& vC using vA to know the type of the vector.
-  // vA: TypeSize
-  // vB: destination and source
-  // vC: source
+  // vA: destination and source
+  // vB: source
+  // vC: TypeSize
   kMirOpPackedAnd,
 
   // @brief Packed bitwise or of units in two vector registers: vB = vB .| vC using vA to know the type of the vector.
-  // vA: TypeSize
-  // vB: destination and source
-  // vC: source
+  // vA: destination and source
+  // vB: source
+  // vC: TypeSize
   kMirOpPackedOr,
 
   // @brief Packed bitwise xor of units in two vector registers: vB = vB .^ vC using vA to know the type of the vector.
-  // vA: TypeSize
-  // vB: destination and source
-  // vC: source
+  // vA: destination and source
+  // vB: source
+  // vC: TypeSize
   kMirOpPackedXor,
 
   // @brief Reduce a 128-bit packed element into a single VR by taking lower bits
   // @details Instruction does a horizontal addition of the packed elements and then adds it to VR
-  // vA: TypeSize
-  // vB: destination and source VR (not vector register)
-  // vC: source (vector register)
+  // vA: destination and source VR (not vector register)
+  // vB: source (vector register)
+  // vC: TypeSize
   kMirOpPackedAddReduce,
 
   // @brief Extract a packed element into a single VR.
-  // vA: TypeSize
-  // vB: destination VR (not vector register)
-  // vC: source (vector register)
+  // vA: destination VR (not vector register)
+  // vB: source (vector register)
+  // vC: TypeSize
   // arg[0]: The index to use for extraction from vector register (which packed element)
   kMirOpPackedReduce,
 
   // @brief Create a vector value, with all TypeSize values equal to vC
-  // vA: TypeSize
-  // vB: destination vector register
-  // vC: source VR (not vector register)
+  // vA: destination vector register
+  // vB: source VR (not vector register)
+  // vC: TypeSize
   kMirOpPackedSet,
 
+  // @brief Reserve N vector registers (named 0..N-1)
+  // vA: Number of registers
+  // @note: The backend may choose to map vector numbers used in vector opcodes.
+  //  Reserved registers are removed from the list of backend temporary pool.
+  kMirOpReserveVectorRegisters,
+
+  // @brief Free Reserved vector registers
+  // @note: All currently reserved vector registers are returned to the temporary pool.
+  kMirOpReturnVectorRegisters,
+
   kMirOpLast,
 };
 
diff --git a/compiler/dex/mir_dataflow.cc b/compiler/dex/mir_dataflow.cc
index 9fea709568..bc99a272a6 100644
--- a/compiler/dex/mir_dataflow.cc
+++ b/compiler/dex/mir_dataflow.cc
@@ -840,6 +840,54 @@ const uint64_t MIRGraph::oat_data_flow_attributes_[kMirOpLast] = {
 
   // 113 MIR_SELECT
   DF_DA | DF_UB,
+
+  // 114 MirOpConstVector
+  DF_DA,
+
+  // 115 MirOpMoveVector
+  0,
+
+  // 116 MirOpPackedMultiply
+  0,
+
+  // 117 MirOpPackedAddition
+  0,
+
+  // 118 MirOpPackedSubtract
+  0,
+
+  // 119 MirOpPackedShiftLeft
+  0,
+
+  // 120 MirOpPackedSignedShiftRight
+  0,
+
+  // 121 MirOpPackedUnsignedShiftRight
+  0,
+
+  // 122 MirOpPackedAnd
+  0,
+
+  // 123 MirOpPackedOr
+  0,
+
+  // 124 MirOpPackedXor
+  0,
+
+  // 125 MirOpPackedAddReduce
+  DF_DA | DF_UA,
+
+  // 126 MirOpPackedReduce
+  DF_DA,
+
+  // 127 MirOpPackedSet
+  DF_UB,
+
+  // 128 MirOpReserveVectorRegisters
+  0,
+
+  // 129 MirOpReturnVectorRegisters
+  0,
 };
 
 /* Return the base virtual register for a SSA name */
diff --git a/compiler/dex/mir_graph.cc b/compiler/dex/mir_graph.cc
index 71bb2c6166..5741b0b8ff 100644
--- a/compiler/dex/mir_graph.cc
+++ b/compiler/dex/mir_graph.cc
@@ -62,6 +62,8 @@ const char* MIRGraph::extended_mir_op_names_[kMirOpLast - kMirOpFirst] = {
   "PackedAddReduce",
   "PackedReduce",
   "PackedSet",
+  "ReserveVectorRegisters",
+  "ReturnVectorRegisters",
 };
 
 MIRGraph::MIRGraph(CompilationUnit* cu, ArenaAllocator* arena)
@@ -910,12 +912,13 @@ void MIRGraph::DumpCFG(const char* dir_prefix, bool all_blocks, const char *suff
                         mir->next ? " | " : " ");
               }
             } else {
-              fprintf(file, "    {%04x %s %s %s\\l}%s\\\n", mir->offset,
+              fprintf(file, "    {%04x %s %s %s %s\\l}%s\\\n", mir->offset,
                       mir->ssa_rep ? GetDalvikDisassembly(mir) :
                       !IsPseudoMirOp(opcode) ? Instruction::Name(mir->dalvikInsn.opcode) :
                         extended_mir_op_names_[opcode - kMirOpFirst],
                       (mir->optimization_flags & MIR_IGNORE_RANGE_CHECK) != 0 ? " no_rangecheck" : " ",
                       (mir->optimization_flags & MIR_IGNORE_NULL_CHECK) != 0 ? " no_nullcheck" : " ",
+                      (mir->optimization_flags & MIR_IGNORE_SUSPEND_CHECK) != 0 ? " no_suspendcheck" : " ",
                       mir->next ? " | " : " ");
             }
         }
diff --git a/compiler/dex/quick/gen_invoke.cc b/compiler/dex/quick/gen_invoke.cc
index 02f39ac180..6c0dfe80a6 100755
--- a/compiler/dex/quick/gen_invoke.cc
+++ b/compiler/dex/quick/gen_invoke.cc
@@ -1638,6 +1638,12 @@ bool Mir2Lir::GenInlinedStringCompareTo(CallInfo* info) {
 
 bool Mir2Lir::GenInlinedCurrentThread(CallInfo* info) {
   RegLocation rl_dest = InlineTarget(info);
+
+  // Early exit if the result is unused.
+  if (rl_dest.orig_sreg < 0) {
+    return true;
+  }
+
   RegLocation rl_result = EvalLoc(rl_dest, kRefReg, true);
 
   switch (cu_->instruction_set) {
diff --git a/compiler/dex/quick/ralloc_util.cc b/compiler/dex/quick/ralloc_util.cc
index 13bd4432d7..e8fc919d5f 100644
--- a/compiler/dex/quick/ralloc_util.cc
+++ b/compiler/dex/quick/ralloc_util.cc
@@ -1276,7 +1276,7 @@ void Mir2Lir::DoPromotion() {
         if (cu_->instruction_set == kThumb2) {
           bool wide = fp_regs[i].s_reg & STARTING_WIDE_SREG;
           if (wide) {
-            if (promotion_map_[p_map_idx + 1].fp_location == kLocPhysReg) {
+            if (promotion_map_[p_map_idx + 1].fp_location != kLocPhysReg) {
               // Ignore result - if can't alloc double may still be able to alloc singles.
               AllocPreservedDouble(low_sreg);
             }
diff --git a/compiler/dex/quick/x86/assemble_x86.cc b/compiler/dex/quick/x86/assemble_x86.cc
index 7baf2d9663..4e973d8b48 100644
--- a/compiler/dex/quick/x86/assemble_x86.cc
+++ b/compiler/dex/quick/x86/assemble_x86.cc
@@ -327,21 +327,11 @@ ENCODING_MAP(Cmp, IS_LOAD, 0, 0,
 { kX86 ## opname ## RM, kRegMem,   IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE1,  { prefix, 0, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RM", "!0r,[!1r+!2d]" }, \
 { kX86 ## opname ## RA, kRegArray, IS_LOAD | IS_QUIN_OP     | reg_def | REG_USE12, { prefix, 0, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RA", "!0r,[!1r+!2r<<!3d+!4d]" }
 
-#define EXT_0F_REX_NO_PREFIX_ENCODING_MAP(opname, opcode, reg_def) \
-{ kX86 ## opname ## RR, kRegReg,             IS_BINARY_OP   | reg_def | REG_USE1,  { REX, 0x00, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RR", "!0r,!1r" }, \
-{ kX86 ## opname ## RM, kRegMem,   IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE1,  { REX, 0x00, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RM", "!0r,[!1r+!2d]" }, \
-{ kX86 ## opname ## RA, kRegArray, IS_LOAD | IS_QUIN_OP     | reg_def | REG_USE12, { REX, 0x00, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RA", "!0r,[!1r+!2r<<!3d+!4d]" }
-
 #define EXT_0F_REX_W_ENCODING_MAP(opname, prefix, opcode, reg_def) \
 { kX86 ## opname ## RR, kRegReg,             IS_BINARY_OP   | reg_def | REG_USE1,  { prefix, REX_W, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RR", "!0r,!1r" }, \
 { kX86 ## opname ## RM, kRegMem,   IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE1,  { prefix, REX_W, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RM", "!0r,[!1r+!2d]" }, \
 { kX86 ## opname ## RA, kRegArray, IS_LOAD | IS_QUIN_OP     | reg_def | REG_USE12, { prefix, REX_W, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RA", "!0r,[!1r+!2r<<!3d+!4d]" }
 
-#define EXT_0F_REX_W_NO_PREFIX_ENCODING_MAP(opname, opcode, reg_def) \
-{ kX86 ## opname ## RR, kRegReg,             IS_BINARY_OP   | reg_def | REG_USE1,  { REX_W, 0x00, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RR", "!0r,!1r" }, \
-{ kX86 ## opname ## RM, kRegMem,   IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE1,  { REX_W, 0x00, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RM", "!0r,[!1r+!2d]" }, \
-{ kX86 ## opname ## RA, kRegArray, IS_LOAD | IS_QUIN_OP     | reg_def | REG_USE12, { REX_W, 0x00, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RA", "!0r,[!1r+!2r<<!3d+!4d]" }
-
 #define EXT_0F_ENCODING2_MAP(opname, prefix, opcode, opcode2, reg_def) \
 { kX86 ## opname ## RR, kRegReg,             IS_BINARY_OP   | reg_def | REG_USE1,  { prefix, 0, 0x0F, opcode, opcode2, 0, 0, 0, false }, #opname "RR", "!0r,!1r" }, \
 { kX86 ## opname ## RM, kRegMem,   IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE1,  { prefix, 0, 0x0F, opcode, opcode2, 0, 0, 0, false }, #opname "RM", "!0r,[!1r+!2d]" }, \
@@ -405,9 +395,12 @@ ENCODING_MAP(Cmp, IS_LOAD, 0, 0,
   EXT_0F_ENCODING_MAP(Haddpd,    0x66, 0x7C, REG_DEF0_USE0),
   EXT_0F_ENCODING_MAP(Haddps,    0xF2, 0x7C, REG_DEF0_USE0),
 
-  { kX86PextrbRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0x66, 0, 0x0F, 0x3A, 0x14, 0, 0, 1, false }, "PextbRRI", "!0r,!1r,!2d" },
-  { kX86PextrwRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0x66, 0, 0x0F, 0xC5, 0x00, 0, 0, 1, false }, "PextwRRI", "!0r,!1r,!2d" },
-  { kX86PextrdRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1, false }, "PextdRRI", "!0r,!1r,!2d" },
+  { kX86PextrbRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0  | REG_USE1, { 0x66, 0, 0x0F, 0x3A, 0x14, 0, 0, 1, false }, "PextbRRI", "!0r,!1r,!2d" },
+  { kX86PextrwRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0  | REG_USE1, { 0x66, 0, 0x0F, 0xC5, 0x00, 0, 0, 1, false }, "PextwRRI", "!0r,!1r,!2d" },
+  { kX86PextrdRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0  | REG_USE1, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1, false }, "PextdRRI", "!0r,!1r,!2d" },
+  { kX86PextrbMRI, kMemRegImm, IS_QUAD_OP     | REG_USE02 | IS_STORE, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1, false }, "kX86PextrbMRI", "[!0r+!1d],!2r,!3d" },
+  { kX86PextrwMRI, kMemRegImm, IS_QUAD_OP     | REG_USE02 | IS_STORE, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1, false }, "kX86PextrwMRI", "[!0r+!1d],!2r,!3d" },
+  { kX86PextrdMRI, kMemRegImm, IS_QUAD_OP     | REG_USE02 | IS_STORE, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1, false }, "kX86PextrdMRI", "[!0r+!1d],!2r,!3d" },
 
   { kX86PshuflwRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0xF2, 0, 0x0F, 0x70, 0, 0, 0, 1, false }, "PshuflwRRI", "!0r,!1r,!2d" },
   { kX86PshufdRRI,  kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0x66, 0, 0x0F, 0x70, 0, 0, 0, 1, false }, "PshuffRRI", "!0r,!1r,!2d" },
@@ -499,10 +492,10 @@ ENCODING_MAP(Cmp, IS_LOAD, 0, 0,
   EXT_0F_ENCODING_MAP(Movzx16, 0x00, 0xB7, REG_DEF0),
   EXT_0F_ENCODING_MAP(Movsx8,  0x00, 0xBE, REG_DEF0),
   EXT_0F_ENCODING_MAP(Movsx16, 0x00, 0xBF, REG_DEF0),
-  EXT_0F_REX_NO_PREFIX_ENCODING_MAP(Movzx8q, 0xB6, REG_DEF0),
-  EXT_0F_REX_W_NO_PREFIX_ENCODING_MAP(Movzx16q, 0xB7, REG_DEF0),
-  EXT_0F_REX_NO_PREFIX_ENCODING_MAP(Movsx8q, 0xBE, REG_DEF0),
-  EXT_0F_REX_W_NO_PREFIX_ENCODING_MAP(Movsx16q, 0xBF, REG_DEF0),
+  EXT_0F_ENCODING_MAP(Movzx8q,  REX_W, 0xB6, REG_DEF0),
+  EXT_0F_ENCODING_MAP(Movzx16q, REX_W, 0xB7, REG_DEF0),
+  EXT_0F_ENCODING_MAP(Movsx8q,  REX, 0xBE, REG_DEF0),
+  EXT_0F_ENCODING_MAP(Movsx16q, REX_W, 0xBF, REG_DEF0),
 #undef EXT_0F_ENCODING_MAP
 
   { kX86Jcc8,  kJcc,  IS_BINARY_OP | IS_BRANCH | NEEDS_FIXUP | USES_CCODES, { 0,             0, 0x70, 0,    0, 0, 0, 0, false }, "Jcc8",  "!1c !0t" },
@@ -627,7 +620,8 @@ size_t X86Mir2Lir::ComputeSize(const X86EncodingMap* entry, int32_t raw_reg, int
     if (registers_need_rex_prefix) {
       DCHECK(cu_->target64) << "Attempt to use a 64-bit only addressable register "
           << RegStorage::RegNum(raw_reg) << " with instruction " << entry->name;
-      if (entry->skeleton.prefix1 != REX_W && entry->skeleton.prefix2 != REX_W) {
+      if (entry->skeleton.prefix1 != REX_W && entry->skeleton.prefix2 != REX_W
+         && entry->skeleton.prefix1 != REX && entry->skeleton.prefix2 != REX) {
         ++size;  // rex
       }
     }
@@ -906,7 +900,8 @@ void X86Mir2Lir::EmitPrefix(const X86EncodingMap* entry,
       // 64 bit addresses by GS, not FS.
       code_buffer_.push_back(THREAD_PREFIX_GS);
     } else {
-      if (entry->skeleton.prefix1 == REX_W) {
+      if (entry->skeleton.prefix1 == REX_W || entry->skeleton.prefix1 == REX) {
+        DCHECK(cu_->target64);
         rex |= entry->skeleton.prefix1;
         code_buffer_.push_back(rex);
         rex = 0;
@@ -915,7 +910,8 @@ void X86Mir2Lir::EmitPrefix(const X86EncodingMap* entry,
       }
     }
     if (entry->skeleton.prefix2 != 0) {
-      if (entry->skeleton.prefix2 == REX_W) {
+      if (entry->skeleton.prefix2 == REX_W || entry->skeleton.prefix1 == REX) {
+        DCHECK(cu_->target64);
         rex |= entry->skeleton.prefix2;
         code_buffer_.push_back(rex);
         rex = 0;
diff --git a/compiler/dex/quick/x86/call_x86.cc b/compiler/dex/quick/x86/call_x86.cc
index 6ca220cb2e..9000514856 100644
--- a/compiler/dex/quick/x86/call_x86.cc
+++ b/compiler/dex/quick/x86/call_x86.cc
@@ -94,13 +94,10 @@ void X86Mir2Lir::GenPackedSwitch(MIR* mir, DexOffset table_offset,
     start_of_method_reg = rl_method.reg;
     store_method_addr_used_ = true;
   } else {
-    if (cu_->target64) {
-      start_of_method_reg = AllocTempWide();
-    } else {
-      start_of_method_reg = AllocTemp();
-    }
+    start_of_method_reg = AllocTempRef();
     NewLIR1(kX86StartOfMethod, start_of_method_reg.GetReg());
   }
+  DCHECK_EQ(start_of_method_reg.Is64Bit(), cu_->target64);
   int low_key = s4FromSwitchData(&table[2]);
   RegStorage keyReg;
   // Remove the bias, if necessary
@@ -111,7 +108,7 @@ void X86Mir2Lir::GenPackedSwitch(MIR* mir, DexOffset table_offset,
     OpRegRegImm(kOpSub, keyReg, rl_src.reg, low_key);
   }
   // Bounds check - if < 0 or >= size continue following switch
-  OpRegImm(kOpCmp, keyReg, size-1);
+  OpRegImm(kOpCmp, keyReg, size - 1);
   LIR* branch_over = OpCondBranch(kCondHi, NULL);
 
   // Load the displacement from the switch table
@@ -119,11 +116,7 @@ void X86Mir2Lir::GenPackedSwitch(MIR* mir, DexOffset table_offset,
   NewLIR5(kX86PcRelLoadRA, disp_reg.GetReg(), start_of_method_reg.GetReg(), keyReg.GetReg(),
           2, WrapPointer(tab_rec));
   // Add displacement to start of method
-  if (cu_->target64) {
-    NewLIR2(kX86Add64RR, start_of_method_reg.GetReg(), disp_reg.GetReg());
-  } else {
-    OpRegReg(kOpAdd, start_of_method_reg, disp_reg);
-  }
+  OpRegReg(kOpAdd, start_of_method_reg, cu_->target64 ? As64BitReg(disp_reg) : disp_reg);
   // ..and go!
   LIR* switch_branch = NewLIR1(kX86JmpR, start_of_method_reg.GetReg());
   tab_rec->anchor = switch_branch;
@@ -174,7 +167,6 @@ void X86Mir2Lir::GenFillArrayData(DexOffset table_offset, RegLocation rl_src) {
     }
     store_method_addr_used_ = true;
   } else {
-    // TODO(64) force to be 64-bit
     NewLIR1(kX86StartOfMethod, method_start.GetReg());
   }
   NewLIR2(kX86PcRelAdr, payload.GetReg(), WrapPointer(tab_rec));
@@ -193,8 +185,8 @@ void X86Mir2Lir::GenMoveException(RegLocation rl_dest) {
       Thread::ExceptionOffset<8>().Int32Value() :
       Thread::ExceptionOffset<4>().Int32Value();
   RegLocation rl_result = EvalLoc(rl_dest, kRefReg, true);
-  NewLIR2(kX86Mov32RT, rl_result.reg.GetReg(), ex_offset);
-  NewLIR2(kX86Mov32TI, ex_offset, 0);
+  NewLIR2(cu_->target64 ? kX86Mov64RT : kX86Mov32RT, rl_result.reg.GetReg(), ex_offset);
+  NewLIR2(cu_->target64 ? kX86Mov64TI : kX86Mov32TI, ex_offset, 0);
   StoreValue(rl_dest, rl_result);
 }
 
@@ -202,17 +194,15 @@ void X86Mir2Lir::GenMoveException(RegLocation rl_dest) {
  * Mark garbage collection card. Skip if the value we're storing is null.
  */
 void X86Mir2Lir::MarkGCCard(RegStorage val_reg, RegStorage tgt_addr_reg) {
-  RegStorage reg_card_base = AllocTemp();
-  RegStorage reg_card_no = AllocTemp();
+  DCHECK_EQ(tgt_addr_reg.Is64Bit(), cu_->target64);
+  DCHECK_EQ(val_reg.Is64Bit(), cu_->target64);
+  RegStorage reg_card_base = AllocTempRef();
+  RegStorage reg_card_no = AllocTempRef();
   LIR* branch_over = OpCmpImmBranch(kCondEq, val_reg, 0, NULL);
   int ct_offset = cu_->target64 ?
       Thread::CardTableOffset<8>().Int32Value() :
       Thread::CardTableOffset<4>().Int32Value();
-  if (cu_->target64) {
-    NewLIR2(kX86Mov64RT, reg_card_base.GetReg(), ct_offset);
-  } else {
-    NewLIR2(kX86Mov32RT, reg_card_base.GetReg(), ct_offset);
-  }
+  NewLIR2(cu_->target64 ? kX86Mov64RT : kX86Mov32RT, reg_card_base.GetReg(), ct_offset);
   OpRegRegImm(kOpLsr, reg_card_no, tgt_addr_reg, gc::accounting::CardTable::kCardShift);
   StoreBaseIndexed(reg_card_base, reg_card_no, reg_card_base, 0, kUnsignedByte);
   LIR* target = NewLIR0(kPseudoTargetLabel);
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index 55e5993dce..ff7b30eeec 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -28,7 +28,7 @@ class X86Mir2Lir : public Mir2Lir {
  protected:
   class InToRegStorageMapper {
    public:
-    virtual RegStorage GetNextReg(bool is_double_or_float, bool is_wide) = 0;
+    virtual RegStorage GetNextReg(bool is_double_or_float, bool is_wide, bool is_ref) = 0;
     virtual ~InToRegStorageMapper() {}
   };
 
@@ -36,7 +36,7 @@ class X86Mir2Lir : public Mir2Lir {
    public:
     explicit InToRegStorageX86_64Mapper(Mir2Lir* ml) : ml_(ml), cur_core_reg_(0), cur_fp_reg_(0) {}
     virtual ~InToRegStorageX86_64Mapper() {}
-    virtual RegStorage GetNextReg(bool is_double_or_float, bool is_wide);
+    virtual RegStorage GetNextReg(bool is_double_or_float, bool is_wide, bool is_ref);
    protected:
     Mir2Lir* ml_;
    private:
@@ -118,6 +118,8 @@ class X86Mir2Lir : public Mir2Lir {
   void FreeCallTemps();
   void LockCallTemps();
   void CompilerInitializeRegAlloc();
+  int VectorRegisterSize();
+  int NumReservableVectorRegisters(bool fp_used);
 
   // Required for target - miscellaneous.
   void AssembleLIR();
@@ -503,6 +505,11 @@ class X86Mir2Lir : public Mir2Lir {
   void GenFusedLongCmpImmBranch(BasicBlock* bb, RegLocation rl_src1,
                                 int64_t val, ConditionCode ccode);
   void GenConstWide(RegLocation rl_dest, int64_t value);
+  void GenMultiplyVectorSignedByte(BasicBlock *bb, MIR *mir);
+  void GenShiftByteVector(BasicBlock *bb, MIR *mir);
+  void AndMaskVectorRegister(RegStorage rs_src1, uint32_t m1, uint32_t m2, uint32_t m3, uint32_t m4);
+  void MaskVectorRegister(X86OpCode opcode, RegStorage rs_src1, uint32_t m1, uint32_t m2, uint32_t m3, uint32_t m4);
+  void AppendOpcodeWithConst(X86OpCode opcode, int reg, MIR* mir);
 
   static bool ProvidesFullMemoryBarrier(X86OpCode opcode);
 
@@ -513,6 +520,12 @@ class X86Mir2Lir : public Mir2Lir {
   virtual RegStorage AllocateByteRegister();
 
   /*
+   * @brief Use a wide temporary as a 128-bit register
+   * @returns a 128-bit temporary register.
+   */
+  virtual RegStorage Get128BitRegister(RegStorage reg);
+
+  /*
    * @brief Check if a register is byte addressable.
    * @returns true if a register is byte addressable.
    */
@@ -528,6 +541,22 @@ class X86Mir2Lir : public Mir2Lir {
    */
   bool GenInlinedIndexOf(CallInfo* info, bool zero_based);
 
+  /**
+   * @brief Reserve a fixed number of vector  registers from the register pool
+   * @details The mir->dalvikInsn.vA specifies an N such that vector registers
+   * [0..N-1] are removed from the temporary pool. The caller must call
+   * ReturnVectorRegisters before calling ReserveVectorRegisters again.
+   * Also sets the num_reserved_vector_regs_ to the specified value
+   * @param mir whose vA specifies the number of registers to reserve
+   */
+  void ReserveVectorRegisters(MIR* mir);
+
+  /**
+   * @brief Return all the reserved vector registers to the temp pool
+   * @details Returns [0..num_reserved_vector_regs_]
+   */
+  void ReturnVectorRegisters();
+
   /*
    * @brief Load 128 bit constant into vector register.
    * @param bb The basic block in which the MIR is from.
@@ -901,6 +930,10 @@ class X86Mir2Lir : public Mir2Lir {
   LIR *AddVectorLiteral(MIR *mir);
 
   InToRegStorageMapping in_to_reg_storage_mapping_;
+
+ private:
+  // The number of vector registers [0..N] reserved by a call to ReserveVectorRegisters
+  int num_reserved_vector_regs_;
 };
 
 }  // namespace art
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index 43882c2e02..e81f505f2f 100755
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -427,6 +427,10 @@ RegStorage X86Mir2Lir::AllocateByteRegister() {
   return reg;
 }
 
+RegStorage X86Mir2Lir::Get128BitRegister(RegStorage reg) {
+  return GetRegInfo(reg)->FindMatchingView(RegisterInfo::k128SoloStorageMask)->GetReg();
+}
+
 bool X86Mir2Lir::IsByteRegister(RegStorage reg) {
   return cu_->target64 || reg.GetRegNum() < rs_rX86_SP.GetRegNum();
 }
@@ -646,6 +650,14 @@ void X86Mir2Lir::CompilerInitializeRegAlloc() {
   reg_pool_->next_dp_reg_ = 1;
 }
 
+int X86Mir2Lir::VectorRegisterSize() {
+  return 128;
+}
+
+int X86Mir2Lir::NumReservableVectorRegisters(bool fp_used) {
+  return fp_used ? 5 : 7;
+}
+
 void X86Mir2Lir::SpillCoreRegs() {
   if (num_core_spills_ == 0) {
     return;
@@ -790,6 +802,9 @@ X86Mir2Lir::X86Mir2Lir(CompilationUnit* cu, MIRGraph* mir_graph, ArenaAllocator*
   rX86_RET1 = rDX;
   rX86_INVOKE_TGT = rAX;
   rX86_COUNT = rCX;
+
+  // Initialize the number of reserved vector registers
+  num_reserved_vector_regs_ = -1;
 }
 
 Mir2Lir* X86CodeGenerator(CompilationUnit* const cu, MIRGraph* const mir_graph,
@@ -1475,6 +1490,12 @@ std::vector<uint8_t>* X86Mir2Lir::ReturnCallFrameInformation() {
 
 void X86Mir2Lir::GenMachineSpecificExtendedMethodMIR(BasicBlock* bb, MIR* mir) {
   switch (static_cast<ExtendedMIROpcode>(mir->dalvikInsn.opcode)) {
+    case kMirOpReserveVectorRegisters:
+      ReserveVectorRegisters(mir);
+      break;
+    case kMirOpReturnVectorRegisters:
+      ReturnVectorRegisters();
+      break;
     case kMirOpConstVector:
       GenConst128(bb, mir);
       break;
@@ -1522,11 +1543,57 @@ void X86Mir2Lir::GenMachineSpecificExtendedMethodMIR(BasicBlock* bb, MIR* mir) {
   }
 }
 
+void X86Mir2Lir::ReserveVectorRegisters(MIR* mir) {
+  // We should not try to reserve twice without returning the registers
+  DCHECK_NE(num_reserved_vector_regs_, -1);
+
+  int num_vector_reg = mir->dalvikInsn.vA;
+  for (int i = 0; i < num_vector_reg; i++) {
+    RegStorage xp_reg = RegStorage::Solo128(i);
+    RegisterInfo *xp_reg_info = GetRegInfo(xp_reg);
+    Clobber(xp_reg);
+
+    for (RegisterInfo *info = xp_reg_info->GetAliasChain();
+                       info != nullptr;
+                       info = info->GetAliasChain()) {
+      if (info->GetReg().IsSingle()) {
+        reg_pool_->sp_regs_.Delete(info);
+      } else {
+        reg_pool_->dp_regs_.Delete(info);
+      }
+    }
+  }
+
+  num_reserved_vector_regs_ = num_vector_reg;
+}
+
+void X86Mir2Lir::ReturnVectorRegisters() {
+  // Return all the reserved registers
+  for (int i = 0; i < num_reserved_vector_regs_; i++) {
+    RegStorage xp_reg = RegStorage::Solo128(i);
+    RegisterInfo *xp_reg_info = GetRegInfo(xp_reg);
+
+    for (RegisterInfo *info = xp_reg_info->GetAliasChain();
+                       info != nullptr;
+                       info = info->GetAliasChain()) {
+      if (info->GetReg().IsSingle()) {
+        reg_pool_->sp_regs_.Insert(info);
+      } else {
+        reg_pool_->dp_regs_.Insert(info);
+      }
+    }
+  }
+
+  // We don't have anymore reserved vector registers
+  num_reserved_vector_regs_ = -1;
+}
+
 void X86Mir2Lir::GenConst128(BasicBlock* bb, MIR* mir) {
-  int type_size = mir->dalvikInsn.vA;
+  store_method_addr_used_ = true;
+  int type_size = mir->dalvikInsn.vB;
   // We support 128 bit vectors.
   DCHECK_EQ(type_size & 0xFFFF, 128);
-  RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vB);
+  RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vA);
   uint32_t *args = mir->dalvikInsn.arg;
   int reg = rs_dest.GetReg();
   // Check for all 0 case.
@@ -1534,6 +1601,12 @@ void X86Mir2Lir::GenConst128(BasicBlock* bb, MIR* mir) {
     NewLIR2(kX86XorpsRR, reg, reg);
     return;
   }
+
+  // Append the mov const vector to reg opcode.
+  AppendOpcodeWithConst(kX86MovupsRM, reg, mir);
+}
+
+void X86Mir2Lir::AppendOpcodeWithConst(X86OpCode opcode, int reg, MIR* mir) {
   // Okay, load it from the constant vector area.
   LIR *data_target = ScanVectorLiteral(mir);
   if (data_target == nullptr) {
@@ -1553,24 +1626,66 @@ void X86Mir2Lir::GenConst128(BasicBlock* bb, MIR* mir) {
   // 4 byte offset.  We will fix this up in the assembler later to have the right
   // value.
   ScopedMemRefType mem_ref_type(this, ResourceMask::kLiteral);
-  LIR *load = NewLIR3(kX86Mova128RM, reg, rl_method.reg.GetReg(),  256 /* bogus */);
+  LIR *load = NewLIR2(opcode, reg, rl_method.reg.GetReg());
   load->flags.fixup = kFixupLoad;
   load->target = data_target;
 }
 
 void X86Mir2Lir::GenMoveVector(BasicBlock *bb, MIR *mir) {
   // We only support 128 bit registers.
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vB);
-  RegStorage rs_src = RegStorage::Solo128(mir->dalvikInsn.vC);
+  DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
+  RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vA);
+  RegStorage rs_src = RegStorage::Solo128(mir->dalvikInsn.vB);
   NewLIR2(kX86Mova128RR, rs_dest.GetReg(), rs_src.GetReg());
 }
 
+void X86Mir2Lir::GenMultiplyVectorSignedByte(BasicBlock *bb, MIR *mir) {
+  const int BYTE_SIZE = 8;
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
+  RegStorage rs_src1_high_tmp = Get128BitRegister(AllocTempWide());
+
+  /*
+   * Emulate the behavior of a kSignedByte by separating out the 16 values in the two XMM
+   * and multiplying 8 at a time before recombining back into one XMM register.
+   *
+   *   let xmm1, xmm2 be real srcs (keep low bits of 16bit lanes)
+   *       xmm3 is tmp             (operate on high bits of 16bit lanes)
+   *
+   *    xmm3 = xmm1
+   *    xmm1 = xmm1 .* xmm2
+   *    xmm1 = xmm1 & 0x00ff00ff00ff00ff00ff00ff00ff00ff  // xmm1 now has low bits
+   *    xmm3 = xmm3 .>> 8
+   *    xmm2 = xmm2 & 0xff00ff00ff00ff00ff00ff00ff00ff00
+   *    xmm2 = xmm2 .* xmm3                               // xmm2 now has high bits
+   *    xmm1 = xmm1 | xmm2                                // combine results
+   */
+
+  // Copy xmm1.
+  NewLIR2(kX86Mova128RR, rs_src1_high_tmp.GetReg(), rs_dest_src1.GetReg());
+
+  // Multiply low bits.
+  NewLIR2(kX86PmullwRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
+
+  // xmm1 now has low bits.
+  AndMaskVectorRegister(rs_dest_src1, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF);
+
+  // Prepare high bits for multiplication.
+  NewLIR2(kX86PsrlwRI, rs_src1_high_tmp.GetReg(), BYTE_SIZE);
+  AndMaskVectorRegister(rs_src2, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00);
+
+  // Multiply high bits and xmm2 now has high bits.
+  NewLIR2(kX86PmullwRR, rs_src2.GetReg(), rs_src1_high_tmp.GetReg());
+
+  // Combine back into dest XMM register.
+  NewLIR2(kX86PorRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
+}
+
 void X86Mir2Lir::GenMultiplyVector(BasicBlock *bb, MIR *mir) {
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
-  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
-  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC);
+  DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
   int opcode = 0;
   switch (opsize) {
     case k32:
@@ -1585,6 +1700,10 @@ void X86Mir2Lir::GenMultiplyVector(BasicBlock *bb, MIR *mir) {
     case kDouble:
       opcode = kX86MulpdRR;
       break;
+    case kSignedByte:
+      // HW doesn't support 16x16 byte multiplication so emulate it.
+      GenMultiplyVectorSignedByte(bb, mir);
+      return;
     default:
       LOG(FATAL) << "Unsupported vector multiply " << opsize;
       break;
@@ -1593,10 +1712,10 @@ void X86Mir2Lir::GenMultiplyVector(BasicBlock *bb, MIR *mir) {
 }
 
 void X86Mir2Lir::GenAddVector(BasicBlock *bb, MIR *mir) {
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
-  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
-  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC);
+  DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
   int opcode = 0;
   switch (opsize) {
     case k32:
@@ -1624,10 +1743,10 @@ void X86Mir2Lir::GenAddVector(BasicBlock *bb, MIR *mir) {
 }
 
 void X86Mir2Lir::GenSubtractVector(BasicBlock *bb, MIR *mir) {
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
-  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
-  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC);
+  DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
   int opcode = 0;
   switch (opsize) {
     case k32:
@@ -1654,11 +1773,60 @@ void X86Mir2Lir::GenSubtractVector(BasicBlock *bb, MIR *mir) {
   NewLIR2(opcode, rs_dest_src1.GetReg(), rs_src2.GetReg());
 }
 
+void X86Mir2Lir::GenShiftByteVector(BasicBlock *bb, MIR *mir) {
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  RegStorage rs_tmp = Get128BitRegister(AllocTempWide());
+
+  int opcode = 0;
+  int imm = mir->dalvikInsn.vB;
+
+  switch (static_cast<ExtendedMIROpcode>(mir->dalvikInsn.opcode)) {
+    case kMirOpPackedShiftLeft:
+      opcode = kX86PsllwRI;
+      break;
+    case kMirOpPackedSignedShiftRight:
+      opcode = kX86PsrawRI;
+      break;
+    case kMirOpPackedUnsignedShiftRight:
+      opcode = kX86PsrlwRI;
+      break;
+    default:
+      LOG(FATAL) << "Unsupported shift operation on byte vector " << opcode;
+      break;
+  }
+
+  /*
+   * xmm1 will have low bits
+   * xmm2 will have high bits
+   *
+   * xmm2 = xmm1
+   * xmm1 = xmm1 .<< N
+   * xmm2 = xmm2 && 0xFF00FF00FF00FF00FF00FF00FF00FF00
+   * xmm2 = xmm2 .<< N
+   * xmm1 = xmm1 | xmm2
+   */
+
+  // Copy xmm1.
+  NewLIR2(kX86Mova128RR, rs_tmp.GetReg(), rs_dest_src1.GetReg());
+
+  // Shift lower values.
+  NewLIR2(opcode, rs_dest_src1.GetReg(), imm);
+
+  // Mask bottom bits.
+  AndMaskVectorRegister(rs_tmp, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00);
+
+  // Shift higher values.
+  NewLIR2(opcode, rs_tmp.GetReg(), imm);
+
+  // Combine back into dest XMM register.
+  NewLIR2(kX86PorRR, rs_dest_src1.GetReg(), rs_tmp.GetReg());
+}
+
 void X86Mir2Lir::GenShiftLeftVector(BasicBlock *bb, MIR *mir) {
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
-  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
-  int imm = mir->dalvikInsn.vC;
+  DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  int imm = mir->dalvikInsn.vB;
   int opcode = 0;
   switch (opsize) {
     case k32:
@@ -1671,6 +1839,10 @@ void X86Mir2Lir::GenShiftLeftVector(BasicBlock *bb, MIR *mir) {
     case kUnsignedHalf:
       opcode = kX86PsllwRI;
       break;
+    case kSignedByte:
+    case kUnsignedByte:
+      GenShiftByteVector(bb, mir);
+      return;
     default:
       LOG(FATAL) << "Unsupported vector shift left " << opsize;
       break;
@@ -1679,10 +1851,10 @@ void X86Mir2Lir::GenShiftLeftVector(BasicBlock *bb, MIR *mir) {
 }
 
 void X86Mir2Lir::GenSignedShiftRightVector(BasicBlock *bb, MIR *mir) {
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
-  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
-  int imm = mir->dalvikInsn.vC;
+  DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  int imm = mir->dalvikInsn.vB;
   int opcode = 0;
   switch (opsize) {
     case k32:
@@ -1692,6 +1864,10 @@ void X86Mir2Lir::GenSignedShiftRightVector(BasicBlock *bb, MIR *mir) {
     case kUnsignedHalf:
       opcode = kX86PsrawRI;
       break;
+    case kSignedByte:
+    case kUnsignedByte:
+      GenShiftByteVector(bb, mir);
+      return;
     default:
       LOG(FATAL) << "Unsupported vector signed shift right " << opsize;
       break;
@@ -1700,10 +1876,10 @@ void X86Mir2Lir::GenSignedShiftRightVector(BasicBlock *bb, MIR *mir) {
 }
 
 void X86Mir2Lir::GenUnsignedShiftRightVector(BasicBlock *bb, MIR *mir) {
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
-  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
-  int imm = mir->dalvikInsn.vC;
+  DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  int imm = mir->dalvikInsn.vB;
   int opcode = 0;
   switch (opsize) {
     case k32:
@@ -1716,6 +1892,10 @@ void X86Mir2Lir::GenUnsignedShiftRightVector(BasicBlock *bb, MIR *mir) {
     case kUnsignedHalf:
       opcode = kX86PsrlwRI;
       break;
+    case kSignedByte:
+    case kUnsignedByte:
+      GenShiftByteVector(bb, mir);
+      return;
     default:
       LOG(FATAL) << "Unsupported vector unsigned shift right " << opsize;
       break;
@@ -1725,91 +1905,209 @@ void X86Mir2Lir::GenUnsignedShiftRightVector(BasicBlock *bb, MIR *mir) {
 
 void X86Mir2Lir::GenAndVector(BasicBlock *bb, MIR *mir) {
   // We only support 128 bit registers.
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
-  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC);
+  DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
   NewLIR2(kX86PandRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
 }
 
 void X86Mir2Lir::GenOrVector(BasicBlock *bb, MIR *mir) {
   // We only support 128 bit registers.
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
-  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC);
+  DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
   NewLIR2(kX86PorRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
 }
 
 void X86Mir2Lir::GenXorVector(BasicBlock *bb, MIR *mir) {
   // We only support 128 bit registers.
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
-  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC);
+  DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
   NewLIR2(kX86PxorRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
 }
 
+void X86Mir2Lir::AndMaskVectorRegister(RegStorage rs_src1, uint32_t m1, uint32_t m2, uint32_t m3, uint32_t m4) {
+  MaskVectorRegister(kX86PandRM, rs_src1, m1, m2, m3, m4);
+}
+
+void X86Mir2Lir::MaskVectorRegister(X86OpCode opcode, RegStorage rs_src1, uint32_t m0, uint32_t m1, uint32_t m2, uint32_t m3) {
+  // Create temporary MIR as container for 128-bit binary mask.
+  MIR const_mir;
+  MIR* const_mirp = &const_mir;
+  const_mirp->dalvikInsn.opcode = static_cast<Instruction::Code>(kMirOpConstVector);
+  const_mirp->dalvikInsn.arg[0] = m0;
+  const_mirp->dalvikInsn.arg[1] = m1;
+  const_mirp->dalvikInsn.arg[2] = m2;
+  const_mirp->dalvikInsn.arg[3] = m3;
+
+  // Mask vector with const from literal pool.
+  AppendOpcodeWithConst(opcode, rs_src1.GetReg(), const_mirp);
+}
+
 void X86Mir2Lir::GenAddReduceVector(BasicBlock *bb, MIR *mir) {
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
-  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
-  int imm = mir->dalvikInsn.vC;
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
+  RegStorage rs_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+  RegLocation rl_dest = mir_graph_->GetDest(mir);
+  RegStorage rs_tmp;
+
+  int vec_bytes = (mir->dalvikInsn.vC & 0xFFFF) / 8;
+  int vec_unit_size = 0;
   int opcode = 0;
+  int extr_opcode = 0;
+  RegLocation rl_result;
+
   switch (opsize) {
     case k32:
+      extr_opcode = kX86PextrdRRI;
       opcode = kX86PhadddRR;
+      vec_unit_size = 4;
+      break;
+    case kSignedByte:
+    case kUnsignedByte:
+      extr_opcode = kX86PextrbRRI;
+      opcode = kX86PhaddwRR;
+      vec_unit_size = 2;
       break;
     case kSignedHalf:
     case kUnsignedHalf:
+      extr_opcode = kX86PextrwRRI;
       opcode = kX86PhaddwRR;
+      vec_unit_size = 2;
       break;
+    case kSingle:
+      rl_result = EvalLoc(rl_dest, kFPReg, true);
+      vec_unit_size = 4;
+      for (int i = 0; i < 3; i++) {
+        NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), rs_src1.GetReg());
+        NewLIR3(kX86ShufpsRRI, rs_src1.GetReg(), rs_src1.GetReg(), 0x39);
+      }
+      NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), rs_src1.GetReg());
+      StoreValue(rl_dest, rl_result);
+
+      // For single-precision floats, we are done here
+      return;
     default:
       LOG(FATAL) << "Unsupported vector add reduce " << opsize;
       break;
   }
-  NewLIR2(opcode, rs_dest_src1.GetReg(), imm);
+
+  int elems = vec_bytes / vec_unit_size;
+
+  // Emulate horizontal add instruction by reducing 2 vectors with 8 values before adding them again
+  // TODO is overflow handled correctly?
+  if (opsize == kSignedByte || opsize == kUnsignedByte) {
+    rs_tmp = Get128BitRegister(AllocTempWide());
+
+    // tmp = xmm1 .>> 8.
+    NewLIR2(kX86Mova128RR, rs_tmp.GetReg(), rs_src1.GetReg());
+    NewLIR2(kX86PsrlwRI, rs_tmp.GetReg(), 8);
+
+    // Zero extend low bits in xmm1.
+    AndMaskVectorRegister(rs_src1, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF);
+  }
+
+  while (elems > 1) {
+    if (opsize == kSignedByte || opsize == kUnsignedByte) {
+      NewLIR2(opcode, rs_tmp.GetReg(), rs_tmp.GetReg());
+    }
+    NewLIR2(opcode, rs_src1.GetReg(), rs_src1.GetReg());
+    elems >>= 1;
+  }
+
+  // Combine the results if we separated them.
+  if (opsize == kSignedByte || opsize == kUnsignedByte) {
+    NewLIR2(kX86PaddbRR, rs_src1.GetReg(), rs_tmp.GetReg());
+  }
+
+  // We need to extract to a GPR.
+  RegStorage temp = AllocTemp();
+  NewLIR3(extr_opcode, temp.GetReg(), rs_src1.GetReg(), 0);
+
+  // Can we do this directly into memory?
+  rl_result = UpdateLocTyped(rl_dest, kCoreReg);
+  if (rl_result.location == kLocPhysReg) {
+    // Ensure res is in a core reg
+    rl_result = EvalLoc(rl_dest, kCoreReg, true);
+    OpRegReg(kOpAdd, rl_result.reg, temp);
+    StoreFinalValue(rl_dest, rl_result);
+  } else {
+    OpMemReg(kOpAdd, rl_result, temp.GetReg());
+  }
+
+  FreeTemp(temp);
 }
 
 void X86Mir2Lir::GenReduceVector(BasicBlock *bb, MIR *mir) {
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
-  RegStorage rs_src = RegStorage::Solo128(mir->dalvikInsn.vB);
-  int index = mir->dalvikInsn.arg[0];
-  int opcode = 0;
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
+  RegLocation rl_dest = mir_graph_->GetDest(mir);
+  RegStorage rs_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+  int extract_index = mir->dalvikInsn.arg[0];
+  int extr_opcode = 0;
+  RegLocation rl_result;
+  bool is_wide = false;
+
   switch (opsize) {
     case k32:
-      opcode = kX86PextrdRRI;
+      rl_result = UpdateLocTyped(rl_dest, kCoreReg);
+      extr_opcode = (rl_result.location == kLocPhysReg) ? kX86PextrdMRI : kX86PextrdRRI;
       break;
     case kSignedHalf:
     case kUnsignedHalf:
-      opcode = kX86PextrwRRI;
-      break;
-    case kUnsignedByte:
-    case kSignedByte:
-      opcode = kX86PextrbRRI;
+      rl_result= UpdateLocTyped(rl_dest, kCoreReg);
+      extr_opcode = (rl_result.location == kLocPhysReg) ? kX86PextrwMRI : kX86PextrwRRI;
       break;
     default:
-      LOG(FATAL) << "Unsupported vector reduce " << opsize;
+      LOG(FATAL) << "Unsupported vector add reduce " << opsize;
+      return;
       break;
   }
-  // We need to extract to a GPR.
-  RegStorage temp = AllocTemp();
-  NewLIR3(opcode, temp.GetReg(), rs_src.GetReg(), index);
 
-  // Assume that the destination VR is in the def for the mir.
-  RegLocation rl_dest = mir_graph_->GetDest(mir);
-  RegLocation rl_temp =
-    {kLocPhysReg, 0, 0, 0, 0, 0, 0, 0, 1, temp, INVALID_SREG, INVALID_SREG};
-  StoreValue(rl_dest, rl_temp);
+  if (rl_result.location == kLocPhysReg) {
+    NewLIR3(extr_opcode, rl_result.reg.GetReg(), rs_src1.GetReg(), extract_index);
+    if (is_wide == true) {
+      StoreFinalValue(rl_dest, rl_result);
+    } else {
+      StoreFinalValueWide(rl_dest, rl_result);
+    }
+  } else {
+    int displacement = SRegOffset(rl_result.s_reg_low);
+    LIR *l = NewLIR3(extr_opcode, rs_rX86_SP.GetReg(), displacement, rs_src1.GetReg());
+    AnnotateDalvikRegAccess(l, displacement >> 2, true /* is_load */, is_wide /* is_64bit */);
+    AnnotateDalvikRegAccess(l, displacement >> 2, false /* is_load */, is_wide /* is_64bit */);
+  }
 }
 
 void X86Mir2Lir::GenSetVector(BasicBlock *bb, MIR *mir) {
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
-  RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vB);
-  int op_low = 0, op_high = 0;
+  DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
+  RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vA);
+  int op_low = 0, op_high = 0, imm = 0, op_mov = kX86MovdxrRR;
+  RegisterClass reg_type = kCoreReg;
+
   switch (opsize) {
     case k32:
       op_low = kX86PshufdRRI;
       break;
+    case kSingle:
+      op_low = kX86PshufdRRI;
+      op_mov = kX86Mova128RR;
+      reg_type = kFPReg;
+      break;
+    case k64:
+      op_low = kX86PshufdRRI;
+      imm = 0x44;
+      break;
+    case kDouble:
+      op_low = kX86PshufdRRI;
+      op_mov = kX86Mova128RR;
+      reg_type = kFPReg;
+      imm = 0x44;
+      break;
+    case kSignedByte:
+    case kUnsignedByte:
+      // Shuffle 8 bit value into 16 bit word.
+      // We set val = val + (val << 8) below and use 16 bit shuffle.
     case kSignedHalf:
     case kUnsignedHalf:
       // Handles low quadword.
@@ -1822,23 +2120,37 @@ void X86Mir2Lir::GenSetVector(BasicBlock *bb, MIR *mir) {
       break;
   }
 
-  // Load the value from the VR into a GPR.
   RegLocation rl_src = mir_graph_->GetSrc(mir, 0);
-  rl_src = LoadValue(rl_src, kCoreReg);
+
+  // Load the value from the VR into the reg.
+  if (rl_src.wide == 0) {
+    rl_src = LoadValue(rl_src, reg_type);
+  } else {
+    rl_src = LoadValueWide(rl_src, reg_type);
+  }
+
+  // If opsize is 8 bits wide then double value and use 16 bit shuffle instead.
+  if (opsize == kSignedByte || opsize == kUnsignedByte) {
+    RegStorage temp = AllocTemp();
+    // val = val + (val << 8).
+    NewLIR2(kX86Mov32RR, temp.GetReg(), rl_src.reg.GetReg());
+    NewLIR2(kX86Sal32RI, temp.GetReg(), 8);
+    NewLIR2(kX86Or32RR, rl_src.reg.GetReg(), temp.GetReg());
+    FreeTemp(temp);
+  }
 
   // Load the value into the XMM register.
-  NewLIR2(kX86MovdxrRR, rs_dest.GetReg(), rl_src.reg.GetReg());
+  NewLIR2(op_mov, rs_dest.GetReg(), rl_src.reg.GetReg());
 
   // Now shuffle the value across the destination.
-  NewLIR3(op_low, rs_dest.GetReg(), rs_dest.GetReg(), 0);
+  NewLIR3(op_low, rs_dest.GetReg(), rs_dest.GetReg(), imm);
 
   // And then repeat as needed.
   if (op_high != 0) {
-    NewLIR3(op_high, rs_dest.GetReg(), rs_dest.GetReg(), 0);
+    NewLIR3(op_high, rs_dest.GetReg(), rs_dest.GetReg(), imm);
   }
 }
 
-
 LIR *X86Mir2Lir::ScanVectorLiteral(MIR *mir) {
   int *args = reinterpret_cast<int*>(mir->dalvikInsn.arg);
   for (LIR *p = const_vectors_; p != nullptr; p = p->next) {
@@ -1867,7 +2179,7 @@ LIR *X86Mir2Lir::AddVectorLiteral(MIR *mir) {
 }
 
 // ------------ ABI support: mapping of args to physical registers -------------
-RegStorage X86Mir2Lir::InToRegStorageX86_64Mapper::GetNextReg(bool is_double_or_float, bool is_wide) {
+RegStorage X86Mir2Lir::InToRegStorageX86_64Mapper::GetNextReg(bool is_double_or_float, bool is_wide, bool is_ref) {
   const SpecialTargetRegister coreArgMappingToPhysicalReg[] = {kArg1, kArg2, kArg3, kArg4, kArg5};
   const int coreArgMappingToPhysicalRegSize = sizeof(coreArgMappingToPhysicalReg) / sizeof(SpecialTargetRegister);
   const SpecialTargetRegister fpArgMappingToPhysicalReg[] = {kFArg0, kFArg1, kFArg2, kFArg3,
@@ -1880,7 +2192,8 @@ RegStorage X86Mir2Lir::InToRegStorageX86_64Mapper::GetNextReg(bool is_double_or_
     }
   } else {
     if (cur_core_reg_ < coreArgMappingToPhysicalRegSize) {
-      return ml_->TargetReg(coreArgMappingToPhysicalReg[cur_core_reg_++], is_wide);
+      return is_ref ? ml_->TargetRefReg(coreArgMappingToPhysicalReg[cur_core_reg_++]) :
+                      ml_->TargetReg(coreArgMappingToPhysicalReg[cur_core_reg_++], is_wide);
     }
   }
   return RegStorage::InvalidReg();
@@ -1897,11 +2210,12 @@ void X86Mir2Lir::InToRegStorageMapping::Initialize(RegLocation* arg_locs, int co
   max_mapped_in_ = -1;
   is_there_stack_mapped_ = false;
   for (int in_position = 0; in_position < count; in_position++) {
-     RegStorage reg = mapper->GetNextReg(arg_locs[in_position].fp, arg_locs[in_position].wide);
+     RegStorage reg = mapper->GetNextReg(arg_locs[in_position].fp,
+             arg_locs[in_position].wide, arg_locs[in_position].ref);
      if (reg.Valid()) {
        mapping_[in_position] = reg;
        max_mapped_in_ = std::max(max_mapped_in_, in_position);
-       if (reg.Is64BitSolo()) {
+       if (arg_locs[in_position].wide) {
          // We covered 2 args, so skip the next one
          in_position++;
        }
diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h
index e271e9d100..2789923bb9 100644
--- a/compiler/dex/quick/x86/x86_lir.h
+++ b/compiler/dex/quick/x86/x86_lir.h
@@ -569,6 +569,9 @@ enum X86OpCode {
   kX86PextrbRRI,                // Extract 8 bits from XMM into GPR
   kX86PextrwRRI,                // Extract 16 bits from XMM into GPR
   kX86PextrdRRI,                // Extract 32 bits from XMM into GPR
+  kX86PextrbMRI,                // Extract 8 bits from XMM into memory
+  kX86PextrwMRI,                // Extract 16 bits from XMM into memory
+  kX86PextrdMRI,                // Extract 32 bits from XMM into memory
   kX86PshuflwRRI,               // Shuffle 16 bits in lower 64 bits of XMM.
   kX86PshufdRRI,                // Shuffle 32 bits in XMM.
   kX86ShufpsRRI,                // FP Shuffle 32 bits in XMM.
@@ -723,7 +726,7 @@ struct X86EncodingMap {
 #define REX_X 0x42
 // Extension of the ModR/M r/m field, SIB base field, or Opcode reg field
 #define REX_B 0x41
-// Extended register set
+// An empty REX prefix used to normalize the byte operations so that they apply to R4 through R15
 #define REX 0x40
 // Mask extracting the least 3 bits of r0..r15
 #define kRegNumMask32 0x07
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index 770ae89ca2..9bf51359cf 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -1466,8 +1466,12 @@ static void CheckAndClearResolveException(Thread* self)
   CHECK(self->IsExceptionPending());
   mirror::Throwable* exception = self->GetException(nullptr);
   std::string descriptor = exception->GetClass()->GetDescriptor();
-  if (descriptor != "Ljava/lang/IncompatibleClassChangeError;" &&
-      descriptor != "Ljava/lang/NoClassDefFoundError;") {
+      if (descriptor != "Ljava/lang/IllegalAccessError;" &&
+          descriptor != "Ljava/lang/IncompatibleClassChangeError;" &&
+          descriptor != "Ljava/lang/InstantiationError;" &&
+          descriptor != "Ljava/lang/NoClassDefFoundError;" &&
+          descriptor != "Ljava/lang/NoSuchFieldError;" &&
+          descriptor != "Ljava/lang/NoSuchMethodError;") {
     LOG(FATAL) << "Unexpected exeption " << exception->Dump();
   }
   self->ClearException();
diff --git a/compiler/elf_writer_quick.cc b/compiler/elf_writer_quick.cc
index 06f6e89c7b..42743862fe 100644
--- a/compiler/elf_writer_quick.cc
+++ b/compiler/elf_writer_quick.cc
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#include <unordered_set>
-
 #include "elf_writer_quick.h"
 
 #include "base/logging.h"
diff --git a/compiler/image_writer.cc b/compiler/image_writer.cc
index 2d25b7a2ed..acfa607f39 100644
--- a/compiler/image_writer.cc
+++ b/compiler/image_writer.cc
@@ -796,9 +796,9 @@ void ImageWriter::PatchOatCodeAndMethods(File* elf_file) {
   };
   const bool add_patches = compiler_driver_.GetCompilerOptions().GetIncludePatchInformation();
   if (add_patches) {
-    // TODO if we are adding patches the resulting ELF file might have a
-    // potentially rather large amount of free space where patches might have been
-    // placed. We should adjust the ELF file to get rid of this excess space.
+    // TODO if we are adding patches the resulting ELF file might have a potentially rather large
+    // amount of free space where patches might have been placed. We should adjust the ELF file to
+    // get rid of this excess space.
     patches.reserve(compiler_driver_.GetCodeToPatch().size() +
                     compiler_driver_.GetMethodsToPatch().size() +
                     compiler_driver_.GetClassesToPatch().size());
@@ -892,7 +892,7 @@ void ImageWriter::PatchOatCodeAndMethods(File* elf_file) {
     }
     Elf32_Shdr* shdr = file->FindSectionByName(".oat_patches");
     if (shdr != nullptr) {
-      DCHECK_EQ(shdr, file->FindSectionByType(SHT_OAT_PATCH))
+      CHECK_EQ(shdr, file->FindSectionByType(SHT_OAT_PATCH))
           << "Incorrect type for .oat_patches section";
       CHECK_LE(patches.size() * sizeof(uintptr_t), shdr->sh_size)
           << "We got more patches than anticipated";
@@ -903,9 +903,8 @@ void ImageWriter::PatchOatCodeAndMethods(File* elf_file) {
           << "Section overlaps onto next section";
       // It's mmap'd so we can just memcpy.
       memcpy(file->Begin() + shdr->sh_offset, patches.data(), patches.size()*sizeof(uintptr_t));
-      // TODO We should fill in the newly empty space between the last patch and
-      // the start of the next section by moving the following sections down if
-      // possible.
+      // TODO We should fill in the newly empty space between the last patch and the start of the
+      // next section by moving the following sections down if possible.
       shdr->sh_size = patches.size() * sizeof(uintptr_t);
     } else {
       LOG(ERROR) << "Unable to find section header for SHT_OAT_PATCH";