46 files changed, 1741 insertions, 507 deletions
diff --git a/Android.mk b/Android.mk
index 9c206400fc..3f4ead60d1 100644
--- a/Android.mk
+++ b/Android.mk
@@ -324,7 +324,7 @@ oat-target-$(1): $$(OUT_OAT_FILE)
 
 $$(OUT_OAT_FILE): $(PRODUCT_OUT)/$(1) $(DEFAULT_DEX_PREOPT_BUILT_IMAGE) $(DEX2OATD_DEPENDENCY)
 	@mkdir -p $$(dir $$@)
-	$(DEX2OATD) --runtime-arg -Xms64m --runtime-arg -Xmx64m \
+	$(DEX2OATD) --runtime-arg $(DEX2OAT_XMS) --runtime-arg $(DEX2OAT_XMX) \
 		--boot-image=$(DEFAULT_DEX_PREOPT_BUILT_IMAGE) --dex-file=$(PRODUCT_OUT)/$(1) \
 		--dex-location=/$(1) --oat-file=$$@ \
 		--instruction-set=$(DEX2OAT_TARGET_ARCH) \
diff --git a/build/Android.gtest.mk b/build/Android.gtest.mk
index 183ec37cd6..10cd1cc65a 100644
--- a/build/Android.gtest.mk
+++ b/build/Android.gtest.mk
@@ -110,6 +110,7 @@ RUNTIME_GTEST_COMMON_SRC_FILES := \
   runtime/mem_map_test.cc \
   runtime/mirror/dex_cache_test.cc \
   runtime/mirror/object_test.cc \
+  runtime/monitor_test.cc \
   runtime/parsed_options_test.cc \
   runtime/reference_table_test.cc \
   runtime/thread_pool_test.cc \
diff --git a/build/Android.oat.mk b/build/Android.oat.mk
index d3c8d7ee40..dd87f4a58e 100644
--- a/build/Android.oat.mk
+++ b/build/Android.oat.mk
@@ -29,7 +29,7 @@ define create-core-oat-host-rules
 $$($(1)HOST_CORE_IMG_OUT): $$(HOST_CORE_DEX_FILES) $$(DEX2OATD_DEPENDENCY)
 	@echo "host dex2oat: $$@ ($$?)"
 	@mkdir -p $$(dir $$@)
-	$$(hide) $$(DEX2OATD) --runtime-arg -Xms16m --runtime-arg -Xmx16m \
+	$$(hide) $$(DEX2OATD) --runtime-arg $(DEX2OAT_IMAGE_XMS) --runtime-arg $(DEX2OAT_IMAGE_XMX) \
 	  --image-classes=$$(PRELOADED_CLASSES) $$(addprefix --dex-file=,$$(HOST_CORE_DEX_FILES)) \
 	  $$(addprefix --dex-location=,$$(HOST_CORE_DEX_LOCATIONS)) --oat-file=$$($(1)HOST_CORE_OAT_OUT) \
 	  --oat-location=$$($(1)HOST_CORE_OAT) --image=$$($(1)HOST_CORE_IMG_OUT) \
@@ -57,7 +57,7 @@ define create-core-oat-target-rules
 $$($(1)TARGET_CORE_IMG_OUT): $$($(1)TARGET_CORE_DEX_FILES) $$(DEX2OATD_DEPENDENCY)
 	@echo "target dex2oat: $$@ ($$?)"
 	@mkdir -p $$(dir $$@)
-	$$(hide) $$(DEX2OATD) --runtime-arg -Xms16m --runtime-arg -Xmx16m \
+	$$(hide) $$(DEX2OATD) --runtime-arg $(DEX2OAT_XMS) --runtime-arg $(DEX2OAT_XMX) \
 	  --image-classes=$$(PRELOADED_CLASSES) $$(addprefix --dex-file=,$$(TARGET_CORE_DEX_FILES)) \
 	  $$(addprefix --dex-location=,$$(TARGET_CORE_DEX_LOCATIONS)) --oat-file=$$($(1)TARGET_CORE_OAT_OUT) \
 	  --oat-location=$$($(1)TARGET_CORE_OAT) --image=$$($(1)TARGET_CORE_IMG_OUT) \
diff --git a/compiler/dex/backend.h b/compiler/dex/backend.h
index 596b3c9802..1f24849257 100644
--- a/compiler/dex/backend.h
+++ b/compiler/dex/backend.h
@@ -28,6 +28,25 @@ class Backend {
     virtual void Materialize() = 0;
     virtual CompiledMethod* GetCompiledMethod() = 0;
 
+    // Queries for backend support for vectors
+    /*
+     * Return the number of bits in a vector register.
+     * @return 0 if vector registers are not supported, or the
+     * number of bits in the vector register if supported.
+     */
+    virtual int VectorRegisterSize() { return 0; }
+
+    /*
+     * Return the number of reservable vector registers supported
+     * @param fp_used  ‘true’ if floating point computations will be
+     * executed while vector registers are reserved.
+     * @return the number of vector registers that are available
+     * @note The backend should ensure that sufficient vector registers
+     * are held back to generate scalar code without exhausting vector
+     * registers, if scalar code also uses the vector registers.
+     */
+    virtual int NumReservableVectorRegisters(bool fp_used) { return 0; }
+
   protected:
     explicit Backend(ArenaAllocator* arena) : arena_(arena) {}
     ArenaAllocator* const arena_;
diff --git a/compiler/dex/compiler_enums.h b/compiler/dex/compiler_enums.h
index caecb7a48e..799a742032 100644
--- a/compiler/dex/compiler_enums.h
+++ b/compiler/dex/compiler_enums.h
@@ -133,91 +133,101 @@ enum ExtendedMIROpcode {
   // could be supported by using a bit in TypeSize and arg[0] where needed.
 
   // @brief MIR to move constant data to a vector register
-  // vA: number of bits in register
-  // vB: destination
+  // vA: destination
+  // vB: number of bits in register
   // args[0]~args[3]: up to 128 bits of data for initialization
   kMirOpConstVector,
 
   // @brief MIR to move a vectorized register to another
-  // vA: TypeSize
-  // vB: destination
-  // vC: source
+  // vA: destination
+  // vB: source
+  // vC: TypeSize
   kMirOpMoveVector,
 
   // @brief Packed multiply of units in two vector registers: vB = vB .* vC using vA to know the type of the vector.
-  // vA: TypeSize
-  // vB: destination and source
-  // vC: source
+  // vA: destination and source
+  // vB: source
+  // vC: TypeSize
   kMirOpPackedMultiply,
 
   // @brief Packed addition of units in two vector registers: vB = vB .+ vC using vA to know the type of the vector.
-  // vA: TypeSize
-  // vB: destination and source
-  // vC: source
+  // vA: destination and source
+  // vB: source
+  // vC: TypeSize
   kMirOpPackedAddition,
 
   // @brief Packed subtraction of units in two vector registers: vB = vB .- vC using vA to know the type of the vector.
-  // vA: TypeSize
-  // vB: destination and source
-  // vC: source
+  // vA: destination and source
+  // vB: source
+  // vC: TypeSize
   kMirOpPackedSubtract,
 
   // @brief Packed shift left of units in two vector registers: vB = vB .<< vC using vA to know the type of the vector.
-  // vA: TypeSize
-  // vB: destination and source
-  // vC: immediate
+  // vA: destination and source
+  // vB: amount to shift
+  // vC: TypeSize
   kMirOpPackedShiftLeft,
 
   // @brief Packed signed shift right of units in two vector registers: vB = vB .>> vC using vA to know the type of the vector.
-  // vA: TypeSize
-  // vB: destination and source
-  // vC: immediate
+  // vA: destination and source
+  // vB: amount to shift
+  // vC: TypeSize
   kMirOpPackedSignedShiftRight,
 
   // @brief Packed unsigned shift right of units in two vector registers: vB = vB .>>> vC using vA to know the type of the vector.
-  // vA: TypeSize
-  // vB: destination and source
-  // vC: immediate
+  // vA: destination and source
+  // vB: amount to shift
+  // vC: TypeSize
   kMirOpPackedUnsignedShiftRight,
 
   // @brief Packed bitwise and of units in two vector registers: vB = vB .& vC using vA to know the type of the vector.
-  // vA: TypeSize
-  // vB: destination and source
-  // vC: source
+  // vA: destination and source
+  // vB: source
+  // vC: TypeSize
   kMirOpPackedAnd,
 
   // @brief Packed bitwise or of units in two vector registers: vB = vB .| vC using vA to know the type of the vector.
-  // vA: TypeSize
-  // vB: destination and source
-  // vC: source
+  // vA: destination and source
+  // vB: source
+  // vC: TypeSize
   kMirOpPackedOr,
 
   // @brief Packed bitwise xor of units in two vector registers: vB = vB .^ vC using vA to know the type of the vector.
-  // vA: TypeSize
-  // vB: destination and source
-  // vC: source
+  // vA: destination and source
+  // vB: source
+  // vC: TypeSize
   kMirOpPackedXor,
 
   // @brief Reduce a 128-bit packed element into a single VR by taking lower bits
   // @details Instruction does a horizontal addition of the packed elements and then adds it to VR
-  // vA: TypeSize
-  // vB: destination and source VR (not vector register)
-  // vC: source (vector register)
+  // vA: destination and source VR (not vector register)
+  // vB: source (vector register)
+  // vC: TypeSize
   kMirOpPackedAddReduce,
 
   // @brief Extract a packed element into a single VR.
-  // vA: TypeSize
-  // vB: destination VR (not vector register)
-  // vC: source (vector register)
+  // vA: destination VR (not vector register)
+  // vB: source (vector register)
+  // vC: TypeSize
   // arg[0]: The index to use for extraction from vector register (which packed element)
   kMirOpPackedReduce,
 
   // @brief Create a vector value, with all TypeSize values equal to vC
-  // vA: TypeSize
-  // vB: destination vector register
-  // vC: source VR (not vector register)
+  // vA: destination vector register
+  // vB: source VR (not vector register)
+  // vC: TypeSize
   kMirOpPackedSet,
 
+  // @brief Reserve N vector registers (named 0..N-1)
+  // vA: Number of registers
+  // @note: The backend may choose to map vector numbers used in vector opcodes.
+  //  Reserved registers are removed from the list of backend temporary pool.
+  kMirOpReserveVectorRegisters,
+
+  // @brief Free Reserved vector registers
+  // @note: All currently reserved vector registers are returned to the temporary pool.
+  kMirOpReturnVectorRegisters,
+
   kMirOpLast,
 };
 
diff --git a/compiler/dex/mir_dataflow.cc b/compiler/dex/mir_dataflow.cc
index 9fea709568..bc99a272a6 100644
--- a/compiler/dex/mir_dataflow.cc
+++ b/compiler/dex/mir_dataflow.cc
@@ -840,6 +840,54 @@ const uint64_t MIRGraph::oat_data_flow_attributes_[kMirOpLast] = {
 
   // 113 MIR_SELECT
   DF_DA | DF_UB,
+
+  // 114 MirOpConstVector
+  DF_DA,
+
+  // 115 MirOpMoveVector
+  0,
+
+  // 116 MirOpPackedMultiply
+  0,
+
+  // 117 MirOpPackedAddition
+  0,
+
+  // 118 MirOpPackedSubtract
+  0,
+
+  // 119 MirOpPackedShiftLeft
+  0,
+
+  // 120 MirOpPackedSignedShiftRight
+  0,
+
+  // 121 MirOpPackedUnsignedShiftRight
+  0,
+
+  // 122 MirOpPackedAnd
+  0,
+
+  // 123 MirOpPackedOr
+  0,
+
+  // 124 MirOpPackedXor
+  0,
+
+  // 125 MirOpPackedAddReduce
+  DF_DA | DF_UA,
+
+  // 126 MirOpPackedReduce
+  DF_DA,
+
+  // 127 MirOpPackedSet
+  DF_UB,
+
+  // 128 MirOpReserveVectorRegisters
+  0,
+
+  // 129 MirOpReturnVectorRegisters
+  0,
 };
 
 /* Return the base virtual register for a SSA name */
diff --git a/compiler/dex/mir_graph.cc b/compiler/dex/mir_graph.cc
index baa46d61bd..8ce0389520 100644
--- a/compiler/dex/mir_graph.cc
+++ b/compiler/dex/mir_graph.cc
@@ -62,6 +62,8 @@ const char* MIRGraph::extended_mir_op_names_[kMirOpLast - kMirOpFirst] = {
   "PackedAddReduce",
   "PackedReduce",
   "PackedSet",
+  "ReserveVectorRegisters",
+  "ReturnVectorRegisters",
 };
 
 MIRGraph::MIRGraph(CompilationUnit* cu, ArenaAllocator* arena)
@@ -836,12 +838,13 @@ void MIRGraph::DumpCFG(const char* dir_prefix, bool all_blocks, const char *suff
                         mir->next ? " | " : " ");
               }
             } else {
-              fprintf(file, "    {%04x %s %s %s\\l}%s\\\n", mir->offset,
+              fprintf(file, "    {%04x %s %s %s %s\\l}%s\\\n", mir->offset,
                       mir->ssa_rep ? GetDalvikDisassembly(mir) :
                       !IsPseudoMirOp(opcode) ? Instruction::Name(mir->dalvikInsn.opcode) :
                         extended_mir_op_names_[opcode - kMirOpFirst],
                       (mir->optimization_flags & MIR_IGNORE_RANGE_CHECK) != 0 ? " no_rangecheck" : " ",
                       (mir->optimization_flags & MIR_IGNORE_NULL_CHECK) != 0 ? " no_nullcheck" : " ",
+                      (mir->optimization_flags & MIR_IGNORE_SUSPEND_CHECK) != 0 ? " no_suspendcheck" : " ",
                       mir->next ? " | " : " ");
             }
         }
diff --git a/compiler/dex/quick/gen_invoke.cc b/compiler/dex/quick/gen_invoke.cc
index 02f39ac180..6c0dfe80a6 100755
--- a/compiler/dex/quick/gen_invoke.cc
+++ b/compiler/dex/quick/gen_invoke.cc
@@ -1638,6 +1638,12 @@ bool Mir2Lir::GenInlinedStringCompareTo(CallInfo* info) {
 
 bool Mir2Lir::GenInlinedCurrentThread(CallInfo* info) {
   RegLocation rl_dest = InlineTarget(info);
+
+  // Early exit if the result is unused.
+  if (rl_dest.orig_sreg < 0) {
+    return true;
+  }
+
   RegLocation rl_result = EvalLoc(rl_dest, kRefReg, true);
 
   switch (cu_->instruction_set) {
diff --git a/compiler/dex/quick/ralloc_util.cc b/compiler/dex/quick/ralloc_util.cc
index 13bd4432d7..e8fc919d5f 100644
--- a/compiler/dex/quick/ralloc_util.cc
+++ b/compiler/dex/quick/ralloc_util.cc
@@ -1276,7 +1276,7 @@ void Mir2Lir::DoPromotion() {
         if (cu_->instruction_set == kThumb2) {
           bool wide = fp_regs[i].s_reg & STARTING_WIDE_SREG;
           if (wide) {
-            if (promotion_map_[p_map_idx + 1].fp_location == kLocPhysReg) {
+            if (promotion_map_[p_map_idx + 1].fp_location != kLocPhysReg) {
               // Ignore result - if can't alloc double may still be able to alloc singles.
               AllocPreservedDouble(low_sreg);
             }
diff --git a/compiler/dex/quick/x86/assemble_x86.cc b/compiler/dex/quick/x86/assemble_x86.cc
index 7baf2d9663..4e973d8b48 100644
--- a/compiler/dex/quick/x86/assemble_x86.cc
+++ b/compiler/dex/quick/x86/assemble_x86.cc
@@ -327,21 +327,11 @@ ENCODING_MAP(Cmp, IS_LOAD, 0, 0,
 { kX86 ## opname ## RM, kRegMem,   IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE1,  { prefix, 0, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RM", "!0r,[!1r+!2d]" }, \
 { kX86 ## opname ## RA, kRegArray, IS_LOAD | IS_QUIN_OP     | reg_def | REG_USE12, { prefix, 0, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RA", "!0r,[!1r+!2r<<!3d+!4d]" }
 
-#define EXT_0F_REX_NO_PREFIX_ENCODING_MAP(opname, opcode, reg_def) \
-{ kX86 ## opname ## RR, kRegReg,             IS_BINARY_OP   | reg_def | REG_USE1,  { REX, 0x00, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RR", "!0r,!1r" }, \
-{ kX86 ## opname ## RM, kRegMem,   IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE1,  { REX, 0x00, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RM", "!0r,[!1r+!2d]" }, \
-{ kX86 ## opname ## RA, kRegArray, IS_LOAD | IS_QUIN_OP     | reg_def | REG_USE12, { REX, 0x00, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RA", "!0r,[!1r+!2r<<!3d+!4d]" }
-
 #define EXT_0F_REX_W_ENCODING_MAP(opname, prefix, opcode, reg_def) \
 { kX86 ## opname ## RR, kRegReg,             IS_BINARY_OP   | reg_def | REG_USE1,  { prefix, REX_W, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RR", "!0r,!1r" }, \
 { kX86 ## opname ## RM, kRegMem,   IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE1,  { prefix, REX_W, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RM", "!0r,[!1r+!2d]" }, \
 { kX86 ## opname ## RA, kRegArray, IS_LOAD | IS_QUIN_OP     | reg_def | REG_USE12, { prefix, REX_W, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RA", "!0r,[!1r+!2r<<!3d+!4d]" }
 
-#define EXT_0F_REX_W_NO_PREFIX_ENCODING_MAP(opname, opcode, reg_def) \
-{ kX86 ## opname ## RR, kRegReg,             IS_BINARY_OP   | reg_def | REG_USE1,  { REX_W, 0x00, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RR", "!0r,!1r" }, \
-{ kX86 ## opname ## RM, kRegMem,   IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE1,  { REX_W, 0x00, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RM", "!0r,[!1r+!2d]" }, \
-{ kX86 ## opname ## RA, kRegArray, IS_LOAD | IS_QUIN_OP     | reg_def | REG_USE12, { REX_W, 0x00, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RA", "!0r,[!1r+!2r<<!3d+!4d]" }
-
 #define EXT_0F_ENCODING2_MAP(opname, prefix, opcode, opcode2, reg_def) \
 { kX86 ## opname ## RR, kRegReg,             IS_BINARY_OP   | reg_def | REG_USE1,  { prefix, 0, 0x0F, opcode, opcode2, 0, 0, 0, false }, #opname "RR", "!0r,!1r" }, \
 { kX86 ## opname ## RM, kRegMem,   IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE1,  { prefix, 0, 0x0F, opcode, opcode2, 0, 0, 0, false }, #opname "RM", "!0r,[!1r+!2d]" }, \
@@ -405,9 +395,12 @@ ENCODING_MAP(Cmp, IS_LOAD, 0, 0,
   EXT_0F_ENCODING_MAP(Haddpd,    0x66, 0x7C, REG_DEF0_USE0),
   EXT_0F_ENCODING_MAP(Haddps,    0xF2, 0x7C, REG_DEF0_USE0),
 
-  { kX86PextrbRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0x66, 0, 0x0F, 0x3A, 0x14, 0, 0, 1, false }, "PextbRRI", "!0r,!1r,!2d" },
-  { kX86PextrwRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0x66, 0, 0x0F, 0xC5, 0x00, 0, 0, 1, false }, "PextwRRI", "!0r,!1r,!2d" },
-  { kX86PextrdRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1, false }, "PextdRRI", "!0r,!1r,!2d" },
+  { kX86PextrbRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0  | REG_USE1, { 0x66, 0, 0x0F, 0x3A, 0x14, 0, 0, 1, false }, "PextbRRI", "!0r,!1r,!2d" },
+  { kX86PextrwRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0  | REG_USE1, { 0x66, 0, 0x0F, 0xC5, 0x00, 0, 0, 1, false }, "PextwRRI", "!0r,!1r,!2d" },
+  { kX86PextrdRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0  | REG_USE1, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1, false }, "PextdRRI", "!0r,!1r,!2d" },
+  { kX86PextrbMRI, kMemRegImm, IS_QUAD_OP     | REG_USE02 | IS_STORE, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1, false }, "kX86PextrbMRI", "[!0r+!1d],!2r,!3d" },
+  { kX86PextrwMRI, kMemRegImm, IS_QUAD_OP     | REG_USE02 | IS_STORE, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1, false }, "kX86PextrwMRI", "[!0r+!1d],!2r,!3d" },
+  { kX86PextrdMRI, kMemRegImm, IS_QUAD_OP     | REG_USE02 | IS_STORE, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1, false }, "kX86PextrdMRI", "[!0r+!1d],!2r,!3d" },
 
   { kX86PshuflwRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0xF2, 0, 0x0F, 0x70, 0, 0, 0, 1, false }, "PshuflwRRI", "!0r,!1r,!2d" },
   { kX86PshufdRRI,  kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0x66, 0, 0x0F, 0x70, 0, 0, 0, 1, false }, "PshuffRRI", "!0r,!1r,!2d" },
@@ -499,10 +492,10 @@ ENCODING_MAP(Cmp, IS_LOAD, 0, 0,
   EXT_0F_ENCODING_MAP(Movzx16, 0x00, 0xB7, REG_DEF0),
   EXT_0F_ENCODING_MAP(Movsx8,  0x00, 0xBE, REG_DEF0),
   EXT_0F_ENCODING_MAP(Movsx16, 0x00, 0xBF, REG_DEF0),
-  EXT_0F_REX_NO_PREFIX_ENCODING_MAP(Movzx8q, 0xB6, REG_DEF0),
-  EXT_0F_REX_W_NO_PREFIX_ENCODING_MAP(Movzx16q, 0xB7, REG_DEF0),
-  EXT_0F_REX_NO_PREFIX_ENCODING_MAP(Movsx8q, 0xBE, REG_DEF0),
-  EXT_0F_REX_W_NO_PREFIX_ENCODING_MAP(Movsx16q, 0xBF, REG_DEF0),
+  EXT_0F_ENCODING_MAP(Movzx8q,  REX_W, 0xB6, REG_DEF0),
+  EXT_0F_ENCODING_MAP(Movzx16q, REX_W, 0xB7, REG_DEF0),
+  EXT_0F_ENCODING_MAP(Movsx8q,  REX, 0xBE, REG_DEF0),
+  EXT_0F_ENCODING_MAP(Movsx16q, REX_W, 0xBF, REG_DEF0),
 #undef EXT_0F_ENCODING_MAP
 
   { kX86Jcc8,  kJcc,  IS_BINARY_OP | IS_BRANCH | NEEDS_FIXUP | USES_CCODES, { 0,             0, 0x70, 0,    0, 0, 0, 0, false }, "Jcc8",  "!1c !0t" },
@@ -627,7 +620,8 @@ size_t X86Mir2Lir::ComputeSize(const X86EncodingMap* entry, int32_t raw_reg, int
     if (registers_need_rex_prefix) {
       DCHECK(cu_->target64) << "Attempt to use a 64-bit only addressable register "
           << RegStorage::RegNum(raw_reg) << " with instruction " << entry->name;
-      if (entry->skeleton.prefix1 != REX_W && entry->skeleton.prefix2 != REX_W) {
+      if (entry->skeleton.prefix1 != REX_W && entry->skeleton.prefix2 != REX_W
+         && entry->skeleton.prefix1 != REX && entry->skeleton.prefix2 != REX) {
         ++size;  // rex
       }
     }
@@ -906,7 +900,8 @@ void X86Mir2Lir::EmitPrefix(const X86EncodingMap* entry,
       // 64 bit addresses by GS, not FS.
       code_buffer_.push_back(THREAD_PREFIX_GS);
     } else {
-      if (entry->skeleton.prefix1 == REX_W) {
+      if (entry->skeleton.prefix1 == REX_W || entry->skeleton.prefix1 == REX) {
+        DCHECK(cu_->target64);
         rex |= entry->skeleton.prefix1;
         code_buffer_.push_back(rex);
         rex = 0;
@@ -915,7 +910,8 @@ void X86Mir2Lir::EmitPrefix(const X86EncodingMap* entry,
       }
     }
     if (entry->skeleton.prefix2 != 0) {
-      if (entry->skeleton.prefix2 == REX_W) {
+      if (entry->skeleton.prefix2 == REX_W || entry->skeleton.prefix1 == REX) {
+        DCHECK(cu_->target64);
         rex |= entry->skeleton.prefix2;
         code_buffer_.push_back(rex);
         rex = 0;
diff --git a/compiler/dex/quick/x86/call_x86.cc b/compiler/dex/quick/x86/call_x86.cc
index 6ca220cb2e..9000514856 100644
--- a/compiler/dex/quick/x86/call_x86.cc
+++ b/compiler/dex/quick/x86/call_x86.cc
@@ -94,13 +94,10 @@ void X86Mir2Lir::GenPackedSwitch(MIR* mir, DexOffset table_offset,
     start_of_method_reg = rl_method.reg;
     store_method_addr_used_ = true;
   } else {
-    if (cu_->target64) {
-      start_of_method_reg = AllocTempWide();
-    } else {
-      start_of_method_reg = AllocTemp();
-    }
+    start_of_method_reg = AllocTempRef();
     NewLIR1(kX86StartOfMethod, start_of_method_reg.GetReg());
   }
+  DCHECK_EQ(start_of_method_reg.Is64Bit(), cu_->target64);
   int low_key = s4FromSwitchData(&table[2]);
   RegStorage keyReg;
   // Remove the bias, if necessary
@@ -111,7 +108,7 @@ void X86Mir2Lir::GenPackedSwitch(MIR* mir, DexOffset table_offset,
     OpRegRegImm(kOpSub, keyReg, rl_src.reg, low_key);
   }
   // Bounds check - if < 0 or >= size continue following switch
-  OpRegImm(kOpCmp, keyReg, size-1);
+  OpRegImm(kOpCmp, keyReg, size - 1);
   LIR* branch_over = OpCondBranch(kCondHi, NULL);
 
   // Load the displacement from the switch table
@@ -119,11 +116,7 @@ void X86Mir2Lir::GenPackedSwitch(MIR* mir, DexOffset table_offset,
   NewLIR5(kX86PcRelLoadRA, disp_reg.GetReg(), start_of_method_reg.GetReg(), keyReg.GetReg(),
           2, WrapPointer(tab_rec));
   // Add displacement to start of method
-  if (cu_->target64) {
-    NewLIR2(kX86Add64RR, start_of_method_reg.GetReg(), disp_reg.GetReg());
-  } else {
-    OpRegReg(kOpAdd, start_of_method_reg, disp_reg);
-  }
+  OpRegReg(kOpAdd, start_of_method_reg, cu_->target64 ? As64BitReg(disp_reg) : disp_reg);
   // ..and go!
   LIR* switch_branch = NewLIR1(kX86JmpR, start_of_method_reg.GetReg());
   tab_rec->anchor = switch_branch;
@@ -174,7 +167,6 @@ void X86Mir2Lir::GenFillArrayData(DexOffset table_offset, RegLocation rl_src) {
     }
     store_method_addr_used_ = true;
   } else {
-    // TODO(64) force to be 64-bit
     NewLIR1(kX86StartOfMethod, method_start.GetReg());
   }
   NewLIR2(kX86PcRelAdr, payload.GetReg(), WrapPointer(tab_rec));
@@ -193,8 +185,8 @@ void X86Mir2Lir::GenMoveException(RegLocation rl_dest) {
       Thread::ExceptionOffset<8>().Int32Value() :
       Thread::ExceptionOffset<4>().Int32Value();
   RegLocation rl_result = EvalLoc(rl_dest, kRefReg, true);
-  NewLIR2(kX86Mov32RT, rl_result.reg.GetReg(), ex_offset);
-  NewLIR2(kX86Mov32TI, ex_offset, 0);
+  NewLIR2(cu_->target64 ? kX86Mov64RT : kX86Mov32RT, rl_result.reg.GetReg(), ex_offset);
+  NewLIR2(cu_->target64 ? kX86Mov64TI : kX86Mov32TI, ex_offset, 0);
   StoreValue(rl_dest, rl_result);
 }
 
@@ -202,17 +194,15 @@ void X86Mir2Lir::GenMoveException(RegLocation rl_dest) {
  * Mark garbage collection card. Skip if the value we're storing is null.
  */
 void X86Mir2Lir::MarkGCCard(RegStorage val_reg, RegStorage tgt_addr_reg) {
-  RegStorage reg_card_base = AllocTemp();
-  RegStorage reg_card_no = AllocTemp();
+  DCHECK_EQ(tgt_addr_reg.Is64Bit(), cu_->target64);
+  DCHECK_EQ(val_reg.Is64Bit(), cu_->target64);
+  RegStorage reg_card_base = AllocTempRef();
+  RegStorage reg_card_no = AllocTempRef();
   LIR* branch_over = OpCmpImmBranch(kCondEq, val_reg, 0, NULL);
   int ct_offset = cu_->target64 ?
       Thread::CardTableOffset<8>().Int32Value() :
       Thread::CardTableOffset<4>().Int32Value();
-  if (cu_->target64) {
-    NewLIR2(kX86Mov64RT, reg_card_base.GetReg(), ct_offset);
-  } else {
-    NewLIR2(kX86Mov32RT, reg_card_base.GetReg(), ct_offset);
-  }
+  NewLIR2(cu_->target64 ? kX86Mov64RT : kX86Mov32RT, reg_card_base.GetReg(), ct_offset);
   OpRegRegImm(kOpLsr, reg_card_no, tgt_addr_reg, gc::accounting::CardTable::kCardShift);
   StoreBaseIndexed(reg_card_base, reg_card_no, reg_card_base, 0, kUnsignedByte);
   LIR* target = NewLIR0(kPseudoTargetLabel);
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index 55e5993dce..ff7b30eeec 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -28,7 +28,7 @@ class X86Mir2Lir : public Mir2Lir {
  protected:
   class InToRegStorageMapper {
    public:
-    virtual RegStorage GetNextReg(bool is_double_or_float, bool is_wide) = 0;
+    virtual RegStorage GetNextReg(bool is_double_or_float, bool is_wide, bool is_ref) = 0;
     virtual ~InToRegStorageMapper() {}
   };
 
@@ -36,7 +36,7 @@ class X86Mir2Lir : public Mir2Lir {
    public:
     explicit InToRegStorageX86_64Mapper(Mir2Lir* ml) : ml_(ml), cur_core_reg_(0), cur_fp_reg_(0) {}
     virtual ~InToRegStorageX86_64Mapper() {}
-    virtual RegStorage GetNextReg(bool is_double_or_float, bool is_wide);
+    virtual RegStorage GetNextReg(bool is_double_or_float, bool is_wide, bool is_ref);
    protected:
     Mir2Lir* ml_;
    private:
@@ -118,6 +118,8 @@ class X86Mir2Lir : public Mir2Lir {
   void FreeCallTemps();
   void LockCallTemps();
   void CompilerInitializeRegAlloc();
+  int VectorRegisterSize();
+  int NumReservableVectorRegisters(bool fp_used);
 
   // Required for target - miscellaneous.
   void AssembleLIR();
@@ -503,6 +505,11 @@ class X86Mir2Lir : public Mir2Lir {
   void GenFusedLongCmpImmBranch(BasicBlock* bb, RegLocation rl_src1,
                                 int64_t val, ConditionCode ccode);
   void GenConstWide(RegLocation rl_dest, int64_t value);
+  void GenMultiplyVectorSignedByte(BasicBlock *bb, MIR *mir);
+  void GenShiftByteVector(BasicBlock *bb, MIR *mir);
+  void AndMaskVectorRegister(RegStorage rs_src1, uint32_t m1, uint32_t m2, uint32_t m3, uint32_t m4);
+  void MaskVectorRegister(X86OpCode opcode, RegStorage rs_src1, uint32_t m1, uint32_t m2, uint32_t m3, uint32_t m4);
+  void AppendOpcodeWithConst(X86OpCode opcode, int reg, MIR* mir);
 
   static bool ProvidesFullMemoryBarrier(X86OpCode opcode);
 
@@ -513,6 +520,12 @@ class X86Mir2Lir : public Mir2Lir {
   virtual RegStorage AllocateByteRegister();
 
   /*
+   * @brief Use a wide temporary as a 128-bit register
+   * @returns a 128-bit temporary register.
+   */
+  virtual RegStorage Get128BitRegister(RegStorage reg);
+
+  /*
    * @brief Check if a register is byte addressable.
    * @returns true if a register is byte addressable.
    */
@@ -528,6 +541,22 @@ class X86Mir2Lir : public Mir2Lir {
    */
   bool GenInlinedIndexOf(CallInfo* info, bool zero_based);
 
+  /**
+   * @brief Reserve a fixed number of vector  registers from the register pool
+   * @details The mir->dalvikInsn.vA specifies an N such that vector registers
+   * [0..N-1] are removed from the temporary pool. The caller must call
+   * ReturnVectorRegisters before calling ReserveVectorRegisters again.
+   * Also sets the num_reserved_vector_regs_ to the specified value
+   * @param mir whose vA specifies the number of registers to reserve
+   */
+  void ReserveVectorRegisters(MIR* mir);
+
+  /**
+   * @brief Return all the reserved vector registers to the temp pool
+   * @details Returns [0..num_reserved_vector_regs_]
+   */
+  void ReturnVectorRegisters();
+
   /*
    * @brief Load 128 bit constant into vector register.
    * @param bb The basic block in which the MIR is from.
@@ -901,6 +930,10 @@ class X86Mir2Lir : public Mir2Lir {
   LIR *AddVectorLiteral(MIR *mir);
 
   InToRegStorageMapping in_to_reg_storage_mapping_;
+
+ private:
+  // The number of vector registers [0..N] reserved by a call to ReserveVectorRegisters
+  int num_reserved_vector_regs_;
 };
 
 }  // namespace art
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index 43882c2e02..e81f505f2f 100755
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -427,6 +427,10 @@ RegStorage X86Mir2Lir::AllocateByteRegister() {
   return reg;
 }
 
+RegStorage X86Mir2Lir::Get128BitRegister(RegStorage reg) {
+  return GetRegInfo(reg)->FindMatchingView(RegisterInfo::k128SoloStorageMask)->GetReg();
+}
+
 bool X86Mir2Lir::IsByteRegister(RegStorage reg) {
   return cu_->target64 || reg.GetRegNum() < rs_rX86_SP.GetRegNum();
 }
@@ -646,6 +650,14 @@ void X86Mir2Lir::CompilerInitializeRegAlloc() {
   reg_pool_->next_dp_reg_ = 1;
 }
 
+int X86Mir2Lir::VectorRegisterSize() {
+  return 128;
+}
+
+int X86Mir2Lir::NumReservableVectorRegisters(bool fp_used) {
+  return fp_used ? 5 : 7;
+}
+
 void X86Mir2Lir::SpillCoreRegs() {
   if (num_core_spills_ == 0) {
     return;
@@ -790,6 +802,9 @@ X86Mir2Lir::X86Mir2Lir(CompilationUnit* cu, MIRGraph* mir_graph, ArenaAllocator*
   rX86_RET1 = rDX;
   rX86_INVOKE_TGT = rAX;
   rX86_COUNT = rCX;
+
+  // Initialize the number of reserved vector registers
+  num_reserved_vector_regs_ = -1;
 }
 
 Mir2Lir* X86CodeGenerator(CompilationUnit* const cu, MIRGraph* const mir_graph,
@@ -1475,6 +1490,12 @@ std::vector<uint8_t>* X86Mir2Lir::ReturnCallFrameInformation() {
 
 void X86Mir2Lir::GenMachineSpecificExtendedMethodMIR(BasicBlock* bb, MIR* mir) {
   switch (static_cast<ExtendedMIROpcode>(mir->dalvikInsn.opcode)) {
+    case kMirOpReserveVectorRegisters:
+      ReserveVectorRegisters(mir);
+      break;
+    case kMirOpReturnVectorRegisters:
+      ReturnVectorRegisters();
+      break;
     case kMirOpConstVector:
       GenConst128(bb, mir);
       break;
@@ -1522,11 +1543,57 @@ void X86Mir2Lir::GenMachineSpecificExtendedMethodMIR(BasicBlock* bb, MIR* mir) {
   }
 }
 
+void X86Mir2Lir::ReserveVectorRegisters(MIR* mir) {
+  // We should not try to reserve twice without returning the registers
+  DCHECK_NE(num_reserved_vector_regs_, -1);
+
+  int num_vector_reg = mir->dalvikInsn.vA;
+  for (int i = 0; i < num_vector_reg; i++) {
+    RegStorage xp_reg = RegStorage::Solo128(i);
+    RegisterInfo *xp_reg_info = GetRegInfo(xp_reg);
+    Clobber(xp_reg);
+
+    for (RegisterInfo *info = xp_reg_info->GetAliasChain();
+                       info != nullptr;
+                       info = info->GetAliasChain()) {
+      if (info->GetReg().IsSingle()) {
+        reg_pool_->sp_regs_.Delete(info);
+      } else {
+        reg_pool_->dp_regs_.Delete(info);
+      }
+    }
+  }
+
+  num_reserved_vector_regs_ = num_vector_reg;
+}
+
+void X86Mir2Lir::ReturnVectorRegisters() {
+  // Return all the reserved registers
+  for (int i = 0; i < num_reserved_vector_regs_; i++) {
+    RegStorage xp_reg = RegStorage::Solo128(i);
+    RegisterInfo *xp_reg_info = GetRegInfo(xp_reg);
+
+    for (RegisterInfo *info = xp_reg_info->GetAliasChain();
+                       info != nullptr;
+                       info = info->GetAliasChain()) {
+      if (info->GetReg().IsSingle()) {
+        reg_pool_->sp_regs_.Insert(info);
+      } else {
+        reg_pool_->dp_regs_.Insert(info);
+      }
+    }
+  }
+
+  // We don't have anymore reserved vector registers
+  num_reserved_vector_regs_ = -1;
+}
+
 void X86Mir2Lir::GenConst128(BasicBlock* bb, MIR* mir) {
-  int type_size = mir->dalvikInsn.vA;
+  store_method_addr_used_ = true;
+  int type_size = mir->dalvikInsn.vB;
   // We support 128 bit vectors.
   DCHECK_EQ(type_size & 0xFFFF, 128);
-  RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vB);
+  RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vA);
   uint32_t *args = mir->dalvikInsn.arg;
   int reg = rs_dest.GetReg();
   // Check for all 0 case.
@@ -1534,6 +1601,12 @@ void X86Mir2Lir::GenConst128(BasicBlock* bb, MIR* mir) {
     NewLIR2(kX86XorpsRR, reg, reg);
     return;
   }
+
+  // Append the mov const vector to reg opcode.
+  AppendOpcodeWithConst(kX86MovupsRM, reg, mir);
+}
+
+void X86Mir2Lir::AppendOpcodeWithConst(X86OpCode opcode, int reg, MIR* mir) {
   // Okay, load it from the constant vector area.
   LIR *data_target = ScanVectorLiteral(mir);
   if (data_target == nullptr) {
@@ -1553,24 +1626,66 @@ void X86Mir2Lir::GenConst128(BasicBlock* bb, MIR* mir) {
   // 4 byte offset.  We will fix this up in the assembler later to have the right
   // value.
   ScopedMemRefType mem_ref_type(this, ResourceMask::kLiteral);
-  LIR *load = NewLIR3(kX86Mova128RM, reg, rl_method.reg.GetReg(),  256 /* bogus */);
+  LIR *load = NewLIR2(opcode, reg, rl_method.reg.GetReg());
   load->flags.fixup = kFixupLoad;
   load->target = data_target;
 }
 
 void X86Mir2Lir::GenMoveVector(BasicBlock *bb, MIR *mir) {
   // We only support 128 bit registers.
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vB);
-  RegStorage rs_src = RegStorage::Solo128(mir->dalvikInsn.vC);
+  DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
+  RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vA);
+  RegStorage rs_src = RegStorage::Solo128(mir->dalvikInsn.vB);
   NewLIR2(kX86Mova128RR, rs_dest.GetReg(), rs_src.GetReg());
 }
 
+void X86Mir2Lir::GenMultiplyVectorSignedByte(BasicBlock *bb, MIR *mir) {
+  const int BYTE_SIZE = 8;
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
+  RegStorage rs_src1_high_tmp = Get128BitRegister(AllocTempWide());
+
+  /*
+   * Emulate the behavior of a kSignedByte by separating out the 16 values in the two XMM
+   * and multiplying 8 at a time before recombining back into one XMM register.
+   *
+   *   let xmm1, xmm2 be real srcs (keep low bits of 16bit lanes)
+   *       xmm3 is tmp             (operate on high bits of 16bit lanes)
+   *
+   *    xmm3 = xmm1
+   *    xmm1 = xmm1 .* xmm2
+   *    xmm1 = xmm1 & 0x00ff00ff00ff00ff00ff00ff00ff00ff  // xmm1 now has low bits
+   *    xmm3 = xmm3 .>> 8
+   *    xmm2 = xmm2 & 0xff00ff00ff00ff00ff00ff00ff00ff00
+   *    xmm2 = xmm2 .* xmm3                               // xmm2 now has high bits
+   *    xmm1 = xmm1 | xmm2                                // combine results
+   */
+
+  // Copy xmm1.
+  NewLIR2(kX86Mova128RR, rs_src1_high_tmp.GetReg(), rs_dest_src1.GetReg());
+
+  // Multiply low bits.
+  NewLIR2(kX86PmullwRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
+
+  // xmm1 now has low bits.
+  AndMaskVectorRegister(rs_dest_src1, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF);
+
+  // Prepare high bits for multiplication.
+  NewLIR2(kX86PsrlwRI, rs_src1_high_tmp.GetReg(), BYTE_SIZE);
+  AndMaskVectorRegister(rs_src2, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00);
+
+  // Multiply high bits and xmm2 now has high bits.
+  NewLIR2(kX86PmullwRR, rs_src2.GetReg(), rs_src1_high_tmp.GetReg());
+
+  // Combine back into dest XMM register.
+  NewLIR2(kX86PorRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
+}
+
 void X86Mir2Lir::GenMultiplyVector(BasicBlock *bb, MIR *mir) {
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
-  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
-  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC);
+  DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
   int opcode = 0;
   switch (opsize) {
     case k32:
@@ -1585,6 +1700,10 @@ void X86Mir2Lir::GenMultiplyVector(BasicBlock *bb, MIR *mir) {
     case kDouble:
       opcode = kX86MulpdRR;
       break;
+    case kSignedByte:
+      // HW doesn't support 16x16 byte multiplication so emulate it.
+      GenMultiplyVectorSignedByte(bb, mir);
+      return;
     default:
       LOG(FATAL) << "Unsupported vector multiply " << opsize;
       break;
@@ -1593,10 +1712,10 @@ void X86Mir2Lir::GenMultiplyVector(BasicBlock *bb, MIR *mir) {
 }
 
 void X86Mir2Lir::GenAddVector(BasicBlock *bb, MIR *mir) {
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
-  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
-  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC);
+  DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
   int opcode = 0;
   switch (opsize) {
     case k32:
@@ -1624,10 +1743,10 @@ void X86Mir2Lir::GenAddVector(BasicBlock *bb, MIR *mir) {
 }
 
 void X86Mir2Lir::GenSubtractVector(BasicBlock *bb, MIR *mir) {
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
-  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
-  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC);
+  DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
   int opcode = 0;
   switch (opsize) {
     case k32:
@@ -1654,11 +1773,60 @@ void X86Mir2Lir::GenSubtractVector(BasicBlock *bb, MIR *mir) {
   NewLIR2(opcode, rs_dest_src1.GetReg(), rs_src2.GetReg());
 }
 
+void X86Mir2Lir::GenShiftByteVector(BasicBlock *bb, MIR *mir) {
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  RegStorage rs_tmp = Get128BitRegister(AllocTempWide());
+
+  int opcode = 0;
+  int imm = mir->dalvikInsn.vB;
+
+  switch (static_cast<ExtendedMIROpcode>(mir->dalvikInsn.opcode)) {
+    case kMirOpPackedShiftLeft:
+      opcode = kX86PsllwRI;
+      break;
+    case kMirOpPackedSignedShiftRight:
+      opcode = kX86PsrawRI;
+      break;
+    case kMirOpPackedUnsignedShiftRight:
+      opcode = kX86PsrlwRI;
+      break;
+    default:
+      LOG(FATAL) << "Unsupported shift operation on byte vector " << opcode;
+      break;
+  }
+
+  /*
+   * xmm1 will have low bits
+   * xmm2 will have high bits
+   *
+   * xmm2 = xmm1
+   * xmm1 = xmm1 .<< N
+   * xmm2 = xmm2 && 0xFF00FF00FF00FF00FF00FF00FF00FF00
+   * xmm2 = xmm2 .<< N
+   * xmm1 = xmm1 | xmm2
+   */
+
+  // Copy xmm1.
+  NewLIR2(kX86Mova128RR, rs_tmp.GetReg(), rs_dest_src1.GetReg());
+
+  // Shift lower values.
+  NewLIR2(opcode, rs_dest_src1.GetReg(), imm);
+
+  // Mask bottom bits.
+  AndMaskVectorRegister(rs_tmp, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00);
+
+  // Shift higher values.
+  NewLIR2(opcode, rs_tmp.GetReg(), imm);
+
+  // Combine back into dest XMM register.
+  NewLIR2(kX86PorRR, rs_dest_src1.GetReg(), rs_tmp.GetReg());
+}
+
 void X86Mir2Lir::GenShiftLeftVector(BasicBlock *bb, MIR *mir) {
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
-  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
-  int imm = mir->dalvikInsn.vC;
+  DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  int imm = mir->dalvikInsn.vB;
   int opcode = 0;
   switch (opsize) {
     case k32:
@@ -1671,6 +1839,10 @@ void X86Mir2Lir::GenShiftLeftVector(BasicBlock *bb, MIR *mir) {
     case kUnsignedHalf:
       opcode = kX86PsllwRI;
       break;
+    case kSignedByte:
+    case kUnsignedByte:
+      GenShiftByteVector(bb, mir);
+      return;
     default:
       LOG(FATAL) << "Unsupported vector shift left " << opsize;
       break;
@@ -1679,10 +1851,10 @@ void X86Mir2Lir::GenShiftLeftVector(BasicBlock *bb, MIR *mir) {
 }
 
 void X86Mir2Lir::GenSignedShiftRightVector(BasicBlock *bb, MIR *mir) {
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
-  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
-  int imm = mir->dalvikInsn.vC;
+  DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  int imm = mir->dalvikInsn.vB;
   int opcode = 0;
   switch (opsize) {
     case k32:
@@ -1692,6 +1864,10 @@ void X86Mir2Lir::GenSignedShiftRightVector(BasicBlock *bb, MIR *mir) {
     case kUnsignedHalf:
       opcode = kX86PsrawRI;
       break;
+    case kSignedByte:
+    case kUnsignedByte:
+      GenShiftByteVector(bb, mir);
+      return;
     default:
       LOG(FATAL) << "Unsupported vector signed shift right " << opsize;
       break;
@@ -1700,10 +1876,10 @@ void X86Mir2Lir::GenSignedShiftRightVector(BasicBlock *bb, MIR *mir) {
 }
 
 void X86Mir2Lir::GenUnsignedShiftRightVector(BasicBlock *bb, MIR *mir) {
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
-  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
-  int imm = mir->dalvikInsn.vC;
+  DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  int imm = mir->dalvikInsn.vB;
   int opcode = 0;
   switch (opsize) {
     case k32:
@@ -1716,6 +1892,10 @@ void X86Mir2Lir::GenUnsignedShiftRightVector(BasicBlock *bb, MIR *mir) {
     case kUnsignedHalf:
       opcode = kX86PsrlwRI;
       break;
+    case kSignedByte:
+    case kUnsignedByte:
+      GenShiftByteVector(bb, mir);
+      return;
     default:
       LOG(FATAL) << "Unsupported vector unsigned shift right " << opsize;
       break;
@@ -1725,91 +1905,209 @@ void X86Mir2Lir::GenUnsignedShiftRightVector(BasicBlock *bb, MIR *mir) {
 
 void X86Mir2Lir::GenAndVector(BasicBlock *bb, MIR *mir) {
   // We only support 128 bit registers.
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
-  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC);
+  DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
   NewLIR2(kX86PandRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
 }
 
 void X86Mir2Lir::GenOrVector(BasicBlock *bb, MIR *mir) {
   // We only support 128 bit registers.
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
-  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC);
+  DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
   NewLIR2(kX86PorRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
 }
 
 void X86Mir2Lir::GenXorVector(BasicBlock *bb, MIR *mir) {
   // We only support 128 bit registers.
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
-  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC);
+  DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
   NewLIR2(kX86PxorRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
 }
 
+void X86Mir2Lir::AndMaskVectorRegister(RegStorage rs_src1, uint32_t m1, uint32_t m2, uint32_t m3, uint32_t m4) {
+  MaskVectorRegister(kX86PandRM, rs_src1, m1, m2, m3, m4);
+}
+
+void X86Mir2Lir::MaskVectorRegister(X86OpCode opcode, RegStorage rs_src1, uint32_t m0, uint32_t m1, uint32_t m2, uint32_t m3) {
+  // Create temporary MIR as container for 128-bit binary mask.
+  MIR const_mir;
+  MIR* const_mirp = &const_mir;
+  const_mirp->dalvikInsn.opcode = static_cast<Instruction::Code>(kMirOpConstVector);
+  const_mirp->dalvikInsn.arg[0] = m0;
+  const_mirp->dalvikInsn.arg[1] = m1;
+  const_mirp->dalvikInsn.arg[2] = m2;
+  const_mirp->dalvikInsn.arg[3] = m3;
+
+  // Mask vector with const from literal pool.
+  AppendOpcodeWithConst(opcode, rs_src1.GetReg(), const_mirp);
+}
+
 void X86Mir2Lir::GenAddReduceVector(BasicBlock *bb, MIR *mir) {
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
-  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
-  int imm = mir->dalvikInsn.vC;
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
+  RegStorage rs_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+  RegLocation rl_dest = mir_graph_->GetDest(mir);
+  RegStorage rs_tmp;
+
+  int vec_bytes = (mir->dalvikInsn.vC & 0xFFFF) / 8;
+  int vec_unit_size = 0;
   int opcode = 0;
+  int extr_opcode = 0;
+  RegLocation rl_result;
+
   switch (opsize) {
     case k32:
+      extr_opcode = kX86PextrdRRI;
       opcode = kX86PhadddRR;
+      vec_unit_size = 4;
+      break;
+    case kSignedByte:
+    case kUnsignedByte:
+      extr_opcode = kX86PextrbRRI;
+      opcode = kX86PhaddwRR;
+      vec_unit_size = 2;
       break;
     case kSignedHalf:
     case kUnsignedHalf:
+      extr_opcode = kX86PextrwRRI;
       opcode = kX86PhaddwRR;
+      vec_unit_size = 2;
       break;
+    case kSingle:
+      rl_result = EvalLoc(rl_dest, kFPReg, true);
+      vec_unit_size = 4;
+      for (int i = 0; i < 3; i++) {
+        NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), rs_src1.GetReg());
+        NewLIR3(kX86ShufpsRRI, rs_src1.GetReg(), rs_src1.GetReg(), 0x39);
+      }
+      NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), rs_src1.GetReg());
+      StoreValue(rl_dest, rl_result);
+
+      // For single-precision floats, we are done here
+      return;
     default:
       LOG(FATAL) << "Unsupported vector add reduce " << opsize;
       break;
   }
-  NewLIR2(opcode, rs_dest_src1.GetReg(), imm);
+
+  int elems = vec_bytes / vec_unit_size;
+
+  // Emulate horizontal add instruction by reducing 2 vectors with 8 values before adding them again
+  // TODO is overflow handled correctly?
+  if (opsize == kSignedByte || opsize == kUnsignedByte) {
+    rs_tmp = Get128BitRegister(AllocTempWide());
+
+    // tmp = xmm1 .>> 8.
+    NewLIR2(kX86Mova128RR, rs_tmp.GetReg(), rs_src1.GetReg());
+    NewLIR2(kX86PsrlwRI, rs_tmp.GetReg(), 8);
+
+    // Zero extend low bits in xmm1.
+    AndMaskVectorRegister(rs_src1, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF);
+  }
+
+  while (elems > 1) {
+    if (opsize == kSignedByte || opsize == kUnsignedByte) {
+      NewLIR2(opcode, rs_tmp.GetReg(), rs_tmp.GetReg());
+    }
+    NewLIR2(opcode, rs_src1.GetReg(), rs_src1.GetReg());
+    elems >>= 1;
+  }
+
+  // Combine the results if we separated them.
+  if (opsize == kSignedByte || opsize == kUnsignedByte) {
+    NewLIR2(kX86PaddbRR, rs_src1.GetReg(), rs_tmp.GetReg());
+  }
+
+  // We need to extract to a GPR.
+  RegStorage temp = AllocTemp();
+  NewLIR3(extr_opcode, temp.GetReg(), rs_src1.GetReg(), 0);
+
+  // Can we do this directly into memory?
+  rl_result = UpdateLocTyped(rl_dest, kCoreReg);
+  if (rl_result.location == kLocPhysReg) {
+    // Ensure res is in a core reg
+    rl_result = EvalLoc(rl_dest, kCoreReg, true);
+    OpRegReg(kOpAdd, rl_result.reg, temp);
+    StoreFinalValue(rl_dest, rl_result);
+  } else {
+    OpMemReg(kOpAdd, rl_result, temp.GetReg());
+  }
+
+  FreeTemp(temp);
 }
 
 void X86Mir2Lir::GenReduceVector(BasicBlock *bb, MIR *mir) {
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
-  RegStorage rs_src = RegStorage::Solo128(mir->dalvikInsn.vB);
-  int index = mir->dalvikInsn.arg[0];
-  int opcode = 0;
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
+  RegLocation rl_dest = mir_graph_->GetDest(mir);
+  RegStorage rs_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+  int extract_index = mir->dalvikInsn.arg[0];
+  int extr_opcode = 0;
+  RegLocation rl_result;
+  bool is_wide = false;
+
   switch (opsize) {
     case k32:
-      opcode = kX86PextrdRRI;
+      rl_result = UpdateLocTyped(rl_dest, kCoreReg);
+      extr_opcode = (rl_result.location == kLocPhysReg) ? kX86PextrdMRI : kX86PextrdRRI;
       break;
     case kSignedHalf:
     case kUnsignedHalf:
-      opcode = kX86PextrwRRI;
-      break;
-    case kUnsignedByte:
-    case kSignedByte:
-      opcode = kX86PextrbRRI;
+      rl_result= UpdateLocTyped(rl_dest, kCoreReg);
+      extr_opcode = (rl_result.location == kLocPhysReg) ? kX86PextrwMRI : kX86PextrwRRI;
       break;
     default:
-      LOG(FATAL) << "Unsupported vector reduce " << opsize;
+      LOG(FATAL) << "Unsupported vector add reduce " << opsize;
+      return;
       break;
   }
-  // We need to extract to a GPR.
-  RegStorage temp = AllocTemp();
-  NewLIR3(opcode, temp.GetReg(), rs_src.GetReg(), index);
 
-  // Assume that the destination VR is in the def for the mir.
-  RegLocation rl_dest = mir_graph_->GetDest(mir);
-  RegLocation rl_temp =
-    {kLocPhysReg, 0, 0, 0, 0, 0, 0, 0, 1, temp, INVALID_SREG, INVALID_SREG};
-  StoreValue(rl_dest, rl_temp);
+  if (rl_result.location == kLocPhysReg) {
+    NewLIR3(extr_opcode, rl_result.reg.GetReg(), rs_src1.GetReg(), extract_index);
+    if (is_wide == true) {
+      StoreFinalValue(rl_dest, rl_result);
+    } else {
+      StoreFinalValueWide(rl_dest, rl_result);
+    }
+  } else {
+    int displacement = SRegOffset(rl_result.s_reg_low);
+    LIR *l = NewLIR3(extr_opcode, rs_rX86_SP.GetReg(), displacement, rs_src1.GetReg());
+    AnnotateDalvikRegAccess(l, displacement >> 2, true /* is_load */, is_wide /* is_64bit */);
+    AnnotateDalvikRegAccess(l, displacement >> 2, false /* is_load */, is_wide /* is_64bit */);
+  }
 }
 
 void X86Mir2Lir::GenSetVector(BasicBlock *bb, MIR *mir) {
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
-  RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vB);
-  int op_low = 0, op_high = 0;
+  DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
+  RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vA);
+  int op_low = 0, op_high = 0, imm = 0, op_mov = kX86MovdxrRR;
+  RegisterClass reg_type = kCoreReg;
+
   switch (opsize) {
     case k32:
       op_low = kX86PshufdRRI;
       break;
+    case kSingle:
+      op_low = kX86PshufdRRI;
+      op_mov = kX86Mova128RR;
+      reg_type = kFPReg;
+      break;
+    case k64:
+      op_low = kX86PshufdRRI;
+      imm = 0x44;
+      break;
+    case kDouble:
+      op_low = kX86PshufdRRI;
+      op_mov = kX86Mova128RR;
+      reg_type = kFPReg;
+      imm = 0x44;
+      break;
+    case kSignedByte:
+    case kUnsignedByte:
+      // Shuffle 8 bit value into 16 bit word.
+      // We set val = val + (val << 8) below and use 16 bit shuffle.
     case kSignedHalf:
     case kUnsignedHalf:
       // Handles low quadword.
@@ -1822,23 +2120,37 @@ void X86Mir2Lir::GenSetVector(BasicBlock *bb, MIR *mir) {
       break;
   }
 
-  // Load the value from the VR into a GPR.
   RegLocation rl_src = mir_graph_->GetSrc(mir, 0);
-  rl_src = LoadValue(rl_src, kCoreReg);
+
+  // Load the value from the VR into the reg.
+  if (rl_src.wide == 0) {
+    rl_src = LoadValue(rl_src, reg_type);
+  } else {
+    rl_src = LoadValueWide(rl_src, reg_type);
+  }
+
+  // If opsize is 8 bits wide then double value and use 16 bit shuffle instead.
+  if (opsize == kSignedByte || opsize == kUnsignedByte) {
+    RegStorage temp = AllocTemp();
+    // val = val + (val << 8).
+    NewLIR2(kX86Mov32RR, temp.GetReg(), rl_src.reg.GetReg());
+    NewLIR2(kX86Sal32RI, temp.GetReg(), 8);
+    NewLIR2(kX86Or32RR, rl_src.reg.GetReg(), temp.GetReg());
+    FreeTemp(temp);
+  }
 
   // Load the value into the XMM register.
-  NewLIR2(kX86MovdxrRR, rs_dest.GetReg(), rl_src.reg.GetReg());
+  NewLIR2(op_mov, rs_dest.GetReg(), rl_src.reg.GetReg());
 
   // Now shuffle the value across the destination.
-  NewLIR3(op_low, rs_dest.GetReg(), rs_dest.GetReg(), 0);
+  NewLIR3(op_low, rs_dest.GetReg(), rs_dest.GetReg(), imm);
 
   // And then repeat as needed.
   if (op_high != 0) {
-    NewLIR3(op_high, rs_dest.GetReg(), rs_dest.GetReg(), 0);
+    NewLIR3(op_high, rs_dest.GetReg(), rs_dest.GetReg(), imm);
   }
 }
 
-
 LIR *X86Mir2Lir::ScanVectorLiteral(MIR *mir) {
   int *args = reinterpret_cast<int*>(mir->dalvikInsn.arg);
   for (LIR *p = const_vectors_; p != nullptr; p = p->next) {
@@ -1867,7 +2179,7 @@ LIR *X86Mir2Lir::AddVectorLiteral(MIR *mir) {
 }
 
 // ------------ ABI support: mapping of args to physical registers -------------
-RegStorage X86Mir2Lir::InToRegStorageX86_64Mapper::GetNextReg(bool is_double_or_float, bool is_wide) {
+RegStorage X86Mir2Lir::InToRegStorageX86_64Mapper::GetNextReg(bool is_double_or_float, bool is_wide, bool is_ref) {
   const SpecialTargetRegister coreArgMappingToPhysicalReg[] = {kArg1, kArg2, kArg3, kArg4, kArg5};
   const int coreArgMappingToPhysicalRegSize = sizeof(coreArgMappingToPhysicalReg) / sizeof(SpecialTargetRegister);
   const SpecialTargetRegister fpArgMappingToPhysicalReg[] = {kFArg0, kFArg1, kFArg2, kFArg3,
@@ -1880,7 +2192,8 @@ RegStorage X86Mir2Lir::InToRegStorageX86_64Mapper::GetNextReg(bool is_double_or_
     }
   } else {
     if (cur_core_reg_ < coreArgMappingToPhysicalRegSize) {
-      return ml_->TargetReg(coreArgMappingToPhysicalReg[cur_core_reg_++], is_wide);
+      return is_ref ? ml_->TargetRefReg(coreArgMappingToPhysicalReg[cur_core_reg_++]) :
+                      ml_->TargetReg(coreArgMappingToPhysicalReg[cur_core_reg_++], is_wide);
     }
   }
   return RegStorage::InvalidReg();
@@ -1897,11 +2210,12 @@ void X86Mir2Lir::InToRegStorageMapping::Initialize(RegLocation* arg_locs, int co
   max_mapped_in_ = -1;
   is_there_stack_mapped_ = false;
   for (int in_position = 0; in_position < count; in_position++) {
-     RegStorage reg = mapper->GetNextReg(arg_locs[in_position].fp, arg_locs[in_position].wide);
+     RegStorage reg = mapper->GetNextReg(arg_locs[in_position].fp,
+             arg_locs[in_position].wide, arg_locs[in_position].ref);
      if (reg.Valid()) {
        mapping_[in_position] = reg;
        max_mapped_in_ = std::max(max_mapped_in_, in_position);
-       if (reg.Is64BitSolo()) {
+       if (arg_locs[in_position].wide) {
          // We covered 2 args, so skip the next one
          in_position++;
        }
diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h
index e271e9d100..2789923bb9 100644
--- a/compiler/dex/quick/x86/x86_lir.h
+++ b/compiler/dex/quick/x86/x86_lir.h
@@ -569,6 +569,9 @@ enum X86OpCode {
   kX86PextrbRRI,                // Extract 8 bits from XMM into GPR
   kX86PextrwRRI,                // Extract 16 bits from XMM into GPR
   kX86PextrdRRI,                // Extract 32 bits from XMM into GPR
+  kX86PextrbMRI,                // Extract 8 bits from XMM into memory
+  kX86PextrwMRI,                // Extract 16 bits from XMM into memory
+  kX86PextrdMRI,                // Extract 32 bits from XMM into memory
   kX86PshuflwRRI,               // Shuffle 16 bits in lower 64 bits of XMM.
   kX86PshufdRRI,                // Shuffle 32 bits in XMM.
   kX86ShufpsRRI,                // FP Shuffle 32 bits in XMM.
@@ -723,7 +726,7 @@ struct X86EncodingMap {
 #define REX_X 0x42
 // Extension of the ModR/M r/m field, SIB base field, or Opcode reg field
 #define REX_B 0x41
-// Extended register set
+// An empty REX prefix used to normalize the byte operations so that they apply to R4 through R15
 #define REX 0x40
 // Mask extracting the least 3 bits of r0..r15
 #define kRegNumMask32 0x07
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index 770ae89ca2..9bf51359cf 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -1466,8 +1466,12 @@ static void CheckAndClearResolveException(Thread* self)
   CHECK(self->IsExceptionPending());
   mirror::Throwable* exception = self->GetException(nullptr);
   std::string descriptor = exception->GetClass()->GetDescriptor();
-  if (descriptor != "Ljava/lang/IncompatibleClassChangeError;" &&
-      descriptor != "Ljava/lang/NoClassDefFoundError;") {
+      if (descriptor != "Ljava/lang/IllegalAccessError;" &&
+          descriptor != "Ljava/lang/IncompatibleClassChangeError;" &&
+          descriptor != "Ljava/lang/InstantiationError;" &&
+          descriptor != "Ljava/lang/NoClassDefFoundError;" &&
+          descriptor != "Ljava/lang/NoSuchFieldError;" &&
+          descriptor != "Ljava/lang/NoSuchMethodError;") {
     LOG(FATAL) << "Unexpected exeption " << exception->Dump();
   }
   self->ClearException();
diff --git a/compiler/elf_writer_quick.cc b/compiler/elf_writer_quick.cc
index 06f6e89c7b..42743862fe 100644
--- a/compiler/elf_writer_quick.cc
+++ b/compiler/elf_writer_quick.cc
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#include <unordered_set>
-
 #include "elf_writer_quick.h"
 
 #include "base/logging.h"
diff --git a/compiler/image_test.cc b/compiler/image_test.cc
index d52ec0ad5a..406d9d2696 100644
--- a/compiler/image_test.cc
+++ b/compiler/image_test.cc
@@ -25,6 +25,7 @@
 #include "compiler/image_writer.h"
 #include "compiler/oat_writer.h"
 #include "gc/space/image_space.h"
+#include "implicit_check_options.h"
 #include "lock_word.h"
 #include "mirror/object-inl.h"
 #include "signal_catcher.h"
@@ -77,8 +78,11 @@ TEST_F(ImageTest, WriteRead) {
 
       t.NewTiming("WriteElf");
       ScopedObjectAccess soa(Thread::Current());
-      OatWriter oat_writer(class_linker->GetBootClassPath(),
-                           0, 0, "", compiler_driver_.get(), &timings);
+      SafeMap<std::string, std::string> key_value_store;
+      key_value_store.Put(ImplicitCheckOptions::kImplicitChecksOatHeaderKey,
+                          ImplicitCheckOptions::Serialize(true, true, true));
+      OatWriter oat_writer(class_linker->GetBootClassPath(), 0, 0, compiler_driver_.get(), &timings,
+                           &key_value_store);
       bool success = compiler_driver_->WriteElf(GetTestAndroidRoot(),
                                                 !kIsTargetBuild,
                                                 class_linker->GetBootClassPath(),
diff --git a/compiler/image_writer.cc b/compiler/image_writer.cc
index 2d25b7a2ed..acfa607f39 100644
--- a/compiler/image_writer.cc
+++ b/compiler/image_writer.cc
@@ -796,9 +796,9 @@ void ImageWriter::PatchOatCodeAndMethods(File* elf_file) {
   };
   const bool add_patches = compiler_driver_.GetCompilerOptions().GetIncludePatchInformation();
   if (add_patches) {
-    // TODO if we are adding patches the resulting ELF file might have a
-    // potentially rather large amount of free space where patches might have been
-    // placed. We should adjust the ELF file to get rid of this excess space.
+    // TODO if we are adding patches the resulting ELF file might have a potentially rather large
+    // amount of free space where patches might have been placed. We should adjust the ELF file to
+    // get rid of this excess space.
     patches.reserve(compiler_driver_.GetCodeToPatch().size() +
                     compiler_driver_.GetMethodsToPatch().size() +
                     compiler_driver_.GetClassesToPatch().size());
@@ -892,7 +892,7 @@ void ImageWriter::PatchOatCodeAndMethods(File* elf_file) {
     }
     Elf32_Shdr* shdr = file->FindSectionByName(".oat_patches");
     if (shdr != nullptr) {
-      DCHECK_EQ(shdr, file->FindSectionByType(SHT_OAT_PATCH))
+      CHECK_EQ(shdr, file->FindSectionByType(SHT_OAT_PATCH))
           << "Incorrect type for .oat_patches section";
       CHECK_LE(patches.size() * sizeof(uintptr_t), shdr->sh_size)
           << "We got more patches than anticipated";
@@ -903,9 +903,8 @@ void ImageWriter::PatchOatCodeAndMethods(File* elf_file) {
           << "Section overlaps onto next section";
       // It's mmap'd so we can just memcpy.
       memcpy(file->Begin() + shdr->sh_offset, patches.data(), patches.size()*sizeof(uintptr_t));
-      // TODO We should fill in the newly empty space between the last patch and
-      // the start of the next section by moving the following sections down if
-      // possible.
+      // TODO We should fill in the newly empty space between the last patch and the start of the
+      // next section by moving the following sections down if possible.
       shdr->sh_size = patches.size() * sizeof(uintptr_t);
     } else {
       LOG(ERROR) << "Unable to find section header for SHT_OAT_PATCH";
diff --git a/compiler/oat_test.cc b/compiler/oat_test.cc
index 254faac796..d2ee0ede80 100644
--- a/compiler/oat_test.cc
+++ b/compiler/oat_test.cc
@@ -18,6 +18,7 @@
 #include "compiler/compiler.h"
 #include "compiler/oat_writer.h"
 #include "entrypoints/quick/quick_entrypoints.h"
+#include "implicit_check_options.h"
 #include "mirror/art_method-inl.h"
 #include "mirror/class-inl.h"
 #include "mirror/object-inl.h"
@@ -111,12 +112,16 @@ TEST_F(OatTest, WriteRead) {
 
   ScopedObjectAccess soa(Thread::Current());
   ScratchFile tmp;
+  SafeMap<std::string, std::string> key_value_store;
+  key_value_store.Put(OatHeader::kImageLocationKey, "lue.art");
+  key_value_store.Put(ImplicitCheckOptions::kImplicitChecksOatHeaderKey,
+                      ImplicitCheckOptions::Serialize(true, true, true));
   OatWriter oat_writer(class_linker->GetBootClassPath(),
                        42U,
                        4096U,
-                       "lue.art",
                        compiler_driver_.get(),
-                       &timings);
+                       &timings,
+                       &key_value_store);
   bool success = compiler_driver_->WriteElf(GetTestAndroidRoot(),
                                             !kIsTargetBuild,
                                             class_linker->GetBootClassPath(),
@@ -136,7 +141,7 @@ TEST_F(OatTest, WriteRead) {
   ASSERT_EQ(1U, oat_header.GetDexFileCount());  // core
   ASSERT_EQ(42U, oat_header.GetImageFileLocationOatChecksum());
   ASSERT_EQ(4096U, oat_header.GetImageFileLocationOatDataBegin());
-  ASSERT_EQ("lue.art", oat_header.GetImageFileLocation());
+  ASSERT_EQ("lue.art", std::string(oat_header.GetStoreValueByKey(OatHeader::kImageLocationKey)));
 
   const DexFile* dex_file = java_lang_dex_file_;
   uint32_t dex_file_checksum = dex_file->GetLocationChecksum();
@@ -189,20 +194,20 @@ TEST_F(OatTest, OatHeaderIsValid) {
     std::vector<const DexFile*> dex_files;
     uint32_t image_file_location_oat_checksum = 0;
     uint32_t image_file_location_oat_begin = 0;
-    const std::string image_file_location;
-    OatHeader oat_header(instruction_set,
-                         instruction_set_features,
-                         &dex_files,
-                         image_file_location_oat_checksum,
-                         image_file_location_oat_begin,
-                         image_file_location);
-    ASSERT_TRUE(oat_header.IsValid());
-
-    char* magic = const_cast<char*>(oat_header.GetMagic());
+    OatHeader* oat_header = OatHeader::Create(instruction_set,
+                                              instruction_set_features,
+                                              &dex_files,
+                                              image_file_location_oat_checksum,
+                                              image_file_location_oat_begin,
+                                              nullptr);
+    ASSERT_NE(oat_header, nullptr);
+    ASSERT_TRUE(oat_header->IsValid());
+
+    char* magic = const_cast<char*>(oat_header->GetMagic());
     strcpy(magic, "");  // bad magic
-    ASSERT_FALSE(oat_header.IsValid());
+    ASSERT_FALSE(oat_header->IsValid());
     strcpy(magic, "oat\n000");  // bad version
-    ASSERT_FALSE(oat_header.IsValid());
+    ASSERT_FALSE(oat_header->IsValid());
 }
 
 }  // namespace art
diff --git a/compiler/oat_writer.cc b/compiler/oat_writer.cc
index e1b6992c47..92ed33c644 100644
--- a/compiler/oat_writer.cc
+++ b/compiler/oat_writer.cc
@@ -49,19 +49,19 @@ namespace art {
 OatWriter::OatWriter(const std::vector<const DexFile*>& dex_files,
                      uint32_t image_file_location_oat_checksum,
                      uintptr_t image_file_location_oat_begin,
-                     const std::string& image_file_location,
                      const CompilerDriver* compiler,
-                     TimingLogger* timings)
+                     TimingLogger* timings,
+                     SafeMap<std::string, std::string>* key_value_store)
   : compiler_driver_(compiler),
     dex_files_(&dex_files),
     image_file_location_oat_checksum_(image_file_location_oat_checksum),
     image_file_location_oat_begin_(image_file_location_oat_begin),
-    image_file_location_(image_file_location),
+    key_value_store_(key_value_store),
     oat_header_(NULL),
     size_dex_file_alignment_(0),
     size_executable_offset_alignment_(0),
     size_oat_header_(0),
-    size_oat_header_image_file_location_(0),
+    size_oat_header_key_value_store_(0),
     size_dex_file_(0),
     size_interpreter_to_interpreter_bridge_(0),
     size_interpreter_to_compiled_code_bridge_(0),
@@ -89,6 +89,8 @@ OatWriter::OatWriter(const std::vector<const DexFile*>& dex_files,
     size_oat_class_status_(0),
     size_oat_class_method_bitmaps_(0),
     size_oat_class_method_offsets_(0) {
+  CHECK(key_value_store != nullptr);
+
   size_t offset;
   {
     TimingLogger::ScopedTiming split("InitOatHeader", timings);
@@ -121,7 +123,8 @@ OatWriter::OatWriter(const std::vector<const DexFile*>& dex_files,
   size_ = offset;
 
   CHECK_EQ(dex_files_->size(), oat_dex_files_.size());
-  CHECK(image_file_location.empty() == compiler->IsImage());
+  CHECK_EQ(compiler->IsImage(),
+           key_value_store_->find(OatHeader::kImageLocationKey) == key_value_store_->end());
 }
 
 OatWriter::~OatWriter() {
@@ -716,16 +719,14 @@ bool OatWriter::VisitDexMethods(DexMethodVisitor* visitor) {
 }
 
 size_t OatWriter::InitOatHeader() {
-  // create the OatHeader
-  oat_header_ = new OatHeader(compiler_driver_->GetInstructionSet(),
-                              compiler_driver_->GetInstructionSetFeatures(),
-                              dex_files_,
-                              image_file_location_oat_checksum_,
-                              image_file_location_oat_begin_,
-                              image_file_location_);
-  size_t offset = sizeof(*oat_header_);
-  offset += image_file_location_.size();
-  return offset;
+  oat_header_ = OatHeader::Create(compiler_driver_->GetInstructionSet(),
+                                  compiler_driver_->GetInstructionSetFeatures(),
+                                  dex_files_,
+                                  image_file_location_oat_checksum_,
+                                  image_file_location_oat_begin_,
+                                  key_value_store_);
+
+  return oat_header_->GetHeaderSize();
 }
 
 size_t OatWriter::InitOatDexFiles(size_t offset) {
@@ -864,17 +865,13 @@ size_t OatWriter::InitOatCodeDexFiles(size_t offset) {
 bool OatWriter::Write(OutputStream* out) {
   const size_t file_offset = out->Seek(0, kSeekCurrent);
 
-  if (!out->WriteFully(oat_header_, sizeof(*oat_header_))) {
+  size_t header_size = oat_header_->GetHeaderSize();
+  if (!out->WriteFully(oat_header_, header_size)) {
     PLOG(ERROR) << "Failed to write oat header to " << out->GetLocation();
     return false;
   }
-  size_oat_header_ += sizeof(*oat_header_);
-
-  if (!out->WriteFully(image_file_location_.data(), image_file_location_.size())) {
-    PLOG(ERROR) << "Failed to write oat header image file location to " << out->GetLocation();
-    return false;
-  }
-  size_oat_header_image_file_location_ += image_file_location_.size();
+  size_oat_header_ += sizeof(OatHeader);
+  size_oat_header_key_value_store_ += oat_header_->GetHeaderSize() - sizeof(OatHeader);
 
   if (!WriteTables(out, file_offset)) {
     LOG(ERROR) << "Failed to write oat tables to " << out->GetLocation();
@@ -909,7 +906,7 @@ bool OatWriter::Write(OutputStream* out) {
     DO_STAT(size_dex_file_alignment_);
     DO_STAT(size_executable_offset_alignment_);
     DO_STAT(size_oat_header_);
-    DO_STAT(size_oat_header_image_file_location_);
+    DO_STAT(size_oat_header_key_value_store_);
     DO_STAT(size_dex_file_);
     DO_STAT(size_interpreter_to_interpreter_bridge_);
     DO_STAT(size_interpreter_to_compiled_code_bridge_);
diff --git a/compiler/oat_writer.h b/compiler/oat_writer.h
index dbecb95362..3d34956651 100644
--- a/compiler/oat_writer.h
+++ b/compiler/oat_writer.h
@@ -79,9 +79,9 @@ class OatWriter {
   OatWriter(const std::vector<const DexFile*>& dex_files,
             uint32_t image_file_location_oat_checksum,
             uintptr_t image_file_location_oat_begin,
-            const std::string& image_file_location,
             const CompilerDriver* compiler,
-            TimingLogger* timings);
+            TimingLogger* timings,
+            SafeMap<std::string, std::string>* key_value_store);
 
   const OatHeader& GetOatHeader() const {
     return *oat_header_;
@@ -253,9 +253,9 @@ class OatWriter {
   // dependencies on the image.
   uint32_t image_file_location_oat_checksum_;
   uintptr_t image_file_location_oat_begin_;
-  std::string image_file_location_;
 
   // data to write
+  SafeMap<std::string, std::string>* key_value_store_;
   OatHeader* oat_header_;
   std::vector<OatDexFile*> oat_dex_files_;
   std::vector<OatClass*> oat_classes_;
@@ -274,7 +274,7 @@ class OatWriter {
   uint32_t size_dex_file_alignment_;
   uint32_t size_executable_offset_alignment_;
   uint32_t size_oat_header_;
-  uint32_t size_oat_header_image_file_location_;
+  uint32_t size_oat_header_key_value_store_;
   uint32_t size_dex_file_;
   uint32_t size_interpreter_to_interpreter_bridge_;
   uint32_t size_interpreter_to_compiled_code_bridge_;
diff --git a/dex2oat/dex2oat.cc b/dex2oat/dex2oat.cc
index 80e77245fc..7f6c752e88 100644
--- a/dex2oat/dex2oat.cc
+++ b/dex2oat/dex2oat.cc
@@ -48,6 +48,7 @@
 #include "gc/space/image_space.h"
 #include "gc/space/space-inl.h"
 #include "image_writer.h"
+#include "implicit_check_options.h"
 #include "leb128.h"
 #include "mirror/art_method-inl.h"
 #include "mirror/class-inl.h"
@@ -336,7 +337,10 @@ class Dex2Oat {
                                       bool dump_passes,
                                       TimingLogger& timings,
                                       CumulativeLogger& compiler_phases_timings,
-                                      std::string profile_file) {
+                                      std::string profile_file,
+                                      SafeMap<std::string, std::string>* key_value_store) {
+    CHECK(key_value_store != nullptr);
+
     // Handle and ClassLoader creation needs to come after Runtime::Create
     jobject class_loader = nullptr;
     Thread* self = Thread::Current();
@@ -356,18 +360,18 @@ class Dex2Oat {
     }
 
     std::unique_ptr<CompilerDriver> driver(new CompilerDriver(compiler_options_,
-                                                        verification_results_,
-                                                        method_inliner_map_,
-                                                        compiler_kind_,
-                                                        instruction_set_,
-                                                        instruction_set_features_,
-                                                        image,
-                                                        image_classes.release(),
-                                                        thread_count_,
-                                                        dump_stats,
-                                                        dump_passes,
-                                                        &compiler_phases_timings,
-                                                        profile_file));
+                                                              verification_results_,
+                                                              method_inliner_map_,
+                                                              compiler_kind_,
+                                                              instruction_set_,
+                                                              instruction_set_features_,
+                                                              image,
+                                                              image_classes.release(),
+                                                              thread_count_,
+                                                              dump_stats,
+                                                              dump_passes,
+                                                              &compiler_phases_timings,
+                                                              profile_file));
 
     driver->GetCompiler()->SetBitcodeFileName(*driver.get(), bitcode_filename);
 
@@ -386,11 +390,15 @@ class Dex2Oat {
       image_file_location = image_space->GetImageFilename();
     }
 
+    if (!image_file_location.empty()) {
+      key_value_store->Put(OatHeader::kImageLocationKey, image_file_location);
+    }
+
     OatWriter oat_writer(dex_files, image_file_location_oat_checksum,
                          image_file_location_oat_data_begin,
-                         image_file_location,
                          driver.get(),
-                         &timings);
+                         &timings,
+                         key_value_store);
 
     t2.NewTiming("Writing ELF");
     if (!driver->WriteElf(android_root, is_host, dex_files, &oat_writer, oat_file)) {
@@ -1167,8 +1175,8 @@ static int dex2oat(int argc, char** argv) {
     Usage("Unknown --compiler-filter value %s", compiler_filter_string);
   }
 
-  CheckExplicitCheckOptions(instruction_set, &explicit_null_checks, &explicit_so_checks,
-                            &explicit_suspend_checks);
+  ImplicitCheckOptions::Check(instruction_set, &explicit_null_checks, &explicit_so_checks,
+                              &explicit_suspend_checks);
 
   if (!explicit_include_patch_information) {
     include_patch_information =
@@ -1262,24 +1270,15 @@ static int dex2oat(int argc, char** argv) {
   // TODO: Not sure whether it's a good idea to allow anything else but the runtime option in
   // this case at all, as we'll have to throw away produced code for a mismatch.
   if (!has_explicit_checks_options) {
-    bool cross_compiling = true;
-    switch (kRuntimeISA) {
-      case kArm:
-      case kThumb2:
-        cross_compiling = instruction_set != kArm && instruction_set != kThumb2;
-        break;
-      default:
-        cross_compiling = instruction_set != kRuntimeISA;
-        break;
-    }
-    if (!cross_compiling) {
-      Runtime* runtime = Runtime::Current();
-      compiler_options.SetExplicitNullChecks(runtime->ExplicitNullChecks());
-      compiler_options.SetExplicitStackOverflowChecks(runtime->ExplicitStackOverflowChecks());
-      compiler_options.SetExplicitSuspendChecks(runtime->ExplicitSuspendChecks());
+    if (ImplicitCheckOptions::CheckForCompiling(kRuntimeISA, instruction_set, &explicit_null_checks,
+                                                &explicit_so_checks, &explicit_suspend_checks)) {
+      compiler_options.SetExplicitNullChecks(explicit_null_checks);
+      compiler_options.SetExplicitStackOverflowChecks(explicit_so_checks);
+      compiler_options.SetExplicitSuspendChecks(explicit_suspend_checks);
     }
   }
 
+
   // Runtime::Create acquired the mutator_lock_ that is normally given away when we Runtime::Start,
   // give it away now so that we don't starve GC.
   Thread* self = Thread::Current();
@@ -1378,19 +1377,43 @@ static int dex2oat(int argc, char** argv) {
     }
   }
 
+  // Fill some values into the key-value store for the oat header.
+  SafeMap<std::string, std::string> key_value_store;
+
+  // Insert implicit check options.
+  key_value_store.Put(ImplicitCheckOptions::kImplicitChecksOatHeaderKey,
+                      ImplicitCheckOptions::Serialize(compiler_options.GetExplicitNullChecks(),
+                                                      compiler_options.
+                                                          GetExplicitStackOverflowChecks(),
+                                                      compiler_options.GetExplicitSuspendChecks()));
+
+  // Insert some compiler things.
+  std::ostringstream oss;
+  for (int i = 0; i < argc; ++i) {
+    if (i > 0) {
+      oss << ' ';
+    }
+    oss << argv[i];
+  }
+  key_value_store.Put(OatHeader::kDex2OatCmdLineKey, oss.str());
+  oss.str("");  // Reset.
+  oss << kRuntimeISA;
+  key_value_store.Put(OatHeader::kDex2OatHostKey, oss.str());
+
   std::unique_ptr<const CompilerDriver> compiler(dex2oat->CreateOatFile(boot_image_option,
-                                                                  android_root,
-                                                                  is_host,
-                                                                  dex_files,
-                                                                  oat_file.get(),
-                                                                  bitcode_filename,
-                                                                  image,
-                                                                  image_classes,
-                                                                  dump_stats,
-                                                                  dump_passes,
-                                                                  timings,
-                                                                  compiler_phases_timings,
-                                                                  profile_file));
+                                                                        android_root,
+                                                                        is_host,
+                                                                        dex_files,
+                                                                        oat_file.get(),
+                                                                        bitcode_filename,
+                                                                        image,
+                                                                        image_classes,
+                                                                        dump_stats,
+                                                                        dump_passes,
+                                                                        timings,
+                                                                        compiler_phases_timings,
+                                                                        profile_file,
+                                                                        &key_value_store));
 
   if (compiler.get() == nullptr) {
     LOG(ERROR) << "Failed to create oat file: " << oat_location;
diff --git a/disassembler/disassembler_x86.cc b/disassembler/disassembler_x86.cc
index e6cbf05744..a6f9a8a22b 100644
--- a/disassembler/disassembler_x86.cc
+++ b/disassembler/disassembler_x86.cc
@@ -201,7 +201,8 @@ size_t DisassemblerX86::DumpInstruction(std::ostream& os, const uint8_t* instr)
   std::ostringstream opcode;
   bool store = false;  // stores to memory (ie rm is on the left)
   bool load = false;  // loads from memory (ie rm is on the right)
-  bool byte_operand = false;
+  bool byte_operand = false;  // true when the opcode is dealing with byte operands
+  bool byte_second_operand = false;  // true when the source operand is a byte register but the target register isn't (ie movsxb/movzxb).
   bool target_specific = false;  // register name depends on target (64 vs 32 bits).
   bool ax = false;  // implicit use of ax
   bool cx = false;  // implicit use of cx
@@ -426,6 +427,20 @@ DISASSEMBLER_ENTRY(cmp,
         instr++;
         if (prefix[2] == 0x66) {
           switch (*instr) {
+            case 0x01:
+              opcode << "phaddw";
+              prefix[2] = 0;
+              has_modrm = true;
+              load = true;
+              src_reg_file = dst_reg_file = SSE;
+              break;
+            case 0x02:
+              opcode << "phaddd";
+              prefix[2] = 0;
+              has_modrm = true;
+              load = true;
+              src_reg_file = dst_reg_file = SSE;
+              break;
             case 0x40:
               opcode << "pmulld";
               prefix[2] = 0;
@@ -449,7 +464,7 @@ DISASSEMBLER_ENTRY(cmp,
               prefix[2] = 0;
               has_modrm = true;
               store = true;
-              dst_reg_file = SSE;
+              src_reg_file = SSE;
               immediate_bytes = 1;
               break;
             case 0x16:
@@ -457,7 +472,7 @@ DISASSEMBLER_ENTRY(cmp,
               prefix[2] = 0;
               has_modrm = true;
               store = true;
-              dst_reg_file = SSE;
+              src_reg_file = SSE;
               immediate_bytes = 1;
               break;
             default:
@@ -732,9 +747,9 @@ DISASSEMBLER_ENTRY(cmp,
         break;
       case 0xAF: opcode << "imul"; has_modrm = true; load = true; break;
       case 0xB1: opcode << "cmpxchg"; has_modrm = true; store = true; break;
-      case 0xB6: opcode << "movzxb"; has_modrm = true; load = true; break;
+      case 0xB6: opcode << "movzxb"; has_modrm = true; load = true; byte_second_operand = true; break;
       case 0xB7: opcode << "movzxw"; has_modrm = true; load = true; break;
-      case 0xBE: opcode << "movsxb"; has_modrm = true; load = true; break;
+      case 0xBE: opcode << "movsxb"; has_modrm = true; load = true; byte_second_operand = true; rex |= (rex == 0 ? 0 : 0b1000); break;
       case 0xBF: opcode << "movsxw"; has_modrm = true; load = true; break;
       case 0xC5:
         if (prefix[2] == 0x66) {
@@ -742,7 +757,7 @@ DISASSEMBLER_ENTRY(cmp,
           prefix[2] = 0;
           has_modrm = true;
           store = true;
-          src_reg_file = dst_reg_file = SSE;
+          src_reg_file = SSE;
           immediate_bytes = 1;
         } else {
           opcode << StringPrintf("unknown opcode '0F %02X'", *instr);
@@ -1124,7 +1139,8 @@ DISASSEMBLER_ENTRY(cmp,
     } else {
       if (mod == 3) {
         if (!no_ops) {
-          DumpRmReg(address, rex_w, rm, byte_operand, prefix[2], load ? src_reg_file : dst_reg_file);
+          DumpRmReg(address, rex_w, rm, byte_operand || byte_second_operand,
+                    prefix[2], load ? src_reg_file : dst_reg_file);
         }
       } else {
         address << "[";
diff --git a/oatdump/oatdump.cc b/oatdump/oatdump.cc
index 12970fcaab..631d53825f 100644
--- a/oatdump/oatdump.cc
+++ b/oatdump/oatdump.cc
@@ -171,10 +171,18 @@ class OatDumper {
     os << "IMAGE FILE LOCATION OAT BEGIN:\n";
     os << StringPrintf("0x%08x\n\n", oat_header.GetImageFileLocationOatDataBegin());
 
-    os << "IMAGE FILE LOCATION:\n";
-    const std::string image_file_location(oat_header.GetImageFileLocation());
-    os << image_file_location;
-    os << "\n\n";
+    // Print the key-value store.
+    {
+      os << "KEY VALUE STORE:\n";
+      size_t index = 0;
+      const char* key;
+      const char* value;
+      while (oat_header.GetStoreKeyValuePairByIndex(index, &key, &value)) {
+        os << key << " = " << value << "\n";
+        index++;
+      }
+      os << "\n";
+    }
 
     os << "BEGIN:\n";
     os << reinterpret_cast<const void*>(oat_file_.Begin()) << "\n\n";
diff --git a/patchoat/patchoat.cc b/patchoat/patchoat.cc
index ea4b880b83..dcf8c70501 100644
--- a/patchoat/patchoat.cc
+++ b/patchoat/patchoat.cc
@@ -64,15 +64,14 @@ static InstructionSet ElfISAToInstructionSet(Elf32_Word isa) {
 
 bool PatchOat::Patch(const std::string& image_location, off_t delta,
                      File* output_image, InstructionSet isa,
-                     TimingLogger& timings) {
-  std::string error_msg;
+                     TimingLogger* timings) {
   CHECK(Runtime::Current() == nullptr);
   CHECK(output_image != nullptr);
   CHECK_GE(output_image->Fd(), 0);
   CHECK(!image_location.empty()) << "image file must have a filename.";
   CHECK_NE(isa, kNone);
 
-  TimingLogger::ScopedTiming t("Runtime Setup", &timings);
+  TimingLogger::ScopedTiming t("Runtime Setup", timings);
   const char *isa_name = GetInstructionSetString(isa);
   std::string image_filename(GetSystemImageFilename(image_location.c_str(), isa));
   std::unique_ptr<File> input_image(OS::OpenFileForReading(image_filename.c_str()));
@@ -110,6 +109,7 @@ bool PatchOat::Patch(const std::string& image_location, off_t delta,
 
   t.NewTiming("Image and oat Patching setup");
   // Create the map where we will write the image patches to.
+  std::string error_msg;
   std::unique_ptr<MemMap> image(MemMap::MapFile(image_len, PROT_READ | PROT_WRITE, MAP_PRIVATE,
                                                 input_image->Fd(), 0,
                                                 input_image->GetPath().c_str(),
@@ -137,8 +137,7 @@ bool PatchOat::Patch(const std::string& image_location, off_t delta,
 
 bool PatchOat::Patch(const File* input_oat, const std::string& image_location, off_t delta,
                      File* output_oat, File* output_image, InstructionSet isa,
-                     TimingLogger& timings) {
-  std::string error_msg;
+                     TimingLogger* timings) {
   CHECK(Runtime::Current() == nullptr);
   CHECK(output_image != nullptr);
   CHECK_GE(output_image->Fd(), 0);
@@ -148,7 +147,7 @@ bool PatchOat::Patch(const File* input_oat, const std::string& image_location, o
   CHECK_GE(output_oat->Fd(), 0);
   CHECK(!image_location.empty()) << "image file must have a filename.";
 
-  TimingLogger::ScopedTiming t("Runtime Setup", &timings);
+  TimingLogger::ScopedTiming t("Runtime Setup", timings);
 
   if (isa == kNone) {
     Elf32_Ehdr elf_hdr;
@@ -194,6 +193,7 @@ bool PatchOat::Patch(const File* input_oat, const std::string& image_location, o
 
   t.NewTiming("Image and oat Patching setup");
   // Create the map where we will write the image patches to.
+  std::string error_msg;
   std::unique_ptr<MemMap> image(MemMap::MapFile(image_len, PROT_READ | PROT_WRITE, MAP_PRIVATE,
                                                 input_image->Fd(), 0,
                                                 input_image->GetPath().c_str(),
@@ -234,7 +234,7 @@ bool PatchOat::Patch(const File* input_oat, const std::string& image_location, o
 }
 
 bool PatchOat::WriteElf(File* out) {
-  TimingLogger::ScopedTiming t("Writing Elf File", &timings_);
+  TimingLogger::ScopedTiming t("Writing Elf File", timings_);
   CHECK(oat_file_.get() != nullptr);
   CHECK(out != nullptr);
   size_t expect = oat_file_->Size();
@@ -248,7 +248,7 @@ bool PatchOat::WriteElf(File* out) {
 }
 
 bool PatchOat::WriteImage(File* out) {
-  TimingLogger::ScopedTiming t("Writing image File", &timings_);
+  TimingLogger::ScopedTiming t("Writing image File", timings_);
   CHECK(image_ != nullptr);
   CHECK(out != nullptr);
   size_t expect = image_->Size();
@@ -275,7 +275,7 @@ bool PatchOat::PatchImage() {
   }
 
   {
-    TimingLogger::ScopedTiming t("Walk Bitmap", &timings_);
+    TimingLogger::ScopedTiming t("Walk Bitmap", timings_);
     // Walk the bitmap.
     WriterMutexLock mu(Thread::Current(), *Locks::heap_bitmap_lock_);
     bitmap_->Walk(PatchOat::BitmapCallback, this);
@@ -348,7 +348,7 @@ void PatchOat::VisitObject(mirror::Object* object) {
 
 void PatchOat::FixupMethod(mirror::ArtMethod* object, mirror::ArtMethod* copy) {
   // Just update the entry points if it looks like we should.
-  // TODO sanity check all the pointers' values
+  // TODO: sanity check all the pointers' values
   uintptr_t portable = reinterpret_cast<uintptr_t>(
       object->GetEntryPointFromPortableCompiledCode<kVerifyNone>());
   if (portable != 0) {
@@ -377,12 +377,12 @@ void PatchOat::FixupMethod(mirror::ArtMethod* object, mirror::ArtMethod* copy) {
   }
 }
 
-bool PatchOat::Patch(File* input_oat, off_t delta, File* output_oat, TimingLogger& timings) {
+bool PatchOat::Patch(File* input_oat, off_t delta, File* output_oat, TimingLogger* timings) {
   CHECK(input_oat != nullptr);
   CHECK(output_oat != nullptr);
   CHECK_GE(input_oat->Fd(), 0);
   CHECK_GE(output_oat->Fd(), 0);
-  TimingLogger::ScopedTiming t("Setup Oat File Patching", &timings);
+  TimingLogger::ScopedTiming t("Setup Oat File Patching", timings);
 
   std::string error_msg;
   std::unique_ptr<ElfFile> elf(ElfFile::Open(const_cast<File*>(input_oat),
@@ -437,7 +437,7 @@ bool PatchOat::CheckOatFile() {
 }
 
 bool PatchOat::PatchElf() {
-  TimingLogger::ScopedTiming t("Fixup Elf Headers", &timings_);
+  TimingLogger::ScopedTiming t("Fixup Elf Headers", timings_);
   // Fixup Phdr's
   for (unsigned int i = 0; i < oat_file_->GetProgramHeaderNum(); i++) {
     Elf32_Phdr& hdr = oat_file_->GetProgramHeader(i);
@@ -623,28 +623,28 @@ static void Usage(const char *fmt, ...) {
   exit(EXIT_FAILURE);
 }
 
-static bool ReadBaseDelta(const char* name, off_t* delta, std::string& error_msg) {
+static bool ReadBaseDelta(const char* name, off_t* delta, std::string* error_msg) {
   CHECK(name != nullptr);
   CHECK(delta != nullptr);
   std::unique_ptr<File> file;
   if (OS::FileExists(name)) {
     file.reset(OS::OpenFileForReading(name));
     if (file.get() == nullptr) {
-      error_msg = "Failed to open file %s for reading";
+      *error_msg = "Failed to open file %s for reading";
       return false;
     }
   } else {
-    error_msg = "File %s does not exist";
+    *error_msg = "File %s does not exist";
     return false;
   }
   CHECK(file.get() != nullptr);
   ImageHeader hdr;
   if (sizeof(hdr) != file->Read(reinterpret_cast<char*>(&hdr), sizeof(hdr), 0)) {
-    error_msg = "Failed to read file %s";
+    *error_msg = "Failed to read file %s";
     return false;
   }
   if (!hdr.IsValid()) {
-    error_msg = "%s does not contain a valid image header.";
+    *error_msg = "%s does not contain a valid image header.";
     return false;
   }
   *delta = hdr.GetPatchDelta();
@@ -661,7 +661,7 @@ static File* CreateOrOpen(const char* name, bool* created) {
   }
 }
 
-int patchoat(int argc, char **argv) {
+static int patchoat(int argc, char **argv) {
   InitLogging(argv);
   const bool debug = kIsDebugBuild;
   orig_argc = argc;
@@ -712,7 +712,7 @@ int patchoat(int argc, char **argv) {
     if (log_options) {
       LOG(INFO) << "patchoat: option[" << i << "]=" << argv[i];
     }
-    // TODO GetInstructionSetFromString shouldn't LOG(FATAL).
+    // TODO: GetInstructionSetFromString shouldn't LOG(FATAL).
     if (option.starts_with("--instruction-set=")) {
       isa_set = true;
       const char* isa_str = option.substr(strlen("--instruction-set=")).data();
@@ -921,7 +921,7 @@ int patchoat(int argc, char **argv) {
     } else if (!patched_image_filename.empty()) {
       base_delta_set = true;
       std::string error_msg;
-      if (!ReadBaseDelta(patched_image_filename.c_str(), &base_delta, error_msg)) {
+      if (!ReadBaseDelta(patched_image_filename.c_str(), &base_delta, &error_msg)) {
         Usage(error_msg.c_str(), patched_image_filename.c_str());
       }
     } else {
@@ -1000,14 +1000,14 @@ int patchoat(int argc, char **argv) {
   if (have_image_files && have_oat_files) {
     TimingLogger::ScopedTiming pt("patch image and oat", &timings);
     ret = PatchOat::Patch(input_oat.get(), input_image_location, base_delta,
-                          output_oat.get(), output_image.get(), isa, timings);
+                          output_oat.get(), output_image.get(), isa, &timings);
   } else if (have_oat_files) {
     TimingLogger::ScopedTiming pt("patch oat", &timings);
-    ret = PatchOat::Patch(input_oat.get(), base_delta, output_oat.get(), timings);
+    ret = PatchOat::Patch(input_oat.get(), base_delta, output_oat.get(), &timings);
   } else {
     TimingLogger::ScopedTiming pt("patch image", &timings);
     CHECK(have_image_files);
-    ret = PatchOat::Patch(input_image_location, base_delta, output_image.get(), isa, timings);
+    ret = PatchOat::Patch(input_image_location, base_delta, output_image.get(), isa, &timings);
   }
   cleanup(ret);
   return (ret) ? EXIT_SUCCESS : EXIT_FAILURE;
diff --git a/patchoat/patchoat.h b/patchoat/patchoat.h
index b9f36d4f8d..a63e6f44b8 100644
--- a/patchoat/patchoat.h
+++ b/patchoat/patchoat.h
@@ -36,38 +36,29 @@ class Object;
 class Reference;
 class Class;
 class ArtMethod;
-};
-
-int patchoat(int argc, char** argv);
+};  // namespace mirror
 
 class PatchOat {
  public:
-  static bool Patch(File* oat_in, off_t delta, File* oat_out, TimingLogger& timings);
+  static bool Patch(File* oat_in, off_t delta, File* oat_out, TimingLogger* timings);
 
   static bool Patch(const std::string& art_location, off_t delta, File* art_out, InstructionSet isa,
-                    TimingLogger& timings);
+                    TimingLogger* timings);
 
   static bool Patch(const File* oat_in, const std::string& art_location,
                     off_t delta, File* oat_out, File* art_out, InstructionSet isa,
-                    TimingLogger& timings);
+                    TimingLogger* timings);
 
  private:
-  std::unique_ptr<ElfFile> oat_file_;
-  MemMap* image_;
-  gc::accounting::ContinuousSpaceBitmap* bitmap_;
-  MemMap* heap_;
-  off_t delta_;
-  TimingLogger& timings_;
-
   // Takes ownership only of the ElfFile. All other pointers are only borrowed.
-  PatchOat(ElfFile* oat_file, off_t delta, TimingLogger& timings)
+  PatchOat(ElfFile* oat_file, off_t delta, TimingLogger* timings)
       : oat_file_(oat_file), delta_(delta), timings_(timings) {}
   PatchOat(MemMap* image, gc::accounting::ContinuousSpaceBitmap* bitmap,
-           MemMap* heap, off_t delta, TimingLogger& timings)
+           MemMap* heap, off_t delta, TimingLogger* timings)
       : image_(image), bitmap_(bitmap), heap_(heap),
         delta_(delta), timings_(timings) {}
   PatchOat(ElfFile* oat_file, MemMap* image, gc::accounting::ContinuousSpaceBitmap* bitmap,
-           MemMap* heap, off_t delta, TimingLogger& timings)
+           MemMap* heap, off_t delta, TimingLogger* timings)
       : oat_file_(oat_file), image_(image), bitmap_(bitmap), heap_(heap),
         delta_(delta), timings_(timings) {}
   ~PatchOat() {}
@@ -98,6 +89,8 @@ class PatchOat {
   mirror::Object* RelocatedCopyOf(mirror::Object*);
   mirror::Object* RelocatedAddressOf(mirror::Object* obj);
 
+  // Walks through the old image and patches the mmap'd copy of it to the new offset. It does not
+  // change the heap.
   class PatchVisitor {
   public:
     PatchVisitor(PatchOat* patcher, mirror::Object* copy) : patcher_(patcher), copy_(copy) {}
@@ -112,6 +105,18 @@ class PatchOat {
     mirror::Object* copy_;
   };
 
+  // The elf file we are patching.
+  std::unique_ptr<ElfFile> oat_file_;
+  // A mmap of the image we are patching. This is modified.
+  const MemMap* image_;
+  // The heap we are patching. This is not modified.
+  gc::accounting::ContinuousSpaceBitmap* bitmap_;
+  // The heap we are patching. This is not modified.
+  const MemMap* heap_;
+  // The amount we are changing the offset by.
+  off_t delta_;
+  TimingLogger* timings_;
+
   DISALLOW_IMPLICIT_CONSTRUCTORS(PatchOat);
 };
 
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index 60453c3cc1..860cbd207e 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -581,10 +581,6 @@ bool ClassLinker::GenerateOatFile(const char* dex_filename,
   std::vector<std::string> argv;
   argv.push_back(dex2oat);
   argv.push_back("--runtime-arg");
-  argv.push_back("-Xms64m");
-  argv.push_back("--runtime-arg");
-  argv.push_back("-Xmx64m");
-  argv.push_back("--runtime-arg");
   argv.push_back("-classpath");
   argv.push_back("--runtime-arg");
   argv.push_back(Runtime::Current()->GetClassPathString());
@@ -1163,7 +1159,9 @@ void ClassLinker::InitFromImage() {
   OatFile& oat_file = GetImageOatFile(space);
   CHECK_EQ(oat_file.GetOatHeader().GetImageFileLocationOatChecksum(), 0U);
   CHECK_EQ(oat_file.GetOatHeader().GetImageFileLocationOatDataBegin(), 0U);
-  CHECK(oat_file.GetOatHeader().GetImageFileLocation().empty());
+  const char* image_file_location = oat_file.GetOatHeader().
+      GetStoreValueByKey(OatHeader::kImageLocationKey);
+  CHECK(image_file_location == nullptr || *image_file_location == 0);
   portable_resolution_trampoline_ = oat_file.GetOatHeader().GetPortableResolutionTrampoline();
   quick_resolution_trampoline_ = oat_file.GetOatHeader().GetQuickResolutionTrampoline();
   portable_imt_conflict_trampoline_ = oat_file.GetOatHeader().GetPortableImtConflictTrampoline();
diff --git a/runtime/debugger.cc b/runtime/debugger.cc
index 11415b49e0..6161aff647 100644
--- a/runtime/debugger.cc
+++ b/runtime/debugger.cc
@@ -170,27 +170,47 @@ class AllocRecord {
   AllocRecordStackTraceElement stack_[kMaxAllocRecordStackDepth];  // Unused entries have NULL method.
 };
 
-struct Breakpoint {
-  // The location of this breakpoint.
-  mirror::ArtMethod* method;
-  uint32_t dex_pc;
+class Breakpoint {
+ public:
+  Breakpoint(mirror::ArtMethod* method, uint32_t dex_pc, bool need_full_deoptimization)
+    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+    : method_(nullptr), dex_pc_(dex_pc), need_full_deoptimization_(need_full_deoptimization) {
+    ScopedObjectAccessUnchecked soa(Thread::Current());
+    method_ = soa.EncodeMethod(method);
+  }
 
-  // Indicates whether breakpoint needs full deoptimization or selective deoptimization.
-  bool need_full_deoptimization;
+  Breakpoint(const Breakpoint& other) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+    : method_(nullptr), dex_pc_(other.dex_pc_),
+      need_full_deoptimization_(other.need_full_deoptimization_) {
+    ScopedObjectAccessUnchecked soa(Thread::Current());
+    method_ = soa.EncodeMethod(other.Method());
+  }
 
-  Breakpoint(mirror::ArtMethod* method, uint32_t dex_pc, bool need_full_deoptimization)
-    : method(method), dex_pc(dex_pc), need_full_deoptimization(need_full_deoptimization) {}
+  mirror::ArtMethod* Method() const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    ScopedObjectAccessUnchecked soa(Thread::Current());
+    return soa.DecodeMethod(method_);
+  }
 
-  void VisitRoots(RootCallback* callback, void* arg) {
-    if (method != nullptr) {
-      callback(reinterpret_cast<mirror::Object**>(&method), arg, 0, kRootDebugger);
-    }
+  uint32_t DexPc() const {
+    return dex_pc_;
+  }
+
+  bool NeedFullDeoptimization() const {
+    return need_full_deoptimization_;
   }
+
+ private:
+  // The location of this breakpoint.
+  jmethodID method_;
+  uint32_t dex_pc_;
+
+  // Indicates whether breakpoint needs full deoptimization or selective deoptimization.
+  bool need_full_deoptimization_;
 };
 
-static std::ostream& operator<<(std::ostream& os, const Breakpoint& rhs)
+static std::ostream& operator<<(std::ostream& os, Breakpoint& rhs)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-  os << StringPrintf("Breakpoint[%s @%#x]", PrettyMethod(rhs.method).c_str(), rhs.dex_pc);
+  os << StringPrintf("Breakpoint[%s @%#x]", PrettyMethod(rhs.Method()).c_str(), rhs.DexPc());
   return os;
 }
 
@@ -349,18 +369,12 @@ void SingleStepControl::Clear() {
   dex_pcs.clear();
 }
 
-void DeoptimizationRequest::VisitRoots(RootCallback* callback, void* arg) {
-  if (method != nullptr) {
-    callback(reinterpret_cast<mirror::Object**>(&method), arg, 0, kRootDebugger);
-  }
-}
-
 static bool IsBreakpoint(const mirror::ArtMethod* m, uint32_t dex_pc)
     LOCKS_EXCLUDED(Locks::breakpoint_lock_)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   MutexLock mu(Thread::Current(), *Locks::breakpoint_lock_);
   for (size_t i = 0, e = gBreakpoints.size(); i < e; ++i) {
-    if (gBreakpoints[i].method == m && gBreakpoints[i].dex_pc == dex_pc) {
+    if (gBreakpoints[i].DexPc() == dex_pc && gBreakpoints[i].Method() == m) {
       VLOG(jdwp) << "Hit breakpoint #" << i << ": " << gBreakpoints[i];
       return true;
     }
@@ -641,21 +655,6 @@ void Dbg::StartJdwp() {
   }
 }
 
-void Dbg::VisitRoots(RootCallback* callback, void* arg) {
-  {
-    MutexLock mu(Thread::Current(), *Locks::breakpoint_lock_);
-    for (Breakpoint& bp : gBreakpoints) {
-      bp.VisitRoots(callback, arg);
-    }
-  }
-  if (deoptimization_lock_ != nullptr) {  // only true if the debugger is started.
-    MutexLock mu(Thread::Current(), *deoptimization_lock_);
-    for (DeoptimizationRequest& req : deoptimization_requests_) {
-      req.VisitRoots(callback, arg);
-    }
-  }
-}
-
 void Dbg::StopJdwp() {
   // Prevent the JDWP thread from processing JDWP incoming packets after we close the connection.
   Disposed();
@@ -2844,22 +2843,22 @@ size_t* Dbg::GetReferenceCounterForEvent(uint32_t instrumentation_event) {
 // Process request while all mutator threads are suspended.
 void Dbg::ProcessDeoptimizationRequest(const DeoptimizationRequest& request) {
   instrumentation::Instrumentation* instrumentation = Runtime::Current()->GetInstrumentation();
-  switch (request.kind) {
+  switch (request.GetKind()) {
     case DeoptimizationRequest::kNothing:
       LOG(WARNING) << "Ignoring empty deoptimization request.";
       break;
     case DeoptimizationRequest::kRegisterForEvent:
       VLOG(jdwp) << StringPrintf("Add debugger as listener for instrumentation event 0x%x",
-                                 request.instrumentation_event);
-      instrumentation->AddListener(&gDebugInstrumentationListener, request.instrumentation_event);
-      instrumentation_events_ |= request.instrumentation_event;
+                                 request.InstrumentationEvent());
+      instrumentation->AddListener(&gDebugInstrumentationListener, request.InstrumentationEvent());
+      instrumentation_events_ |= request.InstrumentationEvent();
       break;
     case DeoptimizationRequest::kUnregisterForEvent:
       VLOG(jdwp) << StringPrintf("Remove debugger as listener for instrumentation event 0x%x",
-                                 request.instrumentation_event);
+                                 request.InstrumentationEvent());
       instrumentation->RemoveListener(&gDebugInstrumentationListener,
-                                      request.instrumentation_event);
-      instrumentation_events_ &= ~request.instrumentation_event;
+                                      request.InstrumentationEvent());
+      instrumentation_events_ &= ~request.InstrumentationEvent();
       break;
     case DeoptimizationRequest::kFullDeoptimization:
       VLOG(jdwp) << "Deoptimize the world ...";
@@ -2872,17 +2871,17 @@ void Dbg::ProcessDeoptimizationRequest(const DeoptimizationRequest& request) {
       VLOG(jdwp) << "Undeoptimize the world DONE";
       break;
     case DeoptimizationRequest::kSelectiveDeoptimization:
-      VLOG(jdwp) << "Deoptimize method " << PrettyMethod(request.method) << " ...";
-      instrumentation->Deoptimize(request.method);
-      VLOG(jdwp) << "Deoptimize method " << PrettyMethod(request.method) << " DONE";
+      VLOG(jdwp) << "Deoptimize method " << PrettyMethod(request.Method()) << " ...";
+      instrumentation->Deoptimize(request.Method());
+      VLOG(jdwp) << "Deoptimize method " << PrettyMethod(request.Method()) << " DONE";
       break;
     case DeoptimizationRequest::kSelectiveUndeoptimization:
-      VLOG(jdwp) << "Undeoptimize method " << PrettyMethod(request.method) << " ...";
-      instrumentation->Undeoptimize(request.method);
-      VLOG(jdwp) << "Undeoptimize method " << PrettyMethod(request.method) << " DONE";
+      VLOG(jdwp) << "Undeoptimize method " << PrettyMethod(request.Method()) << " ...";
+      instrumentation->Undeoptimize(request.Method());
+      VLOG(jdwp) << "Undeoptimize method " << PrettyMethod(request.Method()) << " DONE";
       break;
     default:
-      LOG(FATAL) << "Unsupported deoptimization request kind " << request.kind;
+      LOG(FATAL) << "Unsupported deoptimization request kind " << request.GetKind();
       break;
   }
 }
@@ -2899,8 +2898,8 @@ void Dbg::ProcessDelayedFullUndeoptimizations() {
     MutexLock mu(Thread::Current(), *deoptimization_lock_);
     while (delayed_full_undeoptimization_count_ > 0) {
       DeoptimizationRequest req;
-      req.kind = DeoptimizationRequest::kFullUndeoptimization;
-      req.method = nullptr;
+      req.SetKind(DeoptimizationRequest::kFullUndeoptimization);
+      req.SetMethod(nullptr);
       RequestDeoptimizationLocked(req);
       --delayed_full_undeoptimization_count_;
     }
@@ -2909,7 +2908,7 @@ void Dbg::ProcessDelayedFullUndeoptimizations() {
 }
 
 void Dbg::RequestDeoptimization(const DeoptimizationRequest& req) {
-  if (req.kind == DeoptimizationRequest::kNothing) {
+  if (req.GetKind() == DeoptimizationRequest::kNothing) {
     // Nothing to do.
     return;
   }
@@ -2918,35 +2917,35 @@ void Dbg::RequestDeoptimization(const DeoptimizationRequest& req) {
 }
 
 void Dbg::RequestDeoptimizationLocked(const DeoptimizationRequest& req) {
-  switch (req.kind) {
+  switch (req.GetKind()) {
     case DeoptimizationRequest::kRegisterForEvent: {
-      DCHECK_NE(req.instrumentation_event, 0u);
-      size_t* counter = GetReferenceCounterForEvent(req.instrumentation_event);
+      DCHECK_NE(req.InstrumentationEvent(), 0u);
+      size_t* counter = GetReferenceCounterForEvent(req.InstrumentationEvent());
       CHECK(counter != nullptr) << StringPrintf("No counter for instrumentation event 0x%x",
-                                                req.instrumentation_event);
+                                                req.InstrumentationEvent());
       if (*counter == 0) {
         VLOG(jdwp) << StringPrintf("Queue request #%zd to start listening to instrumentation event 0x%x",
-                                   deoptimization_requests_.size(), req.instrumentation_event);
+                                   deoptimization_requests_.size(), req.InstrumentationEvent());
         deoptimization_requests_.push_back(req);
       }
       *counter = *counter + 1;
       break;
     }
     case DeoptimizationRequest::kUnregisterForEvent: {
-      DCHECK_NE(req.instrumentation_event, 0u);
-      size_t* counter = GetReferenceCounterForEvent(req.instrumentation_event);
+      DCHECK_NE(req.InstrumentationEvent(), 0u);
+      size_t* counter = GetReferenceCounterForEvent(req.InstrumentationEvent());
       CHECK(counter != nullptr) << StringPrintf("No counter for instrumentation event 0x%x",
-                                                req.instrumentation_event);
+                                                req.InstrumentationEvent());
       *counter = *counter - 1;
       if (*counter == 0) {
         VLOG(jdwp) << StringPrintf("Queue request #%zd to stop listening to instrumentation event 0x%x",
-                                   deoptimization_requests_.size(), req.instrumentation_event);
+                                   deoptimization_requests_.size(), req.InstrumentationEvent());
         deoptimization_requests_.push_back(req);
       }
       break;
     }
     case DeoptimizationRequest::kFullDeoptimization: {
-      DCHECK(req.method == nullptr);
+      DCHECK(req.Method() == nullptr);
       if (full_deoptimization_event_count_ == 0) {
         VLOG(jdwp) << "Queue request #" << deoptimization_requests_.size()
                    << " for full deoptimization";
@@ -2956,7 +2955,7 @@ void Dbg::RequestDeoptimizationLocked(const DeoptimizationRequest& req) {
       break;
     }
     case DeoptimizationRequest::kFullUndeoptimization: {
-      DCHECK(req.method == nullptr);
+      DCHECK(req.Method() == nullptr);
       DCHECK_GT(full_deoptimization_event_count_, 0U);
       --full_deoptimization_event_count_;
       if (full_deoptimization_event_count_ == 0) {
@@ -2967,21 +2966,21 @@ void Dbg::RequestDeoptimizationLocked(const DeoptimizationRequest& req) {
       break;
     }
     case DeoptimizationRequest::kSelectiveDeoptimization: {
-      DCHECK(req.method != nullptr);
+      DCHECK(req.Method() != nullptr);
       VLOG(jdwp) << "Queue request #" << deoptimization_requests_.size()
-                 << " for deoptimization of " << PrettyMethod(req.method);
+                 << " for deoptimization of " << PrettyMethod(req.Method());
       deoptimization_requests_.push_back(req);
       break;
     }
     case DeoptimizationRequest::kSelectiveUndeoptimization: {
-      DCHECK(req.method != nullptr);
+      DCHECK(req.Method() != nullptr);
       VLOG(jdwp) << "Queue request #" << deoptimization_requests_.size()
-                 << " for undeoptimization of " << PrettyMethod(req.method);
+                 << " for undeoptimization of " << PrettyMethod(req.Method());
       deoptimization_requests_.push_back(req);
       break;
     }
     default: {
-      LOG(FATAL) << "Unknown deoptimization request kind " << req.kind;
+      LOG(FATAL) << "Unknown deoptimization request kind " << req.GetKind();
       break;
     }
   }
@@ -3005,7 +3004,7 @@ void Dbg::ManageDeoptimization() {
   {
     MutexLock mu(self, *deoptimization_lock_);
     size_t req_index = 0;
-    for (const DeoptimizationRequest& request : deoptimization_requests_) {
+    for (DeoptimizationRequest& request : deoptimization_requests_) {
       VLOG(jdwp) << "Process deoptimization request #" << req_index++;
       ProcessDeoptimizationRequest(request);
     }
@@ -3036,9 +3035,9 @@ static bool IsMethodPossiblyInlined(Thread* self, mirror::ArtMethod* m)
 }
 
 static const Breakpoint* FindFirstBreakpointForMethod(mirror::ArtMethod* m)
-    EXCLUSIVE_LOCKS_REQUIRED(Locks::breakpoint_lock_) {
-  for (const Breakpoint& breakpoint : gBreakpoints) {
-    if (breakpoint.method == m) {
+    EXCLUSIVE_LOCKS_REQUIRED(Locks::breakpoint_lock_) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  for (Breakpoint& breakpoint : gBreakpoints) {
+    if (breakpoint.Method() == m) {
       return &breakpoint;
     }
   }
@@ -3050,7 +3049,7 @@ static void SanityCheckExistingBreakpoints(mirror::ArtMethod* m, bool need_full_
     EXCLUSIVE_LOCKS_REQUIRED(Locks::breakpoint_lock_)  {
   if (kIsDebugBuild) {
     for (const Breakpoint& breakpoint : gBreakpoints) {
-      CHECK_EQ(need_full_deoptimization, breakpoint.need_full_deoptimization);
+      CHECK_EQ(need_full_deoptimization, breakpoint.NeedFullDeoptimization());
     }
     if (need_full_deoptimization) {
       // We should have deoptimized everything but not "selectively" deoptimized this method.
@@ -3080,18 +3079,18 @@ void Dbg::WatchLocation(const JDWP::JdwpLocation* location, DeoptimizationReques
     // inlined, we deoptimize everything; otherwise we deoptimize only this method.
     need_full_deoptimization = IsMethodPossiblyInlined(self, m);
     if (need_full_deoptimization) {
-      req->kind = DeoptimizationRequest::kFullDeoptimization;
-      req->method = nullptr;
+      req->SetKind(DeoptimizationRequest::kFullDeoptimization);
+      req->SetMethod(nullptr);
     } else {
-      req->kind = DeoptimizationRequest::kSelectiveDeoptimization;
-      req->method = m;
+      req->SetKind(DeoptimizationRequest::kSelectiveDeoptimization);
+      req->SetMethod(m);
     }
   } else {
     // There is at least one breakpoint for this method: we don't need to deoptimize.
-    req->kind = DeoptimizationRequest::kNothing;
-    req->method = nullptr;
+    req->SetKind(DeoptimizationRequest::kNothing);
+    req->SetMethod(nullptr);
 
-    need_full_deoptimization = existing_breakpoint->need_full_deoptimization;
+    need_full_deoptimization = existing_breakpoint->NeedFullDeoptimization();
     SanityCheckExistingBreakpoints(m, need_full_deoptimization);
   }
 
@@ -3103,15 +3102,14 @@ void Dbg::WatchLocation(const JDWP::JdwpLocation* location, DeoptimizationReques
 // Uninstalls a breakpoint at the specified location. Also indicates through the deoptimization
 // request if we need to undeoptimize.
 void Dbg::UnwatchLocation(const JDWP::JdwpLocation* location, DeoptimizationRequest* req) {
+  MutexLock mu(Thread::Current(), *Locks::breakpoint_lock_);
   mirror::ArtMethod* m = FromMethodId(location->method_id);
   DCHECK(m != nullptr) << "No method for method id " << location->method_id;
-
-  MutexLock mu(Thread::Current(), *Locks::breakpoint_lock_);
   bool need_full_deoptimization = false;
   for (size_t i = 0, e = gBreakpoints.size(); i < e; ++i) {
-    if (gBreakpoints[i].method == m && gBreakpoints[i].dex_pc == location->dex_pc) {
+    if (gBreakpoints[i].DexPc() == location->dex_pc && gBreakpoints[i].Method() == m) {
       VLOG(jdwp) << "Removed breakpoint #" << i << ": " << gBreakpoints[i];
-      need_full_deoptimization = gBreakpoints[i].need_full_deoptimization;
+      need_full_deoptimization = gBreakpoints[i].NeedFullDeoptimization();
       DCHECK_NE(need_full_deoptimization, Runtime::Current()->GetInstrumentation()->IsDeoptimized(m));
       gBreakpoints.erase(gBreakpoints.begin() + i);
       break;
@@ -3122,17 +3120,17 @@ void Dbg::UnwatchLocation(const JDWP::JdwpLocation* location, DeoptimizationRequ
     // There is no more breakpoint on this method: we need to undeoptimize.
     if (need_full_deoptimization) {
       // This method required full deoptimization: we need to undeoptimize everything.
-      req->kind = DeoptimizationRequest::kFullUndeoptimization;
-      req->method = nullptr;
+      req->SetKind(DeoptimizationRequest::kFullUndeoptimization);
+      req->SetMethod(nullptr);
     } else {
       // This method required selective deoptimization: we need to undeoptimize only that method.
-      req->kind = DeoptimizationRequest::kSelectiveUndeoptimization;
-      req->method = m;
+      req->SetKind(DeoptimizationRequest::kSelectiveUndeoptimization);
+      req->SetMethod(m);
     }
   } else {
     // There is at least one breakpoint for this method: we don't need to undeoptimize.
-    req->kind = DeoptimizationRequest::kNothing;
-    req->method = nullptr;
+    req->SetKind(DeoptimizationRequest::kNothing);
+    req->SetMethod(nullptr);
     SanityCheckExistingBreakpoints(m, need_full_deoptimization);
   }
 }
@@ -4590,4 +4588,14 @@ jbyteArray Dbg::GetRecentAllocations() {
   return result;
 }
 
+mirror::ArtMethod* DeoptimizationRequest::Method() const {
+  ScopedObjectAccessUnchecked soa(Thread::Current());
+  return soa.DecodeMethod(method_);
+}
+
+void DeoptimizationRequest::SetMethod(mirror::ArtMethod* m) {
+  ScopedObjectAccessUnchecked soa(Thread::Current());
+  method_ = soa.EncodeMethod(m);
+}
+
 }  // namespace art
diff --git a/runtime/debugger.h b/runtime/debugger.h
index 2589638461..1d3668c1f6 100644
--- a/runtime/debugger.h
+++ b/runtime/debugger.h
@@ -131,7 +131,8 @@ struct SingleStepControl {
 };
 
 // TODO rename to InstrumentationRequest.
-struct DeoptimizationRequest {
+class DeoptimizationRequest {
+ public:
   enum Kind {
     kNothing,                   // no action.
     kRegisterForEvent,          // start listening for instrumentation event.
@@ -142,21 +143,48 @@ struct DeoptimizationRequest {
     kSelectiveUndeoptimization  // undeoptimize one method.
   };
 
-  DeoptimizationRequest() : kind(kNothing), instrumentation_event(0), method(nullptr) {}
+  DeoptimizationRequest() : kind_(kNothing), instrumentation_event_(0), method_(nullptr) {}
+
+  DeoptimizationRequest(const DeoptimizationRequest& other)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      : kind_(other.kind_), instrumentation_event_(other.instrumentation_event_) {
+    // Create a new JNI global reference for the method.
+    SetMethod(other.Method());
+  }
+
+  mirror::ArtMethod* Method() const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
+  void SetMethod(mirror::ArtMethod* m) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  void VisitRoots(RootCallback* callback, void* arg);
+  // Name 'Kind()' would collide with the above enum name.
+  Kind GetKind() const {
+    return kind_;
+  }
 
-  Kind kind;
+  void SetKind(Kind kind) {
+    kind_ = kind;
+  }
+
+  uint32_t InstrumentationEvent() const {
+    return instrumentation_event_;
+  }
+
+  void SetInstrumentationEvent(uint32_t instrumentation_event) {
+    instrumentation_event_ = instrumentation_event;
+  }
+
+ private:
+  Kind kind_;
 
   // TODO we could use a union to hold the instrumentation_event and the method since they
   // respectively have sense only for kRegisterForEvent/kUnregisterForEvent and
   // kSelectiveDeoptimization/kSelectiveUndeoptimization.
 
   // Event to start or stop listening to. Only for kRegisterForEvent and kUnregisterForEvent.
-  uint32_t instrumentation_event;
+  uint32_t instrumentation_event_;
 
   // Method for selective deoptimization.
-  mirror::ArtMethod* method;
+  jmethodID method_;
 };
 
 class Dbg {
diff --git a/runtime/gc/space/image_space.cc b/runtime/gc/space/image_space.cc
index 61633cd489..a87aa890c8 100644
--- a/runtime/gc/space/image_space.cc
+++ b/runtime/gc/space/image_space.cc
@@ -61,13 +61,6 @@ static bool GenerateImage(const std::string& image_filename, std::string* error_
   image_option_string += image_filename;
   arg_vector.push_back(image_option_string);
 
-  arg_vector.push_back("--runtime-arg");
-  arg_vector.push_back("-Xms64m");
-
-  arg_vector.push_back("--runtime-arg");
-  arg_vector.push_back("-Xmx64m");
-
-
   for (size_t i = 0; i < boot_class_path.size(); i++) {
     arg_vector.push_back(std::string("--dex-file=") + boot_class_path[i]);
   }
@@ -348,6 +341,10 @@ bool ImageSpace::ValidateOatFile(std::string* error_msg) const {
   return true;
 }
 
+const OatFile* ImageSpace::GetOatFile() const {
+  return oat_file_.get();
+}
+
 OatFile* ImageSpace::ReleaseOatFile() {
   CHECK(oat_file_.get() != NULL);
   return oat_file_.release();
diff --git a/runtime/gc/space/image_space.h b/runtime/gc/space/image_space.h
index 372db3a580..dd9b58084d 100644
--- a/runtime/gc/space/image_space.h
+++ b/runtime/gc/space/image_space.h
@@ -51,6 +51,9 @@ class ImageSpace : public MemMapSpace {
   static ImageHeader* ReadImageHeaderOrDie(const char* image_location,
                                            InstructionSet image_isa);
 
+  // Give access to the OatFile.
+  const OatFile* GetOatFile() const;
+
   // Releases the OatFile from the ImageSpace so it can be transfer to
   // the caller, presumably the ClassLinker.
   OatFile* ReleaseOatFile()
diff --git a/runtime/implicit_check_options.h b/runtime/implicit_check_options.h
new file mode 100644
index 0000000000..b9ff0ac5ab
--- /dev/null
+++ b/runtime/implicit_check_options.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_IMPLICIT_CHECK_OPTIONS_H_
+#define ART_RUNTIME_IMPLICIT_CHECK_OPTIONS_H_
+
+#include "gc/heap.h"
+#include "gc/space/image_space.h"
+#include "instruction_set.h"
+#include "runtime.h"
+
+#include <string>
+
+namespace art {
+
+class ImplicitCheckOptions {
+ public:
+  static constexpr const char* kImplicitChecksOatHeaderKey = "implicit-checks";
+
+  static std::string Serialize(bool explicit_null_checks, bool explicit_stack_overflow_checks,
+                               bool explicit_suspend_checks) {
+    char tmp[4];
+    tmp[0] = explicit_null_checks ? 'N' : 'n';
+    tmp[1] = explicit_stack_overflow_checks ? 'O' : 'o';
+    tmp[2] = explicit_suspend_checks ? 'S' : 's';
+    tmp[3] = 0;
+    return std::string(tmp);
+  }
+
+  static bool Parse(const char* str, bool* explicit_null_checks,
+                    bool* explicit_stack_overflow_checks, bool* explicit_suspend_checks) {
+    if (str != nullptr && str[0] != 0 && str[1] != 0 && str[2] != 0 &&
+        (str[0] == 'n' || str[0] == 'N') &&
+        (str[1] == 'o' || str[1] == 'O') &&
+        (str[2] == 's' || str[2] == 'S')) {
+      *explicit_null_checks = str[0] == 'N';
+      *explicit_stack_overflow_checks = str[1] == 'O';
+      *explicit_suspend_checks = str[2] == 'S';
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  static void Check(InstructionSet isa, bool* explicit_null_checks,
+                    bool* explicit_stack_overflow_checks, bool* explicit_suspend_checks) {
+    switch (isa) {
+      case kArm:
+      case kThumb2:
+        break;  // All checks implemented, leave as is.
+
+      default:  // No checks implemented, reset all to explicit checks.
+        *explicit_null_checks = true;
+        *explicit_stack_overflow_checks = true;
+        *explicit_suspend_checks = true;
+    }
+  }
+
+  static bool CheckForCompiling(InstructionSet host, InstructionSet target,
+                                bool* explicit_null_checks, bool* explicit_stack_overflow_checks,
+                                bool* explicit_suspend_checks) {
+    // Check the boot image settings.
+    Runtime* runtime = Runtime::Current();
+    if (runtime != nullptr) {
+      gc::space::ImageSpace* ispace = runtime->GetHeap()->GetImageSpace();
+      if (ispace != nullptr) {
+        const OatFile* oat_file = ispace->GetOatFile();
+        if (oat_file != nullptr) {
+          const char* v = oat_file->GetOatHeader().GetStoreValueByKey(kImplicitChecksOatHeaderKey);
+          if (!Parse(v, explicit_null_checks, explicit_stack_overflow_checks,
+                     explicit_suspend_checks)) {
+            LOG(FATAL) << "Should have been able to parse boot image implicit check values";
+          }
+          return true;
+        }
+      }
+    }
+
+    // Check the current runtime.
+    bool cross_compiling = true;
+    switch (host) {
+      case kArm:
+      case kThumb2:
+        cross_compiling = target != kArm && target != kThumb2;
+        break;
+      default:
+        cross_compiling = host != target;
+        break;
+    }
+    if (!cross_compiling) {
+      Runtime* runtime = Runtime::Current();
+      *explicit_null_checks = runtime->ExplicitNullChecks();
+      *explicit_stack_overflow_checks = runtime->ExplicitStackOverflowChecks();
+      *explicit_suspend_checks = runtime->ExplicitSuspendChecks();
+      return true;
+    }
+
+    // Give up.
+    return false;
+  }
+};
+
+}  // namespace art
+
+#endif  // ART_RUNTIME_IMPLICIT_CHECK_OPTIONS_H_
diff --git a/runtime/jdwp/jdwp_event.cc b/runtime/jdwp/jdwp_event.cc
index 86c84e8b0f..36fbed4ea2 100644
--- a/runtime/jdwp/jdwp_event.cc
+++ b/runtime/jdwp/jdwp_event.cc
@@ -192,17 +192,17 @@ JdwpError JdwpState::RegisterEvent(JdwpEvent* pEvent) {
       }
     }
     if (NeedsFullDeoptimization(pEvent->eventKind)) {
-      CHECK_EQ(req.kind, DeoptimizationRequest::kNothing);
-      CHECK(req.method == nullptr);
-      req.kind = DeoptimizationRequest::kFullDeoptimization;
+      CHECK_EQ(req.GetKind(), DeoptimizationRequest::kNothing);
+      CHECK(req.Method() == nullptr);
+      req.SetKind(DeoptimizationRequest::kFullDeoptimization);
     }
     Dbg::RequestDeoptimization(req);
   }
   uint32_t instrumentation_event = GetInstrumentationEventFor(pEvent->eventKind);
   if (instrumentation_event != 0) {
     DeoptimizationRequest req;
-    req.kind = DeoptimizationRequest::kRegisterForEvent;
-    req.instrumentation_event = instrumentation_event;
+    req.SetKind(DeoptimizationRequest::kRegisterForEvent);
+    req.SetInstrumentationEvent(instrumentation_event);
     Dbg::RequestDeoptimization(req);
   }
 
@@ -274,17 +274,17 @@ void JdwpState::UnregisterEvent(JdwpEvent* pEvent) {
       // deoptimization and only the last single-step will trigger a full undeoptimization.
       Dbg::DelayFullUndeoptimization();
     } else if (NeedsFullDeoptimization(pEvent->eventKind)) {
-      CHECK_EQ(req.kind, DeoptimizationRequest::kNothing);
-      CHECK(req.method == nullptr);
-      req.kind = DeoptimizationRequest::kFullUndeoptimization;
+      CHECK_EQ(req.GetKind(), DeoptimizationRequest::kNothing);
+      CHECK(req.Method() == nullptr);
+      req.SetKind(DeoptimizationRequest::kFullUndeoptimization);
     }
     Dbg::RequestDeoptimization(req);
   }
   uint32_t instrumentation_event = GetInstrumentationEventFor(pEvent->eventKind);
   if (instrumentation_event != 0) {
     DeoptimizationRequest req;
-    req.kind = DeoptimizationRequest::kUnregisterForEvent;
-    req.instrumentation_event = instrumentation_event;
+    req.SetKind(DeoptimizationRequest::kUnregisterForEvent);
+    req.SetInstrumentationEvent(instrumentation_event);
     Dbg::RequestDeoptimization(req);
   }
 
diff --git a/runtime/monitor.cc b/runtime/monitor.cc
index 999a9e504b..eb62a694e0 100644
--- a/runtime/monitor.cc
+++ b/runtime/monitor.cc
@@ -401,8 +401,8 @@ void Monitor::Wait(Thread* self, int64_t ms, int32_t ns,
 
   // Make sure that we hold the lock.
   if (owner_ != self) {
-    ThrowIllegalMonitorStateExceptionF("object not locked by thread before wait()");
     monitor_lock_.Unlock(self);
+    ThrowIllegalMonitorStateExceptionF("object not locked by thread before wait()");
     return;
   }
 
@@ -414,10 +414,10 @@ void Monitor::Wait(Thread* self, int64_t ms, int32_t ns,
 
   // Enforce the timeout range.
   if (ms < 0 || ns < 0 || ns > 999999) {
+    monitor_lock_.Unlock(self);
     ThrowLocation throw_location = self->GetCurrentLocationForThrow();
     self->ThrowNewExceptionF(throw_location, "Ljava/lang/IllegalArgumentException;",
                              "timeout arguments out of range: ms=%" PRId64 " ns=%d", ms, ns);
-    monitor_lock_.Unlock(self);
     return;
   }
 
@@ -512,6 +512,8 @@ void Monitor::Wait(Thread* self, int64_t ms, int32_t ns,
   --num_waiters_;
   RemoveFromWaitSet(self);
 
+  monitor_lock_.Unlock(self);
+
   if (was_interrupted) {
     /*
      * We were interrupted while waiting, or somebody interrupted an
@@ -529,7 +531,6 @@ void Monitor::Wait(Thread* self, int64_t ms, int32_t ns,
       self->ThrowNewException(throw_location, "Ljava/lang/InterruptedException;", NULL);
     }
   }
-  monitor_lock_.Unlock(self);
 }
 
 void Monitor::Notify(Thread* self) {
diff --git a/runtime/monitor_test.cc b/runtime/monitor_test.cc
new file mode 100644
index 0000000000..bdba494e14
--- /dev/null
+++ b/runtime/monitor_test.cc
@@ -0,0 +1,380 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "barrier.h"
+#include "monitor.h"
+
+#include <string>
+
+#include "atomic.h"
+#include "common_runtime_test.h"
+#include "handle_scope-inl.h"
+#include "mirror/class-inl.h"
+#include "mirror/string-inl.h"  // Strings are easiest to allocate
+#include "thread_pool.h"
+#include "utils.h"
+
+namespace art {
+
+class MonitorTest : public CommonRuntimeTest {
+ protected:
+  void SetUpRuntimeOptions(Runtime::Options *options) OVERRIDE {
+    // Use a smaller heap
+    for (std::pair<std::string, const void*>& pair : *options) {
+      if (pair.first.find("-Xmx") == 0) {
+        pair.first = "-Xmx4M";  // Smallest we can go.
+      }
+    }
+    options->push_back(std::make_pair("-Xint", nullptr));
+  }
+ public:
+  std::unique_ptr<Monitor> monitor_;
+  Handle<mirror::String> object_;
+  Handle<mirror::String> second_object_;
+  Handle<mirror::String> watchdog_object_;
+  // One exception test is for waiting on another Thread's lock. This is used to race-free &
+  // loop-free pass
+  Thread* thread_;
+  std::unique_ptr<Barrier> barrier_;
+  std::unique_ptr<Barrier> complete_barrier_;
+  bool completed_;
+};
+
+// Fill the heap.
+static const size_t kMaxHandles = 1000000;  // Use arbitrary large amount for now.
+static void FillHeap(Thread* self, ClassLinker* class_linker,
+                     std::unique_ptr<StackHandleScope<kMaxHandles>>* hsp,
+                     std::vector<Handle<mirror::Object>>* handles)
+    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  Runtime::Current()->GetHeap()->SetIdealFootprint(1 * GB);
+
+  hsp->reset(new StackHandleScope<kMaxHandles>(self));
+  // Class java.lang.Object.
+  Handle<mirror::Class> c((*hsp)->NewHandle(class_linker->FindSystemClass(self,
+                                                                       "Ljava/lang/Object;")));
+  // Array helps to fill memory faster.
+  Handle<mirror::Class> ca((*hsp)->NewHandle(class_linker->FindSystemClass(self,
+                                                                        "[Ljava/lang/Object;")));
+
+  // Start allocating with 128K
+  size_t length = 128 * KB / 4;
+  while (length > 10) {
+    Handle<mirror::Object> h((*hsp)->NewHandle<mirror::Object>(
+        mirror::ObjectArray<mirror::Object>::Alloc(self, ca.Get(), length / 4)));
+    if (self->IsExceptionPending() || h.Get() == nullptr) {
+      self->ClearException();
+
+      // Try a smaller length
+      length = length / 8;
+      // Use at most half the reported free space.
+      size_t mem = Runtime::Current()->GetHeap()->GetFreeMemory();
+      if (length * 8 > mem) {
+        length = mem / 8;
+      }
+    } else {
+      handles->push_back(h);
+    }
+  }
+
+  // Allocate simple objects till it fails.
+  while (!self->IsExceptionPending()) {
+    Handle<mirror::Object> h = (*hsp)->NewHandle<mirror::Object>(c->AllocObject(self));
+    if (!self->IsExceptionPending() && h.Get() != nullptr) {
+      handles->push_back(h);
+    }
+  }
+  self->ClearException();
+}
+
+// Check that an exception can be thrown correctly.
+// This test is potentially racy, but the timeout is long enough that it should work.
+
+class CreateTask : public Task {
+ public:
+  explicit CreateTask(MonitorTest* monitor_test, uint64_t initial_sleep, int64_t millis,
+                      bool expected) :
+      monitor_test_(monitor_test), initial_sleep_(initial_sleep), millis_(millis),
+      expected_(expected) {}
+
+  void Run(Thread* self) {
+    {
+      ScopedObjectAccess soa(self);
+
+      monitor_test_->thread_ = self;        // Pass the Thread.
+      monitor_test_->object_.Get()->MonitorEnter(self);     // Lock the object. This should transition
+      LockWord lock_after = monitor_test_->object_.Get()->GetLockWord(false);     // it to thinLocked.
+      LockWord::LockState new_state = lock_after.GetState();
+
+      // Cannot use ASSERT only, as analysis thinks we'll keep holding the mutex.
+      if (LockWord::LockState::kThinLocked != new_state) {
+        monitor_test_->object_.Get()->MonitorExit(self);         // To appease analysis.
+        ASSERT_EQ(LockWord::LockState::kThinLocked, new_state);  // To fail the test.
+        return;
+      }
+
+      // Force a fat lock by running identity hashcode to fill up lock word.
+      monitor_test_->object_.Get()->IdentityHashCode();
+      LockWord lock_after2 = monitor_test_->object_.Get()->GetLockWord(false);
+      LockWord::LockState new_state2 = lock_after2.GetState();
+
+      // Cannot use ASSERT only, as analysis thinks we'll keep holding the mutex.
+      if (LockWord::LockState::kFatLocked != new_state2) {
+        monitor_test_->object_.Get()->MonitorExit(self);         // To appease analysis.
+        ASSERT_EQ(LockWord::LockState::kFatLocked, new_state2);  // To fail the test.
+        return;
+      }
+    }  // Need to drop the mutator lock to use the barrier.
+
+    monitor_test_->barrier_->Wait(self);           // Let the other thread know we're done.
+
+    {
+      ScopedObjectAccess soa(self);
+
+      // Give the other task a chance to do its thing.
+      NanoSleep(initial_sleep_ * 1000 * 1000);
+
+      // Now try to Wait on the Monitor.
+      Monitor::Wait(self, monitor_test_->object_.Get(), millis_, 0, true,
+                    ThreadState::kTimedWaiting);
+
+      // Check the exception status against what we expect.
+      EXPECT_EQ(expected_, self->IsExceptionPending());
+      if (expected_) {
+        self->ClearException();
+      }
+    }
+
+    monitor_test_->complete_barrier_->Wait(self);  // Wait for test completion.
+
+    {
+      ScopedObjectAccess soa(self);
+      monitor_test_->object_.Get()->MonitorExit(self);  // Release the object. Appeases analysis.
+    }
+  }
+
+  void Finalize() {
+    delete this;
+  }
+
+ private:
+  MonitorTest* monitor_test_;
+  uint64_t initial_sleep_;
+  int64_t millis_;
+  bool expected_;
+};
+
+
+class UseTask : public Task {
+ public:
+  UseTask(MonitorTest* monitor_test, uint64_t initial_sleep, int64_t millis, bool expected) :
+      monitor_test_(monitor_test), initial_sleep_(initial_sleep), millis_(millis),
+      expected_(expected) {}
+
+  void Run(Thread* self) {
+    monitor_test_->barrier_->Wait(self);  // Wait for the other thread to set up the monitor.
+
+    {
+      ScopedObjectAccess soa(self);
+
+      // Give the other task a chance to do its thing.
+      NanoSleep(initial_sleep_ * 1000 * 1000);
+
+      Monitor::Wait(self, monitor_test_->object_.Get(), millis_, 0, true,
+                    ThreadState::kTimedWaiting);
+
+      // Check the exception status against what we expect.
+      EXPECT_EQ(expected_, self->IsExceptionPending());
+      if (expected_) {
+        self->ClearException();
+      }
+    }
+
+    monitor_test_->complete_barrier_->Wait(self);  // Wait for test completion.
+  }
+
+  void Finalize() {
+    delete this;
+  }
+
+ private:
+  MonitorTest* monitor_test_;
+  uint64_t initial_sleep_;
+  int64_t millis_;
+  bool expected_;
+};
+
+class InterruptTask : public Task {
+ public:
+  InterruptTask(MonitorTest* monitor_test, uint64_t initial_sleep, uint64_t millis) :
+      monitor_test_(monitor_test), initial_sleep_(initial_sleep), millis_(millis) {}
+
+  void Run(Thread* self) {
+    monitor_test_->barrier_->Wait(self);  // Wait for the other thread to set up the monitor.
+
+    {
+      ScopedObjectAccess soa(self);
+
+      // Give the other task a chance to do its thing.
+      NanoSleep(initial_sleep_ * 1000 * 1000);
+
+      // Interrupt the other thread.
+      monitor_test_->thread_->Interrupt(self);
+
+      // Give it some more time to get to the exception code.
+      NanoSleep(millis_ * 1000 * 1000);
+
+      // Now try to Wait.
+      Monitor::Wait(self, monitor_test_->object_.Get(), 10, 0, true,
+                    ThreadState::kTimedWaiting);
+
+      // No check here, as depending on scheduling we may or may not fail.
+      if (self->IsExceptionPending()) {
+        self->ClearException();
+      }
+    }
+
+    monitor_test_->complete_barrier_->Wait(self);  // Wait for test completion.
+  }
+
+  void Finalize() {
+    delete this;
+  }
+
+ private:
+  MonitorTest* monitor_test_;
+  uint64_t initial_sleep_;
+  uint64_t millis_;
+};
+
+class WatchdogTask : public Task {
+ public:
+  explicit WatchdogTask(MonitorTest* monitor_test) : monitor_test_(monitor_test) {}
+
+  void Run(Thread* self) {
+    ScopedObjectAccess soa(self);
+
+    monitor_test_->watchdog_object_.Get()->MonitorEnter(self);        // Lock the object.
+
+    monitor_test_->watchdog_object_.Get()->Wait(self, 30 * 1000, 0);  // Wait for 30s, or being
+                                                                      // woken up.
+
+    monitor_test_->watchdog_object_.Get()->MonitorExit(self);         // Release the lock.
+
+    if (!monitor_test_->completed_) {
+      LOG(FATAL) << "Watchdog timeout!";
+    }
+  }
+
+  void Finalize() {
+    delete this;
+  }
+
+ private:
+  MonitorTest* monitor_test_;
+};
+
+static void CommonWaitSetup(MonitorTest* test, ClassLinker* class_linker, uint64_t create_sleep,
+                            int64_t c_millis, bool c_expected, bool interrupt, uint64_t use_sleep,
+                            int64_t u_millis, bool u_expected, const char* pool_name) {
+  // First create the object we lock. String is easiest.
+  StackHandleScope<3> hs(Thread::Current());
+  {
+    ScopedObjectAccess soa(Thread::Current());
+    test->object_ = hs.NewHandle(mirror::String::AllocFromModifiedUtf8(Thread::Current(),
+                                                                       "hello, world!"));
+    test->watchdog_object_ = hs.NewHandle(mirror::String::AllocFromModifiedUtf8(Thread::Current(),
+                                                                                "hello, world!"));
+  }
+
+  // Create the barrier used to synchronize.
+  test->barrier_ = std::unique_ptr<Barrier>(new Barrier(2));
+  test->complete_barrier_ = std::unique_ptr<Barrier>(new Barrier(3));
+  test->completed_ = false;
+
+  // Fill the heap.
+  std::unique_ptr<StackHandleScope<kMaxHandles>> hsp;
+  std::vector<Handle<mirror::Object>> handles;
+  {
+    Thread* self = Thread::Current();
+    ScopedObjectAccess soa(self);
+
+    // Our job: Fill the heap, then try Wait.
+    FillHeap(self, class_linker, &hsp, &handles);
+
+    // Now release everything.
+    auto it = handles.begin();
+    auto end = handles.end();
+
+    for ( ; it != end; ++it) {
+      it->Assign(nullptr);
+    }
+  }  // Need to drop the mutator lock to allow barriers.
+
+  Thread* self = Thread::Current();
+  ThreadPool thread_pool(pool_name, 3);
+  thread_pool.AddTask(self, new CreateTask(test, create_sleep, c_millis, c_expected));
+  if (interrupt) {
+    thread_pool.AddTask(self, new InterruptTask(test, use_sleep, static_cast<uint64_t>(u_millis)));
+  } else {
+    thread_pool.AddTask(self, new UseTask(test, use_sleep, u_millis, u_expected));
+  }
+  thread_pool.AddTask(self, new WatchdogTask(test));
+  thread_pool.StartWorkers(self);
+
+  // Wait on completion barrier.
+  test->complete_barrier_->Wait(Thread::Current());
+  test->completed_ = true;
+
+  // Wake the watchdog.
+  {
+    Thread* self = Thread::Current();
+    ScopedObjectAccess soa(self);
+
+    test->watchdog_object_.Get()->MonitorEnter(self);     // Lock the object.
+    test->watchdog_object_.Get()->NotifyAll(self);        // Wake up waiting parties.
+    test->watchdog_object_.Get()->MonitorExit(self);      // Release the lock.
+  }
+
+  thread_pool.StopWorkers(self);
+}
+
+
+// First test: throwing an exception when trying to wait in Monitor with another thread.
+TEST_F(MonitorTest, CheckExceptionsWait1) {
+  // Make the CreateTask wait 10ms, the UseTask wait 10ms.
+  // => The use task will get the lock first and get to self == owner check.
+  CommonWaitSetup(this, class_linker_, 10, 50, false, false, 2, 50, true,
+                  "Monitor test thread pool 1");
+}
+
+// Second test: throwing an exception for invalid wait time.
+TEST_F(MonitorTest, CheckExceptionsWait2) {
+  // Make the CreateTask wait 0ms, the UseTask wait 10ms.
+  // => The create task will get the lock first and get to ms >= 0
+  CommonWaitSetup(this, class_linker_, 0, -1, true, false, 10, 50, true,
+                  "Monitor test thread pool 2");
+}
+
+// Third test: throwing an interrupted-exception.
+TEST_F(MonitorTest, CheckExceptionsWait3) {
+  // Make the CreateTask wait 0ms, then Wait for a long time. Make the InterruptTask wait 10ms,
+  // after which it will interrupt the create task and then wait another 10ms.
+  // => The create task will get to the interrupted-exception throw.
+  CommonWaitSetup(this, class_linker_, 0, 500, true, true, 10, 50, true,
+                  "Monitor test thread pool 3");
+}
+
+}  // namespace art
diff --git a/runtime/oat.cc b/runtime/oat.cc
index 857c0a24e2..1421baffcf 100644
--- a/runtime/oat.cc
+++ b/runtime/oat.cc
@@ -17,15 +17,46 @@
 #include "oat.h"
 #include "utils.h"
 
+#include <string.h>
 #include <zlib.h>
 
 namespace art {
 
 const uint8_t OatHeader::kOatMagic[] = { 'o', 'a', 't', '\n' };
-const uint8_t OatHeader::kOatVersion[] = { '0', '3', '6', '\0' };
+const uint8_t OatHeader::kOatVersion[] = { '0', '3', '7', '\0' };
+
+static size_t ComputeOatHeaderSize(const SafeMap<std::string, std::string>* variable_data) {
+  size_t estimate = 0U;
+  if (variable_data != nullptr) {
+    SafeMap<std::string, std::string>::const_iterator it = variable_data->begin();
+    SafeMap<std::string, std::string>::const_iterator end = variable_data->end();
+    for ( ; it != end; ++it) {
+      estimate += it->first.length() + 1;
+      estimate += it->second.length() + 1;
+    }
+  }
+  return sizeof(OatHeader) + estimate;
+}
+
+OatHeader* OatHeader::Create(InstructionSet instruction_set,
+                             const InstructionSetFeatures& instruction_set_features,
+                             const std::vector<const DexFile*>* dex_files,
+                             uint32_t image_file_location_oat_checksum,
+                             uint32_t image_file_location_oat_data_begin,
+                             const SafeMap<std::string, std::string>* variable_data) {
+  // Estimate size of optional data.
+  size_t needed_size = ComputeOatHeaderSize(variable_data);
+
+  // Reserve enough memory.
+  void* memory = operator new (needed_size);
 
-OatHeader::OatHeader() {
-  memset(this, 0, sizeof(*this));
+  // Create the OatHeader in-place.
+  return new (memory) OatHeader(instruction_set,
+                                instruction_set_features,
+                                dex_files,
+                                image_file_location_oat_checksum,
+                                image_file_location_oat_data_begin,
+                                variable_data);
 }
 
 OatHeader::OatHeader(InstructionSet instruction_set,
@@ -33,7 +64,7 @@ OatHeader::OatHeader(InstructionSet instruction_set,
                      const std::vector<const DexFile*>* dex_files,
                      uint32_t image_file_location_oat_checksum,
                      uint32_t image_file_location_oat_data_begin,
-                     const std::string& image_file_location) {
+                     const SafeMap<std::string, std::string>* variable_data) {
   memcpy(magic_, kOatMagic, sizeof(kOatMagic));
   memcpy(version_, kOatVersion, sizeof(kOatVersion));
 
@@ -56,9 +87,16 @@ OatHeader::OatHeader(InstructionSet instruction_set,
   image_file_location_oat_data_begin_ = image_file_location_oat_data_begin;
   UpdateChecksum(&image_file_location_oat_data_begin_, sizeof(image_file_location_oat_data_begin_));
 
-  image_file_location_size_ = image_file_location.size();
-  UpdateChecksum(&image_file_location_size_, sizeof(image_file_location_size_));
-  UpdateChecksum(image_file_location.data(), image_file_location_size_);
+  // Flatten the map. Will also update variable_size_data_size_.
+  Flatten(variable_data);
+
+  // Update checksum for variable data size.
+  UpdateChecksum(&key_value_store_size_, sizeof(key_value_store_size_));
+
+  // Update for data, if existing.
+  if (key_value_store_size_ > 0U) {
+    UpdateChecksum(&key_value_store_, key_value_store_size_);
+  }
 
   executable_offset_ = 0;
   interpreter_to_interpreter_bridge_offset_ = 0;
@@ -327,20 +365,97 @@ uint32_t OatHeader::GetImageFileLocationOatDataBegin() const {
   return image_file_location_oat_data_begin_;
 }
 
-uint32_t OatHeader::GetImageFileLocationSize() const {
+uint32_t OatHeader::GetKeyValueStoreSize() const {
   CHECK(IsValid());
-  return image_file_location_size_;
+  return key_value_store_size_;
 }
 
-const uint8_t* OatHeader::GetImageFileLocationData() const {
+const uint8_t* OatHeader::GetKeyValueStore() const {
   CHECK(IsValid());
-  return image_file_location_data_;
+  return key_value_store_;
 }
 
-std::string OatHeader::GetImageFileLocation() const {
-  CHECK(IsValid());
-  return std::string(reinterpret_cast<const char*>(GetImageFileLocationData()),
-                     GetImageFileLocationSize());
+// Advance start until it is either end or \0.
+static const char* ParseString(const char* start, const char* end) {
+  while (start < end && *start != 0) {
+    start++;
+  }
+  return start;
+}
+
+const char* OatHeader::GetStoreValueByKey(const char* key) const {
+  const char* ptr = reinterpret_cast<const char*>(&key_value_store_);
+  const char* end = ptr + key_value_store_size_;
+
+  while (ptr < end) {
+    // Scan for a closing zero.
+    const char* str_end = ParseString(ptr, end);
+    if (str_end < end) {
+      if (strcmp(key, ptr) == 0) {
+        // Same as key. Check if value is OK.
+        if (ParseString(str_end + 1, end) < end) {
+          return str_end + 1;
+        }
+      } else {
+        // Different from key. Advance over the value.
+        ptr = ParseString(str_end + 1, end) + 1;
+      }
+    } else {
+      break;
+    }
+  }
+  // Not found.
+  return nullptr;
+}
+
+bool OatHeader::GetStoreKeyValuePairByIndex(size_t index, const char** key,
+                                            const char** value) const {
+  const char* ptr = reinterpret_cast<const char*>(&key_value_store_);
+  const char* end = ptr + key_value_store_size_;
+  ssize_t counter = static_cast<ssize_t>(index);
+
+  while (ptr < end && counter >= 0) {
+    // Scan for a closing zero.
+    const char* str_end = ParseString(ptr, end);
+    if (str_end < end) {
+      const char* maybe_key = ptr;
+      ptr = ParseString(str_end + 1, end) + 1;
+      if (ptr <= end) {
+        if (counter == 0) {
+          *key = maybe_key;
+          *value = str_end + 1;
+          return true;
+        } else {
+          counter--;
+        }
+      } else {
+        return false;
+      }
+    } else {
+      break;
+    }
+  }
+  // Not found.
+  return false;
+}
+
+size_t OatHeader::GetHeaderSize() const {
+  return sizeof(OatHeader) + key_value_store_size_;
+}
+
+void OatHeader::Flatten(const SafeMap<std::string, std::string>* key_value_store) {
+  char* data_ptr = reinterpret_cast<char*>(&key_value_store_);
+  if (key_value_store != nullptr) {
+    SafeMap<std::string, std::string>::const_iterator it = key_value_store->begin();
+    SafeMap<std::string, std::string>::const_iterator end = key_value_store->end();
+    for ( ; it != end; ++it) {
+      strcpy(data_ptr, it->first.c_str());
+      data_ptr += it->first.length() + 1;
+      strcpy(data_ptr, it->second.c_str());
+      data_ptr += it->second.length() + 1;
+    }
+  }
+  key_value_store_size_ = data_ptr - reinterpret_cast<char*>(&key_value_store_);
 }
 
 OatMethodOffsets::OatMethodOffsets()
diff --git a/runtime/oat.h b/runtime/oat.h
index 7be768c5cb..fbed596d33 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -23,6 +23,7 @@
 #include "dex_file.h"
 #include "instruction_set.h"
 #include "quick/quick_method_frame_info.h"
+#include "safe_map.h"
 
 namespace art {
 
@@ -31,13 +32,16 @@ class PACKED(4) OatHeader {
   static const uint8_t kOatMagic[4];
   static const uint8_t kOatVersion[4];
 
-  OatHeader();
-  OatHeader(InstructionSet instruction_set,
-            const InstructionSetFeatures& instruction_set_features,
-            const std::vector<const DexFile*>* dex_files,
-            uint32_t image_file_location_oat_checksum,
-            uint32_t image_file_location_oat_data_begin,
-            const std::string& image_file_location);
+  static constexpr const char* kImageLocationKey = "image-location";
+  static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";
+  static constexpr const char* kDex2OatHostKey = "dex2oat-host";
+
+  static OatHeader* Create(InstructionSet instruction_set,
+                           const InstructionSetFeatures& instruction_set_features,
+                           const std::vector<const DexFile*>* dex_files,
+                           uint32_t image_file_location_oat_checksum,
+                           uint32_t image_file_location_oat_data_begin,
+                           const SafeMap<std::string, std::string>* variable_data);
 
   bool IsValid() const;
   const char* GetMagic() const;
@@ -88,11 +92,24 @@ class PACKED(4) OatHeader {
   const InstructionSetFeatures& GetInstructionSetFeatures() const;
   uint32_t GetImageFileLocationOatChecksum() const;
   uint32_t GetImageFileLocationOatDataBegin() const;
-  uint32_t GetImageFileLocationSize() const;
-  const uint8_t* GetImageFileLocationData() const;
-  std::string GetImageFileLocation() const;
+
+  uint32_t GetKeyValueStoreSize() const;
+  const uint8_t* GetKeyValueStore() const;
+  const char* GetStoreValueByKey(const char* key) const;
+  bool GetStoreKeyValuePairByIndex(size_t index, const char** key, const char** value) const;
+
+  size_t GetHeaderSize() const;
 
  private:
+  OatHeader(InstructionSet instruction_set,
+            const InstructionSetFeatures& instruction_set_features,
+            const std::vector<const DexFile*>* dex_files,
+            uint32_t image_file_location_oat_checksum,
+            uint32_t image_file_location_oat_data_begin,
+            const SafeMap<std::string, std::string>* variable_data);
+
+  void Flatten(const SafeMap<std::string, std::string>* variable_data);
+
   uint8_t magic_[4];
   uint8_t version_[4];
   uint32_t adler32_checksum_;
@@ -114,8 +131,9 @@ class PACKED(4) OatHeader {
 
   uint32_t image_file_location_oat_checksum_;
   uint32_t image_file_location_oat_data_begin_;
-  uint32_t image_file_location_size_;
-  uint8_t image_file_location_data_[0];  // note variable width data at end
+
+  uint32_t key_value_store_size_;
+  uint8_t key_value_store_[0];  // note variable width data at end
 
   DISALLOW_COPY_AND_ASSIGN(OatHeader);
 };
diff --git a/runtime/oat_file.cc b/runtime/oat_file.cc
index 6c44aa91bd..bae1632fa2 100644
--- a/runtime/oat_file.cc
+++ b/runtime/oat_file.cc
@@ -17,17 +17,20 @@
 #include "oat_file.h"
 
 #include <dlfcn.h>
+#include <sstream>
 
 #include "base/bit_vector.h"
 #include "base/stl_util.h"
 #include "base/unix_file/fd_file.h"
 #include "elf_file.h"
+#include "implicit_check_options.h"
 #include "oat.h"
 #include "mirror/art_method.h"
 #include "mirror/art_method-inl.h"
 #include "mirror/class.h"
 #include "mirror/object-inl.h"
 #include "os.h"
+#include "runtime.h"
 #include "utils.h"
 #include "vmap_table.h"
 
@@ -55,28 +58,71 @@ OatFile* OatFile::Open(const std::string& filename,
                        std::string* error_msg) {
   CHECK(!filename.empty()) << location;
   CheckLocation(filename);
-  if (kUsePortableCompiler) {
+  std::unique_ptr<OatFile> ret;
+  if (kUsePortableCompiler && executable) {
     // If we are using PORTABLE, use dlopen to deal with relocations.
     //
     // We use our own ELF loader for Quick to deal with legacy apps that
     // open a generated dex file by name, remove the file, then open
     // another generated dex file with the same name. http://b/10614658
-    if (executable) {
-      return OpenDlopen(filename, location, requested_base, error_msg);
+    ret.reset(OpenDlopen(filename, location, requested_base, error_msg));
+  } else {
+    // If we aren't trying to execute, we just use our own ElfFile loader for a couple reasons:
+    //
+    // On target, dlopen may fail when compiling due to selinux restrictions on installd.
+    //
+    // On host, dlopen is expected to fail when cross compiling, so fall back to OpenElfFile.
+    // This won't work for portable runtime execution because it doesn't process relocations.
+    std::unique_ptr<File> file(OS::OpenFileForReading(filename.c_str()));
+    if (file.get() == NULL) {
+      *error_msg = StringPrintf("Failed to open oat filename for reading: %s", strerror(errno));
+      return nullptr;
     }
+    ret.reset(OpenElfFile(file.get(), location, requested_base, false, executable, error_msg));
+  }
+
+  if (ret.get() == nullptr) {
+    return nullptr;
+  }
+
+  // Embedded options check. Right now only implicit checks.
+  // TODO: Refactor to somewhere else?
+  const char* implicit_checks_value = ret->GetOatHeader().
+      GetStoreValueByKey(ImplicitCheckOptions::kImplicitChecksOatHeaderKey);
+
+  if (implicit_checks_value == nullptr) {
+    *error_msg = "Did not find implicit checks value.";
+    return nullptr;
   }
-  // If we aren't trying to execute, we just use our own ElfFile loader for a couple reasons:
-  //
-  // On target, dlopen may fail when compiling due to selinux restrictions on installd.
-  //
-  // On host, dlopen is expected to fail when cross compiling, so fall back to OpenElfFile.
-  // This won't work for portable runtime execution because it doesn't process relocations.
-  std::unique_ptr<File> file(OS::OpenFileForReading(filename.c_str()));
-  if (file.get() == NULL) {
-    *error_msg = StringPrintf("Failed to open oat filename for reading: %s", strerror(errno));
-    return NULL;
+
+  bool explicit_null_checks, explicit_so_checks, explicit_suspend_checks;
+  if (ImplicitCheckOptions::Parse(implicit_checks_value, &explicit_null_checks,
+                                  &explicit_so_checks, &explicit_suspend_checks)) {
+    if (!executable) {
+      // Not meant to be run, i.e., either we are compiling or dumping. Just accept.
+      return ret.release();
+    }
+
+    Runtime* runtime = Runtime::Current();
+    // We really should have a runtime.
+    DCHECK_NE(static_cast<Runtime*>(nullptr), runtime);
+
+    if (runtime->ExplicitNullChecks() != explicit_null_checks ||
+        runtime->ExplicitStackOverflowChecks() != explicit_so_checks ||
+        runtime->ExplicitSuspendChecks() != explicit_suspend_checks) {
+      std::ostringstream os;
+      os << "Explicit check options do not match runtime: " << implicit_checks_value << " -> ";
+      os << runtime->ExplicitNullChecks() << " vs " << explicit_null_checks << " | ";
+      os << runtime->ExplicitStackOverflowChecks() << " vs " << explicit_so_checks << " | ";
+      os << runtime->ExplicitSuspendChecks() << " vs " << explicit_suspend_checks;
+      *error_msg = os.str();
+      return nullptr;
+    }
+    return ret.release();
+  } else {
+    *error_msg = "Failed parsing implicit check options.";
+    return nullptr;
   }
-  return OpenElfFile(file.get(), location, requested_base, false, executable, error_msg);
 }
 
 OatFile* OatFile::OpenWritable(File* file, const std::string& location, std::string* error_msg) {
@@ -206,11 +252,11 @@ bool OatFile::Setup(std::string* error_msg) {
     return false;
   }
 
-  oat += GetOatHeader().GetImageFileLocationSize();
+  oat += GetOatHeader().GetKeyValueStoreSize();
   if (oat > End()) {
-    *error_msg = StringPrintf("In oat file '%s' found truncated image file location: "
+    *error_msg = StringPrintf("In oat file '%s' found truncated variable-size data: "
                               "%p + %zd + %ud <= %p", GetLocation().c_str(),
-                              Begin(), sizeof(OatHeader), GetOatHeader().GetImageFileLocationSize(),
+                              Begin(), sizeof(OatHeader), GetOatHeader().GetKeyValueStoreSize(),
                               End());
     return false;
   }
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index 53ddcca469..3b14aaa767 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -930,7 +930,6 @@ void Runtime::VisitConstantRoots(RootCallback* callback, void* arg) {
 void Runtime::VisitConcurrentRoots(RootCallback* callback, void* arg, VisitRootFlags flags) {
   intern_table_->VisitRoots(callback, arg, flags);
   class_linker_->VisitRoots(callback, arg, flags);
-  Dbg::VisitRoots(callback, arg);
   if ((flags & kVisitRootFlagNewRoots) == 0) {
     // Guaranteed to have no new roots in the constant roots.
     VisitConstantRoots(callback, arg);
diff --git a/runtime/verifier/method_verifier.cc b/runtime/verifier/method_verifier.cc
index 89cfcdd1de..eabb993879 100644
--- a/runtime/verifier/method_verifier.cc
+++ b/runtime/verifier/method_verifier.cc
@@ -46,7 +46,7 @@
 namespace art {
 namespace verifier {
 
-static const bool gDebugVerify = false;
+static constexpr bool gDebugVerify = false;
 // TODO: Add a constant to method_verifier to turn on verbose logging?
 
 void PcToRegisterLineTable::Init(RegisterTrackingMode mode, InstructionFlags* flags,
@@ -1329,8 +1329,7 @@ bool MethodVerifier::CodeFlowVerifyMethod() {
     work_insn_idx_ = insn_idx;
     if (insn_flags_[insn_idx].IsBranchTarget()) {
       work_line_->CopyFromLine(reg_table_.GetLine(insn_idx));
-    } else {
-#ifndef NDEBUG
+    } else if (kIsDebugBuild) {
       /*
        * Sanity check: retrieve the stored register line (assuming
        * a full table) and make sure it actually matches.
@@ -1346,7 +1345,6 @@ bool MethodVerifier::CodeFlowVerifyMethod() {
                      << "  expected=" << *register_line;
         }
       }
-#endif
     }
     if (!CodeFlowVerifyInstruction(&start_guess)) {
       std::string prepend(PrettyMethod(dex_method_idx_, *dex_file_));
@@ -1958,14 +1956,24 @@ bool MethodVerifier::CodeFlowVerifyInstruction(uint32_t* start_guess) {
           (Instruction::INSTANCE_OF == instance_of_inst->Opcode()) &&
           (inst->VRegA_21t() == instance_of_inst->VRegA_22c()) &&
           (instance_of_inst->VRegA_22c() != instance_of_inst->VRegB_22c())) {
-        // Check that the we are not attempting conversion to interface types,
-        // which is not done because of the multiple inheritance implications.
-        // Also don't change the type if it would result in an upcast.
+        // Check the type of the instance-of is different than that of registers type, as if they
+        // are the same there is no work to be done here. Check that the conversion is not to or
+        // from an unresolved type as type information is imprecise. If the instance-of is to an
+        // interface then ignore the type information as interfaces can only be treated as Objects
+        // and we don't want to disallow field and other operations on the object. If the value
+        // being instance-of checked against is known null (zero) then allow the optimization as
+        // we didn't have type information. If the merge of the instance-of type with the original
+        // type is assignable to the original then allow optimization. This check is performed to
+        // ensure that subsequent merges don't lose type information - such as becoming an
+        // interface from a class that would lose information relevant to field checks.
         const RegType& orig_type = work_line_->GetRegisterType(instance_of_inst->VRegB_22c());
         const RegType& cast_type = ResolveClassAndCheckAccess(instance_of_inst->VRegC_22c());
 
-        if (!cast_type.IsUnresolvedTypes() && !orig_type.IsUnresolvedTypes() &&
-            !cast_type.GetClass()->IsInterface() && !cast_type.IsAssignableFrom(orig_type)) {
+        if (!orig_type.Equals(cast_type) &&
+            !cast_type.IsUnresolvedTypes() && !orig_type.IsUnresolvedTypes() &&
+            !cast_type.GetClass()->IsInterface() &&
+            (orig_type.IsZero() ||
+                orig_type.IsStrictlyAssignableFrom(cast_type.Merge(orig_type, &reg_types_)))) {
           RegisterLine* update_line = RegisterLine::Create(code_item_->registers_size_, this);
           if (inst->Opcode() == Instruction::IF_EQZ) {
             fallthrough_line.reset(update_line);
@@ -2699,11 +2707,11 @@ bool MethodVerifier::CodeFlowVerifyInstruction(uint32_t* start_guess) {
     }
     /* update branch target, set "changed" if appropriate */
     if (NULL != branch_line.get()) {
-      if (!UpdateRegisters(work_insn_idx_ + branch_target, branch_line.get())) {
+      if (!UpdateRegisters(work_insn_idx_ + branch_target, branch_line.get(), false)) {
         return false;
       }
     } else {
-      if (!UpdateRegisters(work_insn_idx_ + branch_target, work_line_.get())) {
+      if (!UpdateRegisters(work_insn_idx_ + branch_target, work_line_.get(), false)) {
         return false;
       }
     }
@@ -2743,8 +2751,9 @@ bool MethodVerifier::CodeFlowVerifyInstruction(uint32_t* start_guess) {
       if (!CheckNotMoveException(code_item_->insns_, abs_offset)) {
         return false;
       }
-      if (!UpdateRegisters(abs_offset, work_line_.get()))
+      if (!UpdateRegisters(abs_offset, work_line_.get(), false)) {
         return false;
+      }
     }
   }
 
@@ -2765,7 +2774,7 @@ bool MethodVerifier::CodeFlowVerifyInstruction(uint32_t* start_guess) {
        * "work_regs", because at runtime the exception will be thrown before the instruction
        * modifies any registers.
        */
-      if (!UpdateRegisters(iterator.GetHandlerAddress(), saved_line_.get())) {
+      if (!UpdateRegisters(iterator.GetHandlerAddress(), saved_line_.get(), false)) {
         return false;
       }
     }
@@ -2824,9 +2833,10 @@ bool MethodVerifier::CodeFlowVerifyInstruction(uint32_t* start_guess) {
     }
     RegisterLine* next_line = reg_table_.GetLine(next_insn_idx);
     if (next_line != NULL) {
-      // Merge registers into what we have for the next instruction,
-      // and set the "changed" flag if needed.
-      if (!UpdateRegisters(next_insn_idx, work_line_.get())) {
+      // Merge registers into what we have for the next instruction, and set the "changed" flag if
+      // needed. If the merge changes the state of the registers then the work line will be
+      // updated.
+      if (!UpdateRegisters(next_insn_idx, work_line_.get(), true)) {
         return false;
       }
     } else {
@@ -3890,7 +3900,8 @@ bool MethodVerifier::CheckNotMoveException(const uint16_t* insns, int insn_idx)
   return true;
 }
 
-bool MethodVerifier::UpdateRegisters(uint32_t next_insn, const RegisterLine* merge_line) {
+bool MethodVerifier::UpdateRegisters(uint32_t next_insn, RegisterLine* merge_line,
+                                     bool update_merge_line) {
   bool changed = true;
   RegisterLine* target_line = reg_table_.GetLine(next_insn);
   if (!insn_flags_[next_insn].IsVisitedOrChanged()) {
@@ -3939,6 +3950,9 @@ bool MethodVerifier::UpdateRegisters(uint32_t next_insn, const RegisterLine* mer
                       << *merge_line << "  ==\n"
                       << *target_line << "\n";
     }
+    if (update_merge_line && changed) {
+      merge_line->CopyFromLine(target_line);
+    }
   }
   if (changed) {
     insn_flags_[next_insn].SetChanged();
diff --git a/runtime/verifier/method_verifier.h b/runtime/verifier/method_verifier.h
index b6d5b351c3..757c41993c 100644
--- a/runtime/verifier/method_verifier.h
+++ b/runtime/verifier/method_verifier.h
@@ -595,9 +595,11 @@ class MethodVerifier {
   /*
   * Control can transfer to "next_insn". Merge the registers from merge_line into the table at
   * next_insn, and set the changed flag on the target address if any of the registers were changed.
+  * In the case of fall-through, update the merge line on a change as its the working line for the
+  * next instruction.
   * Returns "false" if an error is encountered.
   */
-  bool UpdateRegisters(uint32_t next_insn, const RegisterLine* merge_line)
+  bool UpdateRegisters(uint32_t next_insn, RegisterLine* merge_line, bool update_merge_line)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Is the method being verified a constructor?
diff --git a/runtime/verifier/reg_type.h b/runtime/verifier/reg_type.h
index 64001d36f3..e985f3a2de 100644
--- a/runtime/verifier/reg_type.h
+++ b/runtime/verifier/reg_type.h
@@ -209,9 +209,9 @@ class RegType {
                           !IsUnresolvedSuperClass()));
     return descriptor_;
   }
-  mirror::Class* GetClass() const {
+  mirror::Class* GetClass() const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     DCHECK(!IsUnresolvedReference());
-    DCHECK(klass_ != NULL);
+    DCHECK(klass_ != NULL) << Dump();
     DCHECK(HasClass());
     return klass_;
   }
diff --git a/test/052-verifier-fun/expected.txt b/test/052-verifier-fun/expected.txt
index 566267534e..931aef30a1 100644
--- a/test/052-verifier-fun/expected.txt
+++ b/test/052-verifier-fun/expected.txt
@@ -1,2 +1,3 @@
 BlahOne
 Zorch.
+10 == 10
diff --git a/test/052-verifier-fun/src/Main.java b/test/052-verifier-fun/src/Main.java
index 0168412bab..3ffd14376c 100644
--- a/test/052-verifier-fun/src/Main.java
+++ b/test/052-verifier-fun/src/Main.java
@@ -24,6 +24,7 @@ public class Main {
         tryBlah(1);
 
         System.out.println("Zorch.");
+        System.out.println("10 == " + instanceOfTest(10));
     }
 
     /*
@@ -120,4 +121,15 @@ public class Main {
 
         feature.doStuff();
     }
+
+    static int instanceOfTest(Integer x) {
+      Object y = x;
+      if (y instanceof String) {
+        // Bug: 15808277
+        // Non-sensical instance-of to check merging after the branch doesn't result in a verifier
+        // error.
+        ((String)y).charAt(0);
+      }
+      return x.intValue();
+    }
 }
diff --git a/test/082-inline-execute/src/Main.java b/test/082-inline-execute/src/Main.java
index 5b8134df1c..3b11879e08 100644
--- a/test/082-inline-execute/src/Main.java
+++ b/test/082-inline-execute/src/Main.java
@@ -49,6 +49,7 @@ public class Main {
     test_String_indexOf();
     test_String_isEmpty();
     test_String_length();
+    test_Thread_currentThread();
   }
 
   /*
@@ -70,6 +71,17 @@ public class Main {
       return (b - a) < maxDelta;
   }
 
+  /**
+   * Will test inlining Thread.currentThread().
+   */
+  public static void test_Thread_currentThread() {
+    // 1. Do not use result.
+    Thread.currentThread();
+
+    // 2. Result should not be null.
+    Assert.assertNotNull(Thread.currentThread());
+  }
+
   public static void test_String_length() {
     String str0 = "";
     String str1 = "x";
diff --git a/test/Android.oat.mk b/test/Android.oat.mk
index 13d452c7e8..fec2540e9c 100644
--- a/test/Android.oat.mk
+++ b/test/Android.oat.mk
@@ -193,7 +193,7 @@ $(3): PRIVATE_OAT_FILE := $$(oat_file)
 $(3): $$(ART_TEST_HOST_OAT_$(1)_DEX) $(ART_TEST_HOST_OAT_DEPENDENCIES)
 	$(hide) mkdir -p $(ART_HOST_TEST_DIR)/android-data-$$@/dalvik-cache/$$($(2)HOST_ARCH)
 	$(hide) cp $$(realpath $$<) $(ART_HOST_TEST_DIR)/android-data-$$@/oat-test-dex-$(1).jar
-	$(hide) $(DEX2OATD) $(DEX2OAT_FLAGS) --runtime-arg -Xms16m --runtime-arg -Xmx16m $(4) \
+	$(hide) $(DEX2OATD) $(DEX2OAT_FLAGS) --runtime-arg $(DEX2OAT_XMS) --runtime-arg $(DEX2OAT_XMX) $(4) \
 	  --boot-image=$$(HOST_CORE_IMG_LOCATION) \
 	  --dex-file=$$(PRIVATE_DEX_FILE) --oat-file=$$(PRIVATE_OAT_FILE) \
 	  --instruction-set=$($(2)ART_HOST_ARCH) --host --android-root=$(HOST_OUT) \