32 files changed, 1219 insertions, 320 deletions
diff --git a/Android.mk b/Android.mk
index 9c206400fc..3f4ead60d1 100644
--- a/Android.mk
+++ b/Android.mk
@@ -324,7 +324,7 @@ oat-target-$(1): $$(OUT_OAT_FILE)
 
 $$(OUT_OAT_FILE): $(PRODUCT_OUT)/$(1) $(DEFAULT_DEX_PREOPT_BUILT_IMAGE) $(DEX2OATD_DEPENDENCY)
 	@mkdir -p $$(dir $$@)
-	$(DEX2OATD) --runtime-arg -Xms64m --runtime-arg -Xmx64m \
+	$(DEX2OATD) --runtime-arg $(DEX2OAT_XMS) --runtime-arg $(DEX2OAT_XMX) \
 		--boot-image=$(DEFAULT_DEX_PREOPT_BUILT_IMAGE) --dex-file=$(PRODUCT_OUT)/$(1) \
 		--dex-location=/$(1) --oat-file=$$@ \
 		--instruction-set=$(DEX2OAT_TARGET_ARCH) \
diff --git a/build/Android.common_build.mk b/build/Android.common_build.mk
index 5cfdbfd816..c39bc5dd63 100644
--- a/build/Android.common_build.mk
+++ b/build/Android.common_build.mk
@@ -207,6 +207,7 @@ ifndef LIBART_IMG_TARGET_BASE_ADDRESS
   $(error LIBART_IMG_TARGET_BASE_ADDRESS unset)
 endif
 ART_TARGET_CFLAGS := $(art_cflags) -DART_TARGET -DART_BASE_ADDRESS=$(LIBART_IMG_TARGET_BASE_ADDRESS)
+ART_TARGET_LDFLAGS :=
 ifeq ($(TARGET_CPU_SMP),true)
   ART_TARGET_CFLAGS += -DANDROID_SMP=1
 else
@@ -219,6 +220,14 @@ else
   endif
 endif
 ART_TARGET_CFLAGS += $(ART_DEFAULT_GC_TYPE_CFLAGS)
+ifdef PGO_GEN
+  ART_TARGET_CFLAGS += -fprofile-generate=$(PGO_GEN)
+  ART_TARGET_LDFLAGS += -fprofile-generate=$(PGO_GEN)
+endif
+ifdef PGO_USE
+  ART_TARGET_CFLAGS += -fprofile-use=$(PGO_USE)
+  ART_TARGET_CFLAGS += -fprofile-correction -Wno-error
+endif
 
 # DEX2OAT_TARGET_INSTRUCTION_SET_FEATURES is set in ../build/core/dex_preopt.mk based on
 # the TARGET_CPU_VARIANT
@@ -277,6 +286,7 @@ ART_TARGET_DEBUG_CFLAGS := $(art_debug_cflags)
 define set-target-local-cflags-vars
   LOCAL_CFLAGS += $(ART_TARGET_CFLAGS)
   LOCAL_CFLAGS_x86 += $(ART_TARGET_CFLAGS_x86)
+  LOCAL_LDFLAGS += $(ART_TARGET_LDFLAGS)
   art_target_cflags_ndebug_or_debug := $(1)
   ifeq ($$(art_target_cflags_ndebug_or_debug),debug)
     LOCAL_CFLAGS += $(ART_TARGET_DEBUG_CFLAGS)
diff --git a/build/Android.gtest.mk b/build/Android.gtest.mk
index 183ec37cd6..10cd1cc65a 100644
--- a/build/Android.gtest.mk
+++ b/build/Android.gtest.mk
@@ -110,6 +110,7 @@ RUNTIME_GTEST_COMMON_SRC_FILES := \
   runtime/mem_map_test.cc \
   runtime/mirror/dex_cache_test.cc \
   runtime/mirror/object_test.cc \
+  runtime/monitor_test.cc \
   runtime/parsed_options_test.cc \
   runtime/reference_table_test.cc \
   runtime/thread_pool_test.cc \
diff --git a/build/Android.oat.mk b/build/Android.oat.mk
index d3c8d7ee40..dd87f4a58e 100644
--- a/build/Android.oat.mk
+++ b/build/Android.oat.mk
@@ -29,7 +29,7 @@ define create-core-oat-host-rules
 $$($(1)HOST_CORE_IMG_OUT): $$(HOST_CORE_DEX_FILES) $$(DEX2OATD_DEPENDENCY)
 	@echo "host dex2oat: $$@ ($$?)"
 	@mkdir -p $$(dir $$@)
-	$$(hide) $$(DEX2OATD) --runtime-arg -Xms16m --runtime-arg -Xmx16m \
+	$$(hide) $$(DEX2OATD) --runtime-arg $(DEX2OAT_IMAGE_XMS) --runtime-arg $(DEX2OAT_IMAGE_XMX) \
 	  --image-classes=$$(PRELOADED_CLASSES) $$(addprefix --dex-file=,$$(HOST_CORE_DEX_FILES)) \
 	  $$(addprefix --dex-location=,$$(HOST_CORE_DEX_LOCATIONS)) --oat-file=$$($(1)HOST_CORE_OAT_OUT) \
 	  --oat-location=$$($(1)HOST_CORE_OAT) --image=$$($(1)HOST_CORE_IMG_OUT) \
@@ -57,7 +57,7 @@ define create-core-oat-target-rules
 $$($(1)TARGET_CORE_IMG_OUT): $$($(1)TARGET_CORE_DEX_FILES) $$(DEX2OATD_DEPENDENCY)
 	@echo "target dex2oat: $$@ ($$?)"
 	@mkdir -p $$(dir $$@)
-	$$(hide) $$(DEX2OATD) --runtime-arg -Xms16m --runtime-arg -Xmx16m \
+	$$(hide) $$(DEX2OATD) --runtime-arg $(DEX2OAT_XMS) --runtime-arg $(DEX2OAT_XMX) \
 	  --image-classes=$$(PRELOADED_CLASSES) $$(addprefix --dex-file=,$$(TARGET_CORE_DEX_FILES)) \
 	  $$(addprefix --dex-location=,$$(TARGET_CORE_DEX_LOCATIONS)) --oat-file=$$($(1)TARGET_CORE_OAT_OUT) \
 	  --oat-location=$$($(1)TARGET_CORE_OAT) --image=$$($(1)TARGET_CORE_IMG_OUT) \
diff --git a/compiler/dex/backend.h b/compiler/dex/backend.h
index 596b3c9802..1f24849257 100644
--- a/compiler/dex/backend.h
+++ b/compiler/dex/backend.h
@@ -28,6 +28,25 @@ class Backend {
     virtual void Materialize() = 0;
     virtual CompiledMethod* GetCompiledMethod() = 0;
 
+    // Queries for backend support for vectors
+    /*
+     * Return the number of bits in a vector register.
+     * @return 0 if vector registers are not supported, or the
+     * number of bits in the vector register if supported.
+     */
+    virtual int VectorRegisterSize() { return 0; }
+
+    /*
+     * Return the number of reservable vector registers supported
+     * @param fp_used  ‘true’ if floating point computations will be
+     * executed while vector registers are reserved.
+     * @return the number of vector registers that are available
+     * @note The backend should ensure that sufficient vector registers
+     * are held back to generate scalar code without exhausting vector
+     * registers, if scalar code also uses the vector registers.
+     */
+    virtual int NumReservableVectorRegisters(bool fp_used) { return 0; }
+
   protected:
     explicit Backend(ArenaAllocator* arena) : arena_(arena) {}
     ArenaAllocator* const arena_;
diff --git a/compiler/dex/compiler_enums.h b/compiler/dex/compiler_enums.h
index caecb7a48e..799a742032 100644
--- a/compiler/dex/compiler_enums.h
+++ b/compiler/dex/compiler_enums.h
@@ -133,91 +133,101 @@ enum ExtendedMIROpcode {
   // could be supported by using a bit in TypeSize and arg[0] where needed.
 
   // @brief MIR to move constant data to a vector register
-  // vA: number of bits in register
-  // vB: destination
+  // vA: destination
+  // vB: number of bits in register
   // args[0]~args[3]: up to 128 bits of data for initialization
   kMirOpConstVector,
 
   // @brief MIR to move a vectorized register to another
-  // vA: TypeSize
-  // vB: destination
-  // vC: source
+  // vA: destination
+  // vB: source
+  // vC: TypeSize
   kMirOpMoveVector,
 
   // @brief Packed multiply of units in two vector registers: vB = vB .* vC using vA to know the type of the vector.
-  // vA: TypeSize
-  // vB: destination and source
-  // vC: source
+  // vA: destination and source
+  // vB: source
+  // vC: TypeSize
   kMirOpPackedMultiply,
 
   // @brief Packed addition of units in two vector registers: vB = vB .+ vC using vA to know the type of the vector.
-  // vA: TypeSize
-  // vB: destination and source
-  // vC: source
+  // vA: destination and source
+  // vB: source
+  // vC: TypeSize
   kMirOpPackedAddition,
 
   // @brief Packed subtraction of units in two vector registers: vB = vB .- vC using vA to know the type of the vector.
-  // vA: TypeSize
-  // vB: destination and source
-  // vC: source
+  // vA: destination and source
+  // vB: source
+  // vC: TypeSize
   kMirOpPackedSubtract,
 
   // @brief Packed shift left of units in two vector registers: vB = vB .<< vC using vA to know the type of the vector.
-  // vA: TypeSize
-  // vB: destination and source
-  // vC: immediate
+  // vA: destination and source
+  // vB: amount to shift
+  // vC: TypeSize
   kMirOpPackedShiftLeft,
 
   // @brief Packed signed shift right of units in two vector registers: vB = vB .>> vC using vA to know the type of the vector.
-  // vA: TypeSize
-  // vB: destination and source
-  // vC: immediate
+  // vA: destination and source
+  // vB: amount to shift
+  // vC: TypeSize
   kMirOpPackedSignedShiftRight,
 
   // @brief Packed unsigned shift right of units in two vector registers: vB = vB .>>> vC using vA to know the type of the vector.
-  // vA: TypeSize
-  // vB: destination and source
-  // vC: immediate
+  // vA: destination and source
+  // vB: amount to shift
+  // vC: TypeSize
   kMirOpPackedUnsignedShiftRight,
 
   // @brief Packed bitwise and of units in two vector registers: vB = vB .& vC using vA to know the type of the vector.
-  // vA: TypeSize
-  // vB: destination and source
-  // vC: source
+  // vA: destination and source
+  // vB: source
+  // vC: TypeSize
   kMirOpPackedAnd,
 
   // @brief Packed bitwise or of units in two vector registers: vB = vB .| vC using vA to know the type of the vector.
-  // vA: TypeSize
-  // vB: destination and source
-  // vC: source
+  // vA: destination and source
+  // vB: source
+  // vC: TypeSize
   kMirOpPackedOr,
 
   // @brief Packed bitwise xor of units in two vector registers: vB = vB .^ vC using vA to know the type of the vector.
-  // vA: TypeSize
-  // vB: destination and source
-  // vC: source
+  // vA: destination and source
+  // vB: source
+  // vC: TypeSize
   kMirOpPackedXor,
 
   // @brief Reduce a 128-bit packed element into a single VR by taking lower bits
   // @details Instruction does a horizontal addition of the packed elements and then adds it to VR
-  // vA: TypeSize
-  // vB: destination and source VR (not vector register)
-  // vC: source (vector register)
+  // vA: destination and source VR (not vector register)
+  // vB: source (vector register)
+  // vC: TypeSize
   kMirOpPackedAddReduce,
 
   // @brief Extract a packed element into a single VR.
-  // vA: TypeSize
-  // vB: destination VR (not vector register)
-  // vC: source (vector register)
+  // vA: destination VR (not vector register)
+  // vB: source (vector register)
+  // vC: TypeSize
   // arg[0]: The index to use for extraction from vector register (which packed element)
   kMirOpPackedReduce,
 
   // @brief Create a vector value, with all TypeSize values equal to vC
-  // vA: TypeSize
-  // vB: destination vector register
-  // vC: source VR (not vector register)
+  // vA: destination vector register
+  // vB: source VR (not vector register)
+  // vC: TypeSize
   kMirOpPackedSet,
 
+  // @brief Reserve N vector registers (named 0..N-1)
+  // vA: Number of registers
+  // @note: The backend may choose to map vector numbers used in vector opcodes.
+  //  Reserved registers are removed from the list of backend temporary pool.
+  kMirOpReserveVectorRegisters,
+
+  // @brief Free Reserved vector registers
+  // @note: All currently reserved vector registers are returned to the temporary pool.
+  kMirOpReturnVectorRegisters,
+
   kMirOpLast,
 };
 
diff --git a/compiler/dex/mir_dataflow.cc b/compiler/dex/mir_dataflow.cc
index 9fea709568..bc99a272a6 100644
--- a/compiler/dex/mir_dataflow.cc
+++ b/compiler/dex/mir_dataflow.cc
@@ -840,6 +840,54 @@ const uint64_t MIRGraph::oat_data_flow_attributes_[kMirOpLast] = {
 
   // 113 MIR_SELECT
   DF_DA | DF_UB,
+
+  // 114 MirOpConstVector
+  DF_DA,
+
+  // 115 MirOpMoveVector
+  0,
+
+  // 116 MirOpPackedMultiply
+  0,
+
+  // 117 MirOpPackedAddition
+  0,
+
+  // 118 MirOpPackedSubtract
+  0,
+
+  // 119 MirOpPackedShiftLeft
+  0,
+
+  // 120 MirOpPackedSignedShiftRight
+  0,
+
+  // 121 MirOpPackedUnsignedShiftRight
+  0,
+
+  // 122 MirOpPackedAnd
+  0,
+
+  // 123 MirOpPackedOr
+  0,
+
+  // 124 MirOpPackedXor
+  0,
+
+  // 125 MirOpPackedAddReduce
+  DF_DA | DF_UA,
+
+  // 126 MirOpPackedReduce
+  DF_DA,
+
+  // 127 MirOpPackedSet
+  DF_UB,
+
+  // 128 MirOpReserveVectorRegisters
+  0,
+
+  // 129 MirOpReturnVectorRegisters
+  0,
 };
 
 /* Return the base virtual register for a SSA name */
diff --git a/compiler/dex/mir_graph.cc b/compiler/dex/mir_graph.cc
index baa46d61bd..8ce0389520 100644
--- a/compiler/dex/mir_graph.cc
+++ b/compiler/dex/mir_graph.cc
@@ -62,6 +62,8 @@ const char* MIRGraph::extended_mir_op_names_[kMirOpLast - kMirOpFirst] = {
   "PackedAddReduce",
   "PackedReduce",
   "PackedSet",
+  "ReserveVectorRegisters",
+  "ReturnVectorRegisters",
 };
 
 MIRGraph::MIRGraph(CompilationUnit* cu, ArenaAllocator* arena)
@@ -836,12 +838,13 @@ void MIRGraph::DumpCFG(const char* dir_prefix, bool all_blocks, const char *suff
                         mir->next ? " | " : " ");
               }
             } else {
-              fprintf(file, "    {%04x %s %s %s\\l}%s\\\n", mir->offset,
+              fprintf(file, "    {%04x %s %s %s %s\\l}%s\\\n", mir->offset,
                       mir->ssa_rep ? GetDalvikDisassembly(mir) :
                       !IsPseudoMirOp(opcode) ? Instruction::Name(mir->dalvikInsn.opcode) :
                         extended_mir_op_names_[opcode - kMirOpFirst],
                       (mir->optimization_flags & MIR_IGNORE_RANGE_CHECK) != 0 ? " no_rangecheck" : " ",
                       (mir->optimization_flags & MIR_IGNORE_NULL_CHECK) != 0 ? " no_nullcheck" : " ",
+                      (mir->optimization_flags & MIR_IGNORE_SUSPEND_CHECK) != 0 ? " no_suspendcheck" : " ",
                       mir->next ? " | " : " ");
             }
         }
diff --git a/compiler/dex/quick/gen_invoke.cc b/compiler/dex/quick/gen_invoke.cc
index 02f39ac180..6c0dfe80a6 100755
--- a/compiler/dex/quick/gen_invoke.cc
+++ b/compiler/dex/quick/gen_invoke.cc
@@ -1638,6 +1638,12 @@ bool Mir2Lir::GenInlinedStringCompareTo(CallInfo* info) {
 
 bool Mir2Lir::GenInlinedCurrentThread(CallInfo* info) {
   RegLocation rl_dest = InlineTarget(info);
+
+  // Early exit if the result is unused.
+  if (rl_dest.orig_sreg < 0) {
+    return true;
+  }
+
   RegLocation rl_result = EvalLoc(rl_dest, kRefReg, true);
 
   switch (cu_->instruction_set) {
diff --git a/compiler/dex/quick/ralloc_util.cc b/compiler/dex/quick/ralloc_util.cc
index 13bd4432d7..e8fc919d5f 100644
--- a/compiler/dex/quick/ralloc_util.cc
+++ b/compiler/dex/quick/ralloc_util.cc
@@ -1276,7 +1276,7 @@ void Mir2Lir::DoPromotion() {
         if (cu_->instruction_set == kThumb2) {
           bool wide = fp_regs[i].s_reg & STARTING_WIDE_SREG;
           if (wide) {
-            if (promotion_map_[p_map_idx + 1].fp_location == kLocPhysReg) {
+            if (promotion_map_[p_map_idx + 1].fp_location != kLocPhysReg) {
               // Ignore result - if can't alloc double may still be able to alloc singles.
               AllocPreservedDouble(low_sreg);
             }
diff --git a/compiler/dex/quick/x86/assemble_x86.cc b/compiler/dex/quick/x86/assemble_x86.cc
index 7baf2d9663..4e973d8b48 100644
--- a/compiler/dex/quick/x86/assemble_x86.cc
+++ b/compiler/dex/quick/x86/assemble_x86.cc
@@ -327,21 +327,11 @@ ENCODING_MAP(Cmp, IS_LOAD, 0, 0,
 { kX86 ## opname ## RM, kRegMem,   IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE1,  { prefix, 0, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RM", "!0r,[!1r+!2d]" }, \
 { kX86 ## opname ## RA, kRegArray, IS_LOAD | IS_QUIN_OP     | reg_def | REG_USE12, { prefix, 0, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RA", "!0r,[!1r+!2r<<!3d+!4d]" }
 
-#define EXT_0F_REX_NO_PREFIX_ENCODING_MAP(opname, opcode, reg_def) \
-{ kX86 ## opname ## RR, kRegReg,             IS_BINARY_OP   | reg_def | REG_USE1,  { REX, 0x00, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RR", "!0r,!1r" }, \
-{ kX86 ## opname ## RM, kRegMem,   IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE1,  { REX, 0x00, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RM", "!0r,[!1r+!2d]" }, \
-{ kX86 ## opname ## RA, kRegArray, IS_LOAD | IS_QUIN_OP     | reg_def | REG_USE12, { REX, 0x00, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RA", "!0r,[!1r+!2r<<!3d+!4d]" }
-
 #define EXT_0F_REX_W_ENCODING_MAP(opname, prefix, opcode, reg_def) \
 { kX86 ## opname ## RR, kRegReg,             IS_BINARY_OP   | reg_def | REG_USE1,  { prefix, REX_W, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RR", "!0r,!1r" }, \
 { kX86 ## opname ## RM, kRegMem,   IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE1,  { prefix, REX_W, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RM", "!0r,[!1r+!2d]" }, \
 { kX86 ## opname ## RA, kRegArray, IS_LOAD | IS_QUIN_OP     | reg_def | REG_USE12, { prefix, REX_W, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RA", "!0r,[!1r+!2r<<!3d+!4d]" }
 
-#define EXT_0F_REX_W_NO_PREFIX_ENCODING_MAP(opname, opcode, reg_def) \
-{ kX86 ## opname ## RR, kRegReg,             IS_BINARY_OP   | reg_def | REG_USE1,  { REX_W, 0x00, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RR", "!0r,!1r" }, \
-{ kX86 ## opname ## RM, kRegMem,   IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE1,  { REX_W, 0x00, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RM", "!0r,[!1r+!2d]" }, \
-{ kX86 ## opname ## RA, kRegArray, IS_LOAD | IS_QUIN_OP     | reg_def | REG_USE12, { REX_W, 0x00, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RA", "!0r,[!1r+!2r<<!3d+!4d]" }
-
 #define EXT_0F_ENCODING2_MAP(opname, prefix, opcode, opcode2, reg_def) \
 { kX86 ## opname ## RR, kRegReg,             IS_BINARY_OP   | reg_def | REG_USE1,  { prefix, 0, 0x0F, opcode, opcode2, 0, 0, 0, false }, #opname "RR", "!0r,!1r" }, \
 { kX86 ## opname ## RM, kRegMem,   IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE1,  { prefix, 0, 0x0F, opcode, opcode2, 0, 0, 0, false }, #opname "RM", "!0r,[!1r+!2d]" }, \
@@ -405,9 +395,12 @@ ENCODING_MAP(Cmp, IS_LOAD, 0, 0,
   EXT_0F_ENCODING_MAP(Haddpd,    0x66, 0x7C, REG_DEF0_USE0),
   EXT_0F_ENCODING_MAP(Haddps,    0xF2, 0x7C, REG_DEF0_USE0),
 
-  { kX86PextrbRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0x66, 0, 0x0F, 0x3A, 0x14, 0, 0, 1, false }, "PextbRRI", "!0r,!1r,!2d" },
-  { kX86PextrwRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0x66, 0, 0x0F, 0xC5, 0x00, 0, 0, 1, false }, "PextwRRI", "!0r,!1r,!2d" },
-  { kX86PextrdRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1, false }, "PextdRRI", "!0r,!1r,!2d" },
+  { kX86PextrbRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0  | REG_USE1, { 0x66, 0, 0x0F, 0x3A, 0x14, 0, 0, 1, false }, "PextbRRI", "!0r,!1r,!2d" },
+  { kX86PextrwRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0  | REG_USE1, { 0x66, 0, 0x0F, 0xC5, 0x00, 0, 0, 1, false }, "PextwRRI", "!0r,!1r,!2d" },
+  { kX86PextrdRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0  | REG_USE1, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1, false }, "PextdRRI", "!0r,!1r,!2d" },
+  { kX86PextrbMRI, kMemRegImm, IS_QUAD_OP     | REG_USE02 | IS_STORE, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1, false }, "kX86PextrbMRI", "[!0r+!1d],!2r,!3d" },
+  { kX86PextrwMRI, kMemRegImm, IS_QUAD_OP     | REG_USE02 | IS_STORE, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1, false }, "kX86PextrwMRI", "[!0r+!1d],!2r,!3d" },
+  { kX86PextrdMRI, kMemRegImm, IS_QUAD_OP     | REG_USE02 | IS_STORE, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1, false }, "kX86PextrdMRI", "[!0r+!1d],!2r,!3d" },
 
   { kX86PshuflwRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0xF2, 0, 0x0F, 0x70, 0, 0, 0, 1, false }, "PshuflwRRI", "!0r,!1r,!2d" },
   { kX86PshufdRRI,  kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0x66, 0, 0x0F, 0x70, 0, 0, 0, 1, false }, "PshuffRRI", "!0r,!1r,!2d" },
@@ -499,10 +492,10 @@ ENCODING_MAP(Cmp, IS_LOAD, 0, 0,
   EXT_0F_ENCODING_MAP(Movzx16, 0x00, 0xB7, REG_DEF0),
   EXT_0F_ENCODING_MAP(Movsx8,  0x00, 0xBE, REG_DEF0),
   EXT_0F_ENCODING_MAP(Movsx16, 0x00, 0xBF, REG_DEF0),
-  EXT_0F_REX_NO_PREFIX_ENCODING_MAP(Movzx8q, 0xB6, REG_DEF0),
-  EXT_0F_REX_W_NO_PREFIX_ENCODING_MAP(Movzx16q, 0xB7, REG_DEF0),
-  EXT_0F_REX_NO_PREFIX_ENCODING_MAP(Movsx8q, 0xBE, REG_DEF0),
-  EXT_0F_REX_W_NO_PREFIX_ENCODING_MAP(Movsx16q, 0xBF, REG_DEF0),
+  EXT_0F_ENCODING_MAP(Movzx8q,  REX_W, 0xB6, REG_DEF0),
+  EXT_0F_ENCODING_MAP(Movzx16q, REX_W, 0xB7, REG_DEF0),
+  EXT_0F_ENCODING_MAP(Movsx8q,  REX, 0xBE, REG_DEF0),
+  EXT_0F_ENCODING_MAP(Movsx16q, REX_W, 0xBF, REG_DEF0),
 #undef EXT_0F_ENCODING_MAP
 
   { kX86Jcc8,  kJcc,  IS_BINARY_OP | IS_BRANCH | NEEDS_FIXUP | USES_CCODES, { 0,             0, 0x70, 0,    0, 0, 0, 0, false }, "Jcc8",  "!1c !0t" },
@@ -627,7 +620,8 @@ size_t X86Mir2Lir::ComputeSize(const X86EncodingMap* entry, int32_t raw_reg, int
     if (registers_need_rex_prefix) {
       DCHECK(cu_->target64) << "Attempt to use a 64-bit only addressable register "
           << RegStorage::RegNum(raw_reg) << " with instruction " << entry->name;
-      if (entry->skeleton.prefix1 != REX_W && entry->skeleton.prefix2 != REX_W) {
+      if (entry->skeleton.prefix1 != REX_W && entry->skeleton.prefix2 != REX_W
+         && entry->skeleton.prefix1 != REX && entry->skeleton.prefix2 != REX) {
         ++size;  // rex
       }
     }
@@ -906,7 +900,8 @@ void X86Mir2Lir::EmitPrefix(const X86EncodingMap* entry,
       // 64 bit addresses by GS, not FS.
       code_buffer_.push_back(THREAD_PREFIX_GS);
     } else {
-      if (entry->skeleton.prefix1 == REX_W) {
+      if (entry->skeleton.prefix1 == REX_W || entry->skeleton.prefix1 == REX) {
+        DCHECK(cu_->target64);
         rex |= entry->skeleton.prefix1;
         code_buffer_.push_back(rex);
         rex = 0;
@@ -915,7 +910,8 @@ void X86Mir2Lir::EmitPrefix(const X86EncodingMap* entry,
       }
     }
     if (entry->skeleton.prefix2 != 0) {
-      if (entry->skeleton.prefix2 == REX_W) {
+      if (entry->skeleton.prefix2 == REX_W || entry->skeleton.prefix1 == REX) {
+        DCHECK(cu_->target64);
         rex |= entry->skeleton.prefix2;
         code_buffer_.push_back(rex);
         rex = 0;
diff --git a/compiler/dex/quick/x86/call_x86.cc b/compiler/dex/quick/x86/call_x86.cc
index 6ca220cb2e..9000514856 100644
--- a/compiler/dex/quick/x86/call_x86.cc
+++ b/compiler/dex/quick/x86/call_x86.cc
@@ -94,13 +94,10 @@ void X86Mir2Lir::GenPackedSwitch(MIR* mir, DexOffset table_offset,
     start_of_method_reg = rl_method.reg;
     store_method_addr_used_ = true;
   } else {
-    if (cu_->target64) {
-      start_of_method_reg = AllocTempWide();
-    } else {
-      start_of_method_reg = AllocTemp();
-    }
+    start_of_method_reg = AllocTempRef();
     NewLIR1(kX86StartOfMethod, start_of_method_reg.GetReg());
   }
+  DCHECK_EQ(start_of_method_reg.Is64Bit(), cu_->target64);
   int low_key = s4FromSwitchData(&table[2]);
   RegStorage keyReg;
   // Remove the bias, if necessary
@@ -111,7 +108,7 @@ void X86Mir2Lir::GenPackedSwitch(MIR* mir, DexOffset table_offset,
     OpRegRegImm(kOpSub, keyReg, rl_src.reg, low_key);
   }
   // Bounds check - if < 0 or >= size continue following switch
-  OpRegImm(kOpCmp, keyReg, size-1);
+  OpRegImm(kOpCmp, keyReg, size - 1);
   LIR* branch_over = OpCondBranch(kCondHi, NULL);
 
   // Load the displacement from the switch table
@@ -119,11 +116,7 @@ void X86Mir2Lir::GenPackedSwitch(MIR* mir, DexOffset table_offset,
   NewLIR5(kX86PcRelLoadRA, disp_reg.GetReg(), start_of_method_reg.GetReg(), keyReg.GetReg(),
           2, WrapPointer(tab_rec));
   // Add displacement to start of method
-  if (cu_->target64) {
-    NewLIR2(kX86Add64RR, start_of_method_reg.GetReg(), disp_reg.GetReg());
-  } else {
-    OpRegReg(kOpAdd, start_of_method_reg, disp_reg);
-  }
+  OpRegReg(kOpAdd, start_of_method_reg, cu_->target64 ? As64BitReg(disp_reg) : disp_reg);
   // ..and go!
   LIR* switch_branch = NewLIR1(kX86JmpR, start_of_method_reg.GetReg());
   tab_rec->anchor = switch_branch;
@@ -174,7 +167,6 @@ void X86Mir2Lir::GenFillArrayData(DexOffset table_offset, RegLocation rl_src) {
     }
     store_method_addr_used_ = true;
   } else {
-    // TODO(64) force to be 64-bit
     NewLIR1(kX86StartOfMethod, method_start.GetReg());
   }
   NewLIR2(kX86PcRelAdr, payload.GetReg(), WrapPointer(tab_rec));
@@ -193,8 +185,8 @@ void X86Mir2Lir::GenMoveException(RegLocation rl_dest) {
       Thread::ExceptionOffset<8>().Int32Value() :
       Thread::ExceptionOffset<4>().Int32Value();
   RegLocation rl_result = EvalLoc(rl_dest, kRefReg, true);
-  NewLIR2(kX86Mov32RT, rl_result.reg.GetReg(), ex_offset);
-  NewLIR2(kX86Mov32TI, ex_offset, 0);
+  NewLIR2(cu_->target64 ? kX86Mov64RT : kX86Mov32RT, rl_result.reg.GetReg(), ex_offset);
+  NewLIR2(cu_->target64 ? kX86Mov64TI : kX86Mov32TI, ex_offset, 0);
   StoreValue(rl_dest, rl_result);
 }
 
@@ -202,17 +194,15 @@ void X86Mir2Lir::GenMoveException(RegLocation rl_dest) {
  * Mark garbage collection card. Skip if the value we're storing is null.
  */
 void X86Mir2Lir::MarkGCCard(RegStorage val_reg, RegStorage tgt_addr_reg) {
-  RegStorage reg_card_base = AllocTemp();
-  RegStorage reg_card_no = AllocTemp();
+  DCHECK_EQ(tgt_addr_reg.Is64Bit(), cu_->target64);
+  DCHECK_EQ(val_reg.Is64Bit(), cu_->target64);
+  RegStorage reg_card_base = AllocTempRef();
+  RegStorage reg_card_no = AllocTempRef();
   LIR* branch_over = OpCmpImmBranch(kCondEq, val_reg, 0, NULL);
   int ct_offset = cu_->target64 ?
       Thread::CardTableOffset<8>().Int32Value() :
       Thread::CardTableOffset<4>().Int32Value();
-  if (cu_->target64) {
-    NewLIR2(kX86Mov64RT, reg_card_base.GetReg(), ct_offset);
-  } else {
-    NewLIR2(kX86Mov32RT, reg_card_base.GetReg(), ct_offset);
-  }
+  NewLIR2(cu_->target64 ? kX86Mov64RT : kX86Mov32RT, reg_card_base.GetReg(), ct_offset);
   OpRegRegImm(kOpLsr, reg_card_no, tgt_addr_reg, gc::accounting::CardTable::kCardShift);
   StoreBaseIndexed(reg_card_base, reg_card_no, reg_card_base, 0, kUnsignedByte);
   LIR* target = NewLIR0(kPseudoTargetLabel);
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index 55e5993dce..ff7b30eeec 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -28,7 +28,7 @@ class X86Mir2Lir : public Mir2Lir {
  protected:
   class InToRegStorageMapper {
    public:
-    virtual RegStorage GetNextReg(bool is_double_or_float, bool is_wide) = 0;
+    virtual RegStorage GetNextReg(bool is_double_or_float, bool is_wide, bool is_ref) = 0;
     virtual ~InToRegStorageMapper() {}
   };
 
@@ -36,7 +36,7 @@ class X86Mir2Lir : public Mir2Lir {
    public:
     explicit InToRegStorageX86_64Mapper(Mir2Lir* ml) : ml_(ml), cur_core_reg_(0), cur_fp_reg_(0) {}
     virtual ~InToRegStorageX86_64Mapper() {}
-    virtual RegStorage GetNextReg(bool is_double_or_float, bool is_wide);
+    virtual RegStorage GetNextReg(bool is_double_or_float, bool is_wide, bool is_ref);
    protected:
     Mir2Lir* ml_;
    private:
@@ -118,6 +118,8 @@ class X86Mir2Lir : public Mir2Lir {
   void FreeCallTemps();
   void LockCallTemps();
   void CompilerInitializeRegAlloc();
+  int VectorRegisterSize();
+  int NumReservableVectorRegisters(bool fp_used);
 
   // Required for target - miscellaneous.
   void AssembleLIR();
@@ -503,6 +505,11 @@ class X86Mir2Lir : public Mir2Lir {
   void GenFusedLongCmpImmBranch(BasicBlock* bb, RegLocation rl_src1,
                                 int64_t val, ConditionCode ccode);
   void GenConstWide(RegLocation rl_dest, int64_t value);
+  void GenMultiplyVectorSignedByte(BasicBlock *bb, MIR *mir);
+  void GenShiftByteVector(BasicBlock *bb, MIR *mir);
+  void AndMaskVectorRegister(RegStorage rs_src1, uint32_t m1, uint32_t m2, uint32_t m3, uint32_t m4);
+  void MaskVectorRegister(X86OpCode opcode, RegStorage rs_src1, uint32_t m1, uint32_t m2, uint32_t m3, uint32_t m4);
+  void AppendOpcodeWithConst(X86OpCode opcode, int reg, MIR* mir);
 
   static bool ProvidesFullMemoryBarrier(X86OpCode opcode);
 
@@ -513,6 +520,12 @@ class X86Mir2Lir : public Mir2Lir {
   virtual RegStorage AllocateByteRegister();
 
   /*
+   * @brief Use a wide temporary as a 128-bit register
+   * @returns a 128-bit temporary register.
+   */
+  virtual RegStorage Get128BitRegister(RegStorage reg);
+
+  /*
    * @brief Check if a register is byte addressable.
    * @returns true if a register is byte addressable.
    */
@@ -528,6 +541,22 @@ class X86Mir2Lir : public Mir2Lir {
    */
   bool GenInlinedIndexOf(CallInfo* info, bool zero_based);
 
+  /**
+   * @brief Reserve a fixed number of vector  registers from the register pool
+   * @details The mir->dalvikInsn.vA specifies an N such that vector registers
+   * [0..N-1] are removed from the temporary pool. The caller must call
+   * ReturnVectorRegisters before calling ReserveVectorRegisters again.
+   * Also sets the num_reserved_vector_regs_ to the specified value
+   * @param mir whose vA specifies the number of registers to reserve
+   */
+  void ReserveVectorRegisters(MIR* mir);
+
+  /**
+   * @brief Return all the reserved vector registers to the temp pool
+   * @details Returns [0..num_reserved_vector_regs_]
+   */
+  void ReturnVectorRegisters();
+
   /*
    * @brief Load 128 bit constant into vector register.
    * @param bb The basic block in which the MIR is from.
@@ -901,6 +930,10 @@ class X86Mir2Lir : public Mir2Lir {
   LIR *AddVectorLiteral(MIR *mir);
 
   InToRegStorageMapping in_to_reg_storage_mapping_;
+
+ private:
+  // The number of vector registers [0..N] reserved by a call to ReserveVectorRegisters
+  int num_reserved_vector_regs_;
 };
 
 }  // namespace art
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index 43882c2e02..e81f505f2f 100755
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -427,6 +427,10 @@ RegStorage X86Mir2Lir::AllocateByteRegister() {
   return reg;
 }
 
+RegStorage X86Mir2Lir::Get128BitRegister(RegStorage reg) {
+  return GetRegInfo(reg)->FindMatchingView(RegisterInfo::k128SoloStorageMask)->GetReg();
+}
+
 bool X86Mir2Lir::IsByteRegister(RegStorage reg) {
   return cu_->target64 || reg.GetRegNum() < rs_rX86_SP.GetRegNum();
 }
@@ -646,6 +650,14 @@ void X86Mir2Lir::CompilerInitializeRegAlloc() {
   reg_pool_->next_dp_reg_ = 1;
 }
 
+int X86Mir2Lir::VectorRegisterSize() {
+  return 128;
+}
+
+int X86Mir2Lir::NumReservableVectorRegisters(bool fp_used) {
+  return fp_used ? 5 : 7;
+}
+
 void X86Mir2Lir::SpillCoreRegs() {
   if (num_core_spills_ == 0) {
     return;
@@ -790,6 +802,9 @@ X86Mir2Lir::X86Mir2Lir(CompilationUnit* cu, MIRGraph* mir_graph, ArenaAllocator*
   rX86_RET1 = rDX;
   rX86_INVOKE_TGT = rAX;
   rX86_COUNT = rCX;
+
+  // Initialize the number of reserved vector registers
+  num_reserved_vector_regs_ = -1;
 }
 
 Mir2Lir* X86CodeGenerator(CompilationUnit* const cu, MIRGraph* const mir_graph,
@@ -1475,6 +1490,12 @@ std::vector<uint8_t>* X86Mir2Lir::ReturnCallFrameInformation() {
 
 void X86Mir2Lir::GenMachineSpecificExtendedMethodMIR(BasicBlock* bb, MIR* mir) {
   switch (static_cast<ExtendedMIROpcode>(mir->dalvikInsn.opcode)) {
+    case kMirOpReserveVectorRegisters:
+      ReserveVectorRegisters(mir);
+      break;
+    case kMirOpReturnVectorRegisters:
+      ReturnVectorRegisters();
+      break;
     case kMirOpConstVector:
       GenConst128(bb, mir);
       break;
@@ -1522,11 +1543,57 @@ void X86Mir2Lir::GenMachineSpecificExtendedMethodMIR(BasicBlock* bb, MIR* mir) {
   }
 }
 
+void X86Mir2Lir::ReserveVectorRegisters(MIR* mir) {
+  // We should not try to reserve twice without returning the registers
+  DCHECK_NE(num_reserved_vector_regs_, -1);
+
+  int num_vector_reg = mir->dalvikInsn.vA;
+  for (int i = 0; i < num_vector_reg; i++) {
+    RegStorage xp_reg = RegStorage::Solo128(i);
+    RegisterInfo *xp_reg_info = GetRegInfo(xp_reg);
+    Clobber(xp_reg);
+
+    for (RegisterInfo *info = xp_reg_info->GetAliasChain();
+                       info != nullptr;
+                       info = info->GetAliasChain()) {
+      if (info->GetReg().IsSingle()) {
+        reg_pool_->sp_regs_.Delete(info);
+      } else {
+        reg_pool_->dp_regs_.Delete(info);
+      }
+    }
+  }
+
+  num_reserved_vector_regs_ = num_vector_reg;
+}
+
+void X86Mir2Lir::ReturnVectorRegisters() {
+  // Return all the reserved registers
+  for (int i = 0; i < num_reserved_vector_regs_; i++) {
+    RegStorage xp_reg = RegStorage::Solo128(i);
+    RegisterInfo *xp_reg_info = GetRegInfo(xp_reg);
+
+    for (RegisterInfo *info = xp_reg_info->GetAliasChain();
+                       info != nullptr;
+                       info = info->GetAliasChain()) {
+      if (info->GetReg().IsSingle()) {
+        reg_pool_->sp_regs_.Insert(info);
+      } else {
+        reg_pool_->dp_regs_.Insert(info);
+      }
+    }
+  }
+
+  // We don't have anymore reserved vector registers
+  num_reserved_vector_regs_ = -1;
+}
+
 void X86Mir2Lir::GenConst128(BasicBlock* bb, MIR* mir) {
-  int type_size = mir->dalvikInsn.vA;
+  store_method_addr_used_ = true;
+  int type_size = mir->dalvikInsn.vB;
   // We support 128 bit vectors.
   DCHECK_EQ(type_size & 0xFFFF, 128);
-  RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vB);
+  RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vA);
   uint32_t *args = mir->dalvikInsn.arg;
   int reg = rs_dest.GetReg();
   // Check for all 0 case.
@@ -1534,6 +1601,12 @@ void X86Mir2Lir::GenConst128(BasicBlock* bb, MIR* mir) {
     NewLIR2(kX86XorpsRR, reg, reg);
     return;
   }
+
+  // Append the mov const vector to reg opcode.
+  AppendOpcodeWithConst(kX86MovupsRM, reg, mir);
+}
+
+void X86Mir2Lir::AppendOpcodeWithConst(X86OpCode opcode, int reg, MIR* mir) {
   // Okay, load it from the constant vector area.
   LIR *data_target = ScanVectorLiteral(mir);
   if (data_target == nullptr) {
@@ -1553,24 +1626,66 @@ void X86Mir2Lir::GenConst128(BasicBlock* bb, MIR* mir) {
   // 4 byte offset.  We will fix this up in the assembler later to have the right
   // value.
   ScopedMemRefType mem_ref_type(this, ResourceMask::kLiteral);
-  LIR *load = NewLIR3(kX86Mova128RM, reg, rl_method.reg.GetReg(),  256 /* bogus */);
+  LIR *load = NewLIR2(opcode, reg, rl_method.reg.GetReg());
   load->flags.fixup = kFixupLoad;
   load->target = data_target;
 }
 
 void X86Mir2Lir::GenMoveVector(BasicBlock *bb, MIR *mir) {
   // We only support 128 bit registers.
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vB);
-  RegStorage rs_src = RegStorage::Solo128(mir->dalvikInsn.vC);
+  DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
+  RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vA);
+  RegStorage rs_src = RegStorage::Solo128(mir->dalvikInsn.vB);
   NewLIR2(kX86Mova128RR, rs_dest.GetReg(), rs_src.GetReg());
 }
 
+void X86Mir2Lir::GenMultiplyVectorSignedByte(BasicBlock *bb, MIR *mir) {
+  const int BYTE_SIZE = 8;
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
+  RegStorage rs_src1_high_tmp = Get128BitRegister(AllocTempWide());
+
+  /*
+   * Emulate the behavior of a kSignedByte by separating out the 16 values in the two XMM
+   * and multiplying 8 at a time before recombining back into one XMM register.
+   *
+   *   let xmm1, xmm2 be real srcs (keep low bits of 16bit lanes)
+   *       xmm3 is tmp             (operate on high bits of 16bit lanes)
+   *
+   *    xmm3 = xmm1
+   *    xmm1 = xmm1 .* xmm2
+   *    xmm1 = xmm1 & 0x00ff00ff00ff00ff00ff00ff00ff00ff  // xmm1 now has low bits
+   *    xmm3 = xmm3 .>> 8
+   *    xmm2 = xmm2 & 0xff00ff00ff00ff00ff00ff00ff00ff00
+   *    xmm2 = xmm2 .* xmm3                               // xmm2 now has high bits
+   *    xmm1 = xmm1 | xmm2                                // combine results
+   */
+
+  // Copy xmm1.
+  NewLIR2(kX86Mova128RR, rs_src1_high_tmp.GetReg(), rs_dest_src1.GetReg());
+
+  // Multiply low bits.
+  NewLIR2(kX86PmullwRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
+
+  // xmm1 now has low bits.
+  AndMaskVectorRegister(rs_dest_src1, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF);
+
+  // Prepare high bits for multiplication.
+  NewLIR2(kX86PsrlwRI, rs_src1_high_tmp.GetReg(), BYTE_SIZE);
+  AndMaskVectorRegister(rs_src2, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00);
+
+  // Multiply high bits and xmm2 now has high bits.
+  NewLIR2(kX86PmullwRR, rs_src2.GetReg(), rs_src1_high_tmp.GetReg());
+
+  // Combine back into dest XMM register.
+  NewLIR2(kX86PorRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
+}
+
 void X86Mir2Lir::GenMultiplyVector(BasicBlock *bb, MIR *mir) {
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
-  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
-  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC);
+  DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
   int opcode = 0;
   switch (opsize) {
     case k32:
@@ -1585,6 +1700,10 @@ void X86Mir2Lir::GenMultiplyVector(BasicBlock *bb, MIR *mir) {
     case kDouble:
       opcode = kX86MulpdRR;
       break;
+    case kSignedByte:
+      // HW doesn't support 16x16 byte multiplication so emulate it.
+      GenMultiplyVectorSignedByte(bb, mir);
+      return;
     default:
       LOG(FATAL) << "Unsupported vector multiply " << opsize;
       break;
@@ -1593,10 +1712,10 @@ void X86Mir2Lir::GenMultiplyVector(BasicBlock *bb, MIR *mir) {
 }
 
 void X86Mir2Lir::GenAddVector(BasicBlock *bb, MIR *mir) {
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
-  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
-  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC);
+  DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
   int opcode = 0;
   switch (opsize) {
     case k32:
@@ -1624,10 +1743,10 @@ void X86Mir2Lir::GenAddVector(BasicBlock *bb, MIR *mir) {
 }
 
 void X86Mir2Lir::GenSubtractVector(BasicBlock *bb, MIR *mir) {
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
-  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
-  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC);
+  DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
   int opcode = 0;
   switch (opsize) {
     case k32:
@@ -1654,11 +1773,60 @@ void X86Mir2Lir::GenSubtractVector(BasicBlock *bb, MIR *mir) {
   NewLIR2(opcode, rs_dest_src1.GetReg(), rs_src2.GetReg());
 }
 
+void X86Mir2Lir::GenShiftByteVector(BasicBlock *bb, MIR *mir) {
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  RegStorage rs_tmp = Get128BitRegister(AllocTempWide());
+
+  int opcode = 0;
+  int imm = mir->dalvikInsn.vB;
+
+  switch (static_cast<ExtendedMIROpcode>(mir->dalvikInsn.opcode)) {
+    case kMirOpPackedShiftLeft:
+      opcode = kX86PsllwRI;
+      break;
+    case kMirOpPackedSignedShiftRight:
+      opcode = kX86PsrawRI;
+      break;
+    case kMirOpPackedUnsignedShiftRight:
+      opcode = kX86PsrlwRI;
+      break;
+    default:
+      LOG(FATAL) << "Unsupported shift operation on byte vector " << opcode;
+      break;
+  }
+
+  /*
+   * xmm1 will have low bits
+   * xmm2 will have high bits
+   *
+   * xmm2 = xmm1
+   * xmm1 = xmm1 .<< N
+   * xmm2 = xmm2 && 0xFF00FF00FF00FF00FF00FF00FF00FF00
+   * xmm2 = xmm2 .<< N
+   * xmm1 = xmm1 | xmm2
+   */
+
+  // Copy xmm1.
+  NewLIR2(kX86Mova128RR, rs_tmp.GetReg(), rs_dest_src1.GetReg());
+
+  // Shift lower values.
+  NewLIR2(opcode, rs_dest_src1.GetReg(), imm);
+
+  // Mask bottom bits.
+  AndMaskVectorRegister(rs_tmp, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00);
+
+  // Shift higher values.
+  NewLIR2(opcode, rs_tmp.GetReg(), imm);
+
+  // Combine back into dest XMM register.
+  NewLIR2(kX86PorRR, rs_dest_src1.GetReg(), rs_tmp.GetReg());
+}
+
 void X86Mir2Lir::GenShiftLeftVector(BasicBlock *bb, MIR *mir) {
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
-  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
-  int imm = mir->dalvikInsn.vC;
+  DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  int imm = mir->dalvikInsn.vB;
   int opcode = 0;
   switch (opsize) {
     case k32:
@@ -1671,6 +1839,10 @@ void X86Mir2Lir::GenShiftLeftVector(BasicBlock *bb, MIR *mir) {
     case kUnsignedHalf:
       opcode = kX86PsllwRI;
       break;
+    case kSignedByte:
+    case kUnsignedByte:
+      GenShiftByteVector(bb, mir);
+      return;
     default:
       LOG(FATAL) << "Unsupported vector shift left " << opsize;
       break;
@@ -1679,10 +1851,10 @@ void X86Mir2Lir::GenShiftLeftVector(BasicBlock *bb, MIR *mir) {
 }
 
 void X86Mir2Lir::GenSignedShiftRightVector(BasicBlock *bb, MIR *mir) {
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
-  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
-  int imm = mir->dalvikInsn.vC;
+  DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  int imm = mir->dalvikInsn.vB;
   int opcode = 0;
   switch (opsize) {
     case k32:
@@ -1692,6 +1864,10 @@ void X86Mir2Lir::GenSignedShiftRightVector(BasicBlock *bb, MIR *mir) {
     case kUnsignedHalf:
       opcode = kX86PsrawRI;
       break;
+    case kSignedByte:
+    case kUnsignedByte:
+      GenShiftByteVector(bb, mir);
+      return;
     default:
       LOG(FATAL) << "Unsupported vector signed shift right " << opsize;
       break;
@@ -1700,10 +1876,10 @@ void X86Mir2Lir::GenSignedShiftRightVector(BasicBlock *bb, MIR *mir) {
 }
 
 void X86Mir2Lir::GenUnsignedShiftRightVector(BasicBlock *bb, MIR *mir) {
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
-  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
-  int imm = mir->dalvikInsn.vC;
+  DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  int imm = mir->dalvikInsn.vB;
   int opcode = 0;
   switch (opsize) {
     case k32:
@@ -1716,6 +1892,10 @@ void X86Mir2Lir::GenUnsignedShiftRightVector(BasicBlock *bb, MIR *mir) {
     case kUnsignedHalf:
       opcode = kX86PsrlwRI;
       break;
+    case kSignedByte:
+    case kUnsignedByte:
+      GenShiftByteVector(bb, mir);
+      return;
     default:
       LOG(FATAL) << "Unsupported vector unsigned shift right " << opsize;
       break;
@@ -1725,91 +1905,209 @@ void X86Mir2Lir::GenUnsignedShiftRightVector(BasicBlock *bb, MIR *mir) {
 
 void X86Mir2Lir::GenAndVector(BasicBlock *bb, MIR *mir) {
   // We only support 128 bit registers.
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
-  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC);
+  DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
   NewLIR2(kX86PandRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
 }
 
 void X86Mir2Lir::GenOrVector(BasicBlock *bb, MIR *mir) {
   // We only support 128 bit registers.
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
-  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC);
+  DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
   NewLIR2(kX86PorRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
 }
 
 void X86Mir2Lir::GenXorVector(BasicBlock *bb, MIR *mir) {
   // We only support 128 bit registers.
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
-  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC);
+  DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
   NewLIR2(kX86PxorRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
 }
 
+void X86Mir2Lir::AndMaskVectorRegister(RegStorage rs_src1, uint32_t m1, uint32_t m2, uint32_t m3, uint32_t m4) {
+  MaskVectorRegister(kX86PandRM, rs_src1, m1, m2, m3, m4);
+}
+
+void X86Mir2Lir::MaskVectorRegister(X86OpCode opcode, RegStorage rs_src1, uint32_t m0, uint32_t m1, uint32_t m2, uint32_t m3) {
+  // Create temporary MIR as container for 128-bit binary mask.
+  MIR const_mir;
+  MIR* const_mirp = &const_mir;
+  const_mirp->dalvikInsn.opcode = static_cast<Instruction::Code>(kMirOpConstVector);
+  const_mirp->dalvikInsn.arg[0] = m0;
+  const_mirp->dalvikInsn.arg[1] = m1;
+  const_mirp->dalvikInsn.arg[2] = m2;
+  const_mirp->dalvikInsn.arg[3] = m3;
+
+  // Mask vector with const from literal pool.
+  AppendOpcodeWithConst(opcode, rs_src1.GetReg(), const_mirp);
+}
+
 void X86Mir2Lir::GenAddReduceVector(BasicBlock *bb, MIR *mir) {
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
-  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
-  int imm = mir->dalvikInsn.vC;
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
+  RegStorage rs_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+  RegLocation rl_dest = mir_graph_->GetDest(mir);
+  RegStorage rs_tmp;
+
+  int vec_bytes = (mir->dalvikInsn.vC & 0xFFFF) / 8;
+  int vec_unit_size = 0;
   int opcode = 0;
+  int extr_opcode = 0;
+  RegLocation rl_result;
+
   switch (opsize) {
     case k32:
+      extr_opcode = kX86PextrdRRI;
       opcode = kX86PhadddRR;
+      vec_unit_size = 4;
+      break;
+    case kSignedByte:
+    case kUnsignedByte:
+      extr_opcode = kX86PextrbRRI;
+      opcode = kX86PhaddwRR;
+      vec_unit_size = 2;
       break;
     case kSignedHalf:
     case kUnsignedHalf:
+      extr_opcode = kX86PextrwRRI;
       opcode = kX86PhaddwRR;
+      vec_unit_size = 2;
       break;
+    case kSingle:
+      rl_result = EvalLoc(rl_dest, kFPReg, true);
+      vec_unit_size = 4;
+      for (int i = 0; i < 3; i++) {
+        NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), rs_src1.GetReg());
+        NewLIR3(kX86ShufpsRRI, rs_src1.GetReg(), rs_src1.GetReg(), 0x39);
+      }
+      NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), rs_src1.GetReg());
+      StoreValue(rl_dest, rl_result);
+
+      // For single-precision floats, we are done here
+      return;
     default:
       LOG(FATAL) << "Unsupported vector add reduce " << opsize;
       break;
   }
-  NewLIR2(opcode, rs_dest_src1.GetReg(), imm);
+
+  int elems = vec_bytes / vec_unit_size;
+
+  // Emulate horizontal add instruction by reducing 2 vectors with 8 values before adding them again
+  // TODO is overflow handled correctly?
+  if (opsize == kSignedByte || opsize == kUnsignedByte) {
+    rs_tmp = Get128BitRegister(AllocTempWide());
+
+    // tmp = xmm1 .>> 8.
+    NewLIR2(kX86Mova128RR, rs_tmp.GetReg(), rs_src1.GetReg());
+    NewLIR2(kX86PsrlwRI, rs_tmp.GetReg(), 8);
+
+    // Zero extend low bits in xmm1.
+    AndMaskVectorRegister(rs_src1, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF);
+  }
+
+  while (elems > 1) {
+    if (opsize == kSignedByte || opsize == kUnsignedByte) {
+      NewLIR2(opcode, rs_tmp.GetReg(), rs_tmp.GetReg());
+    }
+    NewLIR2(opcode, rs_src1.GetReg(), rs_src1.GetReg());
+    elems >>= 1;
+  }
+
+  // Combine the results if we separated them.
+  if (opsize == kSignedByte || opsize == kUnsignedByte) {
+    NewLIR2(kX86PaddbRR, rs_src1.GetReg(), rs_tmp.GetReg());
+  }
+
+  // We need to extract to a GPR.
+  RegStorage temp = AllocTemp();
+  NewLIR3(extr_opcode, temp.GetReg(), rs_src1.GetReg(), 0);
+
+  // Can we do this directly into memory?
+  rl_result = UpdateLocTyped(rl_dest, kCoreReg);
+  if (rl_result.location == kLocPhysReg) {
+    // Ensure res is in a core reg
+    rl_result = EvalLoc(rl_dest, kCoreReg, true);
+    OpRegReg(kOpAdd, rl_result.reg, temp);
+    StoreFinalValue(rl_dest, rl_result);
+  } else {
+    OpMemReg(kOpAdd, rl_result, temp.GetReg());
+  }
+
+  FreeTemp(temp);
 }
 
 void X86Mir2Lir::GenReduceVector(BasicBlock *bb, MIR *mir) {
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
-  RegStorage rs_src = RegStorage::Solo128(mir->dalvikInsn.vB);
-  int index = mir->dalvikInsn.arg[0];
-  int opcode = 0;
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
+  RegLocation rl_dest = mir_graph_->GetDest(mir);
+  RegStorage rs_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+  int extract_index = mir->dalvikInsn.arg[0];
+  int extr_opcode = 0;
+  RegLocation rl_result;
+  bool is_wide = false;
+
   switch (opsize) {
     case k32:
-      opcode = kX86PextrdRRI;
+      rl_result = UpdateLocTyped(rl_dest, kCoreReg);
+      extr_opcode = (rl_result.location == kLocPhysReg) ? kX86PextrdMRI : kX86PextrdRRI;
       break;
     case kSignedHalf:
     case kUnsignedHalf:
-      opcode = kX86PextrwRRI;
-      break;
-    case kUnsignedByte:
-    case kSignedByte:
-      opcode = kX86PextrbRRI;
+      rl_result= UpdateLocTyped(rl_dest, kCoreReg);
+      extr_opcode = (rl_result.location == kLocPhysReg) ? kX86PextrwMRI : kX86PextrwRRI;
       break;
     default:
-      LOG(FATAL) << "Unsupported vector reduce " << opsize;
+      LOG(FATAL) << "Unsupported vector add reduce " << opsize;
+      return;
       break;
   }
-  // We need to extract to a GPR.
-  RegStorage temp = AllocTemp();
-  NewLIR3(opcode, temp.GetReg(), rs_src.GetReg(), index);
 
-  // Assume that the destination VR is in the def for the mir.
-  RegLocation rl_dest = mir_graph_->GetDest(mir);
-  RegLocation rl_temp =
-    {kLocPhysReg, 0, 0, 0, 0, 0, 0, 0, 1, temp, INVALID_SREG, INVALID_SREG};
-  StoreValue(rl_dest, rl_temp);
+  if (rl_result.location == kLocPhysReg) {
+    NewLIR3(extr_opcode, rl_result.reg.GetReg(), rs_src1.GetReg(), extract_index);
+    if (is_wide == true) {
+      StoreFinalValue(rl_dest, rl_result);
+    } else {
+      StoreFinalValueWide(rl_dest, rl_result);
+    }
+  } else {
+    int displacement = SRegOffset(rl_result.s_reg_low);
+    LIR *l = NewLIR3(extr_opcode, rs_rX86_SP.GetReg(), displacement, rs_src1.GetReg());
+    AnnotateDalvikRegAccess(l, displacement >> 2, true /* is_load */, is_wide /* is_64bit */);
+    AnnotateDalvikRegAccess(l, displacement >> 2, false /* is_load */, is_wide /* is_64bit */);
+  }
 }
 
 void X86Mir2Lir::GenSetVector(BasicBlock *bb, MIR *mir) {
-  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
-  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
-  RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vB);
-  int op_low = 0, op_high = 0;
+  DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
+  RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vA);
+  int op_low = 0, op_high = 0, imm = 0, op_mov = kX86MovdxrRR;
+  RegisterClass reg_type = kCoreReg;
+
   switch (opsize) {
     case k32:
       op_low = kX86PshufdRRI;
       break;
+    case kSingle:
+      op_low = kX86PshufdRRI;
+      op_mov = kX86Mova128RR;
+      reg_type = kFPReg;
+      break;
+    case k64:
+      op_low = kX86PshufdRRI;
+      imm = 0x44;
+      break;
+    case kDouble:
+      op_low = kX86PshufdRRI;
+      op_mov = kX86Mova128RR;
+      reg_type = kFPReg;
+      imm = 0x44;
+      break;
+    case kSignedByte:
+    case kUnsignedByte:
+      // Shuffle 8 bit value into 16 bit word.
+      // We set val = val + (val << 8) below and use 16 bit shuffle.
     case kSignedHalf:
     case kUnsignedHalf:
       // Handles low quadword.
@@ -1822,23 +2120,37 @@ void X86Mir2Lir::GenSetVector(BasicBlock *bb, MIR *mir) {
       break;
   }
 
-  // Load the value from the VR into a GPR.
   RegLocation rl_src = mir_graph_->GetSrc(mir, 0);
-  rl_src = LoadValue(rl_src, kCoreReg);
+
+  // Load the value from the VR into the reg.
+  if (rl_src.wide == 0) {
+    rl_src = LoadValue(rl_src, reg_type);
+  } else {
+    rl_src = LoadValueWide(rl_src, reg_type);
+  }
+
+  // If opsize is 8 bits wide then double value and use 16 bit shuffle instead.
+  if (opsize == kSignedByte || opsize == kUnsignedByte) {
+    RegStorage temp = AllocTemp();
+    // val = val + (val << 8).
+    NewLIR2(kX86Mov32RR, temp.GetReg(), rl_src.reg.GetReg());
+    NewLIR2(kX86Sal32RI, temp.GetReg(), 8);
+    NewLIR2(kX86Or32RR, rl_src.reg.GetReg(), temp.GetReg());
+    FreeTemp(temp);
+  }
 
   // Load the value into the XMM register.
-  NewLIR2(kX86MovdxrRR, rs_dest.GetReg(), rl_src.reg.GetReg());
+  NewLIR2(op_mov, rs_dest.GetReg(), rl_src.reg.GetReg());
 
   // Now shuffle the value across the destination.
-  NewLIR3(op_low, rs_dest.GetReg(), rs_dest.GetReg(), 0);
+  NewLIR3(op_low, rs_dest.GetReg(), rs_dest.GetReg(), imm);
 
   // And then repeat as needed.
   if (op_high != 0) {
-    NewLIR3(op_high, rs_dest.GetReg(), rs_dest.GetReg(), 0);
+    NewLIR3(op_high, rs_dest.GetReg(), rs_dest.GetReg(), imm);
   }
 }
 
-
 LIR *X86Mir2Lir::ScanVectorLiteral(MIR *mir) {
   int *args = reinterpret_cast<int*>(mir->dalvikInsn.arg);
   for (LIR *p = const_vectors_; p != nullptr; p = p->next) {
@@ -1867,7 +2179,7 @@ LIR *X86Mir2Lir::AddVectorLiteral(MIR *mir) {
 }
 
 // ------------ ABI support: mapping of args to physical registers -------------
-RegStorage X86Mir2Lir::InToRegStorageX86_64Mapper::GetNextReg(bool is_double_or_float, bool is_wide) {
+RegStorage X86Mir2Lir::InToRegStorageX86_64Mapper::GetNextReg(bool is_double_or_float, bool is_wide, bool is_ref) {
   const SpecialTargetRegister coreArgMappingToPhysicalReg[] = {kArg1, kArg2, kArg3, kArg4, kArg5};
   const int coreArgMappingToPhysicalRegSize = sizeof(coreArgMappingToPhysicalReg) / sizeof(SpecialTargetRegister);
   const SpecialTargetRegister fpArgMappingToPhysicalReg[] = {kFArg0, kFArg1, kFArg2, kFArg3,
@@ -1880,7 +2192,8 @@ RegStorage X86Mir2Lir::InToRegStorageX86_64Mapper::GetNextReg(bool is_double_or_
     }
   } else {
     if (cur_core_reg_ < coreArgMappingToPhysicalRegSize) {
-      return ml_->TargetReg(coreArgMappingToPhysicalReg[cur_core_reg_++], is_wide);
+      return is_ref ? ml_->TargetRefReg(coreArgMappingToPhysicalReg[cur_core_reg_++]) :
+                      ml_->TargetReg(coreArgMappingToPhysicalReg[cur_core_reg_++], is_wide);
     }
   }
   return RegStorage::InvalidReg();
@@ -1897,11 +2210,12 @@ void X86Mir2Lir::InToRegStorageMapping::Initialize(RegLocation* arg_locs, int co
   max_mapped_in_ = -1;
   is_there_stack_mapped_ = false;
   for (int in_position = 0; in_position < count; in_position++) {
-     RegStorage reg = mapper->GetNextReg(arg_locs[in_position].fp, arg_locs[in_position].wide);
+     RegStorage reg = mapper->GetNextReg(arg_locs[in_position].fp,
+             arg_locs[in_position].wide, arg_locs[in_position].ref);
      if (reg.Valid()) {
        mapping_[in_position] = reg;
        max_mapped_in_ = std::max(max_mapped_in_, in_position);
-       if (reg.Is64BitSolo()) {
+       if (arg_locs[in_position].wide) {
          // We covered 2 args, so skip the next one
          in_position++;
        }
diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h
index e271e9d100..2789923bb9 100644
--- a/compiler/dex/quick/x86/x86_lir.h
+++ b/compiler/dex/quick/x86/x86_lir.h
@@ -569,6 +569,9 @@ enum X86OpCode {
   kX86PextrbRRI,                // Extract 8 bits from XMM into GPR
   kX86PextrwRRI,                // Extract 16 bits from XMM into GPR
   kX86PextrdRRI,                // Extract 32 bits from XMM into GPR
+  kX86PextrbMRI,                // Extract 8 bits from XMM into memory
+  kX86PextrwMRI,                // Extract 16 bits from XMM into memory
+  kX86PextrdMRI,                // Extract 32 bits from XMM into memory
   kX86PshuflwRRI,               // Shuffle 16 bits in lower 64 bits of XMM.
   kX86PshufdRRI,                // Shuffle 32 bits in XMM.
   kX86ShufpsRRI,                // FP Shuffle 32 bits in XMM.
@@ -723,7 +726,7 @@ struct X86EncodingMap {
 #define REX_X 0x42
 // Extension of the ModR/M r/m field, SIB base field, or Opcode reg field
 #define REX_B 0x41
-// Extended register set
+// An empty REX prefix used to normalize the byte operations so that they apply to R4 through R15
 #define REX 0x40
 // Mask extracting the least 3 bits of r0..r15
 #define kRegNumMask32 0x07
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index 770ae89ca2..9bf51359cf 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -1466,8 +1466,12 @@ static void CheckAndClearResolveException(Thread* self)
   CHECK(self->IsExceptionPending());
   mirror::Throwable* exception = self->GetException(nullptr);
   std::string descriptor = exception->GetClass()->GetDescriptor();
-  if (descriptor != "Ljava/lang/IncompatibleClassChangeError;" &&
-      descriptor != "Ljava/lang/NoClassDefFoundError;") {
+      if (descriptor != "Ljava/lang/IllegalAccessError;" &&
+          descriptor != "Ljava/lang/IncompatibleClassChangeError;" &&
+          descriptor != "Ljava/lang/InstantiationError;" &&
+          descriptor != "Ljava/lang/NoClassDefFoundError;" &&
+          descriptor != "Ljava/lang/NoSuchFieldError;" &&
+          descriptor != "Ljava/lang/NoSuchMethodError;") {
     LOG(FATAL) << "Unexpected exeption " << exception->Dump();
   }
   self->ClearException();
diff --git a/disassembler/disassembler_x86.cc b/disassembler/disassembler_x86.cc
index e6cbf05744..a6f9a8a22b 100644
--- a/disassembler/disassembler_x86.cc
+++ b/disassembler/disassembler_x86.cc
@@ -201,7 +201,8 @@ size_t DisassemblerX86::DumpInstruction(std::ostream& os, const uint8_t* instr)
   std::ostringstream opcode;
   bool store = false;  // stores to memory (ie rm is on the left)
   bool load = false;  // loads from memory (ie rm is on the right)
-  bool byte_operand = false;
+  bool byte_operand = false;  // true when the opcode is dealing with byte operands
+  bool byte_second_operand = false;  // true when the source operand is a byte register but the target register isn't (ie movsxb/movzxb).
   bool target_specific = false;  // register name depends on target (64 vs 32 bits).
   bool ax = false;  // implicit use of ax
   bool cx = false;  // implicit use of cx
@@ -426,6 +427,20 @@ DISASSEMBLER_ENTRY(cmp,
         instr++;
         if (prefix[2] == 0x66) {
           switch (*instr) {
+            case 0x01:
+              opcode << "phaddw";
+              prefix[2] = 0;
+              has_modrm = true;
+              load = true;
+              src_reg_file = dst_reg_file = SSE;
+              break;
+            case 0x02:
+              opcode << "phaddd";
+              prefix[2] = 0;
+              has_modrm = true;
+              load = true;
+              src_reg_file = dst_reg_file = SSE;
+              break;
             case 0x40:
               opcode << "pmulld";
               prefix[2] = 0;
@@ -449,7 +464,7 @@ DISASSEMBLER_ENTRY(cmp,
               prefix[2] = 0;
               has_modrm = true;
               store = true;
-              dst_reg_file = SSE;
+              src_reg_file = SSE;
               immediate_bytes = 1;
               break;
             case 0x16:
@@ -457,7 +472,7 @@ DISASSEMBLER_ENTRY(cmp,
               prefix[2] = 0;
               has_modrm = true;
               store = true;
-              dst_reg_file = SSE;
+              src_reg_file = SSE;
               immediate_bytes = 1;
               break;
             default:
@@ -732,9 +747,9 @@ DISASSEMBLER_ENTRY(cmp,
         break;
       case 0xAF: opcode << "imul"; has_modrm = true; load = true; break;
       case 0xB1: opcode << "cmpxchg"; has_modrm = true; store = true; break;
-      case 0xB6: opcode << "movzxb"; has_modrm = true; load = true; break;
+      case 0xB6: opcode << "movzxb"; has_modrm = true; load = true; byte_second_operand = true; break;
       case 0xB7: opcode << "movzxw"; has_modrm = true; load = true; break;
-      case 0xBE: opcode << "movsxb"; has_modrm = true; load = true; break;
+      case 0xBE: opcode << "movsxb"; has_modrm = true; load = true; byte_second_operand = true; rex |= (rex == 0 ? 0 : 0b1000); break;
       case 0xBF: opcode << "movsxw"; has_modrm = true; load = true; break;
       case 0xC5:
         if (prefix[2] == 0x66) {
@@ -742,7 +757,7 @@ DISASSEMBLER_ENTRY(cmp,
           prefix[2] = 0;
           has_modrm = true;
           store = true;
-          src_reg_file = dst_reg_file = SSE;
+          src_reg_file = SSE;
           immediate_bytes = 1;
         } else {
           opcode << StringPrintf("unknown opcode '0F %02X'", *instr);
@@ -1124,7 +1139,8 @@ DISASSEMBLER_ENTRY(cmp,
     } else {
       if (mod == 3) {
         if (!no_ops) {
-          DumpRmReg(address, rex_w, rm, byte_operand, prefix[2], load ? src_reg_file : dst_reg_file);
+          DumpRmReg(address, rex_w, rm, byte_operand || byte_second_operand,
+                    prefix[2], load ? src_reg_file : dst_reg_file);
         }
       } else {
         address << "[";
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index 60453c3cc1..5180e34313 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -581,10 +581,6 @@ bool ClassLinker::GenerateOatFile(const char* dex_filename,
   std::vector<std::string> argv;
   argv.push_back(dex2oat);
   argv.push_back("--runtime-arg");
-  argv.push_back("-Xms64m");
-  argv.push_back("--runtime-arg");
-  argv.push_back("-Xmx64m");
-  argv.push_back("--runtime-arg");
   argv.push_back("-classpath");
   argv.push_back("--runtime-arg");
   argv.push_back(Runtime::Current()->GetClassPathString());
diff --git a/runtime/debugger.cc b/runtime/debugger.cc
index 11415b49e0..6161aff647 100644
--- a/runtime/debugger.cc
+++ b/runtime/debugger.cc
@@ -170,27 +170,47 @@ class AllocRecord {
   AllocRecordStackTraceElement stack_[kMaxAllocRecordStackDepth];  // Unused entries have NULL method.
 };
 
-struct Breakpoint {
-  // The location of this breakpoint.
-  mirror::ArtMethod* method;
-  uint32_t dex_pc;
+class Breakpoint {
+ public:
+  Breakpoint(mirror::ArtMethod* method, uint32_t dex_pc, bool need_full_deoptimization)
+    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+    : method_(nullptr), dex_pc_(dex_pc), need_full_deoptimization_(need_full_deoptimization) {
+    ScopedObjectAccessUnchecked soa(Thread::Current());
+    method_ = soa.EncodeMethod(method);
+  }
 
-  // Indicates whether breakpoint needs full deoptimization or selective deoptimization.
-  bool need_full_deoptimization;
+  Breakpoint(const Breakpoint& other) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+    : method_(nullptr), dex_pc_(other.dex_pc_),
+      need_full_deoptimization_(other.need_full_deoptimization_) {
+    ScopedObjectAccessUnchecked soa(Thread::Current());
+    method_ = soa.EncodeMethod(other.Method());
+  }
 
-  Breakpoint(mirror::ArtMethod* method, uint32_t dex_pc, bool need_full_deoptimization)
-    : method(method), dex_pc(dex_pc), need_full_deoptimization(need_full_deoptimization) {}
+  mirror::ArtMethod* Method() const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    ScopedObjectAccessUnchecked soa(Thread::Current());
+    return soa.DecodeMethod(method_);
+  }
 
-  void VisitRoots(RootCallback* callback, void* arg) {
-    if (method != nullptr) {
-      callback(reinterpret_cast<mirror::Object**>(&method), arg, 0, kRootDebugger);
-    }
+  uint32_t DexPc() const {
+    return dex_pc_;
+  }
+
+  bool NeedFullDeoptimization() const {
+    return need_full_deoptimization_;
   }
+
+ private:
+  // The location of this breakpoint.
+  jmethodID method_;
+  uint32_t dex_pc_;
+
+  // Indicates whether breakpoint needs full deoptimization or selective deoptimization.
+  bool need_full_deoptimization_;
 };
 
-static std::ostream& operator<<(std::ostream& os, const Breakpoint& rhs)
+static std::ostream& operator<<(std::ostream& os, Breakpoint& rhs)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-  os << StringPrintf("Breakpoint[%s @%#x]", PrettyMethod(rhs.method).c_str(), rhs.dex_pc);
+  os << StringPrintf("Breakpoint[%s @%#x]", PrettyMethod(rhs.Method()).c_str(), rhs.DexPc());
   return os;
 }
 
@@ -349,18 +369,12 @@ void SingleStepControl::Clear() {
   dex_pcs.clear();
 }
 
-void DeoptimizationRequest::VisitRoots(RootCallback* callback, void* arg) {
-  if (method != nullptr) {
-    callback(reinterpret_cast<mirror::Object**>(&method), arg, 0, kRootDebugger);
-  }
-}
-
 static bool IsBreakpoint(const mirror::ArtMethod* m, uint32_t dex_pc)
     LOCKS_EXCLUDED(Locks::breakpoint_lock_)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   MutexLock mu(Thread::Current(), *Locks::breakpoint_lock_);
   for (size_t i = 0, e = gBreakpoints.size(); i < e; ++i) {
-    if (gBreakpoints[i].method == m && gBreakpoints[i].dex_pc == dex_pc) {
+    if (gBreakpoints[i].DexPc() == dex_pc && gBreakpoints[i].Method() == m) {
       VLOG(jdwp) << "Hit breakpoint #" << i << ": " << gBreakpoints[i];
       return true;
     }
@@ -641,21 +655,6 @@ void Dbg::StartJdwp() {
   }
 }
 
-void Dbg::VisitRoots(RootCallback* callback, void* arg) {
-  {
-    MutexLock mu(Thread::Current(), *Locks::breakpoint_lock_);
-    for (Breakpoint& bp : gBreakpoints) {
-      bp.VisitRoots(callback, arg);
-    }
-  }
-  if (deoptimization_lock_ != nullptr) {  // only true if the debugger is started.
-    MutexLock mu(Thread::Current(), *deoptimization_lock_);
-    for (DeoptimizationRequest& req : deoptimization_requests_) {
-      req.VisitRoots(callback, arg);
-    }
-  }
-}
-
 void Dbg::StopJdwp() {
   // Prevent the JDWP thread from processing JDWP incoming packets after we close the connection.
   Disposed();
@@ -2844,22 +2843,22 @@ size_t* Dbg::GetReferenceCounterForEvent(uint32_t instrumentation_event) {
 // Process request while all mutator threads are suspended.
 void Dbg::ProcessDeoptimizationRequest(const DeoptimizationRequest& request) {
   instrumentation::Instrumentation* instrumentation = Runtime::Current()->GetInstrumentation();
-  switch (request.kind) {
+  switch (request.GetKind()) {
     case DeoptimizationRequest::kNothing:
       LOG(WARNING) << "Ignoring empty deoptimization request.";
       break;
     case DeoptimizationRequest::kRegisterForEvent:
       VLOG(jdwp) << StringPrintf("Add debugger as listener for instrumentation event 0x%x",
-                                 request.instrumentation_event);
-      instrumentation->AddListener(&gDebugInstrumentationListener, request.instrumentation_event);
-      instrumentation_events_ |= request.instrumentation_event;
+                                 request.InstrumentationEvent());
+      instrumentation->AddListener(&gDebugInstrumentationListener, request.InstrumentationEvent());
+      instrumentation_events_ |= request.InstrumentationEvent();
       break;
     case DeoptimizationRequest::kUnregisterForEvent:
       VLOG(jdwp) << StringPrintf("Remove debugger as listener for instrumentation event 0x%x",
-                                 request.instrumentation_event);
+                                 request.InstrumentationEvent());
       instrumentation->RemoveListener(&gDebugInstrumentationListener,
-                                      request.instrumentation_event);
-      instrumentation_events_ &= ~request.instrumentation_event;
+                                      request.InstrumentationEvent());
+      instrumentation_events_ &= ~request.InstrumentationEvent();
       break;
     case DeoptimizationRequest::kFullDeoptimization:
       VLOG(jdwp) << "Deoptimize the world ...";
@@ -2872,17 +2871,17 @@ void Dbg::ProcessDeoptimizationRequest(const DeoptimizationRequest& request) {
       VLOG(jdwp) << "Undeoptimize the world DONE";
       break;
     case DeoptimizationRequest::kSelectiveDeoptimization:
-      VLOG(jdwp) << "Deoptimize method " << PrettyMethod(request.method) << " ...";
-      instrumentation->Deoptimize(request.method);
-      VLOG(jdwp) << "Deoptimize method " << PrettyMethod(request.method) << " DONE";
+      VLOG(jdwp) << "Deoptimize method " << PrettyMethod(request.Method()) << " ...";
+      instrumentation->Deoptimize(request.Method());
+      VLOG(jdwp) << "Deoptimize method " << PrettyMethod(request.Method()) << " DONE";
       break;
     case DeoptimizationRequest::kSelectiveUndeoptimization:
-      VLOG(jdwp) << "Undeoptimize method " << PrettyMethod(request.method) << " ...";
-      instrumentation->Undeoptimize(request.method);
-      VLOG(jdwp) << "Undeoptimize method " << PrettyMethod(request.method) << " DONE";
+      VLOG(jdwp) << "Undeoptimize method " << PrettyMethod(request.Method()) << " ...";
+      instrumentation->Undeoptimize(request.Method());
+      VLOG(jdwp) << "Undeoptimize method " << PrettyMethod(request.Method()) << " DONE";
       break;
     default:
-      LOG(FATAL) << "Unsupported deoptimization request kind " << request.kind;
+      LOG(FATAL) << "Unsupported deoptimization request kind " << request.GetKind();
       break;
   }
 }
@@ -2899,8 +2898,8 @@ void Dbg::ProcessDelayedFullUndeoptimizations() {
     MutexLock mu(Thread::Current(), *deoptimization_lock_);
     while (delayed_full_undeoptimization_count_ > 0) {
       DeoptimizationRequest req;
-      req.kind = DeoptimizationRequest::kFullUndeoptimization;
-      req.method = nullptr;
+      req.SetKind(DeoptimizationRequest::kFullUndeoptimization);
+      req.SetMethod(nullptr);
       RequestDeoptimizationLocked(req);
       --delayed_full_undeoptimization_count_;
     }
@@ -2909,7 +2908,7 @@ void Dbg::ProcessDelayedFullUndeoptimizations() {
 }
 
 void Dbg::RequestDeoptimization(const DeoptimizationRequest& req) {
-  if (req.kind == DeoptimizationRequest::kNothing) {
+  if (req.GetKind() == DeoptimizationRequest::kNothing) {
     // Nothing to do.
     return;
   }
@@ -2918,35 +2917,35 @@ void Dbg::RequestDeoptimization(const DeoptimizationRequest& req) {
 }
 
 void Dbg::RequestDeoptimizationLocked(const DeoptimizationRequest& req) {
-  switch (req.kind) {
+  switch (req.GetKind()) {
     case DeoptimizationRequest::kRegisterForEvent: {
-      DCHECK_NE(req.instrumentation_event, 0u);
-      size_t* counter = GetReferenceCounterForEvent(req.instrumentation_event);
+      DCHECK_NE(req.InstrumentationEvent(), 0u);
+      size_t* counter = GetReferenceCounterForEvent(req.InstrumentationEvent());
       CHECK(counter != nullptr) << StringPrintf("No counter for instrumentation event 0x%x",
-                                                req.instrumentation_event);
+                                                req.InstrumentationEvent());
       if (*counter == 0) {
         VLOG(jdwp) << StringPrintf("Queue request #%zd to start listening to instrumentation event 0x%x",
-                                   deoptimization_requests_.size(), req.instrumentation_event);
+                                   deoptimization_requests_.size(), req.InstrumentationEvent());
         deoptimization_requests_.push_back(req);
       }
       *counter = *counter + 1;
       break;
     }
     case DeoptimizationRequest::kUnregisterForEvent: {
-      DCHECK_NE(req.instrumentation_event, 0u);
-      size_t* counter = GetReferenceCounterForEvent(req.instrumentation_event);
+      DCHECK_NE(req.InstrumentationEvent(), 0u);
+      size_t* counter = GetReferenceCounterForEvent(req.InstrumentationEvent());
       CHECK(counter != nullptr) << StringPrintf("No counter for instrumentation event 0x%x",
-                                                req.instrumentation_event);
+                                                req.InstrumentationEvent());
       *counter = *counter - 1;
       if (*counter == 0) {
         VLOG(jdwp) << StringPrintf("Queue request #%zd to stop listening to instrumentation event 0x%x",
-                                   deoptimization_requests_.size(), req.instrumentation_event);
+                                   deoptimization_requests_.size(), req.InstrumentationEvent());
         deoptimization_requests_.push_back(req);
       }
       break;
     }
     case DeoptimizationRequest::kFullDeoptimization: {
-      DCHECK(req.method == nullptr);
+      DCHECK(req.Method() == nullptr);
       if (full_deoptimization_event_count_ == 0) {
         VLOG(jdwp) << "Queue request #" << deoptimization_requests_.size()
                    << " for full deoptimization";
@@ -2956,7 +2955,7 @@ void Dbg::RequestDeoptimizationLocked(const DeoptimizationRequest& req) {
       break;
     }
     case DeoptimizationRequest::kFullUndeoptimization: {
-      DCHECK(req.method == nullptr);
+      DCHECK(req.Method() == nullptr);
       DCHECK_GT(full_deoptimization_event_count_, 0U);
       --full_deoptimization_event_count_;
       if (full_deoptimization_event_count_ == 0) {
@@ -2967,21 +2966,21 @@ void Dbg::RequestDeoptimizationLocked(const DeoptimizationRequest& req) {
       break;
     }
     case DeoptimizationRequest::kSelectiveDeoptimization: {
-      DCHECK(req.method != nullptr);
+      DCHECK(req.Method() != nullptr);
       VLOG(jdwp) << "Queue request #" << deoptimization_requests_.size()
-                 << " for deoptimization of " << PrettyMethod(req.method);
+                 << " for deoptimization of " << PrettyMethod(req.Method());
       deoptimization_requests_.push_back(req);
       break;
     }
     case DeoptimizationRequest::kSelectiveUndeoptimization: {
-      DCHECK(req.method != nullptr);
+      DCHECK(req.Method() != nullptr);
       VLOG(jdwp) << "Queue request #" << deoptimization_requests_.size()
-                 << " for undeoptimization of " << PrettyMethod(req.method);
+                 << " for undeoptimization of " << PrettyMethod(req.Method());
       deoptimization_requests_.push_back(req);
       break;
     }
     default: {
-      LOG(FATAL) << "Unknown deoptimization request kind " << req.kind;
+      LOG(FATAL) << "Unknown deoptimization request kind " << req.GetKind();
       break;
     }
   }
@@ -3005,7 +3004,7 @@ void Dbg::ManageDeoptimization() {
   {
     MutexLock mu(self, *deoptimization_lock_);
     size_t req_index = 0;
-    for (const DeoptimizationRequest& request : deoptimization_requests_) {
+    for (DeoptimizationRequest& request : deoptimization_requests_) {
       VLOG(jdwp) << "Process deoptimization request #" << req_index++;
       ProcessDeoptimizationRequest(request);
     }
@@ -3036,9 +3035,9 @@ static bool IsMethodPossiblyInlined(Thread* self, mirror::ArtMethod* m)
 }
 
 static const Breakpoint* FindFirstBreakpointForMethod(mirror::ArtMethod* m)
-    EXCLUSIVE_LOCKS_REQUIRED(Locks::breakpoint_lock_) {
-  for (const Breakpoint& breakpoint : gBreakpoints) {
-    if (breakpoint.method == m) {
+    EXCLUSIVE_LOCKS_REQUIRED(Locks::breakpoint_lock_) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  for (Breakpoint& breakpoint : gBreakpoints) {
+    if (breakpoint.Method() == m) {
       return &breakpoint;
     }
   }
@@ -3050,7 +3049,7 @@ static void SanityCheckExistingBreakpoints(mirror::ArtMethod* m, bool need_full_
     EXCLUSIVE_LOCKS_REQUIRED(Locks::breakpoint_lock_)  {
   if (kIsDebugBuild) {
     for (const Breakpoint& breakpoint : gBreakpoints) {
-      CHECK_EQ(need_full_deoptimization, breakpoint.need_full_deoptimization);
+      CHECK_EQ(need_full_deoptimization, breakpoint.NeedFullDeoptimization());
     }
     if (need_full_deoptimization) {
       // We should have deoptimized everything but not "selectively" deoptimized this method.
@@ -3080,18 +3079,18 @@ void Dbg::WatchLocation(const JDWP::JdwpLocation* location, DeoptimizationReques
     // inlined, we deoptimize everything; otherwise we deoptimize only this method.
     need_full_deoptimization = IsMethodPossiblyInlined(self, m);
     if (need_full_deoptimization) {
-      req->kind = DeoptimizationRequest::kFullDeoptimization;
-      req->method = nullptr;
+      req->SetKind(DeoptimizationRequest::kFullDeoptimization);
+      req->SetMethod(nullptr);
     } else {
-      req->kind = DeoptimizationRequest::kSelectiveDeoptimization;
-      req->method = m;
+      req->SetKind(DeoptimizationRequest::kSelectiveDeoptimization);
+      req->SetMethod(m);
     }
   } else {
     // There is at least one breakpoint for this method: we don't need to deoptimize.
-    req->kind = DeoptimizationRequest::kNothing;
-    req->method = nullptr;
+    req->SetKind(DeoptimizationRequest::kNothing);
+    req->SetMethod(nullptr);
 
-    need_full_deoptimization = existing_breakpoint->need_full_deoptimization;
+    need_full_deoptimization = existing_breakpoint->NeedFullDeoptimization();
     SanityCheckExistingBreakpoints(m, need_full_deoptimization);
   }
 
@@ -3103,15 +3102,14 @@ void Dbg::WatchLocation(const JDWP::JdwpLocation* location, DeoptimizationReques
 // Uninstalls a breakpoint at the specified location. Also indicates through the deoptimization
 // request if we need to undeoptimize.
 void Dbg::UnwatchLocation(const JDWP::JdwpLocation* location, DeoptimizationRequest* req) {
+  MutexLock mu(Thread::Current(), *Locks::breakpoint_lock_);
   mirror::ArtMethod* m = FromMethodId(location->method_id);
   DCHECK(m != nullptr) << "No method for method id " << location->method_id;
-
-  MutexLock mu(Thread::Current(), *Locks::breakpoint_lock_);
   bool need_full_deoptimization = false;
   for (size_t i = 0, e = gBreakpoints.size(); i < e; ++i) {
-    if (gBreakpoints[i].method == m && gBreakpoints[i].dex_pc == location->dex_pc) {
+    if (gBreakpoints[i].DexPc() == location->dex_pc && gBreakpoints[i].Method() == m) {
       VLOG(jdwp) << "Removed breakpoint #" << i << ": " << gBreakpoints[i];
-      need_full_deoptimization = gBreakpoints[i].need_full_deoptimization;
+      need_full_deoptimization = gBreakpoints[i].NeedFullDeoptimization();
       DCHECK_NE(need_full_deoptimization, Runtime::Current()->GetInstrumentation()->IsDeoptimized(m));
       gBreakpoints.erase(gBreakpoints.begin() + i);
       break;
@@ -3122,17 +3120,17 @@ void Dbg::UnwatchLocation(const JDWP::JdwpLocation* location, DeoptimizationRequ
     // There is no more breakpoint on this method: we need to undeoptimize.
     if (need_full_deoptimization) {
       // This method required full deoptimization: we need to undeoptimize everything.
-      req->kind = DeoptimizationRequest::kFullUndeoptimization;
-      req->method = nullptr;
+      req->SetKind(DeoptimizationRequest::kFullUndeoptimization);
+      req->SetMethod(nullptr);
     } else {
       // This method required selective deoptimization: we need to undeoptimize only that method.
-      req->kind = DeoptimizationRequest::kSelectiveUndeoptimization;
-      req->method = m;
+      req->SetKind(DeoptimizationRequest::kSelectiveUndeoptimization);
+      req->SetMethod(m);
     }
   } else {
     // There is at least one breakpoint for this method: we don't need to undeoptimize.
-    req->kind = DeoptimizationRequest::kNothing;
-    req->method = nullptr;
+    req->SetKind(DeoptimizationRequest::kNothing);
+    req->SetMethod(nullptr);
     SanityCheckExistingBreakpoints(m, need_full_deoptimization);
   }
 }
@@ -4590,4 +4588,14 @@ jbyteArray Dbg::GetRecentAllocations() {
   return result;
 }
 
+mirror::ArtMethod* DeoptimizationRequest::Method() const {
+  ScopedObjectAccessUnchecked soa(Thread::Current());
+  return soa.DecodeMethod(method_);
+}
+
+void DeoptimizationRequest::SetMethod(mirror::ArtMethod* m) {
+  ScopedObjectAccessUnchecked soa(Thread::Current());
+  method_ = soa.EncodeMethod(m);
+}
+
 }  // namespace art
diff --git a/runtime/debugger.h b/runtime/debugger.h
index 2589638461..1d3668c1f6 100644
--- a/runtime/debugger.h
+++ b/runtime/debugger.h
@@ -131,7 +131,8 @@ struct SingleStepControl {
 };
 
 // TODO rename to InstrumentationRequest.
-struct DeoptimizationRequest {
+class DeoptimizationRequest {
+ public:
   enum Kind {
     kNothing,                   // no action.
     kRegisterForEvent,          // start listening for instrumentation event.
@@ -142,21 +143,48 @@ struct DeoptimizationRequest {
     kSelectiveUndeoptimization  // undeoptimize one method.
   };
 
-  DeoptimizationRequest() : kind(kNothing), instrumentation_event(0), method(nullptr) {}
+  DeoptimizationRequest() : kind_(kNothing), instrumentation_event_(0), method_(nullptr) {}
+
+  DeoptimizationRequest(const DeoptimizationRequest& other)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      : kind_(other.kind_), instrumentation_event_(other.instrumentation_event_) {
+    // Create a new JNI global reference for the method.
+    SetMethod(other.Method());
+  }
+
+  mirror::ArtMethod* Method() const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
+  void SetMethod(mirror::ArtMethod* m) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  void VisitRoots(RootCallback* callback, void* arg);
+  // Name 'Kind()' would collide with the above enum name.
+  Kind GetKind() const {
+    return kind_;
+  }
 
-  Kind kind;
+  void SetKind(Kind kind) {
+    kind_ = kind;
+  }
+
+  uint32_t InstrumentationEvent() const {
+    return instrumentation_event_;
+  }
+
+  void SetInstrumentationEvent(uint32_t instrumentation_event) {
+    instrumentation_event_ = instrumentation_event;
+  }
+
+ private:
+  Kind kind_;
 
   // TODO we could use a union to hold the instrumentation_event and the method since they
   // respectively have sense only for kRegisterForEvent/kUnregisterForEvent and
   // kSelectiveDeoptimization/kSelectiveUndeoptimization.
 
   // Event to start or stop listening to. Only for kRegisterForEvent and kUnregisterForEvent.
-  uint32_t instrumentation_event;
+  uint32_t instrumentation_event_;
 
   // Method for selective deoptimization.
-  mirror::ArtMethod* method;
+  jmethodID method_;
 };
 
 class Dbg {
diff --git a/runtime/gc/space/image_space.cc b/runtime/gc/space/image_space.cc
index 61633cd489..d534bcb74d 100644
--- a/runtime/gc/space/image_space.cc
+++ b/runtime/gc/space/image_space.cc
@@ -61,13 +61,6 @@ static bool GenerateImage(const std::string& image_filename, std::string* error_
   image_option_string += image_filename;
   arg_vector.push_back(image_option_string);
 
-  arg_vector.push_back("--runtime-arg");
-  arg_vector.push_back("-Xms64m");
-
-  arg_vector.push_back("--runtime-arg");
-  arg_vector.push_back("-Xmx64m");
-
-
   for (size_t i = 0; i < boot_class_path.size(); i++) {
     arg_vector.push_back(std::string("--dex-file=") + boot_class_path[i]);
   }
diff --git a/runtime/jdwp/jdwp_event.cc b/runtime/jdwp/jdwp_event.cc
index 86c84e8b0f..36fbed4ea2 100644
--- a/runtime/jdwp/jdwp_event.cc
+++ b/runtime/jdwp/jdwp_event.cc
@@ -192,17 +192,17 @@ JdwpError JdwpState::RegisterEvent(JdwpEvent* pEvent) {
       }
     }
     if (NeedsFullDeoptimization(pEvent->eventKind)) {
-      CHECK_EQ(req.kind, DeoptimizationRequest::kNothing);
-      CHECK(req.method == nullptr);
-      req.kind = DeoptimizationRequest::kFullDeoptimization;
+      CHECK_EQ(req.GetKind(), DeoptimizationRequest::kNothing);
+      CHECK(req.Method() == nullptr);
+      req.SetKind(DeoptimizationRequest::kFullDeoptimization);
     }
     Dbg::RequestDeoptimization(req);
   }
   uint32_t instrumentation_event = GetInstrumentationEventFor(pEvent->eventKind);
   if (instrumentation_event != 0) {
     DeoptimizationRequest req;
-    req.kind = DeoptimizationRequest::kRegisterForEvent;
-    req.instrumentation_event = instrumentation_event;
+    req.SetKind(DeoptimizationRequest::kRegisterForEvent);
+    req.SetInstrumentationEvent(instrumentation_event);
     Dbg::RequestDeoptimization(req);
   }
 
@@ -274,17 +274,17 @@ void JdwpState::UnregisterEvent(JdwpEvent* pEvent) {
       // deoptimization and only the last single-step will trigger a full undeoptimization.
       Dbg::DelayFullUndeoptimization();
     } else if (NeedsFullDeoptimization(pEvent->eventKind)) {
-      CHECK_EQ(req.kind, DeoptimizationRequest::kNothing);
-      CHECK(req.method == nullptr);
-      req.kind = DeoptimizationRequest::kFullUndeoptimization;
+      CHECK_EQ(req.GetKind(), DeoptimizationRequest::kNothing);
+      CHECK(req.Method() == nullptr);
+      req.SetKind(DeoptimizationRequest::kFullUndeoptimization);
     }
     Dbg::RequestDeoptimization(req);
   }
   uint32_t instrumentation_event = GetInstrumentationEventFor(pEvent->eventKind);
   if (instrumentation_event != 0) {
     DeoptimizationRequest req;
-    req.kind = DeoptimizationRequest::kUnregisterForEvent;
-    req.instrumentation_event = instrumentation_event;
+    req.SetKind(DeoptimizationRequest::kUnregisterForEvent);
+    req.SetInstrumentationEvent(instrumentation_event);
     Dbg::RequestDeoptimization(req);
   }
 
diff --git a/runtime/monitor.cc b/runtime/monitor.cc
index 999a9e504b..eb62a694e0 100644
--- a/runtime/monitor.cc
+++ b/runtime/monitor.cc
@@ -401,8 +401,8 @@ void Monitor::Wait(Thread* self, int64_t ms, int32_t ns,
 
   // Make sure that we hold the lock.
   if (owner_ != self) {
-    ThrowIllegalMonitorStateExceptionF("object not locked by thread before wait()");
     monitor_lock_.Unlock(self);
+    ThrowIllegalMonitorStateExceptionF("object not locked by thread before wait()");
     return;
   }
 
@@ -414,10 +414,10 @@ void Monitor::Wait(Thread* self, int64_t ms, int32_t ns,
 
   // Enforce the timeout range.
   if (ms < 0 || ns < 0 || ns > 999999) {
+    monitor_lock_.Unlock(self);
     ThrowLocation throw_location = self->GetCurrentLocationForThrow();
     self->ThrowNewExceptionF(throw_location, "Ljava/lang/IllegalArgumentException;",
                              "timeout arguments out of range: ms=%" PRId64 " ns=%d", ms, ns);
-    monitor_lock_.Unlock(self);
     return;
   }
 
@@ -512,6 +512,8 @@ void Monitor::Wait(Thread* self, int64_t ms, int32_t ns,
   --num_waiters_;
   RemoveFromWaitSet(self);
 
+  monitor_lock_.Unlock(self);
+
   if (was_interrupted) {
     /*
      * We were interrupted while waiting, or somebody interrupted an
@@ -529,7 +531,6 @@ void Monitor::Wait(Thread* self, int64_t ms, int32_t ns,
       self->ThrowNewException(throw_location, "Ljava/lang/InterruptedException;", NULL);
     }
   }
-  monitor_lock_.Unlock(self);
 }
 
 void Monitor::Notify(Thread* self) {
diff --git a/runtime/monitor_test.cc b/runtime/monitor_test.cc
new file mode 100644
index 0000000000..bdba494e14
--- /dev/null
+++ b/runtime/monitor_test.cc
@@ -0,0 +1,380 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "barrier.h"
+#include "monitor.h"
+
+#include <string>
+
+#include "atomic.h"
+#include "common_runtime_test.h"
+#include "handle_scope-inl.h"
+#include "mirror/class-inl.h"
+#include "mirror/string-inl.h"  // Strings are easiest to allocate
+#include "thread_pool.h"
+#include "utils.h"
+
+namespace art {
+
+class MonitorTest : public CommonRuntimeTest {
+ protected:
+  void SetUpRuntimeOptions(Runtime::Options *options) OVERRIDE {
+    // Use a smaller heap
+    for (std::pair<std::string, const void*>& pair : *options) {
+      if (pair.first.find("-Xmx") == 0) {
+        pair.first = "-Xmx4M";  // Smallest we can go.
+      }
+    }
+    options->push_back(std::make_pair("-Xint", nullptr));
+  }
+ public:
+  std::unique_ptr<Monitor> monitor_;
+  Handle<mirror::String> object_;
+  Handle<mirror::String> second_object_;
+  Handle<mirror::String> watchdog_object_;
+  // One exception test is for waiting on another Thread's lock. This is used to race-free &
+  // loop-free pass
+  Thread* thread_;
+  std::unique_ptr<Barrier> barrier_;
+  std::unique_ptr<Barrier> complete_barrier_;
+  bool completed_;
+};
+
+// Fill the heap.
+static const size_t kMaxHandles = 1000000;  // Use arbitrary large amount for now.
+static void FillHeap(Thread* self, ClassLinker* class_linker,
+                     std::unique_ptr<StackHandleScope<kMaxHandles>>* hsp,
+                     std::vector<Handle<mirror::Object>>* handles)
+    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  Runtime::Current()->GetHeap()->SetIdealFootprint(1 * GB);
+
+  hsp->reset(new StackHandleScope<kMaxHandles>(self));
+  // Class java.lang.Object.
+  Handle<mirror::Class> c((*hsp)->NewHandle(class_linker->FindSystemClass(self,
+                                                                       "Ljava/lang/Object;")));
+  // Array helps to fill memory faster.
+  Handle<mirror::Class> ca((*hsp)->NewHandle(class_linker->FindSystemClass(self,
+                                                                        "[Ljava/lang/Object;")));
+
+  // Start allocating with 128K
+  size_t length = 128 * KB / 4;
+  while (length > 10) {
+    Handle<mirror::Object> h((*hsp)->NewHandle<mirror::Object>(
+        mirror::ObjectArray<mirror::Object>::Alloc(self, ca.Get(), length / 4)));
+    if (self->IsExceptionPending() || h.Get() == nullptr) {
+      self->ClearException();
+
+      // Try a smaller length
+      length = length / 8;
+      // Use at most half the reported free space.
+      size_t mem = Runtime::Current()->GetHeap()->GetFreeMemory();
+      if (length * 8 > mem) {
+        length = mem / 8;
+      }
+    } else {
+      handles->push_back(h);
+    }
+  }
+
+  // Allocate simple objects till it fails.
+  while (!self->IsExceptionPending()) {
+    Handle<mirror::Object> h = (*hsp)->NewHandle<mirror::Object>(c->AllocObject(self));
+    if (!self->IsExceptionPending() && h.Get() != nullptr) {
+      handles->push_back(h);
+    }
+  }
+  self->ClearException();
+}
+
+// Check that an exception can be thrown correctly.
+// This test is potentially racy, but the timeout is long enough that it should work.
+
+class CreateTask : public Task {
+ public:
+  explicit CreateTask(MonitorTest* monitor_test, uint64_t initial_sleep, int64_t millis,
+                      bool expected) :
+      monitor_test_(monitor_test), initial_sleep_(initial_sleep), millis_(millis),
+      expected_(expected) {}
+
+  void Run(Thread* self) {
+    {
+      ScopedObjectAccess soa(self);
+
+      monitor_test_->thread_ = self;        // Pass the Thread.
+      monitor_test_->object_.Get()->MonitorEnter(self);     // Lock the object. This should transition
+      LockWord lock_after = monitor_test_->object_.Get()->GetLockWord(false);     // it to thinLocked.
+      LockWord::LockState new_state = lock_after.GetState();
+
+      // Cannot use ASSERT only, as analysis thinks we'll keep holding the mutex.
+      if (LockWord::LockState::kThinLocked != new_state) {
+        monitor_test_->object_.Get()->MonitorExit(self);         // To appease analysis.
+        ASSERT_EQ(LockWord::LockState::kThinLocked, new_state);  // To fail the test.
+        return;
+      }
+
+      // Force a fat lock by running identity hashcode to fill up lock word.
+      monitor_test_->object_.Get()->IdentityHashCode();
+      LockWord lock_after2 = monitor_test_->object_.Get()->GetLockWord(false);
+      LockWord::LockState new_state2 = lock_after2.GetState();
+
+      // Cannot use ASSERT only, as analysis thinks we'll keep holding the mutex.
+      if (LockWord::LockState::kFatLocked != new_state2) {
+        monitor_test_->object_.Get()->MonitorExit(self);         // To appease analysis.
+        ASSERT_EQ(LockWord::LockState::kFatLocked, new_state2);  // To fail the test.
+        return;
+      }
+    }  // Need to drop the mutator lock to use the barrier.
+
+    monitor_test_->barrier_->Wait(self);           // Let the other thread know we're done.
+
+    {
+      ScopedObjectAccess soa(self);
+
+      // Give the other task a chance to do its thing.
+      NanoSleep(initial_sleep_ * 1000 * 1000);
+
+      // Now try to Wait on the Monitor.
+      Monitor::Wait(self, monitor_test_->object_.Get(), millis_, 0, true,
+                    ThreadState::kTimedWaiting);
+
+      // Check the exception status against what we expect.
+      EXPECT_EQ(expected_, self->IsExceptionPending());
+      if (expected_) {
+        self->ClearException();
+      }
+    }
+
+    monitor_test_->complete_barrier_->Wait(self);  // Wait for test completion.
+
+    {
+      ScopedObjectAccess soa(self);
+      monitor_test_->object_.Get()->MonitorExit(self);  // Release the object. Appeases analysis.
+    }
+  }
+
+  void Finalize() {
+    delete this;
+  }
+
+ private:
+  MonitorTest* monitor_test_;
+  uint64_t initial_sleep_;
+  int64_t millis_;
+  bool expected_;
+};
+
+
+class UseTask : public Task {
+ public:
+  UseTask(MonitorTest* monitor_test, uint64_t initial_sleep, int64_t millis, bool expected) :
+      monitor_test_(monitor_test), initial_sleep_(initial_sleep), millis_(millis),
+      expected_(expected) {}
+
+  void Run(Thread* self) {
+    monitor_test_->barrier_->Wait(self);  // Wait for the other thread to set up the monitor.
+
+    {
+      ScopedObjectAccess soa(self);
+
+      // Give the other task a chance to do its thing.
+      NanoSleep(initial_sleep_ * 1000 * 1000);
+
+      Monitor::Wait(self, monitor_test_->object_.Get(), millis_, 0, true,
+                    ThreadState::kTimedWaiting);
+
+      // Check the exception status against what we expect.
+      EXPECT_EQ(expected_, self->IsExceptionPending());
+      if (expected_) {
+        self->ClearException();
+      }
+    }
+
+    monitor_test_->complete_barrier_->Wait(self);  // Wait for test completion.
+  }
+
+  void Finalize() {
+    delete this;
+  }
+
+ private:
+  MonitorTest* monitor_test_;
+  uint64_t initial_sleep_;
+  int64_t millis_;
+  bool expected_;
+};
+
+class InterruptTask : public Task {
+ public:
+  InterruptTask(MonitorTest* monitor_test, uint64_t initial_sleep, uint64_t millis) :
+      monitor_test_(monitor_test), initial_sleep_(initial_sleep), millis_(millis) {}
+
+  void Run(Thread* self) {
+    monitor_test_->barrier_->Wait(self);  // Wait for the other thread to set up the monitor.
+
+    {
+      ScopedObjectAccess soa(self);
+
+      // Give the other task a chance to do its thing.
+      NanoSleep(initial_sleep_ * 1000 * 1000);
+
+      // Interrupt the other thread.
+      monitor_test_->thread_->Interrupt(self);
+
+      // Give it some more time to get to the exception code.
+      NanoSleep(millis_ * 1000 * 1000);
+
+      // Now try to Wait.
+      Monitor::Wait(self, monitor_test_->object_.Get(), 10, 0, true,
+                    ThreadState::kTimedWaiting);
+
+      // No check here, as depending on scheduling we may or may not fail.
+      if (self->IsExceptionPending()) {
+        self->ClearException();
+      }
+    }
+
+    monitor_test_->complete_barrier_->Wait(self);  // Wait for test completion.
+  }
+
+  void Finalize() {
+    delete this;
+  }
+
+ private:
+  MonitorTest* monitor_test_;
+  uint64_t initial_sleep_;
+  uint64_t millis_;
+};
+
+class WatchdogTask : public Task {
+ public:
+  explicit WatchdogTask(MonitorTest* monitor_test) : monitor_test_(monitor_test) {}
+
+  void Run(Thread* self) {
+    ScopedObjectAccess soa(self);
+
+    monitor_test_->watchdog_object_.Get()->MonitorEnter(self);        // Lock the object.
+
+    monitor_test_->watchdog_object_.Get()->Wait(self, 30 * 1000, 0);  // Wait for 30s, or being
+                                                                      // woken up.
+
+    monitor_test_->watchdog_object_.Get()->MonitorExit(self);         // Release the lock.
+
+    if (!monitor_test_->completed_) {
+      LOG(FATAL) << "Watchdog timeout!";
+    }
+  }
+
+  void Finalize() {
+    delete this;
+  }
+
+ private:
+  MonitorTest* monitor_test_;
+};
+
+static void CommonWaitSetup(MonitorTest* test, ClassLinker* class_linker, uint64_t create_sleep,
+                            int64_t c_millis, bool c_expected, bool interrupt, uint64_t use_sleep,
+                            int64_t u_millis, bool u_expected, const char* pool_name) {
+  // First create the object we lock. String is easiest.
+  StackHandleScope<3> hs(Thread::Current());
+  {
+    ScopedObjectAccess soa(Thread::Current());
+    test->object_ = hs.NewHandle(mirror::String::AllocFromModifiedUtf8(Thread::Current(),
+                                                                       "hello, world!"));
+    test->watchdog_object_ = hs.NewHandle(mirror::String::AllocFromModifiedUtf8(Thread::Current(),
+                                                                                "hello, world!"));
+  }
+
+  // Create the barrier used to synchronize.
+  test->barrier_ = std::unique_ptr<Barrier>(new Barrier(2));
+  test->complete_barrier_ = std::unique_ptr<Barrier>(new Barrier(3));
+  test->completed_ = false;
+
+  // Fill the heap.
+  std::unique_ptr<StackHandleScope<kMaxHandles>> hsp;
+  std::vector<Handle<mirror::Object>> handles;
+  {
+    Thread* self = Thread::Current();
+    ScopedObjectAccess soa(self);
+
+    // Our job: Fill the heap, then try Wait.
+    FillHeap(self, class_linker, &hsp, &handles);
+
+    // Now release everything.
+    auto it = handles.begin();
+    auto end = handles.end();
+
+    for ( ; it != end; ++it) {
+      it->Assign(nullptr);
+    }
+  }  // Need to drop the mutator lock to allow barriers.
+
+  Thread* self = Thread::Current();
+  ThreadPool thread_pool(pool_name, 3);
+  thread_pool.AddTask(self, new CreateTask(test, create_sleep, c_millis, c_expected));
+  if (interrupt) {
+    thread_pool.AddTask(self, new InterruptTask(test, use_sleep, static_cast<uint64_t>(u_millis)));
+  } else {
+    thread_pool.AddTask(self, new UseTask(test, use_sleep, u_millis, u_expected));
+  }
+  thread_pool.AddTask(self, new WatchdogTask(test));
+  thread_pool.StartWorkers(self);
+
+  // Wait on completion barrier.
+  test->complete_barrier_->Wait(Thread::Current());
+  test->completed_ = true;
+
+  // Wake the watchdog.
+  {
+    Thread* self = Thread::Current();
+    ScopedObjectAccess soa(self);
+
+    test->watchdog_object_.Get()->MonitorEnter(self);     // Lock the object.
+    test->watchdog_object_.Get()->NotifyAll(self);        // Wake up waiting parties.
+    test->watchdog_object_.Get()->MonitorExit(self);      // Release the lock.
+  }
+
+  thread_pool.StopWorkers(self);
+}
+
+
+// First test: throwing an exception when trying to wait in Monitor with another thread.
+TEST_F(MonitorTest, CheckExceptionsWait1) {
+  // Make the CreateTask wait 10ms, the UseTask wait 10ms.
+  // => The use task will get the lock first and get to self == owner check.
+  CommonWaitSetup(this, class_linker_, 10, 50, false, false, 2, 50, true,
+                  "Monitor test thread pool 1");
+}
+
+// Second test: throwing an exception for invalid wait time.
+TEST_F(MonitorTest, CheckExceptionsWait2) {
+  // Make the CreateTask wait 0ms, the UseTask wait 10ms.
+  // => The create task will get the lock first and get to ms >= 0
+  CommonWaitSetup(this, class_linker_, 0, -1, true, false, 10, 50, true,
+                  "Monitor test thread pool 2");
+}
+
+// Third test: throwing an interrupted-exception.
+TEST_F(MonitorTest, CheckExceptionsWait3) {
+  // Make the CreateTask wait 0ms, then Wait for a long time. Make the InterruptTask wait 10ms,
+  // after which it will interrupt the create task and then wait another 10ms.
+  // => The create task will get to the interrupted-exception throw.
+  CommonWaitSetup(this, class_linker_, 0, 500, true, true, 10, 50, true,
+                  "Monitor test thread pool 3");
+}
+
+}  // namespace art
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index 53ddcca469..3b14aaa767 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -930,7 +930,6 @@ void Runtime::VisitConstantRoots(RootCallback* callback, void* arg) {
 void Runtime::VisitConcurrentRoots(RootCallback* callback, void* arg, VisitRootFlags flags) {
   intern_table_->VisitRoots(callback, arg, flags);
   class_linker_->VisitRoots(callback, arg, flags);
-  Dbg::VisitRoots(callback, arg);
   if ((flags & kVisitRootFlagNewRoots) == 0) {
     // Guaranteed to have no new roots in the constant roots.
     VisitConstantRoots(callback, arg);
diff --git a/runtime/verifier/method_verifier.cc b/runtime/verifier/method_verifier.cc
index 89cfcdd1de..eabb993879 100644
--- a/runtime/verifier/method_verifier.cc
+++ b/runtime/verifier/method_verifier.cc
@@ -46,7 +46,7 @@
 namespace art {
 namespace verifier {
 
-static const bool gDebugVerify = false;
+static constexpr bool gDebugVerify = false;
 // TODO: Add a constant to method_verifier to turn on verbose logging?
 
 void PcToRegisterLineTable::Init(RegisterTrackingMode mode, InstructionFlags* flags,
@@ -1329,8 +1329,7 @@ bool MethodVerifier::CodeFlowVerifyMethod() {
     work_insn_idx_ = insn_idx;
     if (insn_flags_[insn_idx].IsBranchTarget()) {
       work_line_->CopyFromLine(reg_table_.GetLine(insn_idx));
-    } else {
-#ifndef NDEBUG
+    } else if (kIsDebugBuild) {
       /*
        * Sanity check: retrieve the stored register line (assuming
        * a full table) and make sure it actually matches.
@@ -1346,7 +1345,6 @@ bool MethodVerifier::CodeFlowVerifyMethod() {
                      << "  expected=" << *register_line;
         }
       }
-#endif
     }
     if (!CodeFlowVerifyInstruction(&start_guess)) {
       std::string prepend(PrettyMethod(dex_method_idx_, *dex_file_));
@@ -1958,14 +1956,24 @@ bool MethodVerifier::CodeFlowVerifyInstruction(uint32_t* start_guess) {
           (Instruction::INSTANCE_OF == instance_of_inst->Opcode()) &&
           (inst->VRegA_21t() == instance_of_inst->VRegA_22c()) &&
           (instance_of_inst->VRegA_22c() != instance_of_inst->VRegB_22c())) {
-        // Check that the we are not attempting conversion to interface types,
-        // which is not done because of the multiple inheritance implications.
-        // Also don't change the type if it would result in an upcast.
+        // Check the type of the instance-of is different than that of registers type, as if they
+        // are the same there is no work to be done here. Check that the conversion is not to or
+        // from an unresolved type as type information is imprecise. If the instance-of is to an
+        // interface then ignore the type information as interfaces can only be treated as Objects
+        // and we don't want to disallow field and other operations on the object. If the value
+        // being instance-of checked against is known null (zero) then allow the optimization as
+        // we didn't have type information. If the merge of the instance-of type with the original
+        // type is assignable to the original then allow optimization. This check is performed to
+        // ensure that subsequent merges don't lose type information - such as becoming an
+        // interface from a class that would lose information relevant to field checks.
         const RegType& orig_type = work_line_->GetRegisterType(instance_of_inst->VRegB_22c());
         const RegType& cast_type = ResolveClassAndCheckAccess(instance_of_inst->VRegC_22c());
 
-        if (!cast_type.IsUnresolvedTypes() && !orig_type.IsUnresolvedTypes() &&
-            !cast_type.GetClass()->IsInterface() && !cast_type.IsAssignableFrom(orig_type)) {
+        if (!orig_type.Equals(cast_type) &&
+            !cast_type.IsUnresolvedTypes() && !orig_type.IsUnresolvedTypes() &&
+            !cast_type.GetClass()->IsInterface() &&
+            (orig_type.IsZero() ||
+                orig_type.IsStrictlyAssignableFrom(cast_type.Merge(orig_type, &reg_types_)))) {
           RegisterLine* update_line = RegisterLine::Create(code_item_->registers_size_, this);
           if (inst->Opcode() == Instruction::IF_EQZ) {
             fallthrough_line.reset(update_line);
@@ -2699,11 +2707,11 @@ bool MethodVerifier::CodeFlowVerifyInstruction(uint32_t* start_guess) {
     }
     /* update branch target, set "changed" if appropriate */
     if (NULL != branch_line.get()) {
-      if (!UpdateRegisters(work_insn_idx_ + branch_target, branch_line.get())) {
+      if (!UpdateRegisters(work_insn_idx_ + branch_target, branch_line.get(), false)) {
         return false;
       }
     } else {
-      if (!UpdateRegisters(work_insn_idx_ + branch_target, work_line_.get())) {
+      if (!UpdateRegisters(work_insn_idx_ + branch_target, work_line_.get(), false)) {
         return false;
       }
     }
@@ -2743,8 +2751,9 @@ bool MethodVerifier::CodeFlowVerifyInstruction(uint32_t* start_guess) {
       if (!CheckNotMoveException(code_item_->insns_, abs_offset)) {
         return false;
       }
-      if (!UpdateRegisters(abs_offset, work_line_.get()))
+      if (!UpdateRegisters(abs_offset, work_line_.get(), false)) {
         return false;
+      }
     }
   }
 
@@ -2765,7 +2774,7 @@ bool MethodVerifier::CodeFlowVerifyInstruction(uint32_t* start_guess) {
        * "work_regs", because at runtime the exception will be thrown before the instruction
        * modifies any registers.
        */
-      if (!UpdateRegisters(iterator.GetHandlerAddress(), saved_line_.get())) {
+      if (!UpdateRegisters(iterator.GetHandlerAddress(), saved_line_.get(), false)) {
         return false;
       }
     }
@@ -2824,9 +2833,10 @@ bool MethodVerifier::CodeFlowVerifyInstruction(uint32_t* start_guess) {
     }
     RegisterLine* next_line = reg_table_.GetLine(next_insn_idx);
     if (next_line != NULL) {
-      // Merge registers into what we have for the next instruction,
-      // and set the "changed" flag if needed.
-      if (!UpdateRegisters(next_insn_idx, work_line_.get())) {
+      // Merge registers into what we have for the next instruction, and set the "changed" flag if
+      // needed. If the merge changes the state of the registers then the work line will be
+      // updated.
+      if (!UpdateRegisters(next_insn_idx, work_line_.get(), true)) {
         return false;
       }
     } else {
@@ -3890,7 +3900,8 @@ bool MethodVerifier::CheckNotMoveException(const uint16_t* insns, int insn_idx)
   return true;
 }
 
-bool MethodVerifier::UpdateRegisters(uint32_t next_insn, const RegisterLine* merge_line) {
+bool MethodVerifier::UpdateRegisters(uint32_t next_insn, RegisterLine* merge_line,
+                                     bool update_merge_line) {
   bool changed = true;
   RegisterLine* target_line = reg_table_.GetLine(next_insn);
   if (!insn_flags_[next_insn].IsVisitedOrChanged()) {
@@ -3939,6 +3950,9 @@ bool MethodVerifier::UpdateRegisters(uint32_t next_insn, const RegisterLine* mer
                       << *merge_line << "  ==\n"
                       << *target_line << "\n";
     }
+    if (update_merge_line && changed) {
+      merge_line->CopyFromLine(target_line);
+    }
   }
   if (changed) {
     insn_flags_[next_insn].SetChanged();
diff --git a/runtime/verifier/method_verifier.h b/runtime/verifier/method_verifier.h
index b6d5b351c3..757c41993c 100644
--- a/runtime/verifier/method_verifier.h
+++ b/runtime/verifier/method_verifier.h
@@ -595,9 +595,11 @@ class MethodVerifier {
   /*
   * Control can transfer to "next_insn". Merge the registers from merge_line into the table at
   * next_insn, and set the changed flag on the target address if any of the registers were changed.
+  * In the case of fall-through, update the merge line on a change as its the working line for the
+  * next instruction.
   * Returns "false" if an error is encountered.
   */
-  bool UpdateRegisters(uint32_t next_insn, const RegisterLine* merge_line)
+  bool UpdateRegisters(uint32_t next_insn, RegisterLine* merge_line, bool update_merge_line)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Is the method being verified a constructor?
diff --git a/runtime/verifier/reg_type.h b/runtime/verifier/reg_type.h
index 64001d36f3..e985f3a2de 100644
--- a/runtime/verifier/reg_type.h
+++ b/runtime/verifier/reg_type.h
@@ -209,9 +209,9 @@ class RegType {
                           !IsUnresolvedSuperClass()));
     return descriptor_;
   }
-  mirror::Class* GetClass() const {
+  mirror::Class* GetClass() const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     DCHECK(!IsUnresolvedReference());
-    DCHECK(klass_ != NULL);
+    DCHECK(klass_ != NULL) << Dump();
     DCHECK(HasClass());
     return klass_;
   }
diff --git a/test/052-verifier-fun/expected.txt b/test/052-verifier-fun/expected.txt
index 566267534e..931aef30a1 100644
--- a/test/052-verifier-fun/expected.txt
+++ b/test/052-verifier-fun/expected.txt
@@ -1,2 +1,3 @@
 BlahOne
 Zorch.
+10 == 10
diff --git a/test/052-verifier-fun/src/Main.java b/test/052-verifier-fun/src/Main.java
index 0168412bab..3ffd14376c 100644
--- a/test/052-verifier-fun/src/Main.java
+++ b/test/052-verifier-fun/src/Main.java
@@ -24,6 +24,7 @@ public class Main {
         tryBlah(1);
 
         System.out.println("Zorch.");
+        System.out.println("10 == " + instanceOfTest(10));
     }
 
     /*
@@ -120,4 +121,15 @@ public class Main {
 
         feature.doStuff();
     }
+
+    static int instanceOfTest(Integer x) {
+      Object y = x;
+      if (y instanceof String) {
+        // Bug: 15808277
+        // Non-sensical instance-of to check merging after the branch doesn't result in a verifier
+        // error.
+        ((String)y).charAt(0);
+      }
+      return x.intValue();
+    }
 }
diff --git a/test/082-inline-execute/src/Main.java b/test/082-inline-execute/src/Main.java
index 5b8134df1c..3b11879e08 100644
--- a/test/082-inline-execute/src/Main.java
+++ b/test/082-inline-execute/src/Main.java
@@ -49,6 +49,7 @@ public class Main {
     test_String_indexOf();
     test_String_isEmpty();
     test_String_length();
+    test_Thread_currentThread();
   }
 
   /*
@@ -70,6 +71,17 @@ public class Main {
       return (b - a) < maxDelta;
   }
 
+  /**
+   * Will test inlining Thread.currentThread().
+   */
+  public static void test_Thread_currentThread() {
+    // 1. Do not use result.
+    Thread.currentThread();
+
+    // 2. Result should not be null.
+    Assert.assertNotNull(Thread.currentThread());
+  }
+
   public static void test_String_length() {
     String str0 = "";
     String str1 = "x";
diff --git a/test/Android.oat.mk b/test/Android.oat.mk
index 13d452c7e8..fec2540e9c 100644
--- a/test/Android.oat.mk
+++ b/test/Android.oat.mk
@@ -193,7 +193,7 @@ $(3): PRIVATE_OAT_FILE := $$(oat_file)
 $(3): $$(ART_TEST_HOST_OAT_$(1)_DEX) $(ART_TEST_HOST_OAT_DEPENDENCIES)
 	$(hide) mkdir -p $(ART_HOST_TEST_DIR)/android-data-$$@/dalvik-cache/$$($(2)HOST_ARCH)
 	$(hide) cp $$(realpath $$<) $(ART_HOST_TEST_DIR)/android-data-$$@/oat-test-dex-$(1).jar
-	$(hide) $(DEX2OATD) $(DEX2OAT_FLAGS) --runtime-arg -Xms16m --runtime-arg -Xmx16m $(4) \
+	$(hide) $(DEX2OATD) $(DEX2OAT_FLAGS) --runtime-arg $(DEX2OAT_XMS) --runtime-arg $(DEX2OAT_XMX) $(4) \
 	  --boot-image=$$(HOST_CORE_IMG_LOCATION) \
 	  --dex-file=$$(PRIVATE_DEX_FILE) --oat-file=$$(PRIVATE_OAT_FILE) \
 	  --instruction-set=$($(2)ART_HOST_ARCH) --host --android-root=$(HOST_OUT) \