54 files changed, 1676 insertions, 1068 deletions
diff --git a/build/Android.gtest.mk b/build/Android.gtest.mk
index 4c19ba0b4c..cf703a03da 100644
--- a/build/Android.gtest.mk
+++ b/build/Android.gtest.mk
@@ -115,6 +115,7 @@ RUNTIME_GTEST_COMMON_SRC_FILES := \
   runtime/gc/space/rosalloc_space_static_test.cc \
   runtime/gc/space/rosalloc_space_random_test.cc \
   runtime/gc/space/large_object_space_test.cc \
+  runtime/gc/task_processor_test.cc \
   runtime/gtest_test.cc \
   runtime/handle_scope_test.cc \
   runtime/indenter_test.cc \
diff --git a/compiler/dex/bb_optimizations.cc b/compiler/dex/bb_optimizations.cc
index 6a610ab201..e5358139d8 100644
--- a/compiler/dex/bb_optimizations.cc
+++ b/compiler/dex/bb_optimizations.cc
@@ -51,20 +51,4 @@ bool BBCombine::Worker(PassDataHolder* data) const {
   return false;
 }
 
-/*
- * BasicBlock Optimization pass implementation start.
- */
-void BBOptimizations::Start(PassDataHolder* data) const {
-  DCHECK(data != nullptr);
-  CompilationUnit* c_unit = down_cast<PassMEDataHolder*>(data)->c_unit;
-  DCHECK(c_unit != nullptr);
-  /*
-   * This pass has a different ordering depEnding on the suppress exception,
-   * so do the pass here for now:
-   *   - Later, the Start should just change the ordering and we can move the extended
-   *     creation into the pass driver's main job with a new iterator
-   */
-  c_unit->mir_graph->BasicBlockOptimization();
-}
-
 }  // namespace art
diff --git a/compiler/dex/bb_optimizations.h b/compiler/dex/bb_optimizations.h
index 0407e323cb..b07a415d4a 100644
--- a/compiler/dex/bb_optimizations.h
+++ b/compiler/dex/bb_optimizations.h
@@ -284,7 +284,8 @@ class BBCombine : public PassME {
  */
 class BBOptimizations : public PassME {
  public:
-  BBOptimizations() : PassME("BBOptimizations", kNoNodes, "5_post_bbo_cfg") {
+  BBOptimizations()
+      : PassME("BBOptimizations", kNoNodes, kOptimizationBasicBlockChange, "5_post_bbo_cfg") {
   }
 
   bool Gate(const PassDataHolder* data) const {
@@ -294,7 +295,28 @@ class BBOptimizations : public PassME {
     return ((c_unit->disable_opt & (1 << kBBOpt)) == 0);
   }
 
-  void Start(PassDataHolder* data) const;
+  void Start(PassDataHolder* data) const {
+    DCHECK(data != nullptr);
+    CompilationUnit* c_unit = down_cast<PassMEDataHolder*>(data)->c_unit;
+    DCHECK(c_unit != nullptr);
+    c_unit->mir_graph->BasicBlockOptimizationStart();
+
+    /*
+     * This pass has a different ordering depending on the suppress exception,
+     * so do the pass here for now:
+     *   - Later, the Start should just change the ordering and we can move the extended
+     *     creation into the pass driver's main job with a new iterator
+     */
+    c_unit->mir_graph->BasicBlockOptimization();
+  }
+
+  void End(PassDataHolder* data) const {
+    DCHECK(data != nullptr);
+    CompilationUnit* c_unit = down_cast<const PassMEDataHolder*>(data)->c_unit;
+    DCHECK(c_unit != nullptr);
+    c_unit->mir_graph->BasicBlockOptimizationEnd();
+    down_cast<PassMEDataHolder*>(data)->dirty = !c_unit->mir_graph->DfsOrdersUpToDate();
+  }
 };
 
 /**
diff --git a/compiler/dex/compiler_enums.h b/compiler/dex/compiler_enums.h
index 7ff06a04cb..7edb490176 100644
--- a/compiler/dex/compiler_enums.h
+++ b/compiler/dex/compiler_enums.h
@@ -555,7 +555,7 @@ std::ostream& operator<<(std::ostream& os, const DividePattern& pattern);
  * The current recipe is as follows:
  * -# Use AnyStore ~= (LoadStore | StoreStore) ~= release barrier before volatile store.
  * -# Use AnyAny barrier after volatile store.  (StoreLoad is as expensive.)
- * -# Use LoadAny barrier ~= (LoadLoad | LoadStore) ~= acquire barrierafter each volatile load.
+ * -# Use LoadAny barrier ~= (LoadLoad | LoadStore) ~= acquire barrier after each volatile load.
  * -# Use StoreStore barrier after all stores but before return from any constructor whose
  *    class has final fields.
  * -# Use NTStoreStore to order non-temporal stores with respect to all later
diff --git a/compiler/dex/global_value_numbering_test.cc b/compiler/dex/global_value_numbering_test.cc
index 7e3b4d8adf..18e346968a 100644
--- a/compiler/dex/global_value_numbering_test.cc
+++ b/compiler/dex/global_value_numbering_test.cc
@@ -215,7 +215,6 @@ class GlobalValueNumberingTest : public testing::Test {
         bb->data_flow_info->live_in_v = live_in_v_;
       }
     }
-    cu_.mir_graph->num_blocks_ = count;
     ASSERT_EQ(count, cu_.mir_graph->block_list_.size());
     cu_.mir_graph->entry_block_ = cu_.mir_graph->block_list_[1];
     ASSERT_EQ(kEntryBlock, cu_.mir_graph->entry_block_->block_type);
diff --git a/compiler/dex/mir_analysis.cc b/compiler/dex/mir_analysis.cc
index 7b53b14909..0f0846c74c 100644
--- a/compiler/dex/mir_analysis.cc
+++ b/compiler/dex/mir_analysis.cc
@@ -1151,7 +1151,7 @@ bool MIRGraph::SkipCompilation(std::string* skip_message) {
     skip_compilation = true;
     *skip_message = "Huge method: " + std::to_string(GetNumDalvikInsns());
     // If we're got a huge number of basic blocks, don't bother with further analysis.
-    if (static_cast<size_t>(num_blocks_) > (compiler_options.GetHugeMethodThreshold() / 2)) {
+    if (static_cast<size_t>(GetNumBlocks()) > (compiler_options.GetHugeMethodThreshold() / 2)) {
       return true;
     }
   } else if (compiler_options.IsLargeMethod(GetNumDalvikInsns()) &&
diff --git a/compiler/dex/mir_graph.cc b/compiler/dex/mir_graph.cc
index 71ad635ac4..312a6ebcd6 100644
--- a/compiler/dex/mir_graph.cc
+++ b/compiler/dex/mir_graph.cc
@@ -91,6 +91,9 @@ MIRGraph::MIRGraph(CompilationUnit* cu, ArenaAllocator* arena)
       num_reachable_blocks_(0),
       max_num_reachable_blocks_(0),
       dfs_orders_up_to_date_(false),
+      domination_up_to_date_(false),
+      mir_ssa_rep_up_to_date_(false),
+      topological_order_up_to_date_(false),
       dfs_order_(arena->Adapter(kArenaAllocDfsPreOrder)),
       dfs_post_order_(arena->Adapter(kArenaAllocDfsPostOrder)),
       dom_post_order_traversal_(arena->Adapter(kArenaAllocDomPostOrder)),
@@ -105,7 +108,6 @@ MIRGraph::MIRGraph(CompilationUnit* cu, ArenaAllocator* arena)
       try_block_addr_(NULL),
       entry_block_(NULL),
       exit_block_(NULL),
-      num_blocks_(0),
       current_code_item_(NULL),
       dex_pc_to_block_map_(arena->Adapter()),
       m_units_(arena->Adapter()),
@@ -691,7 +693,7 @@ void MIRGraph::InlineMethod(const DexFile::CodeItem* code_item, uint32_t access_
   if (current_method_ == 0) {
     DCHECK(entry_block_ == NULL);
     DCHECK(exit_block_ == NULL);
-    DCHECK_EQ(num_blocks_, 0U);
+    DCHECK_EQ(GetNumBlocks(), 0U);
     // Use id 0 to represent a null block.
     BasicBlock* null_block = CreateNewBB(kNullBlock);
     DCHECK_EQ(null_block->id, NullBasicBlockId);
@@ -1740,6 +1742,9 @@ void MIRGraph::SSATransformationEnd() {
 
   // Update the maximum number of reachable blocks.
   max_num_reachable_blocks_ = num_reachable_blocks_;
+
+  // Mark MIR SSA representations as up to date.
+  mir_ssa_rep_up_to_date_ = true;
 }
 
 size_t MIRGraph::GetNumDalvikInsns() const {
@@ -2005,6 +2010,7 @@ void MIRGraph::ComputeTopologicalSortOrder() {
   topological_order_loop_head_stack_.clear();
   topological_order_loop_head_stack_.reserve(max_nested_loops);
   max_nested_loops_ = max_nested_loops;
+  topological_order_up_to_date_ = true;
 }
 
 bool BasicBlock::IsExceptionBlock() const {
@@ -2246,12 +2252,6 @@ void BasicBlock::Kill(MIRGraph* mir_graph) {
   }
   predecessors.clear();
 
-  KillUnreachable(mir_graph);
-}
-
-void BasicBlock::KillUnreachable(MIRGraph* mir_graph) {
-  DCHECK(predecessors.empty());  // Unreachable.
-
   // Mark as dead and hidden.
   block_type = kDead;
   hidden = true;
@@ -2270,9 +2270,6 @@ void BasicBlock::KillUnreachable(MIRGraph* mir_graph) {
   ChildBlockIterator iter(this, mir_graph);
   for (BasicBlock* succ_bb = iter.Next(); succ_bb != nullptr; succ_bb = iter.Next()) {
     succ_bb->ErasePredecessor(id);
-    if (succ_bb->predecessors.empty()) {
-      succ_bb->KillUnreachable(mir_graph);
-    }
   }
 
   // Remove links to children.
@@ -2393,7 +2390,8 @@ void BasicBlock::UpdatePredecessor(BasicBlockId old_pred, BasicBlockId new_pred)
 // Create a new basic block with block_id as num_blocks_ that is
 // post-incremented.
 BasicBlock* MIRGraph::CreateNewBB(BBType block_type) {
-  BasicBlock* res = NewMemBB(block_type, num_blocks_++);
+  BasicBlockId id = static_cast<BasicBlockId>(block_list_.size());
+  BasicBlock* res = NewMemBB(block_type, id);
   block_list_.push_back(res);
   return res;
 }
@@ -2403,10 +2401,6 @@ void MIRGraph::CalculateBasicBlockInformation() {
   driver.Launch();
 }
 
-void MIRGraph::InitializeBasicBlockData() {
-  num_blocks_ = block_list_.size();
-}
-
 int MIR::DecodedInstruction::FlagsOf() const {
   // Calculate new index.
   int idx = static_cast<int>(opcode) - kNumPackedOpcodes;
diff --git a/compiler/dex/mir_graph.h b/compiler/dex/mir_graph.h
index 851ca150b5..af97f51975 100644
--- a/compiler/dex/mir_graph.h
+++ b/compiler/dex/mir_graph.h
@@ -410,18 +410,12 @@ class BasicBlock : public DeletableArenaObject<kArenaAllocBB> {
 
   /**
    * @brief Kill the BasicBlock.
-   * @details Unlink predecessors to make this block unreachable, then KillUnreachable().
+   * @details Unlink predecessors and successors, remove all MIRs, set the block type to kDead
+   *          and set hidden to true.
    */
   void Kill(MIRGraph* mir_graph);
 
   /**
-   * @brief Kill the unreachable block and all blocks that become unreachable by killing this one.
-   * @details Set the block type to kDead and set hidden to true, remove all MIRs,
-   *          unlink all successors and recursively kill successors that become unreachable.
-   */
-  void KillUnreachable(MIRGraph* mir_graph);
-
-  /**
    * @brief Is ssa_reg the last SSA definition of that VR in the block?
    */
   bool IsSSALiveOut(const CompilationUnit* c_unit, int ssa_reg);
@@ -574,7 +568,7 @@ class MIRGraph {
   }
 
   unsigned int GetNumBlocks() const {
-    return num_blocks_;
+    return block_list_.size();
   }
 
   /**
@@ -704,7 +698,9 @@ class MIRGraph {
 
   void DumpRegLocTable(RegLocation* table, int count);
 
+  void BasicBlockOptimizationStart();
   void BasicBlockOptimization();
+  void BasicBlockOptimizationEnd();
 
   const ArenaVector<BasicBlockId>& GetTopologicalSortOrder() {
     DCHECK(!topological_order_.empty());
@@ -1198,7 +1194,6 @@ class MIRGraph {
   void AllocateSSAUseData(MIR *mir, int num_uses);
   void AllocateSSADefData(MIR *mir, int num_defs);
   void CalculateBasicBlockInformation();
-  void InitializeBasicBlockData();
   void ComputeDFSOrders();
   void ComputeDefBlockMatrix();
   void ComputeDominators();
@@ -1211,6 +1206,18 @@ class MIRGraph {
     return dfs_orders_up_to_date_;
   }
 
+  bool DominationUpToDate() const {
+    return domination_up_to_date_;
+  }
+
+  bool MirSsaRepUpToDate() const {
+    return mir_ssa_rep_up_to_date_;
+  }
+
+  bool TopologicalOrderUpToDate() const {
+    return topological_order_up_to_date_;
+  }
+
   /*
    * IsDebugBuild sanity check: keep track of the Dex PCs for catch entries so that later on
    * we can verify that all catch entries have native PC entries.
@@ -1321,6 +1328,9 @@ class MIRGraph {
   unsigned int num_reachable_blocks_;
   unsigned int max_num_reachable_blocks_;
   bool dfs_orders_up_to_date_;
+  bool domination_up_to_date_;
+  bool mir_ssa_rep_up_to_date_;
+  bool topological_order_up_to_date_;
   ArenaVector<BasicBlockId> dfs_order_;
   ArenaVector<BasicBlockId> dfs_post_order_;
   ArenaVector<BasicBlockId> dom_post_order_traversal_;
@@ -1379,7 +1389,6 @@ class MIRGraph {
   ArenaBitVector* try_block_addr_;
   BasicBlock* entry_block_;
   BasicBlock* exit_block_;
-  unsigned int num_blocks_;
   const DexFile::CodeItem* current_code_item_;
   ArenaVector<uint16_t> dex_pc_to_block_map_;    // FindBlock lookup cache.
   ArenaVector<DexCompilationUnit*> m_units_;     // List of methods included in this graph
diff --git a/compiler/dex/mir_graph_test.cc b/compiler/dex/mir_graph_test.cc
index a96cd84297..8a7e71f4af 100644
--- a/compiler/dex/mir_graph_test.cc
+++ b/compiler/dex/mir_graph_test.cc
@@ -89,7 +89,6 @@ class TopologicalSortOrderTest : public testing::Test {
             cu_.arena.Alloc(sizeof(BasicBlockDataFlow), kArenaAllocDFInfo));
       }
     }
-    cu_.mir_graph->num_blocks_ = count;
     ASSERT_EQ(count, cu_.mir_graph->block_list_.size());
     cu_.mir_graph->entry_block_ = cu_.mir_graph->block_list_[1];
     ASSERT_EQ(kEntryBlock, cu_.mir_graph->entry_block_->block_type);
diff --git a/compiler/dex/mir_optimization.cc b/compiler/dex/mir_optimization.cc
index 6e9844cb7f..15b83413b7 100644
--- a/compiler/dex/mir_optimization.cc
+++ b/compiler/dex/mir_optimization.cc
@@ -485,9 +485,11 @@ bool MIRGraph::BasicBlockOpt(BasicBlock* bb) {
             mir->ssa_rep->num_uses = 0;
             BasicBlock* successor_to_unlink = GetBasicBlock(edge_to_kill);
             successor_to_unlink->ErasePredecessor(bb->id);
-            if (successor_to_unlink->predecessors.empty()) {
-              successor_to_unlink->KillUnreachable(this);
-            }
+            // We have changed the graph structure.
+            dfs_orders_up_to_date_ = false;
+            domination_up_to_date_ = false;
+            topological_order_up_to_date_ = false;
+            // Keep MIR SSA rep, the worst that can happen is a Phi with just 1 input.
           }
           break;
         case Instruction::CMPL_FLOAT:
@@ -649,36 +651,36 @@ bool MIRGraph::BasicBlockOpt(BasicBlock* bb) {
                * Phi node only contains our two cases as input, we will use the result
                * SSA name of the Phi node as our select result and delete the Phi.  If
                * the Phi node has more than two operands, we will arbitrarily use the SSA
-               * name of the "true" path, delete the SSA name of the "false" path from the
+               * name of the "false" path, delete the SSA name of the "true" path from the
                * Phi node (and fix up the incoming arc list).
                */
               if (phi->ssa_rep->num_uses == 2) {
                 mir->ssa_rep->defs[0] = phi->ssa_rep->defs[0];
-                phi->dalvikInsn.opcode = static_cast<Instruction::Code>(kMirOpNop);
+                // Rather than changing the Phi to kMirOpNop, remove it completely.
+                // This avoids leaving other Phis after kMirOpNop (i.e. a non-Phi) insn.
+                tk_tk->RemoveMIR(phi);
+                int dead_false_def = if_false->ssa_rep->defs[0];
+                raw_use_counts_[dead_false_def] = use_counts_[dead_false_def] = 0;
               } else {
-                int dead_def = if_false->ssa_rep->defs[0];
-                int live_def = if_true->ssa_rep->defs[0];
+                int live_def = if_false->ssa_rep->defs[0];
                 mir->ssa_rep->defs[0] = live_def;
-                BasicBlockId* incoming = phi->meta.phi_incoming;
-                for (int i = 0; i < phi->ssa_rep->num_uses; i++) {
-                  if (phi->ssa_rep->uses[i] == live_def) {
-                    incoming[i] = bb->id;
-                  }
-                }
-                for (int i = 0; i < phi->ssa_rep->num_uses; i++) {
-                  if (phi->ssa_rep->uses[i] == dead_def) {
-                    int last_slot = phi->ssa_rep->num_uses - 1;
-                    phi->ssa_rep->uses[i] = phi->ssa_rep->uses[last_slot];
-                    incoming[i] = incoming[last_slot];
-                  }
-                }
-              }
-              phi->ssa_rep->num_uses--;
-              bb->taken = NullBasicBlockId;
-              tk->block_type = kDead;
-              for (MIR* tmir = ft->first_mir_insn; tmir != NULL; tmir = tmir->next) {
-                tmir->dalvikInsn.opcode = static_cast<Instruction::Code>(kMirOpNop);
               }
+              int dead_true_def = if_true->ssa_rep->defs[0];
+              raw_use_counts_[dead_true_def] = use_counts_[dead_true_def] = 0;
+              // We want to remove ft and tk and link bb directly to ft_ft. First, we need
+              // to update all Phi inputs correctly with UpdatePredecessor(ft->id, bb->id)
+              // since the live_def above comes from ft->first_mir_insn (if_false).
+              DCHECK(if_false == ft->first_mir_insn);
+              ft_ft->UpdatePredecessor(ft->id, bb->id);
+              // Correct the rest of the links between bb, ft and ft_ft.
+              ft->ErasePredecessor(bb->id);
+              ft->fall_through = NullBasicBlockId;
+              bb->fall_through = ft_ft->id;
+              // Now we can kill tk and ft.
+              tk->Kill(this);
+              ft->Kill(this);
+              // NOTE: DFS order, domination info and topological order are still usable
+              // despite the newly dead blocks.
             }
           }
         }
@@ -788,43 +790,9 @@ void MIRGraph::CombineBlocks(class BasicBlock* bb) {
     MIR* mir = bb->last_mir_insn;
     DCHECK(bb->first_mir_insn !=  nullptr);
 
-    // Grab the attributes from the paired opcode.
+    // Get the paired insn and check if it can still throw.
     MIR* throw_insn = mir->meta.throw_insn;
-    uint64_t df_attributes = GetDataFlowAttributes(throw_insn);
-
-    // Don't combine if the throw_insn can still throw NPE.
-    if ((df_attributes & DF_HAS_NULL_CHKS) != 0 &&
-        (throw_insn->optimization_flags & MIR_IGNORE_NULL_CHECK) == 0) {
-      break;
-    }
-    // Now whitelist specific instructions.
-    bool ok = false;
-    if ((df_attributes & DF_IFIELD) != 0) {
-      // Combine only if fast, otherwise weird things can happen.
-      const MirIFieldLoweringInfo& field_info = GetIFieldLoweringInfo(throw_insn);
-      ok = (df_attributes & DF_DA)  ? field_info.FastGet() : field_info.FastPut();
-    } else if ((df_attributes & DF_SFIELD) != 0) {
-      // Combine only if fast, otherwise weird things can happen.
-      const MirSFieldLoweringInfo& field_info = GetSFieldLoweringInfo(throw_insn);
-      bool fast = ((df_attributes & DF_DA)  ? field_info.FastGet() : field_info.FastPut());
-      // Don't combine if the SGET/SPUT can call <clinit>().
-      bool clinit = !field_info.IsClassInitialized() &&
-          (throw_insn->optimization_flags & MIR_CLASS_IS_INITIALIZED) == 0;
-      ok = fast && !clinit;
-    } else if ((df_attributes & DF_HAS_RANGE_CHKS) != 0) {
-      // Only AGET/APUT have range checks. We have processed the AGET/APUT null check above.
-      DCHECK_NE(throw_insn->optimization_flags & MIR_IGNORE_NULL_CHECK, 0);
-      ok = ((throw_insn->optimization_flags & MIR_IGNORE_RANGE_CHECK) != 0);
-    } else if ((throw_insn->dalvikInsn.FlagsOf() & Instruction::kThrow) == 0) {
-      // We can encounter a non-throwing insn here thanks to inlining or other optimizations.
-      ok = true;
-    } else if (throw_insn->dalvikInsn.opcode == Instruction::ARRAY_LENGTH ||
-        throw_insn->dalvikInsn.opcode == Instruction::FILL_ARRAY_DATA ||
-        static_cast<int>(throw_insn->dalvikInsn.opcode) == kMirOpNullCheck) {
-      // No more checks for these (null check was processed above).
-      ok = true;
-    }
-    if (!ok) {
+    if (CanThrow(throw_insn)) {
       break;
     }
 
@@ -863,9 +831,6 @@ void MIRGraph::CombineBlocks(class BasicBlock* bb) {
           BasicBlock* succ_bb = GetBasicBlock(succ_info->block);
           DCHECK(succ_bb->catch_entry);
           succ_bb->ErasePredecessor(bb->id);
-          if (succ_bb->predecessors.empty()) {
-            succ_bb->KillUnreachable(this);
-          }
         }
       }
     }
@@ -908,8 +873,10 @@ void MIRGraph::CombineBlocks(class BasicBlock* bb) {
       child->UpdatePredecessor(bb_next->id, bb->id);
     }
 
-    // DFS orders are not up to date anymore.
+    // DFS orders, domination and topological order are not up to date anymore.
     dfs_orders_up_to_date_ = false;
+    domination_up_to_date_ = false;
+    topological_order_up_to_date_ = false;
 
     // Now, loop back and see if we can keep going
   }
@@ -1581,7 +1548,7 @@ bool MIRGraph::BuildExtendedBBList(class BasicBlock* bb) {
   return false;  // Not iterative - return value will be ignored
 }
 
-void MIRGraph::BasicBlockOptimization() {
+void MIRGraph::BasicBlockOptimizationStart() {
   if ((cu_->disable_opt & (1 << kLocalValueNumbering)) == 0) {
     temp_scoped_alloc_.reset(ScopedArenaAllocator::Create(&cu_->arena_stack));
     temp_.gvn.ifield_ids_ =
@@ -1589,7 +1556,9 @@ void MIRGraph::BasicBlockOptimization() {
     temp_.gvn.sfield_ids_ =
         GlobalValueNumbering::PrepareGvnFieldIds(temp_scoped_alloc_.get(), sfield_lowering_infos_);
   }
+}
 
+void MIRGraph::BasicBlockOptimization() {
   if ((cu_->disable_opt & (1 << kSuppressExceptionEdges)) != 0) {
     ClearAllVisitedFlags();
     PreOrderDfsIterator iter2(this);
@@ -1606,7 +1575,9 @@ void MIRGraph::BasicBlockOptimization() {
       BasicBlockOpt(bb);
     }
   }
+}
 
+void MIRGraph::BasicBlockOptimizationEnd() {
   // Clean up after LVN.
   temp_.gvn.ifield_ids_ = nullptr;
   temp_.gvn.sfield_ids_ = nullptr;
@@ -1719,32 +1690,37 @@ bool MIRGraph::CanThrow(MIR* mir) {
   const int opt_flags = mir->optimization_flags;
   uint64_t df_attributes = GetDataFlowAttributes(mir);
 
+  // First, check if the insn can still throw NPE.
   if (((df_attributes & DF_HAS_NULL_CHKS) != 0) && ((opt_flags & MIR_IGNORE_NULL_CHECK) == 0)) {
     return true;
   }
+
+  // Now process specific instructions.
   if ((df_attributes & DF_IFIELD) != 0) {
-    // The IGET/IPUT family.
+    // The IGET/IPUT family. We have processed the IGET/IPUT null check above.
+    DCHECK_NE(opt_flags & MIR_IGNORE_NULL_CHECK, 0);
+    // If not fast, weird things can happen and the insn can throw.
     const MirIFieldLoweringInfo& field_info = GetIFieldLoweringInfo(mir);
-    bool fast = (df_attributes & DF_DA) ? field_info.FastGet() : field_info.FastPut();
-    // Already processed null check above.
-    if (fast) {
-      return false;
-    }
-  } else if ((df_attributes & DF_HAS_RANGE_CHKS) != 0) {
-    // The AGET/APUT family.
-    // Already processed null check above.
-    if ((opt_flags & MIR_IGNORE_RANGE_CHECK) != 0) {
-      return false;
-    }
+    bool fast = (df_attributes & DF_DA) != 0 ? field_info.FastGet() : field_info.FastPut();
+    return !fast;
   } else if ((df_attributes & DF_SFIELD) != 0) {
-    // The SGET/SPUT family.
+    // The SGET/SPUT family. Check for potentially throwing class initialization.
+    // Also, if not fast, weird things can happen and the insn can throw.
     const MirSFieldLoweringInfo& field_info = GetSFieldLoweringInfo(mir);
-    bool fast = (df_attributes & DF_DA) ? field_info.FastGet() : field_info.FastPut();
+    bool fast = (df_attributes & DF_DA) != 0 ? field_info.FastGet() : field_info.FastPut();
     bool is_class_initialized = field_info.IsClassInitialized() ||
         ((mir->optimization_flags & MIR_CLASS_IS_INITIALIZED) != 0);
-    if (fast && is_class_initialized) {
-      return false;
-    }
+    return !(fast && is_class_initialized);
+  } else if ((df_attributes & DF_HAS_RANGE_CHKS) != 0) {
+    // Only AGET/APUT have range checks. We have processed the AGET/APUT null check above.
+    DCHECK_NE(opt_flags & MIR_IGNORE_NULL_CHECK, 0);
+    // Non-throwing only if range check has been eliminated.
+    return ((opt_flags & MIR_IGNORE_RANGE_CHECK) == 0);
+  } else if (mir->dalvikInsn.opcode == Instruction::ARRAY_LENGTH ||
+      mir->dalvikInsn.opcode == Instruction::FILL_ARRAY_DATA ||
+      static_cast<int>(mir->dalvikInsn.opcode) == kMirOpNullCheck) {
+    // No more checks for these (null check was processed above).
+    return false;
   }
   return true;
 }
diff --git a/compiler/dex/mir_optimization_test.cc b/compiler/dex/mir_optimization_test.cc
index 6c2e9c0b27..362c7fdc05 100644
--- a/compiler/dex/mir_optimization_test.cc
+++ b/compiler/dex/mir_optimization_test.cc
@@ -129,7 +129,6 @@ class MirOptimizationTest : public testing::Test {
             cu_.arena.Alloc(sizeof(BasicBlockDataFlow), kArenaAllocDFInfo));
       }
     }
-    cu_.mir_graph->num_blocks_ = count;
     ASSERT_EQ(count, cu_.mir_graph->block_list_.size());
     cu_.mir_graph->entry_block_ = cu_.mir_graph->block_list_[1];
     ASSERT_EQ(kEntryBlock, cu_.mir_graph->entry_block_->block_type);
diff --git a/compiler/dex/pass_driver_me_post_opt.cc b/compiler/dex/pass_driver_me_post_opt.cc
index e6238e9f25..9b56c0da87 100644
--- a/compiler/dex/pass_driver_me_post_opt.cc
+++ b/compiler/dex/pass_driver_me_post_opt.cc
@@ -31,20 +31,19 @@ namespace art {
 // The initial list of passes to be used by the PassDriveMEPostOpt.
 template<>
 const Pass* const PassDriver<PassDriverMEPostOpt>::g_passes[] = {
-  GetPassInstance<InitializeData>(),
-  GetPassInstance<ClearPhiInstructions>(),
-  GetPassInstance<DFSOrders>(),
-  GetPassInstance<BuildDomination>(),
-  GetPassInstance<TopologicalSortOrders>(),
-  GetPassInstance<DefBlockMatrix>(),
-  GetPassInstance<CreatePhiNodes>(),
-  GetPassInstance<ClearVisitedFlag>(),
-  GetPassInstance<SSAConversion>(),
-  GetPassInstance<PhiNodeOperands>(),
-  GetPassInstance<ConstantPropagation>(),
-  GetPassInstance<PerformInitRegLocations>(),
-  GetPassInstance<MethodUseCount>(),
-  GetPassInstance<FreeData>(),
+    GetPassInstance<DFSOrders>(),
+    GetPassInstance<BuildDomination>(),
+    GetPassInstance<TopologicalSortOrders>(),
+    GetPassInstance<InitializeSSATransformation>(),
+    GetPassInstance<ClearPhiInstructions>(),
+    GetPassInstance<DefBlockMatrix>(),
+    GetPassInstance<CreatePhiNodes>(),
+    GetPassInstance<SSAConversion>(),
+    GetPassInstance<PhiNodeOperands>(),
+    GetPassInstance<ConstantPropagation>(),
+    GetPassInstance<PerformInitRegLocations>(),
+    GetPassInstance<MethodUseCount>(),
+    GetPassInstance<FinishSSATransformation>(),
 };
 
 // The number of the passes in the initial list of Passes (g_passes).
diff --git a/compiler/dex/post_opt_passes.h b/compiler/dex/post_opt_passes.h
index 7b84ba88c5..964355bb5d 100644
--- a/compiler/dex/post_opt_passes.h
+++ b/compiler/dex/post_opt_passes.h
@@ -24,13 +24,31 @@
 namespace art {
 
 /**
- * @class InitializeData
+ * @class PassMEMirSsaRep
+ * @brief Convenience class for passes that check MIRGraph::MirSsaRepUpToDate().
+ */
+class PassMEMirSsaRep : public PassME {
+ public:
+  PassMEMirSsaRep(const char* name, DataFlowAnalysisMode type = kAllNodes)
+      : PassME(name, type) {
+  }
+
+  bool Gate(const PassDataHolder* data) const OVERRIDE {
+    DCHECK(data != nullptr);
+    CompilationUnit* c_unit = down_cast<const PassMEDataHolder*>(data)->c_unit;
+    DCHECK(c_unit != nullptr);
+    return !c_unit->mir_graph->MirSsaRepUpToDate();
+  }
+};
+
+/**
+ * @class InitializeSSATransformation
  * @brief There is some data that needs to be initialized before performing
  * the post optimization passes.
  */
-class InitializeData : public PassME {
+class InitializeSSATransformation : public PassMEMirSsaRep {
  public:
-  InitializeData() : PassME("InitializeData", kNoNodes) {
+  InitializeSSATransformation() : PassMEMirSsaRep("InitializeSSATransformation", kNoNodes) {
   }
 
   void Start(PassDataHolder* data) const {
@@ -39,8 +57,8 @@ class InitializeData : public PassME {
     DCHECK(data != nullptr);
     CompilationUnit* c_unit = down_cast<PassMEDataHolder*>(data)->c_unit;
     DCHECK(c_unit != nullptr);
-    c_unit->mir_graph.get()->InitializeBasicBlockData();
-    c_unit->mir_graph.get()->SSATransformationStart();
+    c_unit->mir_graph->SSATransformationStart();
+    c_unit->mir_graph->CompilerInitializeSSAConversion();
   }
 };
 
@@ -62,9 +80,9 @@ class MethodUseCount : public PassME {
  * @class ClearPhiInformation
  * @brief Clear the PHI nodes from the CFG.
  */
-class ClearPhiInstructions : public PassME {
+class ClearPhiInstructions : public PassMEMirSsaRep {
  public:
-  ClearPhiInstructions() : PassME("ClearPhiInstructions") {
+  ClearPhiInstructions() : PassMEMirSsaRep("ClearPhiInstructions") {
   }
 
   bool Worker(PassDataHolder* data) const;
@@ -115,12 +133,18 @@ class BuildDomination : public PassME {
   BuildDomination() : PassME("BuildDomination", kNoNodes) {
   }
 
+  bool Gate(const PassDataHolder* data) const {
+    DCHECK(data != nullptr);
+    CompilationUnit* c_unit = down_cast<const PassMEDataHolder*>(data)->c_unit;
+    DCHECK(c_unit != nullptr);
+    return !c_unit->mir_graph->DominationUpToDate();
+  }
+
   void Start(PassDataHolder* data) const {
     DCHECK(data != nullptr);
     CompilationUnit* c_unit = down_cast<PassMEDataHolder*>(data)->c_unit;
     DCHECK(c_unit != nullptr);
-    c_unit->mir_graph.get()->ComputeDominators();
-    c_unit->mir_graph.get()->CompilerInitializeSSAConversion();
+    c_unit->mir_graph->ComputeDominators();
   }
 
   void End(PassDataHolder* data) const {
@@ -143,6 +167,13 @@ class TopologicalSortOrders : public PassME {
   TopologicalSortOrders() : PassME("TopologicalSortOrders", kNoNodes) {
   }
 
+  bool Gate(const PassDataHolder* data) const {
+    DCHECK(data != nullptr);
+    CompilationUnit* c_unit = down_cast<const PassMEDataHolder*>(data)->c_unit;
+    DCHECK(c_unit != nullptr);
+    return !c_unit->mir_graph->TopologicalOrderUpToDate();
+  }
+
   void Start(PassDataHolder* data) const {
     DCHECK(data != nullptr);
     CompilationUnit* c_unit = down_cast<PassMEDataHolder*>(data)->c_unit;
@@ -155,9 +186,9 @@ class TopologicalSortOrders : public PassME {
  * @class DefBlockMatrix
  * @brief Calculate the matrix of definition per basic block
  */
-class DefBlockMatrix : public PassME {
+class DefBlockMatrix : public PassMEMirSsaRep {
  public:
-  DefBlockMatrix() : PassME("DefBlockMatrix", kNoNodes) {
+  DefBlockMatrix() : PassMEMirSsaRep("DefBlockMatrix", kNoNodes) {
   }
 
   void Start(PassDataHolder* data) const {
@@ -172,9 +203,9 @@ class DefBlockMatrix : public PassME {
  * @class CreatePhiNodes
  * @brief Pass to create the phi nodes after SSA calculation
  */
-class CreatePhiNodes : public PassME {
+class CreatePhiNodes : public PassMEMirSsaRep {
  public:
-  CreatePhiNodes() : PassME("CreatePhiNodes", kNoNodes) {
+  CreatePhiNodes() : PassMEMirSsaRep("CreatePhiNodes", kNoNodes) {
   }
 
   void Start(PassDataHolder* data) const {
@@ -186,30 +217,12 @@ class CreatePhiNodes : public PassME {
 };
 
 /**
- * @class ClearVisitedFlag
- * @brief Pass to clear the visited flag for all basic blocks.
- */
-
-class ClearVisitedFlag : public PassME {
- public:
-  ClearVisitedFlag() : PassME("ClearVisitedFlag", kNoNodes) {
-  }
-
-  void Start(PassDataHolder* data) const {
-    DCHECK(data != nullptr);
-    CompilationUnit* c_unit = down_cast<PassMEDataHolder*>(data)->c_unit;
-    DCHECK(c_unit != nullptr);
-    c_unit->mir_graph.get()->ClearAllVisitedFlags();
-  }
-};
-
-/**
  * @class SSAConversion
  * @brief Pass for SSA conversion of MIRs
  */
-class SSAConversion : public PassME {
+class SSAConversion : public PassMEMirSsaRep {
  public:
-  SSAConversion() : PassME("SSAConversion", kNoNodes) {
+  SSAConversion() : PassMEMirSsaRep("SSAConversion", kNoNodes) {
   }
 
   void Start(PassDataHolder* data) const {
@@ -217,6 +230,7 @@ class SSAConversion : public PassME {
     CompilationUnit* c_unit = down_cast<PassMEDataHolder*>(data)->c_unit;
     DCHECK(c_unit != nullptr);
     MIRGraph *mir_graph = c_unit->mir_graph.get();
+    mir_graph->ClearAllVisitedFlags();
     mir_graph->DoDFSPreOrderSSARename(mir_graph->GetEntryBlock());
   }
 };
@@ -225,9 +239,9 @@ class SSAConversion : public PassME {
  * @class PhiNodeOperands
  * @brief Pass to insert the Phi node operands to basic blocks
  */
-class PhiNodeOperands : public PassME {
+class PhiNodeOperands : public PassMEMirSsaRep {
  public:
-  PhiNodeOperands() : PassME("PhiNodeOperands", kPreOrderDFSTraversal) {
+  PhiNodeOperands() : PassMEMirSsaRep("PhiNodeOperands", kPreOrderDFSTraversal) {
   }
 
   bool Worker(PassDataHolder* data) const {
@@ -246,9 +260,9 @@ class PhiNodeOperands : public PassME {
  * @class InitRegLocations
  * @brief Initialize Register Locations.
  */
-class PerformInitRegLocations : public PassME {
+class PerformInitRegLocations : public PassMEMirSsaRep {
  public:
-  PerformInitRegLocations() : PassME("PerformInitRegLocation", kNoNodes) {
+  PerformInitRegLocations() : PassMEMirSsaRep("PerformInitRegLocation", kNoNodes) {
   }
 
   void Start(PassDataHolder* data) const {
@@ -263,9 +277,9 @@ class PerformInitRegLocations : public PassME {
  * @class ConstantPropagation
  * @brief Perform a constant propagation pass.
  */
-class ConstantPropagation : public PassME {
+class ConstantPropagation : public PassMEMirSsaRep {
  public:
-  ConstantPropagation() : PassME("ConstantPropagation") {
+  ConstantPropagation() : PassMEMirSsaRep("ConstantPropagation") {
   }
 
   bool Worker(PassDataHolder* data) const {
@@ -288,12 +302,12 @@ class ConstantPropagation : public PassME {
 };
 
 /**
- * @class FreeData
+ * @class FinishSSATransformation
  * @brief There is some data that needs to be freed after performing the post optimization passes.
  */
-class FreeData : public PassME {
+class FinishSSATransformation : public PassMEMirSsaRep {
  public:
-  FreeData() : PassME("FreeData", kNoNodes) {
+  FinishSSATransformation() : PassMEMirSsaRep("FinishSSATransformation", kNoNodes) {
   }
 
   void End(PassDataHolder* data) const {
diff --git a/compiler/dex/quick/arm/target_arm.cc b/compiler/dex/quick/arm/target_arm.cc
index d3743531fb..b05939156f 100644
--- a/compiler/dex/quick/arm/target_arm.cc
+++ b/compiler/dex/quick/arm/target_arm.cc
@@ -749,6 +749,7 @@ void ArmMir2Lir::FreeCallTemps() {
   FreeTemp(rs_r1);
   FreeTemp(rs_r2);
   FreeTemp(rs_r3);
+  FreeTemp(TargetReg(kHiddenArg));
   if (!kArm32QuickCodeUseSoftFloat) {
     FreeTemp(rs_fr0);
     FreeTemp(rs_fr1);
diff --git a/compiler/dex/quick/arm64/target_arm64.cc b/compiler/dex/quick/arm64/target_arm64.cc
index 030c5ed2f4..ee7e818f85 100644
--- a/compiler/dex/quick/arm64/target_arm64.cc
+++ b/compiler/dex/quick/arm64/target_arm64.cc
@@ -759,6 +759,7 @@ void Arm64Mir2Lir::FreeCallTemps() {
   FreeTemp(rs_f5);
   FreeTemp(rs_f6);
   FreeTemp(rs_f7);
+  FreeTemp(TargetReg(kHiddenArg));
 }
 
 RegStorage Arm64Mir2Lir::LoadHelper(QuickEntrypointEnum trampoline) {
diff --git a/compiler/dex/quick/gen_invoke.cc b/compiler/dex/quick/gen_invoke.cc
index 9462d3d08f..eb206a68c6 100755
--- a/compiler/dex/quick/gen_invoke.cc
+++ b/compiler/dex/quick/gen_invoke.cc
@@ -1557,7 +1557,7 @@ void Mir2Lir::GenInvokeNoInline(CallInfo* info) {
   LIR* call_insn = GenCallInsn(method_info);
   MarkSafepointPC(call_insn);
 
-  ClobberCallerSave();
+  FreeCallTemps();
   if (info->result.location != kLocInvalid) {
     // We have a following MOVE_RESULT - do it now.
     if (info->result.wide) {
diff --git a/compiler/dex/quick/mips/target_mips.cc b/compiler/dex/quick/mips/target_mips.cc
index efa130c65d..c22ba04e08 100644
--- a/compiler/dex/quick/mips/target_mips.cc
+++ b/compiler/dex/quick/mips/target_mips.cc
@@ -445,6 +445,7 @@ void MipsMir2Lir::FreeCallTemps() {
   FreeTemp(rs_rMIPS_ARG1);
   FreeTemp(rs_rMIPS_ARG2);
   FreeTemp(rs_rMIPS_ARG3);
+  FreeTemp(TargetReg(kHiddenArg));
 }
 
 bool MipsMir2Lir::GenMemBarrier(MemBarrierKind barrier_kind ATTRIBUTE_UNUSED) {
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index 5f6cdda0d3..97732e2c12 100755
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -562,6 +562,7 @@ void X86Mir2Lir::FreeCallTemps() {
   FreeTemp(TargetReg32(kArg1));
   FreeTemp(TargetReg32(kArg2));
   FreeTemp(TargetReg32(kArg3));
+  FreeTemp(TargetReg32(kHiddenArg));
   if (cu_->target64) {
     FreeTemp(TargetReg32(kArg4));
     FreeTemp(TargetReg32(kArg5));
@@ -2209,18 +2210,36 @@ void X86Mir2Lir::GenReduceVector(MIR* mir) {
     // Handle float case.
     // TODO Add support for fast math (not value safe) and do horizontal add in that case.
 
+    int extract_index = mir->dalvikInsn.arg[0];
+
     rl_result = EvalLoc(rl_dest, kFPReg, true);
     NewLIR2(kX86PxorRR, rl_result.reg.GetReg(), rl_result.reg.GetReg());
-    NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), vector_src.GetReg());
 
-    // Since FP must keep order of operation for value safety, we shift to low
-    // 32-bits and add to result.
-    for (int i = 0; i < 3; i++) {
-      NewLIR3(kX86ShufpsRRI, vector_src.GetReg(), vector_src.GetReg(), 0x39);
+    if (LIKELY(extract_index != 0)) {
+      // We know the index of element which we want to extract. We want to extract it and
+      // keep values in vector register correct for future use. So the way we act is:
+      // 1. Generate shuffle mask that allows to swap zeroth and required elements;
+      // 2. Shuffle vector register with this mask;
+      // 3. Extract zeroth element where required value lies;
+      // 4. Shuffle with same mask again to restore original values in vector register.
+      // The mask is generated from equivalence mask 0b11100100 swapping 0th and extracted
+      // element indices.
+      int shuffle[4] = {0b00, 0b01, 0b10, 0b11};
+      shuffle[0] = extract_index;
+      shuffle[extract_index] = 0;
+      int mask = 0;
+      for (int i = 0; i < 4; i++) {
+        mask |= (shuffle[i] << (2 * i));
+      }
+      NewLIR3(kX86ShufpsRRI, vector_src.GetReg(), vector_src.GetReg(), mask);
+      NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), vector_src.GetReg());
+      NewLIR3(kX86ShufpsRRI, vector_src.GetReg(), vector_src.GetReg(), mask);
+    } else {
+      // We need to extract zeroth element and don't need any complex stuff to do it.
       NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), vector_src.GetReg());
     }
 
-    StoreValue(rl_dest, rl_result);
+    StoreFinalValue(rl_dest, rl_result);
   } else if (opsize == kDouble) {
     // TODO Handle double case.
     LOG(FATAL) << "Unsupported add reduce for double.";
diff --git a/compiler/dex/ssa_transformation.cc b/compiler/dex/ssa_transformation.cc
index 7cd431e26c..3905649ac6 100644
--- a/compiler/dex/ssa_transformation.cc
+++ b/compiler/dex/ssa_transformation.cc
@@ -103,7 +103,7 @@ void MIRGraph::ComputeDFSOrders() {
 
   num_reachable_blocks_ = dfs_order_.size();
 
-  if (num_reachable_blocks_ != num_blocks_) {
+  if (num_reachable_blocks_ != GetNumBlocks()) {
     // Kill all unreachable blocks.
     AllNodesIterator iter(this);
     for (BasicBlock* bb = iter.Next(); bb != NULL; bb = iter.Next()) {
@@ -173,9 +173,9 @@ void MIRGraph::ComputeDomPostOrderTraversal(BasicBlock* bb) {
   dom_post_order_traversal_.reserve(num_reachable_blocks_);
 
   ClearAllVisitedFlags();
-  DCHECK(temp_scoped_alloc_.get() != nullptr);
+  ScopedArenaAllocator allocator(&cu_->arena_stack);
   ScopedArenaVector<std::pair<BasicBlock*, ArenaBitVector::IndexIterator>> work_stack(
-      temp_scoped_alloc_->Adapter());
+      allocator.Adapter());
   bb->visited = true;
   work_stack.push_back(std::make_pair(bb, bb->i_dominated->Indexes().begin()));
   while (!work_stack.empty()) {
@@ -402,6 +402,8 @@ void MIRGraph::ComputeDominators() {
   for (BasicBlock* bb = iter5.Next(); bb != NULL; bb = iter5.Next()) {
     ComputeDominanceFrontier(bb);
   }
+
+  domination_up_to_date_ = true;
 }
 
 /*
diff --git a/compiler/optimizing/builder.cc b/compiler/optimizing/builder.cc
index f9054e0133..dde0dfe394 100644
--- a/compiler/optimizing/builder.cc
+++ b/compiler/optimizing/builder.cc
@@ -670,10 +670,13 @@ bool HGraphBuilder::BuildInstanceFieldAccess(const Instruction& instruction,
     MaybeRecordStat(MethodCompilationStat::kNotCompiledUnresolvedField);
     return false;
   }
+
+#if defined(__aarch64__)
   if (resolved_field->IsVolatile()) {
     MaybeRecordStat(MethodCompilationStat::kNotCompiledVolatile);
     return false;
   }
+#endif
 
   Primitive::Type field_type = resolved_field->GetTypeAsPrimitiveType();
 
@@ -689,12 +692,14 @@ bool HGraphBuilder::BuildInstanceFieldAccess(const Instruction& instruction,
         null_check,
         value,
         field_type,
-        resolved_field->GetOffset()));
+        resolved_field->GetOffset(),
+        resolved_field->IsVolatile()));
   } else {
     current_block_->AddInstruction(new (arena_) HInstanceFieldGet(
         current_block_->GetLastInstruction(),
         field_type,
-        resolved_field->GetOffset()));
+        resolved_field->GetOffset(),
+        resolved_field->IsVolatile()));
 
     UpdateLocal(source_or_dest_reg, current_block_->GetLastInstruction());
   }
@@ -723,10 +728,12 @@ bool HGraphBuilder::BuildStaticFieldAccess(const Instruction& instruction,
     return false;
   }
 
+#if defined(__aarch64__)
   if (resolved_field->IsVolatile()) {
     MaybeRecordStat(MethodCompilationStat::kNotCompiledVolatile);
     return false;
   }
+#endif
 
   Handle<mirror::Class> referrer_class(hs.NewHandle(compiler_driver_->ResolveCompilingMethodsClass(
       soa, dex_cache, class_loader, outer_compilation_unit_)));
@@ -763,10 +770,12 @@ bool HGraphBuilder::BuildStaticFieldAccess(const Instruction& instruction,
     HInstruction* value = LoadLocal(source_or_dest_reg, field_type);
     DCHECK_EQ(value->GetType(), field_type);
     current_block_->AddInstruction(
-        new (arena_) HStaticFieldSet(cls, value, field_type, resolved_field->GetOffset()));
+        new (arena_) HStaticFieldSet(cls, value, field_type, resolved_field->GetOffset(),
+            resolved_field->IsVolatile()));
   } else {
     current_block_->AddInstruction(
-        new (arena_) HStaticFieldGet(cls, field_type, resolved_field->GetOffset()));
+        new (arena_) HStaticFieldGet(cls, field_type, resolved_field->GetOffset(),
+            resolved_field->IsVolatile()));
     UpdateLocal(source_or_dest_reg, current_block_->GetLastInstruction());
   }
   return true;
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 002d9d4449..063dc7cafb 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -2556,68 +2556,170 @@ void InstructionCodeGeneratorARM::VisitPhi(HPhi* instruction) {
   LOG(FATAL) << "Unreachable";
 }
 
-void LocationsBuilderARM::VisitInstanceFieldSet(HInstanceFieldSet* instruction) {
+void InstructionCodeGeneratorARM::GenerateMemoryBarrier(MemBarrierKind kind) {
+  // TODO (ported from quick): revisit Arm barrier kinds
+  DmbOptions flavour = DmbOptions::ISH;  // quiet c++ warnings
+  switch (kind) {
+    case MemBarrierKind::kAnyStore:
+    case MemBarrierKind::kLoadAny:
+    case MemBarrierKind::kAnyAny: {
+      flavour = DmbOptions::ISH;
+      break;
+    }
+    case MemBarrierKind::kStoreStore: {
+      flavour = DmbOptions::ISHST;
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unexpected memory barrier " << kind;
+  }
+  __ dmb(flavour);
+}
+
+void InstructionCodeGeneratorARM::GenerateWideAtomicLoad(Register addr,
+                                                         uint32_t offset,
+                                                         Register out_lo,
+                                                         Register out_hi) {
+  if (offset != 0) {
+    __ LoadImmediate(out_lo, offset);
+    __ add(addr, addr, ShifterOperand(out_lo));
+  }
+  __ ldrexd(out_lo, out_hi, addr);
+}
+
+void InstructionCodeGeneratorARM::GenerateWideAtomicStore(Register addr,
+                                                          uint32_t offset,
+                                                          Register value_lo,
+                                                          Register value_hi,
+                                                          Register temp1,
+                                                          Register temp2) {
+  Label fail;
+  if (offset != 0) {
+    __ LoadImmediate(temp1, offset);
+    __ add(addr, addr, ShifterOperand(temp1));
+  }
+  __ Bind(&fail);
+  // We need a load followed by store. (The address used in a STREX instruction must
+  // be the same as the address in the most recently executed LDREX instruction.)
+  __ ldrexd(temp1, temp2, addr);
+  __ strexd(temp1, value_lo, value_hi, addr);
+  __ cmp(temp1, ShifterOperand(0));
+  __ b(&fail, NE);
+}
+
+void LocationsBuilderARM::HandleFieldSet(HInstruction* instruction, const FieldInfo& field_info) {
+  DCHECK(instruction->IsInstanceFieldSet() || instruction->IsStaticFieldSet());
+
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
-  bool needs_write_barrier =
-      CodeGenerator::StoreNeedsWriteBarrier(instruction->GetFieldType(), instruction->GetValue());
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RequiresRegister());
+
+  bool is_volatile = field_info.IsVolatile();
+  Primitive::Type field_type = field_info.GetFieldType();
+  bool is_wide = field_type == Primitive::kPrimLong || field_type == Primitive::kPrimDouble;
+
   // Temporary registers for the write barrier.
-  if (needs_write_barrier) {
+  // TODO: consider renaming StoreNeedsWriteBarrier to StoreNeedsGCMark.
+  if (CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->InputAt(1))) {
+    locations->AddTemp(Location::RequiresRegister());
+    locations->AddTemp(Location::RequiresRegister());
+  } else if (is_volatile && is_wide) {
+    // Arm encoding have some additional constraints for ldrexd/strexd:
+    // - registers need to be consecutive
+    // - the first register should be even but not R14.
+    // We don't test for Arm yet, and the assertion makes sure that we revisit this if we ever
+    // enable Arm encoding.
+    DCHECK_EQ(InstructionSet::kThumb2, codegen_->GetInstructionSet());
+
     locations->AddTemp(Location::RequiresRegister());
     locations->AddTemp(Location::RequiresRegister());
+    if (field_type == Primitive::kPrimDouble) {
+      // For doubles we need two more registers to copy the value.
+      locations->AddTemp(Location::RegisterLocation(R2));
+      locations->AddTemp(Location::RegisterLocation(R3));
+    }
   }
 }
 
-void InstructionCodeGeneratorARM::VisitInstanceFieldSet(HInstanceFieldSet* instruction) {
+void InstructionCodeGeneratorARM::HandleFieldSet(HInstruction* instruction,
+                                                 const FieldInfo& field_info) {
+  DCHECK(instruction->IsInstanceFieldSet() || instruction->IsStaticFieldSet());
+
   LocationSummary* locations = instruction->GetLocations();
-  Register obj = locations->InAt(0).AsRegister<Register>();
-  uint32_t offset = instruction->GetFieldOffset().Uint32Value();
-  Primitive::Type field_type = instruction->GetFieldType();
+  Register base = locations->InAt(0).AsRegister<Register>();
+  Location value = locations->InAt(1);
+
+  bool is_volatile = field_info.IsVolatile();
+  Primitive::Type field_type = field_info.GetFieldType();
+  uint32_t offset = field_info.GetFieldOffset().Uint32Value();
+
+  if (is_volatile) {
+    GenerateMemoryBarrier(MemBarrierKind::kAnyStore);
+  }
 
   switch (field_type) {
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte: {
-      Register value = locations->InAt(1).AsRegister<Register>();
-      __ StoreToOffset(kStoreByte, value, obj, offset);
+      __ StoreToOffset(kStoreByte, value.AsRegister<Register>(), base, offset);
       break;
     }
 
     case Primitive::kPrimShort:
     case Primitive::kPrimChar: {
-      Register value = locations->InAt(1).AsRegister<Register>();
-      __ StoreToOffset(kStoreHalfword, value, obj, offset);
+      __ StoreToOffset(kStoreHalfword, value.AsRegister<Register>(), base, offset);
       break;
     }
 
     case Primitive::kPrimInt:
     case Primitive::kPrimNot: {
-      Register value = locations->InAt(1).AsRegister<Register>();
-      __ StoreToOffset(kStoreWord, value, obj, offset);
-      if (CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->GetValue())) {
+      Register value_reg = value.AsRegister<Register>();
+      __ StoreToOffset(kStoreWord, value_reg, base, offset);
+      if (CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->InputAt(1))) {
         Register temp = locations->GetTemp(0).AsRegister<Register>();
         Register card = locations->GetTemp(1).AsRegister<Register>();
-        codegen_->MarkGCCard(temp, card, obj, value);
+        codegen_->MarkGCCard(temp, card, base, value_reg);
       }
       break;
     }
 
     case Primitive::kPrimLong: {
-      Location value = locations->InAt(1);
-      __ StoreToOffset(kStoreWordPair, value.AsRegisterPairLow<Register>(), obj, offset);
+      if (is_volatile) {
+        // TODO: We could use ldrd and strd that are atomic with Large Physical Address Extension
+        // support. This info is stored in the compiler driver (HasAtomicLdrdAndStrd) and we should
+        // pass it around to be able to optimize.
+        GenerateWideAtomicStore(base, offset,
+                                value.AsRegisterPairLow<Register>(),
+                                value.AsRegisterPairHigh<Register>(),
+                                locations->GetTemp(0).AsRegister<Register>(),
+                                locations->GetTemp(1).AsRegister<Register>());
+      } else {
+        __ StoreToOffset(kStoreWordPair, value.AsRegisterPairLow<Register>(), base, offset);
+      }
       break;
     }
 
     case Primitive::kPrimFloat: {
-      SRegister value = locations->InAt(1).AsFpuRegister<SRegister>();
-      __ StoreSToOffset(value, obj, offset);
+      __ StoreSToOffset(value.AsFpuRegister<SRegister>(), base, offset);
       break;
     }
 
     case Primitive::kPrimDouble: {
-      DRegister value = FromLowSToD(locations->InAt(1).AsFpuRegisterPairLow<SRegister>());
-      __ StoreDToOffset(value, obj, offset);
+      DRegister value_reg = FromLowSToD(value.AsFpuRegisterPairLow<SRegister>());
+      if (is_volatile) {
+        Register value_reg_lo = locations->GetTemp(0).AsRegister<Register>();
+        Register value_reg_hi = locations->GetTemp(1).AsRegister<Register>();
+
+        __ vmovrrd(value_reg_lo, value_reg_hi, value_reg);
+
+        GenerateWideAtomicStore(base, offset,
+                                value_reg_lo,
+                                value_reg_hi,
+                                locations->GetTemp(2).AsRegister<Register>(),
+                                locations->GetTemp(3).AsRegister<Register>());
+      } else {
+        __ StoreDToOffset(value_reg, base, offset);
+      }
       break;
     }
 
@@ -2625,75 +2727,138 @@ void InstructionCodeGeneratorARM::VisitInstanceFieldSet(HInstanceFieldSet* instr
       LOG(FATAL) << "Unreachable type " << field_type;
       UNREACHABLE();
   }
+
+  if (is_volatile) {
+    GenerateMemoryBarrier(MemBarrierKind::kAnyAny);
+  }
 }
 
-void LocationsBuilderARM::VisitInstanceFieldGet(HInstanceFieldGet* instruction) {
+void LocationsBuilderARM::HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info) {
+  DCHECK(instruction->IsInstanceFieldGet() || instruction->IsStaticFieldGet());
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+
+  if (field_info.IsVolatile() && (field_info.GetFieldType() == Primitive::kPrimDouble)) {
+    // Arm encoding have some additional constraints for ldrexd/strexd:
+    // - registers need to be consecutive
+    // - the first register should be even but not R14.
+    // We don't test for Arm yet, and the assertion makes sure that we revisit this if we ever
+    // enable Arm encoding.
+    DCHECK_EQ(InstructionSet::kThumb2, codegen_->GetInstructionSet());
+    locations->AddTemp(Location::RequiresRegister());
+    locations->AddTemp(Location::RequiresRegister());
+  }
 }
 
-void InstructionCodeGeneratorARM::VisitInstanceFieldGet(HInstanceFieldGet* instruction) {
+void InstructionCodeGeneratorARM::HandleFieldGet(HInstruction* instruction,
+                                                 const FieldInfo& field_info) {
+  DCHECK(instruction->IsInstanceFieldGet() || instruction->IsStaticFieldGet());
+
   LocationSummary* locations = instruction->GetLocations();
-  Register obj = locations->InAt(0).AsRegister<Register>();
-  uint32_t offset = instruction->GetFieldOffset().Uint32Value();
+  Register base = locations->InAt(0).AsRegister<Register>();
+  Location out = locations->Out();
+  bool is_volatile = field_info.IsVolatile();
+  Primitive::Type field_type = field_info.GetFieldType();
+  uint32_t offset = field_info.GetFieldOffset().Uint32Value();
 
-  switch (instruction->GetType()) {
+  switch (field_type) {
     case Primitive::kPrimBoolean: {
-      Register out = locations->Out().AsRegister<Register>();
-      __ LoadFromOffset(kLoadUnsignedByte, out, obj, offset);
+      __ LoadFromOffset(kLoadUnsignedByte, out.AsRegister<Register>(), base, offset);
       break;
     }
 
     case Primitive::kPrimByte: {
-      Register out = locations->Out().AsRegister<Register>();
-      __ LoadFromOffset(kLoadSignedByte, out, obj, offset);
+      __ LoadFromOffset(kLoadSignedByte, out.AsRegister<Register>(), base, offset);
       break;
     }
 
     case Primitive::kPrimShort: {
-      Register out = locations->Out().AsRegister<Register>();
-      __ LoadFromOffset(kLoadSignedHalfword, out, obj, offset);
+      __ LoadFromOffset(kLoadSignedHalfword, out.AsRegister<Register>(), base, offset);
       break;
     }
 
     case Primitive::kPrimChar: {
-      Register out = locations->Out().AsRegister<Register>();
-      __ LoadFromOffset(kLoadUnsignedHalfword, out, obj, offset);
+      __ LoadFromOffset(kLoadUnsignedHalfword, out.AsRegister<Register>(), base, offset);
       break;
     }
 
     case Primitive::kPrimInt:
     case Primitive::kPrimNot: {
-      Register out = locations->Out().AsRegister<Register>();
-      __ LoadFromOffset(kLoadWord, out, obj, offset);
+      __ LoadFromOffset(kLoadWord, out.AsRegister<Register>(), base, offset);
       break;
     }
 
     case Primitive::kPrimLong: {
-      // TODO: support volatile.
-      Location out = locations->Out();
-      __ LoadFromOffset(kLoadWordPair, out.AsRegisterPairLow<Register>(), obj, offset);
+      if (is_volatile) {
+        GenerateWideAtomicLoad(base, offset,
+                               out.AsRegisterPairLow<Register>(),
+                               out.AsRegisterPairHigh<Register>());
+      } else {
+        __ LoadFromOffset(kLoadWordPair, out.AsRegisterPairLow<Register>(), base, offset);
+      }
       break;
     }
 
     case Primitive::kPrimFloat: {
-      SRegister out = locations->Out().AsFpuRegister<SRegister>();
-      __ LoadSFromOffset(out, obj, offset);
+      __ LoadSFromOffset(out.AsFpuRegister<SRegister>(), base, offset);
       break;
     }
 
     case Primitive::kPrimDouble: {
-      DRegister out = FromLowSToD(locations->Out().AsFpuRegisterPairLow<SRegister>());
-      __ LoadDFromOffset(out, obj, offset);
+      DRegister out_reg = FromLowSToD(out.AsFpuRegisterPairLow<SRegister>());
+      if (is_volatile) {
+        Register lo = locations->GetTemp(0).AsRegister<Register>();
+        Register hi = locations->GetTemp(1).AsRegister<Register>();
+        GenerateWideAtomicLoad(base, offset, lo, hi);
+        __ vmovdrr(out_reg, lo, hi);
+      } else {
+        __ LoadDFromOffset(out_reg, base, offset);
+      }
       break;
     }
 
     case Primitive::kPrimVoid:
-      LOG(FATAL) << "Unreachable type " << instruction->GetType();
+      LOG(FATAL) << "Unreachable type " << field_type;
       UNREACHABLE();
   }
+
+  if (is_volatile) {
+    GenerateMemoryBarrier(MemBarrierKind::kLoadAny);
+  }
+}
+
+void LocationsBuilderARM::VisitInstanceFieldSet(HInstanceFieldSet* instruction) {
+  HandleFieldSet(instruction, instruction->GetFieldInfo());
+}
+
+void InstructionCodeGeneratorARM::VisitInstanceFieldSet(HInstanceFieldSet* instruction) {
+  HandleFieldSet(instruction, instruction->GetFieldInfo());
+}
+
+void LocationsBuilderARM::VisitInstanceFieldGet(HInstanceFieldGet* instruction) {
+  HandleFieldGet(instruction, instruction->GetFieldInfo());
+}
+
+void InstructionCodeGeneratorARM::VisitInstanceFieldGet(HInstanceFieldGet* instruction) {
+  HandleFieldGet(instruction, instruction->GetFieldInfo());
+}
+
+void LocationsBuilderARM::VisitStaticFieldGet(HStaticFieldGet* instruction) {
+  HandleFieldGet(instruction, instruction->GetFieldInfo());
+}
+
+void InstructionCodeGeneratorARM::VisitStaticFieldGet(HStaticFieldGet* instruction) {
+  HandleFieldGet(instruction, instruction->GetFieldInfo());
+}
+
+void LocationsBuilderARM::VisitStaticFieldSet(HStaticFieldSet* instruction) {
+  HandleFieldSet(instruction, instruction->GetFieldInfo());
+}
+
+void InstructionCodeGeneratorARM::VisitStaticFieldSet(HStaticFieldSet* instruction) {
+  HandleFieldSet(instruction, instruction->GetFieldInfo());
 }
 
 void LocationsBuilderARM::VisitNullCheck(HNullCheck* instruction) {
@@ -3206,146 +3371,6 @@ void InstructionCodeGeneratorARM::GenerateClassInitializationCheck(
   __ Bind(slow_path->GetExitLabel());
 }
 
-void LocationsBuilderARM::VisitStaticFieldGet(HStaticFieldGet* instruction) {
-  LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
-  locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
-}
-
-void InstructionCodeGeneratorARM::VisitStaticFieldGet(HStaticFieldGet* instruction) {
-  LocationSummary* locations = instruction->GetLocations();
-  Register cls = locations->InAt(0).AsRegister<Register>();
-  uint32_t offset = instruction->GetFieldOffset().Uint32Value();
-
-  switch (instruction->GetType()) {
-    case Primitive::kPrimBoolean: {
-      Register out = locations->Out().AsRegister<Register>();
-      __ LoadFromOffset(kLoadUnsignedByte, out, cls, offset);
-      break;
-    }
-
-    case Primitive::kPrimByte: {
-      Register out = locations->Out().AsRegister<Register>();
-      __ LoadFromOffset(kLoadSignedByte, out, cls, offset);
-      break;
-    }
-
-    case Primitive::kPrimShort: {
-      Register out = locations->Out().AsRegister<Register>();
-      __ LoadFromOffset(kLoadSignedHalfword, out, cls, offset);
-      break;
-    }
-
-    case Primitive::kPrimChar: {
-      Register out = locations->Out().AsRegister<Register>();
-      __ LoadFromOffset(kLoadUnsignedHalfword, out, cls, offset);
-      break;
-    }
-
-    case Primitive::kPrimInt:
-    case Primitive::kPrimNot: {
-      Register out = locations->Out().AsRegister<Register>();
-      __ LoadFromOffset(kLoadWord, out, cls, offset);
-      break;
-    }
-
-    case Primitive::kPrimLong: {
-      // TODO: support volatile.
-      Location out = locations->Out();
-      __ LoadFromOffset(kLoadWordPair, out.AsRegisterPairLow<Register>(), cls, offset);
-      break;
-    }
-
-    case Primitive::kPrimFloat: {
-      SRegister out = locations->Out().AsFpuRegister<SRegister>();
-      __ LoadSFromOffset(out, cls, offset);
-      break;
-    }
-
-    case Primitive::kPrimDouble: {
-      DRegister out = FromLowSToD(locations->Out().AsFpuRegisterPairLow<SRegister>());
-      __ LoadDFromOffset(out, cls, offset);
-      break;
-    }
-
-    case Primitive::kPrimVoid:
-      LOG(FATAL) << "Unreachable type " << instruction->GetType();
-      UNREACHABLE();
-  }
-}
-
-void LocationsBuilderARM::VisitStaticFieldSet(HStaticFieldSet* instruction) {
-  LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
-  bool needs_write_barrier =
-      CodeGenerator::StoreNeedsWriteBarrier(instruction->GetFieldType(), instruction->GetValue());
-  locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetInAt(1, Location::RequiresRegister());
-  // Temporary registers for the write barrier.
-  if (needs_write_barrier) {
-    locations->AddTemp(Location::RequiresRegister());
-    locations->AddTemp(Location::RequiresRegister());
-  }
-}
-
-void InstructionCodeGeneratorARM::VisitStaticFieldSet(HStaticFieldSet* instruction) {
-  LocationSummary* locations = instruction->GetLocations();
-  Register cls = locations->InAt(0).AsRegister<Register>();
-  uint32_t offset = instruction->GetFieldOffset().Uint32Value();
-  Primitive::Type field_type = instruction->GetFieldType();
-
-  switch (field_type) {
-    case Primitive::kPrimBoolean:
-    case Primitive::kPrimByte: {
-      Register value = locations->InAt(1).AsRegister<Register>();
-      __ StoreToOffset(kStoreByte, value, cls, offset);
-      break;
-    }
-
-    case Primitive::kPrimShort:
-    case Primitive::kPrimChar: {
-      Register value = locations->InAt(1).AsRegister<Register>();
-      __ StoreToOffset(kStoreHalfword, value, cls, offset);
-      break;
-    }
-
-    case Primitive::kPrimInt:
-    case Primitive::kPrimNot: {
-      Register value = locations->InAt(1).AsRegister<Register>();
-      __ StoreToOffset(kStoreWord, value, cls, offset);
-      if (CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->GetValue())) {
-        Register temp = locations->GetTemp(0).AsRegister<Register>();
-        Register card = locations->GetTemp(1).AsRegister<Register>();
-        codegen_->MarkGCCard(temp, card, cls, value);
-      }
-      break;
-    }
-
-    case Primitive::kPrimLong: {
-      Location value = locations->InAt(1);
-      __ StoreToOffset(kStoreWordPair, value.AsRegisterPairLow<Register>(), cls, offset);
-      break;
-    }
-
-    case Primitive::kPrimFloat: {
-      SRegister value = locations->InAt(1).AsFpuRegister<SRegister>();
-      __ StoreSToOffset(value, cls, offset);
-      break;
-    }
-
-    case Primitive::kPrimDouble: {
-      DRegister value = FromLowSToD(locations->InAt(1).AsFpuRegisterPairLow<SRegister>());
-      __ StoreDToOffset(value, cls, offset);
-      break;
-    }
-
-    case Primitive::kPrimVoid:
-      LOG(FATAL) << "Unreachable type " << field_type;
-      UNREACHABLE();
-  }
-}
-
 void LocationsBuilderARM::VisitLoadString(HLoadString* load) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(load, LocationSummary::kCallOnSlowPath);
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index 226e635d05..b86670d939 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -18,6 +18,7 @@
 #define ART_COMPILER_OPTIMIZING_CODE_GENERATOR_ARM_H_
 
 #include "code_generator.h"
+#include "dex/compiler_enums.h"
 #include "nodes.h"
 #include "parallel_move_resolver.h"
 #include "utils/arm/assembler_thumb2.h"
@@ -110,6 +111,8 @@ class LocationsBuilderARM : public HGraphVisitor {
   void HandleInvoke(HInvoke* invoke);
   void HandleBitwiseOperation(HBinaryOperation* operation);
   void HandleShift(HBinaryOperation* operation);
+  void HandleFieldSet(HInstruction* instruction, const FieldInfo& field_info);
+  void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info);
 
   CodeGeneratorARM* const codegen_;
   InvokeDexCallingConventionVisitor parameter_visitor_;
@@ -138,6 +141,15 @@ class InstructionCodeGeneratorARM : public HGraphVisitor {
   void GenerateClassInitializationCheck(SlowPathCodeARM* slow_path, Register class_reg);
   void HandleBitwiseOperation(HBinaryOperation* operation);
   void HandleShift(HBinaryOperation* operation);
+  void GenerateMemoryBarrier(MemBarrierKind kind);
+  void GenerateWideAtomicStore(Register addr, uint32_t offset,
+                               Register value_lo, Register value_hi,
+                               Register temp1, Register temp2);
+  void GenerateWideAtomicLoad(Register addr, uint32_t offset,
+                              Register out_lo, Register out_hi);
+  void HandleFieldSet(HInstruction* instruction, const FieldInfo& field_info);
+  void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info);
+
 
   ArmAssembler* const assembler_;
   CodeGeneratorARM* const codegen_;
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index e7edd8a805..ddb0e82ec0 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -2656,82 +2656,115 @@ void InstructionCodeGeneratorX86::VisitPhi(HPhi* instruction) {
   LOG(FATAL) << "Unreachable";
 }
 
-void LocationsBuilderX86::VisitInstanceFieldSet(HInstanceFieldSet* instruction) {
+void InstructionCodeGeneratorX86::GenerateMemoryBarrier(MemBarrierKind kind) {
+  /*
+   * According to the JSR-133 Cookbook, for x86 only StoreLoad/AnyAny barriers need memory fence.
+   * All other barriers (LoadAny, AnyStore, StoreStore) are nops due to the x86 memory model.
+   * For those cases, all we need to ensure is that there is a scheduling barrier in place.
+   */
+  switch (kind) {
+    case MemBarrierKind::kAnyAny: {
+      __ mfence();
+      break;
+    }
+    case MemBarrierKind::kAnyStore:
+    case MemBarrierKind::kLoadAny:
+    case MemBarrierKind::kStoreStore: {
+      // nop
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unexpected memory barrier " << kind;
+  }
+}
+
+
+void CodeGeneratorX86::MarkGCCard(Register temp, Register card, Register object, Register value) {
+  Label is_null;
+  __ testl(value, value);
+  __ j(kEqual, &is_null);
+  __ fs()->movl(card, Address::Absolute(Thread::CardTableOffset<kX86WordSize>().Int32Value()));
+  __ movl(temp, object);
+  __ shrl(temp, Immediate(gc::accounting::CardTable::kCardShift));
+  __ movb(Address(temp, card, TIMES_1, 0),
+          X86ManagedRegister::FromCpuRegister(card).AsByteRegister());
+  __ Bind(&is_null);
+}
+
+void LocationsBuilderX86::HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info) {
+  DCHECK(instruction->IsInstanceFieldGet() || instruction->IsStaticFieldGet());
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RequiresRegister());
-  Primitive::Type field_type = instruction->GetFieldType();
-  bool needs_write_barrier =
-    CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->InputAt(1));
+  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
 
-  bool is_byte_type = (field_type == Primitive::kPrimBoolean)
-      || (field_type == Primitive::kPrimByte);
-  // The register allocator does not support multiple
-  // inputs that die at entry with one in a specific register.
-  if (is_byte_type) {
-    // Ensure the value is in a byte register.
-    locations->SetInAt(1, Location::RegisterLocation(EAX));
-  } else {
-    locations->SetInAt(1, Location::RequiresRegister());
-  }
-  // Temporary registers for the write barrier.
-  if (needs_write_barrier) {
-    locations->AddTemp(Location::RequiresRegister());
-    // Ensure the card is in a byte register.
-    locations->AddTemp(Location::RegisterLocation(ECX));
+  if (field_info.IsVolatile() && (field_info.GetFieldType() == Primitive::kPrimLong)) {
+    // Long values can be loaded atomically into an XMM using movsd.
+    // So we use an XMM register as a temp to achieve atomicity (first load the temp into the XMM
+    // and then copy the XMM into the output 32bits at a time).
+    locations->AddTemp(Location::RequiresFpuRegister());
   }
 }
 
-void InstructionCodeGeneratorX86::VisitInstanceFieldSet(HInstanceFieldSet* instruction) {
+void InstructionCodeGeneratorX86::HandleFieldGet(HInstruction* instruction,
+                                                 const FieldInfo& field_info) {
+  DCHECK(instruction->IsInstanceFieldGet() || instruction->IsStaticFieldGet());
+
   LocationSummary* locations = instruction->GetLocations();
-  Register obj = locations->InAt(0).AsRegister<Register>();
-  uint32_t offset = instruction->GetFieldOffset().Uint32Value();
-  Primitive::Type field_type = instruction->GetFieldType();
+  Register base = locations->InAt(0).AsRegister<Register>();
+  Location out = locations->Out();
+  bool is_volatile = field_info.IsVolatile();
+  Primitive::Type field_type = field_info.GetFieldType();
+  uint32_t offset = field_info.GetFieldOffset().Uint32Value();
 
   switch (field_type) {
-    case Primitive::kPrimBoolean:
+    case Primitive::kPrimBoolean: {
+      __ movzxb(out.AsRegister<Register>(), Address(base, offset));
+      break;
+    }
+
     case Primitive::kPrimByte: {
-      ByteRegister value = locations->InAt(1).AsRegister<ByteRegister>();
-      __ movb(Address(obj, offset), value);
+      __ movsxb(out.AsRegister<Register>(), Address(base, offset));
+      break;
+    }
+
+    case Primitive::kPrimShort: {
+      __ movsxw(out.AsRegister<Register>(), Address(base, offset));
       break;
     }
 
-    case Primitive::kPrimShort:
     case Primitive::kPrimChar: {
-      Register value = locations->InAt(1).AsRegister<Register>();
-      __ movw(Address(obj, offset), value);
+      __ movzxw(out.AsRegister<Register>(), Address(base, offset));
       break;
     }
 
     case Primitive::kPrimInt:
     case Primitive::kPrimNot: {
-      Register value = locations->InAt(1).AsRegister<Register>();
-      __ movl(Address(obj, offset), value);
-
-      if (CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->InputAt(1))) {
-        Register temp = locations->GetTemp(0).AsRegister<Register>();
-        Register card = locations->GetTemp(1).AsRegister<Register>();
-        codegen_->MarkGCCard(temp, card, obj, value);
-      }
+      __ movl(out.AsRegister<Register>(), Address(base, offset));
       break;
     }
 
     case Primitive::kPrimLong: {
-      Location value = locations->InAt(1);
-      __ movl(Address(obj, offset), value.AsRegisterPairLow<Register>());
-      __ movl(Address(obj, kX86WordSize + offset), value.AsRegisterPairHigh<Register>());
+      if (is_volatile) {
+        XmmRegister temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+        __ movsd(temp, Address(base, offset));
+        __ movd(out.AsRegisterPairLow<Register>(), temp);
+        __ psrlq(temp, Immediate(32));
+        __ movd(out.AsRegisterPairHigh<Register>(), temp);
+      } else {
+        __ movl(out.AsRegisterPairLow<Register>(), Address(base, offset));
+        __ movl(out.AsRegisterPairHigh<Register>(), Address(base, kX86WordSize + offset));
+      }
       break;
     }
 
     case Primitive::kPrimFloat: {
-      XmmRegister value = locations->InAt(1).AsFpuRegister<XmmRegister>();
-      __ movss(Address(obj, offset), value);
+      __ movss(out.AsFpuRegister<XmmRegister>(), Address(base, offset));
       break;
     }
 
     case Primitive::kPrimDouble: {
-      XmmRegister value = locations->InAt(1).AsFpuRegister<XmmRegister>();
-      __ movsd(Address(obj, offset), value);
+      __ movsd(out.AsFpuRegister<XmmRegister>(), Address(base, offset));
       break;
     }
 
@@ -2739,87 +2772,152 @@ void InstructionCodeGeneratorX86::VisitInstanceFieldSet(HInstanceFieldSet* instr
       LOG(FATAL) << "Unreachable type " << field_type;
       UNREACHABLE();
   }
-}
 
-void CodeGeneratorX86::MarkGCCard(Register temp, Register card, Register object, Register value) {
-  Label is_null;
-  __ testl(value, value);
-  __ j(kEqual, &is_null);
-  __ fs()->movl(card, Address::Absolute(Thread::CardTableOffset<kX86WordSize>().Int32Value()));
-  __ movl(temp, object);
-  __ shrl(temp, Immediate(gc::accounting::CardTable::kCardShift));
-  __ movb(Address(temp, card, TIMES_1, 0),
-          X86ManagedRegister::FromCpuRegister(card).AsByteRegister());
-  __ Bind(&is_null);
+  if (is_volatile) {
+    GenerateMemoryBarrier(MemBarrierKind::kLoadAny);
+  }
 }
 
-void LocationsBuilderX86::VisitInstanceFieldGet(HInstanceFieldGet* instruction) {
+void LocationsBuilderX86::HandleFieldSet(HInstruction* instruction, const FieldInfo& field_info) {
+  DCHECK(instruction->IsInstanceFieldSet() || instruction->IsStaticFieldSet());
+
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+  bool is_volatile = field_info.IsVolatile();
+  Primitive::Type field_type = field_info.GetFieldType();
+  bool is_byte_type = (field_type == Primitive::kPrimBoolean)
+    || (field_type == Primitive::kPrimByte);
+
+  // The register allocator does not support multiple
+  // inputs that die at entry with one in a specific register.
+  if (is_byte_type) {
+    // Ensure the value is in a byte register.
+    locations->SetInAt(1, Location::RegisterLocation(EAX));
+  } else {
+    locations->SetInAt(1, Location::RequiresRegister());
+  }
+  // Temporary registers for the write barrier.
+  if (CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->InputAt(1))) {
+    locations->AddTemp(Location::RequiresRegister());
+    // Ensure the card is in a byte register.
+    locations->AddTemp(Location::RegisterLocation(ECX));
+  } else if (is_volatile && (field_type == Primitive::kPrimLong)) {
+    // 64bits value can be atomically written to an address with movsd and an XMM register.
+    // We need two XMM registers because there's no easier way to (bit) copy a register pair
+    // into a single XMM register (we copy each pair part into the XMMs and then interleave them).
+    // NB: We could make the register allocator understand fp_reg <-> core_reg moves but given the
+    // isolated cases when we need this it isn't worth adding the extra complexity.
+    locations->AddTemp(Location::RequiresFpuRegister());
+    locations->AddTemp(Location::RequiresFpuRegister());
+  }
 }
 
-void InstructionCodeGeneratorX86::VisitInstanceFieldGet(HInstanceFieldGet* instruction) {
+void InstructionCodeGeneratorX86::HandleFieldSet(HInstruction* instruction,
+                                                 const FieldInfo& field_info) {
+  DCHECK(instruction->IsInstanceFieldSet() || instruction->IsStaticFieldSet());
+
   LocationSummary* locations = instruction->GetLocations();
-  Register obj = locations->InAt(0).AsRegister<Register>();
-  uint32_t offset = instruction->GetFieldOffset().Uint32Value();
+  Register base = locations->InAt(0).AsRegister<Register>();
+  Location value = locations->InAt(1);
+  bool is_volatile = field_info.IsVolatile();
+  Primitive::Type field_type = field_info.GetFieldType();
+  uint32_t offset = field_info.GetFieldOffset().Uint32Value();
 
-  switch (instruction->GetType()) {
-    case Primitive::kPrimBoolean: {
-      Register out = locations->Out().AsRegister<Register>();
-      __ movzxb(out, Address(obj, offset));
-      break;
-    }
+  if (is_volatile) {
+    GenerateMemoryBarrier(MemBarrierKind::kAnyStore);
+  }
 
+  switch (field_type) {
+    case Primitive::kPrimBoolean:
     case Primitive::kPrimByte: {
-      Register out = locations->Out().AsRegister<Register>();
-      __ movsxb(out, Address(obj, offset));
-      break;
-    }
-
-    case Primitive::kPrimShort: {
-      Register out = locations->Out().AsRegister<Register>();
-      __ movsxw(out, Address(obj, offset));
+      __ movb(Address(base, offset), value.AsRegister<ByteRegister>());
       break;
     }
 
+    case Primitive::kPrimShort:
     case Primitive::kPrimChar: {
-      Register out = locations->Out().AsRegister<Register>();
-      __ movzxw(out, Address(obj, offset));
+      __ movw(Address(base, offset), value.AsRegister<Register>());
       break;
     }
 
     case Primitive::kPrimInt:
     case Primitive::kPrimNot: {
-      Register out = locations->Out().AsRegister<Register>();
-      __ movl(out, Address(obj, offset));
+      __ movl(Address(base, offset), value.AsRegister<Register>());
+
+      if (CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->InputAt(1))) {
+        Register temp = locations->GetTemp(0).AsRegister<Register>();
+        Register card = locations->GetTemp(1).AsRegister<Register>();
+        codegen_->MarkGCCard(temp, card, base, value.AsRegister<Register>());
+      }
       break;
     }
 
     case Primitive::kPrimLong: {
-      // TODO: support volatile.
-      __ movl(locations->Out().AsRegisterPairLow<Register>(), Address(obj, offset));
-      __ movl(locations->Out().AsRegisterPairHigh<Register>(), Address(obj, kX86WordSize + offset));
+      if (is_volatile) {
+        XmmRegister temp1 = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+        XmmRegister temp2 = locations->GetTemp(1).AsFpuRegister<XmmRegister>();
+        __ movd(temp1, value.AsRegisterPairLow<Register>());
+        __ movd(temp2, value.AsRegisterPairHigh<Register>());
+        __ punpckldq(temp1, temp2);
+        __ movsd(Address(base, offset), temp1);
+      } else {
+        __ movl(Address(base, offset), value.AsRegisterPairLow<Register>());
+        __ movl(Address(base, kX86WordSize + offset), value.AsRegisterPairHigh<Register>());
+      }
       break;
     }
 
     case Primitive::kPrimFloat: {
-      XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
-      __ movss(out, Address(obj, offset));
+      __ movss(Address(base, offset), value.AsFpuRegister<XmmRegister>());
       break;
     }
 
     case Primitive::kPrimDouble: {
-      XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
-      __ movsd(out, Address(obj, offset));
+      __ movsd(Address(base, offset), value.AsFpuRegister<XmmRegister>());
       break;
     }
 
     case Primitive::kPrimVoid:
-      LOG(FATAL) << "Unreachable type " << instruction->GetType();
+      LOG(FATAL) << "Unreachable type " << field_type;
       UNREACHABLE();
   }
+
+  if (is_volatile) {
+    GenerateMemoryBarrier(MemBarrierKind::kAnyAny);
+  }
+}
+
+void LocationsBuilderX86::VisitStaticFieldGet(HStaticFieldGet* instruction) {
+  HandleFieldGet(instruction, instruction->GetFieldInfo());
+}
+
+void InstructionCodeGeneratorX86::VisitStaticFieldGet(HStaticFieldGet* instruction) {
+  HandleFieldGet(instruction, instruction->GetFieldInfo());
+}
+
+void LocationsBuilderX86::VisitStaticFieldSet(HStaticFieldSet* instruction) {
+  HandleFieldSet(instruction, instruction->GetFieldInfo());
+}
+
+void InstructionCodeGeneratorX86::VisitStaticFieldSet(HStaticFieldSet* instruction) {
+  HandleFieldSet(instruction, instruction->GetFieldInfo());
+}
+
+void LocationsBuilderX86::VisitInstanceFieldSet(HInstanceFieldSet* instruction) {
+  HandleFieldSet(instruction, instruction->GetFieldInfo());
+}
+
+void InstructionCodeGeneratorX86::VisitInstanceFieldSet(HInstanceFieldSet* instruction) {
+  HandleFieldSet(instruction, instruction->GetFieldInfo());
+}
+
+void LocationsBuilderX86::VisitInstanceFieldGet(HInstanceFieldGet* instruction) {
+  HandleFieldGet(instruction, instruction->GetFieldInfo());
+}
+
+void InstructionCodeGeneratorX86::VisitInstanceFieldGet(HInstanceFieldGet* instruction) {
+  HandleFieldGet(instruction, instruction->GetFieldInfo());
 }
 
 void LocationsBuilderX86::VisitNullCheck(HNullCheck* instruction) {
@@ -3383,159 +3481,6 @@ void InstructionCodeGeneratorX86::GenerateClassInitializationCheck(
   // No need for memory fence, thanks to the X86 memory model.
 }
 
-void LocationsBuilderX86::VisitStaticFieldGet(HStaticFieldGet* instruction) {
-  LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
-  locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
-}
-
-void InstructionCodeGeneratorX86::VisitStaticFieldGet(HStaticFieldGet* instruction) {
-  LocationSummary* locations = instruction->GetLocations();
-  Register cls = locations->InAt(0).AsRegister<Register>();
-  uint32_t offset = instruction->GetFieldOffset().Uint32Value();
-
-  switch (instruction->GetType()) {
-    case Primitive::kPrimBoolean: {
-      Register out = locations->Out().AsRegister<Register>();
-      __ movzxb(out, Address(cls, offset));
-      break;
-    }
-
-    case Primitive::kPrimByte: {
-      Register out = locations->Out().AsRegister<Register>();
-      __ movsxb(out, Address(cls, offset));
-      break;
-    }
-
-    case Primitive::kPrimShort: {
-      Register out = locations->Out().AsRegister<Register>();
-      __ movsxw(out, Address(cls, offset));
-      break;
-    }
-
-    case Primitive::kPrimChar: {
-      Register out = locations->Out().AsRegister<Register>();
-      __ movzxw(out, Address(cls, offset));
-      break;
-    }
-
-    case Primitive::kPrimInt:
-    case Primitive::kPrimNot: {
-      Register out = locations->Out().AsRegister<Register>();
-      __ movl(out, Address(cls, offset));
-      break;
-    }
-
-    case Primitive::kPrimLong: {
-      // TODO: support volatile.
-      __ movl(locations->Out().AsRegisterPairLow<Register>(), Address(cls, offset));
-      __ movl(locations->Out().AsRegisterPairHigh<Register>(), Address(cls, kX86WordSize + offset));
-      break;
-    }
-
-    case Primitive::kPrimFloat: {
-      XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
-      __ movss(out, Address(cls, offset));
-      break;
-    }
-
-    case Primitive::kPrimDouble: {
-      XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
-      __ movsd(out, Address(cls, offset));
-      break;
-    }
-
-    case Primitive::kPrimVoid:
-      LOG(FATAL) << "Unreachable type " << instruction->GetType();
-      UNREACHABLE();
-  }
-}
-
-void LocationsBuilderX86::VisitStaticFieldSet(HStaticFieldSet* instruction) {
-  LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
-  locations->SetInAt(0, Location::RequiresRegister());
-  Primitive::Type field_type = instruction->GetFieldType();
-  bool needs_write_barrier =
-      CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->InputAt(1));
-  bool is_byte_type = (field_type == Primitive::kPrimBoolean)
-      || (field_type == Primitive::kPrimByte);
-  // The register allocator does not support multiple
-  // inputs that die at entry with one in a specific register.
-  if (is_byte_type) {
-    // Ensure the value is in a byte register.
-    locations->SetInAt(1, Location::RegisterLocation(EAX));
-  } else {
-    locations->SetInAt(1, Location::RequiresRegister());
-  }
-  // Temporary registers for the write barrier.
-  if (needs_write_barrier) {
-    locations->AddTemp(Location::RequiresRegister());
-    // Ensure the card is in a byte register.
-    locations->AddTemp(Location::RegisterLocation(ECX));
-  }
-}
-
-void InstructionCodeGeneratorX86::VisitStaticFieldSet(HStaticFieldSet* instruction) {
-  LocationSummary* locations = instruction->GetLocations();
-  Register cls = locations->InAt(0).AsRegister<Register>();
-  uint32_t offset = instruction->GetFieldOffset().Uint32Value();
-  Primitive::Type field_type = instruction->GetFieldType();
-
-  switch (field_type) {
-    case Primitive::kPrimBoolean:
-    case Primitive::kPrimByte: {
-      ByteRegister value = locations->InAt(1).AsRegister<ByteRegister>();
-      __ movb(Address(cls, offset), value);
-      break;
-    }
-
-    case Primitive::kPrimShort:
-    case Primitive::kPrimChar: {
-      Register value = locations->InAt(1).AsRegister<Register>();
-      __ movw(Address(cls, offset), value);
-      break;
-    }
-
-    case Primitive::kPrimInt:
-    case Primitive::kPrimNot: {
-      Register value = locations->InAt(1).AsRegister<Register>();
-      __ movl(Address(cls, offset), value);
-
-      if (CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->InputAt(1))) {
-        Register temp = locations->GetTemp(0).AsRegister<Register>();
-        Register card = locations->GetTemp(1).AsRegister<Register>();
-        codegen_->MarkGCCard(temp, card, cls, value);
-      }
-      break;
-    }
-
-    case Primitive::kPrimLong: {
-      Location value = locations->InAt(1);
-      __ movl(Address(cls, offset), value.AsRegisterPairLow<Register>());
-      __ movl(Address(cls, kX86WordSize + offset), value.AsRegisterPairHigh<Register>());
-      break;
-    }
-
-    case Primitive::kPrimFloat: {
-      XmmRegister value = locations->InAt(1).AsFpuRegister<XmmRegister>();
-      __ movss(Address(cls, offset), value);
-      break;
-    }
-
-    case Primitive::kPrimDouble: {
-      XmmRegister value = locations->InAt(1).AsFpuRegister<XmmRegister>();
-      __ movsd(Address(cls, offset), value);
-      break;
-    }
-
-    case Primitive::kPrimVoid:
-      LOG(FATAL) << "Unreachable type " << field_type;
-      UNREACHABLE();
-  }
-}
-
 void LocationsBuilderX86::VisitLoadString(HLoadString* load) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(load, LocationSummary::kCallOnSlowPath);
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index aed06c04df..636f8845e5 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -18,6 +18,7 @@
 #define ART_COMPILER_OPTIMIZING_CODE_GENERATOR_X86_H_
 
 #include "code_generator.h"
+#include "dex/compiler_enums.h"
 #include "nodes.h"
 #include "parallel_move_resolver.h"
 #include "utils/x86/assembler_x86.h"
@@ -105,6 +106,8 @@ class LocationsBuilderX86 : public HGraphVisitor {
   void HandleBitwiseOperation(HBinaryOperation* instruction);
   void HandleInvoke(HInvoke* invoke);
   void HandleShift(HBinaryOperation* instruction);
+  void HandleFieldSet(HInstruction* instruction, const FieldInfo& field_info);
+  void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info);
 
   CodeGeneratorX86* const codegen_;
   InvokeDexCallingConventionVisitor parameter_visitor_;
@@ -137,6 +140,9 @@ class InstructionCodeGeneratorX86 : public HGraphVisitor {
   void GenerateShlLong(const Location& loc, Register shifter);
   void GenerateShrLong(const Location& loc, Register shifter);
   void GenerateUShrLong(const Location& loc, Register shifter);
+  void GenerateMemoryBarrier(MemBarrierKind kind);
+  void HandleFieldSet(HInstruction* instruction, const FieldInfo& field_info);
+  void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info);
 
   X86Assembler* const assembler_;
   CodeGeneratorX86* const codegen_;
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index ff7fcdcbac..1bc3092d15 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -2389,69 +2389,87 @@ void InstructionCodeGeneratorX86_64::VisitPhi(HPhi* instruction) {
   LOG(FATAL) << "Unimplemented";
 }
 
-void LocationsBuilderX86_64::VisitInstanceFieldSet(HInstanceFieldSet* instruction) {
+void InstructionCodeGeneratorX86_64::GenerateMemoryBarrier(MemBarrierKind kind) {
+  /*
+   * According to the JSR-133 Cookbook, for x86 only StoreLoad/AnyAny barriers need memory fence.
+   * All other barriers (LoadAny, AnyStore, StoreStore) are nops due to the x86 memory model.
+   * For those cases, all we need to ensure is that there is a scheduling barrier in place.
+   */
+  switch (kind) {
+    case MemBarrierKind::kAnyAny: {
+      __ mfence();
+      break;
+    }
+    case MemBarrierKind::kAnyStore:
+    case MemBarrierKind::kLoadAny:
+    case MemBarrierKind::kStoreStore: {
+      // nop
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unexpected memory barier " << kind;
+  }
+}
+
+void LocationsBuilderX86_64::HandleFieldGet(HInstruction* instruction) {
+  DCHECK(instruction->IsInstanceFieldGet() || instruction->IsStaticFieldGet());
+
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
-  Primitive::Type field_type = instruction->GetFieldType();
-  bool needs_write_barrier =
-      CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->GetValue());
   locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetInAt(1, Location::RequiresRegister());
-  if (needs_write_barrier) {
-    // Temporary registers for the write barrier.
-    locations->AddTemp(Location::RequiresRegister());
-    locations->AddTemp(Location::RequiresRegister());
-  }
+  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
 }
 
-void InstructionCodeGeneratorX86_64::VisitInstanceFieldSet(HInstanceFieldSet* instruction) {
+void InstructionCodeGeneratorX86_64::HandleFieldGet(HInstruction* instruction,
+                                                    const FieldInfo& field_info) {
+  DCHECK(instruction->IsInstanceFieldGet() || instruction->IsStaticFieldGet());
+
   LocationSummary* locations = instruction->GetLocations();
-  CpuRegister obj = locations->InAt(0).AsRegister<CpuRegister>();
-  size_t offset = instruction->GetFieldOffset().SizeValue();
-  Primitive::Type field_type = instruction->GetFieldType();
+  CpuRegister base = locations->InAt(0).AsRegister<CpuRegister>();
+  Location out = locations->Out();
+  bool is_volatile = field_info.IsVolatile();
+  Primitive::Type field_type = field_info.GetFieldType();
+  uint32_t offset = field_info.GetFieldOffset().Uint32Value();
 
   switch (field_type) {
-    case Primitive::kPrimBoolean:
+    case Primitive::kPrimBoolean: {
+      __ movzxb(out.AsRegister<CpuRegister>(), Address(base, offset));
+      break;
+    }
+
     case Primitive::kPrimByte: {
-      CpuRegister value = locations->InAt(1).AsRegister<CpuRegister>();
-      __ movb(Address(obj, offset), value);
+      __ movsxb(out.AsRegister<CpuRegister>(), Address(base, offset));
+      break;
+    }
+
+    case Primitive::kPrimShort: {
+      __ movsxw(out.AsRegister<CpuRegister>(), Address(base, offset));
       break;
     }
 
-    case Primitive::kPrimShort:
     case Primitive::kPrimChar: {
-      CpuRegister value = locations->InAt(1).AsRegister<CpuRegister>();
-      __ movw(Address(obj, offset), value);
+      __ movzxw(out.AsRegister<CpuRegister>(), Address(base, offset));
       break;
     }
 
     case Primitive::kPrimInt:
     case Primitive::kPrimNot: {
-      CpuRegister value = locations->InAt(1).AsRegister<CpuRegister>();
-      __ movl(Address(obj, offset), value);
-      if (CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->GetValue())) {
-        CpuRegister temp = locations->GetTemp(0).AsRegister<CpuRegister>();
-        CpuRegister card = locations->GetTemp(1).AsRegister<CpuRegister>();
-        codegen_->MarkGCCard(temp, card, obj, value);
-      }
+      __ movl(out.AsRegister<CpuRegister>(), Address(base, offset));
       break;
     }
 
     case Primitive::kPrimLong: {
-      CpuRegister value = locations->InAt(1).AsRegister<CpuRegister>();
-      __ movq(Address(obj, offset), value);
+      __ movq(out.AsRegister<CpuRegister>(), Address(base, offset));
       break;
     }
 
     case Primitive::kPrimFloat: {
-      XmmRegister value = locations->InAt(1).AsFpuRegister<XmmRegister>();
-      __ movss(Address(obj, offset), value);
+      __ movss(out.AsFpuRegister<XmmRegister>(), Address(base, offset));
       break;
     }
 
     case Primitive::kPrimDouble: {
-      XmmRegister value = locations->InAt(1).AsFpuRegister<XmmRegister>();
-      __ movsd(Address(obj, offset), value);
+      __ movsd(out.AsFpuRegister<XmmRegister>(), Address(base, offset));
       break;
     }
 
@@ -2459,74 +2477,124 @@ void InstructionCodeGeneratorX86_64::VisitInstanceFieldSet(HInstanceFieldSet* in
       LOG(FATAL) << "Unreachable type " << field_type;
       UNREACHABLE();
   }
+
+  if (is_volatile) {
+    GenerateMemoryBarrier(MemBarrierKind::kLoadAny);
+  }
 }
 
-void LocationsBuilderX86_64::VisitInstanceFieldGet(HInstanceFieldGet* instruction) {
+void LocationsBuilderX86_64::HandleFieldSet(HInstruction* instruction,
+                                            const FieldInfo& field_info) {
+  DCHECK(instruction->IsInstanceFieldSet() || instruction->IsStaticFieldSet());
+
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
+  bool needs_write_barrier =
+      CodeGenerator::StoreNeedsWriteBarrier(field_info.GetFieldType(), instruction->InputAt(1));
+
   locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+  locations->SetInAt(1, Location::RequiresRegister());
+  if (needs_write_barrier) {
+    // Temporary registers for the write barrier.
+    locations->AddTemp(Location::RequiresRegister());
+    locations->AddTemp(Location::RequiresRegister());
+  }
 }
 
-void InstructionCodeGeneratorX86_64::VisitInstanceFieldGet(HInstanceFieldGet* instruction) {
+void InstructionCodeGeneratorX86_64::HandleFieldSet(HInstruction* instruction,
+                                                    const FieldInfo& field_info) {
+  DCHECK(instruction->IsInstanceFieldSet() || instruction->IsStaticFieldSet());
+
   LocationSummary* locations = instruction->GetLocations();
-  CpuRegister obj = locations->InAt(0).AsRegister<CpuRegister>();
-  size_t offset = instruction->GetFieldOffset().SizeValue();
+  CpuRegister base = locations->InAt(0).AsRegister<CpuRegister>();
+  Location value = locations->InAt(1);
+  bool is_volatile = field_info.IsVolatile();
+  Primitive::Type field_type = field_info.GetFieldType();
+  uint32_t offset = field_info.GetFieldOffset().Uint32Value();
 
-  switch (instruction->GetType()) {
-    case Primitive::kPrimBoolean: {
-      CpuRegister out = locations->Out().AsRegister<CpuRegister>();
-      __ movzxb(out, Address(obj, offset));
-      break;
-    }
+  if (is_volatile) {
+    GenerateMemoryBarrier(MemBarrierKind::kAnyStore);
+  }
 
+  switch (field_type) {
+    case Primitive::kPrimBoolean:
     case Primitive::kPrimByte: {
-      CpuRegister out = locations->Out().AsRegister<CpuRegister>();
-      __ movsxb(out, Address(obj, offset));
-      break;
-    }
-
-    case Primitive::kPrimShort: {
-      CpuRegister out = locations->Out().AsRegister<CpuRegister>();
-      __ movsxw(out, Address(obj, offset));
+      __ movb(Address(base, offset), value.AsRegister<CpuRegister>());
       break;
     }
 
+    case Primitive::kPrimShort:
     case Primitive::kPrimChar: {
-      CpuRegister out = locations->Out().AsRegister<CpuRegister>();
-      __ movzxw(out, Address(obj, offset));
+      __ movw(Address(base, offset), value.AsRegister<CpuRegister>());
       break;
     }
 
     case Primitive::kPrimInt:
     case Primitive::kPrimNot: {
-      CpuRegister out = locations->Out().AsRegister<CpuRegister>();
-      __ movl(out, Address(obj, offset));
+      __ movl(Address(base, offset), value.AsRegister<CpuRegister>());
+      if (CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->InputAt(1))) {
+        CpuRegister temp = locations->GetTemp(0).AsRegister<CpuRegister>();
+        CpuRegister card = locations->GetTemp(1).AsRegister<CpuRegister>();
+        codegen_->MarkGCCard(temp, card, base, value.AsRegister<CpuRegister>());
+      }
       break;
     }
 
     case Primitive::kPrimLong: {
-      CpuRegister out = locations->Out().AsRegister<CpuRegister>();
-      __ movq(out, Address(obj, offset));
+      __ movq(Address(base, offset), value.AsRegister<CpuRegister>());
       break;
     }
 
     case Primitive::kPrimFloat: {
-      XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
-      __ movss(out, Address(obj, offset));
+      __ movss(Address(base, offset), value.AsFpuRegister<XmmRegister>());
       break;
     }
 
     case Primitive::kPrimDouble: {
-      XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
-      __ movsd(out, Address(obj, offset));
+      __ movsd(Address(base, offset), value.AsFpuRegister<XmmRegister>());
       break;
     }
 
     case Primitive::kPrimVoid:
-      LOG(FATAL) << "Unreachable type " << instruction->GetType();
+      LOG(FATAL) << "Unreachable type " << field_type;
       UNREACHABLE();
   }
+
+  if (is_volatile) {
+    GenerateMemoryBarrier(MemBarrierKind::kAnyAny);
+  }
+}
+
+void LocationsBuilderX86_64::VisitInstanceFieldSet(HInstanceFieldSet* instruction) {
+  HandleFieldSet(instruction, instruction->GetFieldInfo());
+}
+
+void InstructionCodeGeneratorX86_64::VisitInstanceFieldSet(HInstanceFieldSet* instruction) {
+  HandleFieldSet(instruction, instruction->GetFieldInfo());
+}
+
+void LocationsBuilderX86_64::VisitInstanceFieldGet(HInstanceFieldGet* instruction) {
+  HandleFieldGet(instruction);
+}
+
+void InstructionCodeGeneratorX86_64::VisitInstanceFieldGet(HInstanceFieldGet* instruction) {
+  HandleFieldGet(instruction, instruction->GetFieldInfo());
+}
+
+void LocationsBuilderX86_64::VisitStaticFieldGet(HStaticFieldGet* instruction) {
+  HandleFieldGet(instruction);
+}
+
+void InstructionCodeGeneratorX86_64::VisitStaticFieldGet(HStaticFieldGet* instruction) {
+  HandleFieldGet(instruction, instruction->GetFieldInfo());
+}
+
+void LocationsBuilderX86_64::VisitStaticFieldSet(HStaticFieldSet* instruction) {
+  HandleFieldSet(instruction, instruction->GetFieldInfo());
+}
+
+void InstructionCodeGeneratorX86_64::VisitStaticFieldSet(HStaticFieldSet* instruction) {
+  HandleFieldSet(instruction, instruction->GetFieldInfo());
 }
 
 void LocationsBuilderX86_64::VisitNullCheck(HNullCheck* instruction) {
@@ -3222,146 +3290,6 @@ void InstructionCodeGeneratorX86_64::VisitClinitCheck(HClinitCheck* check) {
                                    check->GetLocations()->InAt(0).AsRegister<CpuRegister>());
 }
 
-void LocationsBuilderX86_64::VisitStaticFieldGet(HStaticFieldGet* instruction) {
-  LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
-  locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
-}
-
-void InstructionCodeGeneratorX86_64::VisitStaticFieldGet(HStaticFieldGet* instruction) {
-  LocationSummary* locations = instruction->GetLocations();
-  CpuRegister cls = locations->InAt(0).AsRegister<CpuRegister>();
-  size_t offset = instruction->GetFieldOffset().SizeValue();
-
-  switch (instruction->GetType()) {
-    case Primitive::kPrimBoolean: {
-      CpuRegister out = locations->Out().AsRegister<CpuRegister>();
-      __ movzxb(out, Address(cls, offset));
-      break;
-    }
-
-    case Primitive::kPrimByte: {
-      CpuRegister out = locations->Out().AsRegister<CpuRegister>();
-      __ movsxb(out, Address(cls, offset));
-      break;
-    }
-
-    case Primitive::kPrimShort: {
-      CpuRegister out = locations->Out().AsRegister<CpuRegister>();
-      __ movsxw(out, Address(cls, offset));
-      break;
-    }
-
-    case Primitive::kPrimChar: {
-      CpuRegister out = locations->Out().AsRegister<CpuRegister>();
-      __ movzxw(out, Address(cls, offset));
-      break;
-    }
-
-    case Primitive::kPrimInt:
-    case Primitive::kPrimNot: {
-      CpuRegister out = locations->Out().AsRegister<CpuRegister>();
-      __ movl(out, Address(cls, offset));
-      break;
-    }
-
-    case Primitive::kPrimLong: {
-      CpuRegister out = locations->Out().AsRegister<CpuRegister>();
-      __ movq(out, Address(cls, offset));
-      break;
-    }
-
-    case Primitive::kPrimFloat: {
-      XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
-      __ movss(out, Address(cls, offset));
-      break;
-    }
-
-    case Primitive::kPrimDouble: {
-      XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
-      __ movsd(out, Address(cls, offset));
-      break;
-    }
-
-    case Primitive::kPrimVoid:
-      LOG(FATAL) << "Unreachable type " << instruction->GetType();
-      UNREACHABLE();
-  }
-}
-
-void LocationsBuilderX86_64::VisitStaticFieldSet(HStaticFieldSet* instruction) {
-  LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
-  Primitive::Type field_type = instruction->GetFieldType();
-  bool needs_write_barrier =
-      CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->GetValue());
-  locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetInAt(1, Location::RequiresRegister());
-  if (needs_write_barrier) {
-    // Temporary registers for the write barrier.
-    locations->AddTemp(Location::RequiresRegister());
-    locations->AddTemp(Location::RequiresRegister());
-  }
-}
-
-void InstructionCodeGeneratorX86_64::VisitStaticFieldSet(HStaticFieldSet* instruction) {
-  LocationSummary* locations = instruction->GetLocations();
-  CpuRegister cls = locations->InAt(0).AsRegister<CpuRegister>();
-  size_t offset = instruction->GetFieldOffset().SizeValue();
-  Primitive::Type field_type = instruction->GetFieldType();
-
-  switch (field_type) {
-    case Primitive::kPrimBoolean:
-    case Primitive::kPrimByte: {
-      CpuRegister value = locations->InAt(1).AsRegister<CpuRegister>();
-      __ movb(Address(cls, offset), value);
-      break;
-    }
-
-    case Primitive::kPrimShort:
-    case Primitive::kPrimChar: {
-      CpuRegister value = locations->InAt(1).AsRegister<CpuRegister>();
-      __ movw(Address(cls, offset), value);
-      break;
-    }
-
-    case Primitive::kPrimInt:
-    case Primitive::kPrimNot: {
-      CpuRegister value = locations->InAt(1).AsRegister<CpuRegister>();
-      __ movl(Address(cls, offset), value);
-      if (CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->GetValue())) {
-        CpuRegister temp = locations->GetTemp(0).AsRegister<CpuRegister>();
-        CpuRegister card = locations->GetTemp(1).AsRegister<CpuRegister>();
-        codegen_->MarkGCCard(temp, card, cls, value);
-      }
-      break;
-    }
-
-    case Primitive::kPrimLong: {
-      CpuRegister value = locations->InAt(1).AsRegister<CpuRegister>();
-      __ movq(Address(cls, offset), value);
-      break;
-    }
-
-    case Primitive::kPrimFloat: {
-      XmmRegister value = locations->InAt(1).AsFpuRegister<XmmRegister>();
-      __ movss(Address(cls, offset), value);
-      break;
-    }
-
-    case Primitive::kPrimDouble: {
-      XmmRegister value = locations->InAt(1).AsFpuRegister<XmmRegister>();
-      __ movsd(Address(cls, offset), value);
-      break;
-    }
-
-    case Primitive::kPrimVoid:
-      LOG(FATAL) << "Unreachable type " << field_type;
-      UNREACHABLE();
-  }
-}
-
 void LocationsBuilderX86_64::VisitLoadString(HLoadString* load) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(load, LocationSummary::kCallOnSlowPath);
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index 794b81ffbc..070886460b 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -18,6 +18,7 @@
 #define ART_COMPILER_OPTIMIZING_CODE_GENERATOR_X86_64_H_
 
 #include "code_generator.h"
+#include "dex/compiler_enums.h"
 #include "nodes.h"
 #include "parallel_move_resolver.h"
 #include "utils/x86_64/assembler_x86_64.h"
@@ -109,6 +110,8 @@ class LocationsBuilderX86_64 : public HGraphVisitor {
   void HandleInvoke(HInvoke* invoke);
   void HandleBitwiseOperation(HBinaryOperation* operation);
   void HandleShift(HBinaryOperation* operation);
+  void HandleFieldSet(HInstruction* instruction, const FieldInfo& field_info);
+  void HandleFieldGet(HInstruction* instruction);
 
   CodeGeneratorX86_64* const codegen_;
   InvokeDexCallingConventionVisitor parameter_visitor_;
@@ -138,6 +141,9 @@ class InstructionCodeGeneratorX86_64 : public HGraphVisitor {
   void HandleBitwiseOperation(HBinaryOperation* operation);
   void GenerateDivRemIntegral(HBinaryOperation* instruction);
   void HandleShift(HBinaryOperation* operation);
+  void GenerateMemoryBarrier(MemBarrierKind kind);
+  void HandleFieldSet(HInstruction* instruction, const FieldInfo& field_info);
+  void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info);
 
   X86_64Assembler* const assembler_;
   CodeGeneratorX86_64* const codegen_;
diff --git a/compiler/optimizing/gvn_test.cc b/compiler/optimizing/gvn_test.cc
index 94ff192264..48f1ea9e15 100644
--- a/compiler/optimizing/gvn_test.cc
+++ b/compiler/optimizing/gvn_test.cc
@@ -40,18 +40,22 @@ TEST(GVNTest, LocalFieldElimination) {
   entry->AddSuccessor(block);
 
   block->AddInstruction(
-      new (&allocator) HInstanceFieldGet(parameter, Primitive::kPrimNot, MemberOffset(42)));
+      new (&allocator) HInstanceFieldGet(parameter, Primitive::kPrimNot,
+          MemberOffset(42), false));
   block->AddInstruction(
-      new (&allocator) HInstanceFieldGet(parameter, Primitive::kPrimNot, MemberOffset(42)));
+      new (&allocator) HInstanceFieldGet(parameter, Primitive::kPrimNot,
+          MemberOffset(42), false));
   HInstruction* to_remove = block->GetLastInstruction();
   block->AddInstruction(
-      new (&allocator) HInstanceFieldGet(parameter, Primitive::kPrimNot, MemberOffset(43)));
+      new (&allocator) HInstanceFieldGet(parameter, Primitive::kPrimNot,
+          MemberOffset(43), false));
   HInstruction* different_offset = block->GetLastInstruction();
   // Kill the value.
   block->AddInstruction(new (&allocator) HInstanceFieldSet(
-      parameter, parameter, Primitive::kPrimNot, MemberOffset(42)));
+      parameter, parameter, Primitive::kPrimNot, MemberOffset(42), false));
   block->AddInstruction(
-      new (&allocator) HInstanceFieldGet(parameter, Primitive::kPrimNot, MemberOffset(42)));
+      new (&allocator) HInstanceFieldGet(parameter, Primitive::kPrimNot,
+          MemberOffset(42), false));
   HInstruction* use_after_kill = block->GetLastInstruction();
   block->AddInstruction(new (&allocator) HExit());
 
@@ -82,7 +86,8 @@ TEST(GVNTest, GlobalFieldElimination) {
   graph->AddBlock(block);
   entry->AddSuccessor(block);
   block->AddInstruction(
-      new (&allocator) HInstanceFieldGet(parameter, Primitive::kPrimBoolean, MemberOffset(42)));
+      new (&allocator) HInstanceFieldGet(parameter, Primitive::kPrimBoolean,
+          MemberOffset(42), false));
 
   block->AddInstruction(new (&allocator) HIf(block->GetLastInstruction()));
   HBasicBlock* then = new (&allocator) HBasicBlock(graph);
@@ -98,13 +103,16 @@ TEST(GVNTest, GlobalFieldElimination) {
   else_->AddSuccessor(join);
 
   then->AddInstruction(
-      new (&allocator) HInstanceFieldGet(parameter, Primitive::kPrimBoolean, MemberOffset(42)));
+      new (&allocator) HInstanceFieldGet(parameter, Primitive::kPrimBoolean,
+          MemberOffset(42), false));
   then->AddInstruction(new (&allocator) HGoto());
   else_->AddInstruction(
-      new (&allocator) HInstanceFieldGet(parameter, Primitive::kPrimBoolean, MemberOffset(42)));
+      new (&allocator) HInstanceFieldGet(parameter, Primitive::kPrimBoolean,
+          MemberOffset(42), false));
   else_->AddInstruction(new (&allocator) HGoto());
   join->AddInstruction(
-      new (&allocator) HInstanceFieldGet(parameter, Primitive::kPrimBoolean, MemberOffset(42)));
+      new (&allocator) HInstanceFieldGet(parameter, Primitive::kPrimBoolean,
+          MemberOffset(42), false));
   join->AddInstruction(new (&allocator) HExit());
 
   graph->TryBuildingSsa();
@@ -132,7 +140,8 @@ TEST(GVNTest, LoopFieldElimination) {
   graph->AddBlock(block);
   entry->AddSuccessor(block);
   block->AddInstruction(
-      new (&allocator) HInstanceFieldGet(parameter, Primitive::kPrimBoolean, MemberOffset(42)));
+      new (&allocator) HInstanceFieldGet(parameter, Primitive::kPrimBoolean,
+          MemberOffset(42), false));
   block->AddInstruction(new (&allocator) HGoto());
 
   HBasicBlock* loop_header = new (&allocator) HBasicBlock(graph);
@@ -148,22 +157,25 @@ TEST(GVNTest, LoopFieldElimination) {
   loop_body->AddSuccessor(loop_header);
 
   loop_header->AddInstruction(
-      new (&allocator) HInstanceFieldGet(parameter, Primitive::kPrimBoolean, MemberOffset(42)));
+      new (&allocator) HInstanceFieldGet(parameter, Primitive::kPrimBoolean,
+          MemberOffset(42), false));
   HInstruction* field_get_in_loop_header = loop_header->GetLastInstruction();
   loop_header->AddInstruction(new (&allocator) HIf(block->GetLastInstruction()));
 
   // Kill inside the loop body to prevent field gets inside the loop header
   // and the body to be GVN'ed.
   loop_body->AddInstruction(new (&allocator) HInstanceFieldSet(
-      parameter, parameter, Primitive::kPrimNot, MemberOffset(42)));
+      parameter, parameter, Primitive::kPrimNot, MemberOffset(42), false));
   HInstruction* field_set = loop_body->GetLastInstruction();
   loop_body->AddInstruction(
-      new (&allocator) HInstanceFieldGet(parameter, Primitive::kPrimBoolean, MemberOffset(42)));
+      new (&allocator) HInstanceFieldGet(parameter, Primitive::kPrimBoolean,
+          MemberOffset(42), false));
   HInstruction* field_get_in_loop_body = loop_body->GetLastInstruction();
   loop_body->AddInstruction(new (&allocator) HGoto());
 
   exit->AddInstruction(
-      new (&allocator) HInstanceFieldGet(parameter, Primitive::kPrimBoolean, MemberOffset(42)));
+      new (&allocator) HInstanceFieldGet(parameter, Primitive::kPrimBoolean,
+          MemberOffset(42), false));
   HInstruction* field_get_in_exit = exit->GetLastInstruction();
   exit->AddInstruction(new (&allocator) HExit());
 
@@ -242,7 +254,7 @@ TEST(GVNTest, LoopSideEffects) {
   {
     // Make one block with a side effect.
     entry->AddInstruction(new (&allocator) HInstanceFieldSet(
-        parameter, parameter, Primitive::kPrimNot, MemberOffset(42)));
+        parameter, parameter, Primitive::kPrimNot, MemberOffset(42), false));
 
     GlobalValueNumberer gvn(&allocator, graph);
     gvn.Run();
@@ -256,7 +268,7 @@ TEST(GVNTest, LoopSideEffects) {
   {
     outer_loop_body->InsertInstructionBefore(
         new (&allocator) HInstanceFieldSet(
-            parameter, parameter, Primitive::kPrimNot, MemberOffset(42)),
+            parameter, parameter, Primitive::kPrimNot, MemberOffset(42), false),
         outer_loop_body->GetLastInstruction());
 
     GlobalValueNumberer gvn(&allocator, graph);
@@ -273,7 +285,7 @@ TEST(GVNTest, LoopSideEffects) {
     outer_loop_body->RemoveInstruction(outer_loop_body->GetFirstInstruction());
     inner_loop_body->InsertInstructionBefore(
         new (&allocator) HInstanceFieldSet(
-            parameter, parameter, Primitive::kPrimNot, MemberOffset(42)),
+            parameter, parameter, Primitive::kPrimNot, MemberOffset(42), false),
         inner_loop_body->GetLastInstruction());
 
     GlobalValueNumberer gvn(&allocator, graph);
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index c963b70492..0fc1fd8663 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -2128,39 +2128,45 @@ class HNullCheck : public HExpression<1> {
 
 class FieldInfo : public ValueObject {
  public:
-  FieldInfo(MemberOffset field_offset, Primitive::Type field_type)
-      : field_offset_(field_offset), field_type_(field_type) {}
+  FieldInfo(MemberOffset field_offset, Primitive::Type field_type, bool is_volatile)
+      : field_offset_(field_offset), field_type_(field_type), is_volatile_(is_volatile) {}
 
   MemberOffset GetFieldOffset() const { return field_offset_; }
   Primitive::Type GetFieldType() const { return field_type_; }
+  bool IsVolatile() const { return is_volatile_; }
 
  private:
   const MemberOffset field_offset_;
   const Primitive::Type field_type_;
+  const bool is_volatile_;
 };
 
 class HInstanceFieldGet : public HExpression<1> {
  public:
   HInstanceFieldGet(HInstruction* value,
                     Primitive::Type field_type,
-                    MemberOffset field_offset)
+                    MemberOffset field_offset,
+                    bool is_volatile)
       : HExpression(field_type, SideEffects::DependsOnSomething()),
-        field_info_(field_offset, field_type) {
+        field_info_(field_offset, field_type, is_volatile) {
     SetRawInputAt(0, value);
   }
 
-  virtual bool CanBeMoved() const { return true; }
-  virtual bool InstructionDataEquals(HInstruction* other) const {
-    size_t other_offset = other->AsInstanceFieldGet()->GetFieldOffset().SizeValue();
-    return other_offset == GetFieldOffset().SizeValue();
+  bool CanBeMoved() const OVERRIDE { return !IsVolatile(); }
+
+  bool InstructionDataEquals(HInstruction* other) const OVERRIDE {
+    HInstanceFieldGet* other_get = other->AsInstanceFieldGet();
+    return GetFieldOffset().SizeValue() == other_get->GetFieldOffset().SizeValue();
   }
 
   virtual size_t ComputeHashCode() const {
     return (HInstruction::ComputeHashCode() << 7) | GetFieldOffset().SizeValue();
   }
 
+  const FieldInfo& GetFieldInfo() const { return field_info_; }
   MemberOffset GetFieldOffset() const { return field_info_.GetFieldOffset(); }
   Primitive::Type GetFieldType() const { return field_info_.GetFieldType(); }
+  bool IsVolatile() const { return field_info_.IsVolatile(); }
 
   DECLARE_INSTRUCTION(InstanceFieldGet);
 
@@ -2175,15 +2181,18 @@ class HInstanceFieldSet : public HTemplateInstruction<2> {
   HInstanceFieldSet(HInstruction* object,
                     HInstruction* value,
                     Primitive::Type field_type,
-                    MemberOffset field_offset)
+                    MemberOffset field_offset,
+                    bool is_volatile)
       : HTemplateInstruction(SideEffects::ChangesSomething()),
-        field_info_(field_offset, field_type) {
+        field_info_(field_offset, field_type, is_volatile) {
     SetRawInputAt(0, object);
     SetRawInputAt(1, value);
   }
 
+  const FieldInfo& GetFieldInfo() const { return field_info_; }
   MemberOffset GetFieldOffset() const { return field_info_.GetFieldOffset(); }
   Primitive::Type GetFieldType() const { return field_info_.GetFieldType(); }
+  bool IsVolatile() const { return field_info_.IsVolatile(); }
 
   HInstruction* GetValue() const { return InputAt(1); }
 
@@ -2496,24 +2505,29 @@ class HStaticFieldGet : public HExpression<1> {
  public:
   HStaticFieldGet(HInstruction* cls,
                   Primitive::Type field_type,
-                  MemberOffset field_offset)
+                  MemberOffset field_offset,
+                  bool is_volatile)
       : HExpression(field_type, SideEffects::DependsOnSomething()),
-        field_info_(field_offset, field_type) {
+        field_info_(field_offset, field_type, is_volatile) {
     SetRawInputAt(0, cls);
   }
 
-  bool CanBeMoved() const OVERRIDE { return true; }
+
+  bool CanBeMoved() const OVERRIDE { return !IsVolatile(); }
+
   bool InstructionDataEquals(HInstruction* other) const OVERRIDE {
-    size_t other_offset = other->AsStaticFieldGet()->GetFieldOffset().SizeValue();
-    return other_offset == GetFieldOffset().SizeValue();
+    HStaticFieldGet* other_get = other->AsStaticFieldGet();
+    return GetFieldOffset().SizeValue() == other_get->GetFieldOffset().SizeValue();
   }
 
   size_t ComputeHashCode() const OVERRIDE {
     return (HInstruction::ComputeHashCode() << 7) | GetFieldOffset().SizeValue();
   }
 
+  const FieldInfo& GetFieldInfo() const { return field_info_; }
   MemberOffset GetFieldOffset() const { return field_info_.GetFieldOffset(); }
   Primitive::Type GetFieldType() const { return field_info_.GetFieldType(); }
+  bool IsVolatile() const { return field_info_.IsVolatile(); }
 
   DECLARE_INSTRUCTION(StaticFieldGet);
 
@@ -2528,15 +2542,18 @@ class HStaticFieldSet : public HTemplateInstruction<2> {
   HStaticFieldSet(HInstruction* cls,
                   HInstruction* value,
                   Primitive::Type field_type,
-                  MemberOffset field_offset)
+                  MemberOffset field_offset,
+                  bool is_volatile)
       : HTemplateInstruction(SideEffects::ChangesSomething()),
-        field_info_(field_offset, field_type) {
+        field_info_(field_offset, field_type, is_volatile) {
     SetRawInputAt(0, cls);
     SetRawInputAt(1, value);
   }
 
+  const FieldInfo& GetFieldInfo() const { return field_info_; }
   MemberOffset GetFieldOffset() const { return field_info_.GetFieldOffset(); }
   Primitive::Type GetFieldType() const { return field_info_.GetFieldType(); }
+  bool IsVolatile() const { return field_info_.IsVolatile(); }
 
   HInstruction* GetValue() const { return InputAt(1); }
 
@@ -2677,7 +2694,7 @@ class HMonitorOperation : public HTemplateInstruction<1> {
 
   DECLARE_INSTRUCTION(MonitorOperation);
 
- protected:
+ private:
   const OperationKind kind_;
   const uint32_t dex_pc_;
 
@@ -2685,7 +2702,6 @@ class HMonitorOperation : public HTemplateInstruction<1> {
   DISALLOW_COPY_AND_ASSIGN(HMonitorOperation);
 };
 
-
 class MoveOperands : public ArenaObject<kArenaAllocMisc> {
  public:
   MoveOperands(Location source, Location destination, HInstruction* instruction)
diff --git a/compiler/optimizing/register_allocator_test.cc b/compiler/optimizing/register_allocator_test.cc
index f677e840ef..c2ea80ec33 100644
--- a/compiler/optimizing/register_allocator_test.cc
+++ b/compiler/optimizing/register_allocator_test.cc
@@ -462,7 +462,7 @@ static HGraph* BuildIfElseWithPhi(ArenaAllocator* allocator,
   entry->AddSuccessor(block);
 
   HInstruction* test = new (allocator) HInstanceFieldGet(
-      parameter, Primitive::kPrimBoolean, MemberOffset(22));
+      parameter, Primitive::kPrimBoolean, MemberOffset(22), false);
   block->AddInstruction(test);
   block->AddInstruction(new (allocator) HIf(test));
   HBasicBlock* then = new (allocator) HBasicBlock(graph);
@@ -481,8 +481,10 @@ static HGraph* BuildIfElseWithPhi(ArenaAllocator* allocator,
 
   *phi = new (allocator) HPhi(allocator, 0, 0, Primitive::kPrimInt);
   join->AddPhi(*phi);
-  *input1 = new (allocator) HInstanceFieldGet(parameter, Primitive::kPrimInt, MemberOffset(42));
-  *input2 = new (allocator) HInstanceFieldGet(parameter, Primitive::kPrimInt, MemberOffset(42));
+  *input1 = new (allocator) HInstanceFieldGet(parameter, Primitive::kPrimInt,
+                                              MemberOffset(42), false);
+  *input2 = new (allocator) HInstanceFieldGet(parameter, Primitive::kPrimInt,
+                                              MemberOffset(42), false);
   then->AddInstruction(*input1);
   else_->AddInstruction(*input2);
   join->AddInstruction(new (allocator) HExit());
@@ -581,7 +583,8 @@ static HGraph* BuildFieldReturn(ArenaAllocator* allocator,
   graph->AddBlock(block);
   entry->AddSuccessor(block);
 
-  *field = new (allocator) HInstanceFieldGet(parameter, Primitive::kPrimInt, MemberOffset(42));
+  *field = new (allocator) HInstanceFieldGet(parameter, Primitive::kPrimInt,
+                                             MemberOffset(42), false);
   block->AddInstruction(*field);
   *ret = new (allocator) HReturn(*field);
   block->AddInstruction(*ret);
diff --git a/compiler/utils/arm/assembler_arm.h b/compiler/utils/arm/assembler_arm.h
index c86ec4b3d6..87b38133fb 100644
--- a/compiler/utils/arm/assembler_arm.h
+++ b/compiler/utils/arm/assembler_arm.h
@@ -429,6 +429,8 @@ class ArmAssembler : public Assembler {
 
   virtual void ldrex(Register rd, Register rn, Condition cond = AL) = 0;
   virtual void strex(Register rd, Register rt, Register rn, Condition cond = AL) = 0;
+  virtual void ldrexd(Register rt, Register rt2, Register rn, Condition cond = AL) = 0;
+  virtual void strexd(Register rd, Register rt, Register rt2, Register rn, Condition cond = AL) = 0;
 
   // Miscellaneous instructions.
   virtual void clrex(Condition cond = AL) = 0;
diff --git a/compiler/utils/arm/assembler_arm32.cc b/compiler/utils/arm/assembler_arm32.cc
index 8f6d45ab53..8d1fb60725 100644
--- a/compiler/utils/arm/assembler_arm32.cc
+++ b/compiler/utils/arm/assembler_arm32.cc
@@ -778,6 +778,7 @@ void Arm32Assembler::EmitMulOp(Condition cond, int32_t opcode,
   Emit(encoding);
 }
 
+
 void Arm32Assembler::ldrex(Register rt, Register rn, Condition cond) {
   CHECK_NE(rn, kNoRegister);
   CHECK_NE(rt, kNoRegister);
@@ -793,6 +794,25 @@ void Arm32Assembler::ldrex(Register rt, Register rn, Condition cond) {
 }
 
 
+void Arm32Assembler::ldrexd(Register rt, Register rt2, Register rn, Condition cond) {
+  CHECK_NE(rn, kNoRegister);
+  CHECK_NE(rt, kNoRegister);
+  CHECK_NE(rt2, kNoRegister);
+  CHECK_NE(rt, R14);
+  CHECK_EQ(0u, static_cast<uint32_t>(rt) % 2);
+  CHECK_EQ(static_cast<uint32_t>(rt) + 1, static_cast<uint32_t>(rt2));
+  CHECK_NE(cond, kNoCondition);
+
+  int32_t encoding =
+      (static_cast<uint32_t>(cond) << kConditionShift) |
+      B24 | B23 | B21 | B20 |
+      static_cast<uint32_t>(rn) << 16 |
+      static_cast<uint32_t>(rt) << 12 |
+      B11 | B10 | B9 | B8 | B7 | B4 | B3 | B2 | B1 | B0;
+  Emit(encoding);
+}
+
+
 void Arm32Assembler::strex(Register rd,
                            Register rt,
                            Register rn,
@@ -811,6 +831,28 @@ void Arm32Assembler::strex(Register rd,
   Emit(encoding);
 }
 
+void Arm32Assembler::strexd(Register rd, Register rt, Register rt2, Register rn, Condition cond) {
+  CHECK_NE(rd, kNoRegister);
+  CHECK_NE(rn, kNoRegister);
+  CHECK_NE(rt, kNoRegister);
+  CHECK_NE(rt2, kNoRegister);
+  CHECK_NE(rt, R14);
+  CHECK_NE(rd, rt);
+  CHECK_NE(rd, rt2);
+  CHECK_EQ(0u, static_cast<uint32_t>(rt) % 2);
+  CHECK_EQ(static_cast<uint32_t>(rt) + 1, static_cast<uint32_t>(rt2));
+  CHECK_NE(cond, kNoCondition);
+
+  int32_t encoding =
+      (static_cast<uint32_t>(cond) << kConditionShift) |
+      B24 | B23 | B21 |
+      static_cast<uint32_t>(rn) << 16 |
+      static_cast<uint32_t>(rd) << 12 |
+      B11 | B10 | B9 | B8 | B7 | B4 |
+      static_cast<uint32_t>(rt);
+  Emit(encoding);
+}
+
 
 void Arm32Assembler::clrex(Condition cond) {
   CHECK_EQ(cond, AL);   // This cannot be conditional on ARM.
diff --git a/compiler/utils/arm/assembler_arm32.h b/compiler/utils/arm/assembler_arm32.h
index 6c8d41587b..b922d66513 100644
--- a/compiler/utils/arm/assembler_arm32.h
+++ b/compiler/utils/arm/assembler_arm32.h
@@ -123,6 +123,8 @@ class Arm32Assembler FINAL : public ArmAssembler {
 
   void ldrex(Register rd, Register rn, Condition cond = AL) OVERRIDE;
   void strex(Register rd, Register rt, Register rn, Condition cond = AL) OVERRIDE;
+  void ldrexd(Register rt, Register rt2, Register rn, Condition cond = AL) OVERRIDE;
+  void strexd(Register rd, Register rt, Register rt2, Register rn, Condition cond = AL) OVERRIDE;
 
   // Miscellaneous instructions.
   void clrex(Condition cond = AL) OVERRIDE;
diff --git a/compiler/utils/arm/assembler_arm32_test.cc b/compiler/utils/arm/assembler_arm32_test.cc
index 951792d45b..4a0ae0ba99 100644
--- a/compiler/utils/arm/assembler_arm32_test.cc
+++ b/compiler/utils/arm/assembler_arm32_test.cc
@@ -697,4 +697,28 @@ TEST_F(AssemblerArm32Test, Vmstat) {
   DriverStr(expected, "vmrs");
 }
 
+TEST_F(AssemblerArm32Test, ldrexd) {
+  GetAssembler()->ldrexd(arm::R0, arm::R1, arm::R0);
+  GetAssembler()->ldrexd(arm::R0, arm::R1, arm::R1);
+  GetAssembler()->ldrexd(arm::R0, arm::R1, arm::R2);
+
+  const char* expected =
+      "ldrexd r0, r1, [r0]\n"
+      "ldrexd r0, r1, [r1]\n"
+      "ldrexd r0, r1, [r2]\n";
+  DriverStr(expected, "ldrexd");
+}
+
+TEST_F(AssemblerArm32Test, strexd) {
+  GetAssembler()->strexd(arm::R9, arm::R0, arm::R1, arm::R0);
+  GetAssembler()->strexd(arm::R9, arm::R0, arm::R1, arm::R1);
+  GetAssembler()->strexd(arm::R9, arm::R0, arm::R1, arm::R2);
+
+  const char* expected =
+      "strexd r9, r0, r1, [r0]\n"
+      "strexd r9, r0, r1, [r1]\n"
+      "strexd r9, r0, r1, [r2]\n";
+  DriverStr(expected, "strexd");
+}
+
 }  // namespace art
diff --git a/compiler/utils/arm/assembler_thumb2.cc b/compiler/utils/arm/assembler_thumb2.cc
index 3eaae56376..3eccd3f9df 100644
--- a/compiler/utils/arm/assembler_thumb2.cc
+++ b/compiler/utils/arm/assembler_thumb2.cc
@@ -1662,9 +1662,6 @@ void Thumb2Assembler::ldrex(Register rt, Register rn, uint16_t imm, Condition co
   CHECK_NE(rn, kNoRegister);
   CHECK_NE(rt, kNoRegister);
   CheckCondition(cond);
-  CHECK_NE(rn, kNoRegister);
-  CHECK_NE(rt, kNoRegister);
-  CheckCondition(cond);
   CHECK_LT(imm, (1u << 10));
 
   int32_t encoding = B31 | B30 | B29 | B27 | B22 | B20 |
@@ -1701,6 +1698,22 @@ void Thumb2Assembler::strex(Register rd,
 }
 
 
+void Thumb2Assembler::ldrexd(Register rt, Register rt2, Register rn, Condition cond) {
+  CHECK_NE(rn, kNoRegister);
+  CHECK_NE(rt, kNoRegister);
+  CHECK_NE(rt2, kNoRegister);
+  CHECK_NE(rt, rt2);
+  CheckCondition(cond);
+
+  int32_t encoding = B31 | B30 | B29 | B27 | B23 | B22 | B20 |
+      static_cast<uint32_t>(rn) << 16 |
+      static_cast<uint32_t>(rt) << 12 |
+      static_cast<uint32_t>(rt2) << 8 |
+      B6 | B5 | B4 | B3 | B2 | B1 | B0;
+  Emit32(encoding);
+}
+
+
 void Thumb2Assembler::strex(Register rd,
                             Register rt,
                             Register rn,
@@ -1709,6 +1722,26 @@ void Thumb2Assembler::strex(Register rd,
 }
 
 
+void Thumb2Assembler::strexd(Register rd, Register rt, Register rt2, Register rn, Condition cond) {
+  CHECK_NE(rd, kNoRegister);
+  CHECK_NE(rn, kNoRegister);
+  CHECK_NE(rt, kNoRegister);
+  CHECK_NE(rt2, kNoRegister);
+  CHECK_NE(rt, rt2);
+  CHECK_NE(rd, rt);
+  CHECK_NE(rd, rt2);
+  CheckCondition(cond);
+
+  int32_t encoding = B31 | B30 | B29 | B27 | B23 | B22 |
+      static_cast<uint32_t>(rn) << 16 |
+      static_cast<uint32_t>(rt) << 12 |
+      static_cast<uint32_t>(rt2) << 8 |
+      B6 | B5 | B4 |
+      static_cast<uint32_t>(rd);
+  Emit32(encoding);
+}
+
+
 void Thumb2Assembler::clrex(Condition cond) {
   CheckCondition(cond);
   int32_t encoding = B31 | B30 | B29 | B27 | B28 | B25 | B24 | B23 |
diff --git a/compiler/utils/arm/assembler_thumb2.h b/compiler/utils/arm/assembler_thumb2.h
index 48a3a7eeb2..81dd13894f 100644
--- a/compiler/utils/arm/assembler_thumb2.h
+++ b/compiler/utils/arm/assembler_thumb2.h
@@ -149,6 +149,8 @@ class Thumb2Assembler FINAL : public ArmAssembler {
   void ldrex(Register rd, Register rn, uint16_t imm, Condition cond = AL);
   void strex(Register rd, Register rt, Register rn, uint16_t imm, Condition cond = AL);
 
+  void ldrexd(Register rt, Register rt2, Register rn, Condition cond = AL) OVERRIDE;
+  void strexd(Register rd, Register rt, Register rt2, Register rn, Condition cond = AL) OVERRIDE;
 
   // Miscellaneous instructions.
   void clrex(Condition cond = AL) OVERRIDE;
diff --git a/compiler/utils/arm/assembler_thumb2_test.cc b/compiler/utils/arm/assembler_thumb2_test.cc
index 6ae95a40e6..425ccd7ea3 100644
--- a/compiler/utils/arm/assembler_thumb2_test.cc
+++ b/compiler/utils/arm/assembler_thumb2_test.cc
@@ -164,4 +164,32 @@ TEST_F(AssemblerThumb2Test, Vmstat) {
   DriverStr(expected, "vmrs");
 }
 
+TEST_F(AssemblerThumb2Test, ldrexd) {
+  GetAssembler()->ldrexd(arm::R0, arm::R1, arm::R0);
+  GetAssembler()->ldrexd(arm::R0, arm::R1, arm::R1);
+  GetAssembler()->ldrexd(arm::R0, arm::R1, arm::R2);
+  GetAssembler()->ldrexd(arm::R5, arm::R3, arm::R7);
+
+  const char* expected =
+      "ldrexd r0, r1, [r0]\n"
+      "ldrexd r0, r1, [r1]\n"
+      "ldrexd r0, r1, [r2]\n"
+      "ldrexd r5, r3, [r7]\n";
+  DriverStr(expected, "ldrexd");
+}
+
+TEST_F(AssemblerThumb2Test, strexd) {
+  GetAssembler()->strexd(arm::R9, arm::R0, arm::R1, arm::R0);
+  GetAssembler()->strexd(arm::R9, arm::R0, arm::R1, arm::R1);
+  GetAssembler()->strexd(arm::R9, arm::R0, arm::R1, arm::R2);
+  GetAssembler()->strexd(arm::R9, arm::R5, arm::R3, arm::R7);
+
+  const char* expected =
+      "strexd r9, r0, r1, [r0]\n"
+      "strexd r9, r0, r1, [r1]\n"
+      "strexd r9, r0, r1, [r2]\n"
+      "strexd r9, r5, r3, [r7]\n";
+  DriverStr(expected, "strexd");
+}
+
 }  // namespace art
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index f0353f6cd2..f8c0043242 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -443,6 +443,27 @@ void X86Assembler::movsd(XmmRegister dst, XmmRegister src) {
 }
 
 
+void X86Assembler::psrlq(XmmRegister reg, const Immediate& shift_count) {
+  DCHECK(shift_count.is_uint8());
+
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x73);
+  EmitXmmRegisterOperand(2, reg);
+  EmitUint8(shift_count.value());
+}
+
+
+void X86Assembler::punpckldq(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x62);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+
 void X86Assembler::addsd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index 9fecf1edf0..6c3d13122c 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -274,6 +274,9 @@ class X86Assembler FINAL : public Assembler {
   void movsd(const Address& dst, XmmRegister src);
   void movsd(XmmRegister dst, XmmRegister src);
 
+  void psrlq(XmmRegister reg, const Immediate& shift_count);
+  void punpckldq(XmmRegister dst, XmmRegister src);
+
   void addsd(XmmRegister dst, XmmRegister src);
   void addsd(XmmRegister dst, const Address& src);
   void subsd(XmmRegister dst, XmmRegister src);
diff --git a/compiler/utils/x86/assembler_x86_test.cc b/compiler/utils/x86/assembler_x86_test.cc
index d901673691..fccb510afb 100644
--- a/compiler/utils/x86/assembler_x86_test.cc
+++ b/compiler/utils/x86/assembler_x86_test.cc
@@ -105,6 +105,18 @@ TEST_F(AssemblerX86Test, Movl) {
   DriverStr(expected, "movl");
 }
 
+TEST_F(AssemblerX86Test, psrlq) {
+  GetAssembler()->psrlq(x86::XMM0, CreateImmediate(32));
+  const char* expected = "psrlq $0x20, %xmm0\n";
+  DriverStr(expected, "psrlq");
+}
+
+TEST_F(AssemblerX86Test, punpckldq) {
+  GetAssembler()->punpckldq(x86::XMM0, x86::XMM1);
+  const char* expected = "punpckldq %xmm1, %xmm0\n";
+  DriverStr(expected, "punpckldq");
+}
+
 TEST_F(AssemblerX86Test, LoadLongConstant) {
   GetAssembler()->LoadLongConstant(x86::XMM0, 51);
   const char* expected =
diff --git a/runtime/Android.mk b/runtime/Android.mk
index ca29eba4ee..13a216c48b 100644
--- a/runtime/Android.mk
+++ b/runtime/Android.mk
@@ -67,6 +67,7 @@ LIBART_COMMON_SRC_FILES := \
   gc/space/rosalloc_space.cc \
   gc/space/space.cc \
   gc/space/zygote_space.cc \
+  gc/task_processor.cc \
   hprof/hprof.cc \
   image.cc \
   indirect_reference_table.cc \
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 8f09e074f7..26d6117122 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -52,6 +52,7 @@
 #include "gc/space/rosalloc_space-inl.h"
 #include "gc/space/space-inl.h"
 #include "gc/space/zygote_space.h"
+#include "gc/task_processor.h"
 #include "entrypoints/quick/quick_alloc_entrypoints.h"
 #include "heap-inl.h"
 #include "image.h"
@@ -129,10 +130,7 @@ Heap::Heap(size_t initial_size, size_t growth_limit, size_t min_free, size_t max
       foreground_collector_type_(foreground_collector_type),
       background_collector_type_(background_collector_type),
       desired_collector_type_(foreground_collector_type_),
-      heap_trim_request_lock_(nullptr),
-      last_trim_time_(0),
-      heap_transition_or_trim_target_time_(0),
-      heap_trim_request_pending_(false),
+      pending_task_lock_(nullptr),
       parallel_gc_threads_(parallel_gc_threads),
       conc_gc_threads_(conc_gc_threads),
       low_memory_mode_(low_memory_mode),
@@ -142,8 +140,6 @@ Heap::Heap(size_t initial_size, size_t growth_limit, size_t min_free, size_t max
       zygote_creation_lock_("zygote creation lock", kZygoteCreationLock),
       zygote_space_(nullptr),
       large_object_threshold_(large_object_threshold),
-      gc_request_pending_(false),
-      conc_gc_running_(false),
       collector_type_running_(kCollectorTypeNone),
       last_gc_type_(collector::kGcTypeNone),
       next_gc_type_(collector::kGcTypePartial),
@@ -194,6 +190,8 @@ Heap::Heap(size_t initial_size, size_t growth_limit, size_t min_free, size_t max
       min_interval_homogeneous_space_compaction_by_oom_(
           min_interval_homogeneous_space_compaction_by_oom),
       last_time_homogeneous_space_compaction_by_oom_(NanoTime()),
+      pending_collector_transition_(nullptr),
+      pending_heap_trim_(nullptr),
       use_homogeneous_space_compaction_for_oom_(use_homogeneous_space_compaction_for_oom) {
   if (VLOG_IS_ON(heap) || VLOG_IS_ON(startup)) {
     LOG(INFO) << "Heap() entering";
@@ -409,9 +407,8 @@ Heap::Heap(size_t initial_size, size_t growth_limit, size_t min_free, size_t max
   gc_complete_lock_ = new Mutex("GC complete lock");
   gc_complete_cond_.reset(new ConditionVariable("GC complete condition variable",
                                                 *gc_complete_lock_));
-  gc_request_lock_ = new Mutex("GC request lock");
-  gc_request_cond_.reset(new ConditionVariable("GC request condition variable", *gc_request_lock_));
-  heap_trim_request_lock_ = new Mutex("Heap trim request lock");
+  task_processor_.reset(new TaskProcessor());
+  pending_task_lock_ = new Mutex("Pending task lock");
   if (ignore_max_footprint_) {
     SetIdealFootprint(std::numeric_limits<size_t>::max());
     concurrent_start_bytes_ = std::numeric_limits<size_t>::max();
@@ -719,8 +716,8 @@ void Heap::VisitObjects(ObjectCallback callback, void* arg) {
     mirror::Object* obj = *it;
     if (obj != nullptr && obj->GetClass() != nullptr) {
       // Avoid the race condition caused by the object not yet being written into the allocation
-      // stack or the class not yet being written in the object. Or, if kUseThreadLocalAllocationStack,
-      // there can be nulls on the allocation stack.
+      // stack or the class not yet being written in the object. Or, if
+      // kUseThreadLocalAllocationStack, there can be nulls on the allocation stack.
       callback(obj, arg);
     }
   }
@@ -872,8 +869,7 @@ Heap::~Heap() {
   STLDeleteElements(&continuous_spaces_);
   STLDeleteElements(&discontinuous_spaces_);
   delete gc_complete_lock_;
-  delete gc_request_lock_;
-  delete heap_trim_request_lock_;
+  delete pending_task_lock_;
   VLOG(heap) << "Finished ~Heap()";
 }
 
@@ -944,37 +940,23 @@ void Heap::ThrowOutOfMemoryError(Thread* self, size_t byte_count, AllocatorType
   self->ThrowOutOfMemoryError(oss.str().c_str());
 }
 
-void Heap::DoPendingTransitionOrTrim() {
-  Thread* self = Thread::Current();
-  CollectorType desired_collector_type;
-  // Wait until we reach the desired transition time.
-  while (true) {
-    uint64_t wait_time;
-    {
-      MutexLock mu(self, *heap_trim_request_lock_);
-      desired_collector_type = desired_collector_type_;
-      uint64_t current_time = NanoTime();
-      if (current_time >= heap_transition_or_trim_target_time_) {
-        break;
-      }
-      wait_time = heap_transition_or_trim_target_time_ - current_time;
-    }
-    ScopedThreadStateChange tsc(self, kSleeping);
-    usleep(wait_time / 1000);  // Usleep takes microseconds.
-  }
+void Heap::DoPendingCollectorTransition() {
+  CollectorType desired_collector_type = desired_collector_type_;
   // Launch homogeneous space compaction if it is desired.
   if (desired_collector_type == kCollectorTypeHomogeneousSpaceCompact) {
     if (!CareAboutPauseTimes()) {
       PerformHomogeneousSpaceCompact();
+    } else {
+      VLOG(gc) << "Homogeneous compaction ignored due to jank perceptible process state";
     }
-    // No need to Trim(). Homogeneous space compaction may free more virtual and physical memory.
-    desired_collector_type = collector_type_;
-    return;
+  } else {
+    TransitionCollector(desired_collector_type);
   }
-  // Transition the collector if the desired collector type is not the same as the current
-  // collector type.
-  TransitionCollector(desired_collector_type);
+}
+
+void Heap::Trim(Thread* self) {
   if (!CareAboutPauseTimes()) {
+    ATRACE_BEGIN("Deflating monitors");
     // Deflate the monitors, this can cause a pause but shouldn't matter since we don't care
     // about pauses.
     Runtime* runtime = Runtime::Current();
@@ -984,9 +966,10 @@ void Heap::DoPendingTransitionOrTrim() {
     VLOG(heap) << "Deflating " << count << " monitors took "
         << PrettyDuration(NanoTime() - start_time);
     runtime->GetThreadList()->ResumeAll();
+    ATRACE_END();
   }
-  // Do a heap trim if it is needed.
-  Trim();
+  TrimIndirectReferenceTables(self);
+  TrimSpaces(self);
 }
 
 class TrimIndirectReferenceTableClosure : public Closure {
@@ -1004,17 +987,22 @@ class TrimIndirectReferenceTableClosure : public Closure {
   Barrier* const barrier_;
 };
 
-
-void Heap::Trim() {
-  Thread* self = Thread::Current();
-  {
-    MutexLock mu(self, *heap_trim_request_lock_);
-    if (!heap_trim_request_pending_ || last_trim_time_ + kHeapTrimWait >= NanoTime()) {
-      return;
-    }
-    last_trim_time_ = NanoTime();
-    heap_trim_request_pending_ = false;
-  }
+void Heap::TrimIndirectReferenceTables(Thread* self) {
+  ScopedObjectAccess soa(self);
+  ATRACE_BEGIN(__FUNCTION__);
+  JavaVMExt* vm = soa.Vm();
+  // Trim globals indirect reference table.
+  vm->TrimGlobals();
+  // Trim locals indirect reference tables.
+  Barrier barrier(0);
+  TrimIndirectReferenceTableClosure closure(&barrier);
+  ScopedThreadStateChange tsc(self, kWaitingForCheckPointsToRun);
+  size_t barrier_count = Runtime::Current()->GetThreadList()->RunCheckpoint(&closure);
+  barrier.Increment(self, barrier_count);
+  ATRACE_END();
+}
+
+void Heap::TrimSpaces(Thread* self) {
   {
     // Need to do this before acquiring the locks since we don't want to get suspended while
     // holding any locks.
@@ -1026,20 +1014,8 @@ void Heap::Trim() {
     WaitForGcToCompleteLocked(kGcCauseTrim, self);
     collector_type_running_ = kCollectorTypeHeapTrim;
   }
-  // Trim reference tables.
-  {
-    ScopedObjectAccess soa(self);
-    JavaVMExt* vm = soa.Vm();
-    // Trim globals indirect reference table.
-    vm->TrimGlobals();
-    // Trim locals indirect reference tables.
-    Barrier barrier(0);
-    TrimIndirectReferenceTableClosure closure(&barrier);
-    ScopedThreadStateChange tsc(self, kWaitingForCheckPointsToRun);
-    size_t barrier_count = Runtime::Current()->GetThreadList()->RunCheckpoint(&closure);
-    barrier.Increment(self, barrier_count);
-  }
-  uint64_t start_ns = NanoTime();
+  ATRACE_BEGIN(__FUNCTION__);
+  const uint64_t start_ns = NanoTime();
   // Trim the managed spaces.
   uint64_t total_alloc_space_allocated = 0;
   uint64_t total_alloc_space_size = 0;
@@ -1089,6 +1065,7 @@ void Heap::Trim() {
       << PrettyDuration(end_ns - gc_heap_end_ns) << ", advised=" << PrettySize(native_reclaimed)
       << ") heaps. Managed heap utilization of " << static_cast<int>(100 * managed_utilization)
       << "%.";
+  ATRACE_END();
 }
 
 bool Heap::IsValidObjectAddress(const mirror::Object* obj) const {
@@ -1639,7 +1616,6 @@ HomogeneousSpaceCompactResult Heap::PerformHomogeneousSpaceCompact() {
   return HomogeneousSpaceCompactResult::kSuccess;
 }
 
-
 void Heap::TransitionCollector(CollectorType collector_type) {
   if (collector_type == collector_type_) {
     return;
@@ -2207,7 +2183,7 @@ collector::GcType Heap::CollectGarbageInternal(collector::GcType gc_type, GcCaus
   collector->Run(gc_cause, clear_soft_references || runtime->IsZygote());
   total_objects_freed_ever_ += GetCurrentGcIteration()->GetFreedObjects();
   total_bytes_freed_ever_ += GetCurrentGcIteration()->GetFreedBytes();
-  RequestHeapTrim();
+  RequestTrim(self);
   // Enqueue cleared references.
   reference_processor_.EnqueueClearedReferences(self);
   // Grow the heap so that we know when to perform the next GC.
@@ -3032,52 +3008,109 @@ void Heap::RequestConcurrentGCAndSaveObject(Thread* self, mirror::Object** obj)
   RequestConcurrentGC(self);
 }
 
-void Heap::RequestConcurrentGC(Thread* self) {
-  // Make sure that we can do a concurrent GC.
+class Heap::ConcurrentGCTask : public HeapTask {
+ public:
+  explicit ConcurrentGCTask(uint64_t target_time) : HeapTask(target_time) { }
+  virtual void Run(Thread* self) OVERRIDE {
+    gc::Heap* heap = Runtime::Current()->GetHeap();
+    heap->ConcurrentGC(self);
+    heap->ClearConcurrentGCRequest();
+  }
+};
+
+static bool CanAddHeapTask(Thread* self) LOCKS_EXCLUDED(Locks::runtime_shutdown_lock_) {
   Runtime* runtime = Runtime::Current();
-  if (runtime == nullptr || !runtime->IsFinishedStarting() || runtime->IsShuttingDown(self) ||
-      self->IsHandlingStackOverflow()) {
-    return;
+  return runtime != nullptr && runtime->IsFinishedStarting() && !runtime->IsShuttingDown(self) &&
+      !self->IsHandlingStackOverflow();
+}
+
+void Heap::ClearConcurrentGCRequest() {
+  concurrent_gc_pending_.StoreRelaxed(false);
+}
+
+void Heap::RequestConcurrentGC(Thread* self) {
+  if (CanAddHeapTask(self) &&
+      concurrent_gc_pending_.CompareExchangeStrongSequentiallyConsistent(false, true)) {
+    task_processor_->AddTask(self, new ConcurrentGCTask(NanoTime()));  // Start straight away.
   }
-  NotifyConcurrentGCRequest(self);
 }
 
 void Heap::ConcurrentGC(Thread* self) {
-  if (Runtime::Current()->IsShuttingDown(self)) {
-    return;
-  }
-  // Wait for any GCs currently running to finish.
-  if (WaitForGcToComplete(kGcCauseBackground, self) == collector::kGcTypeNone) {
-    // If the we can't run the GC type we wanted to run, find the next appropriate one and try that
-    // instead. E.g. can't do partial, so do full instead.
-    if (CollectGarbageInternal(next_gc_type_, kGcCauseBackground, false) ==
-        collector::kGcTypeNone) {
-      for (collector::GcType gc_type : gc_plan_) {
-        // Attempt to run the collector, if we succeed, we are done.
-        if (gc_type > next_gc_type_ &&
-            CollectGarbageInternal(gc_type, kGcCauseBackground, false) != collector::kGcTypeNone) {
-          break;
+  if (!Runtime::Current()->IsShuttingDown(self)) {
+    // Wait for any GCs currently running to finish.
+    if (WaitForGcToComplete(kGcCauseBackground, self) == collector::kGcTypeNone) {
+      // If the we can't run the GC type we wanted to run, find the next appropriate one and try that
+      // instead. E.g. can't do partial, so do full instead.
+      if (CollectGarbageInternal(next_gc_type_, kGcCauseBackground, false) ==
+          collector::kGcTypeNone) {
+        for (collector::GcType gc_type : gc_plan_) {
+          // Attempt to run the collector, if we succeed, we are done.
+          if (gc_type > next_gc_type_ &&
+              CollectGarbageInternal(gc_type, kGcCauseBackground, false) !=
+                  collector::kGcTypeNone) {
+            break;
+          }
         }
       }
     }
   }
 }
 
+class Heap::CollectorTransitionTask : public HeapTask {
+ public:
+  explicit CollectorTransitionTask(uint64_t target_time) : HeapTask(target_time) { }
+  virtual void Run(Thread* self) OVERRIDE {
+    gc::Heap* heap = Runtime::Current()->GetHeap();
+    heap->DoPendingCollectorTransition();
+    heap->ClearPendingCollectorTransition(self);
+  }
+};
+
+void Heap::ClearPendingCollectorTransition(Thread* self) {
+  MutexLock mu(self, *pending_task_lock_);
+  pending_collector_transition_ = nullptr;
+}
+
 void Heap::RequestCollectorTransition(CollectorType desired_collector_type, uint64_t delta_time) {
   Thread* self = Thread::Current();
+  desired_collector_type_ = desired_collector_type;
+  if (desired_collector_type_ == collector_type_ || !CanAddHeapTask(self)) {
+    return;
+  }
+  CollectorTransitionTask* added_task = nullptr;
+  const uint64_t target_time = NanoTime() + delta_time;
   {
-    MutexLock mu(self, *heap_trim_request_lock_);
-    if (desired_collector_type_ == desired_collector_type) {
+    MutexLock mu(self, *pending_task_lock_);
+    // If we have an existing collector transition, update the targe time to be the new target.
+    if (pending_collector_transition_ != nullptr) {
+      task_processor_->UpdateTargetRunTime(self, pending_collector_transition_, target_time);
       return;
     }
-    heap_transition_or_trim_target_time_ =
-        std::max(heap_transition_or_trim_target_time_, NanoTime() + delta_time);
-    desired_collector_type_ = desired_collector_type;
+    added_task = new CollectorTransitionTask(target_time);
+    pending_collector_transition_ = added_task;
+  }
+  task_processor_->AddTask(self, added_task);
+}
+
+class Heap::HeapTrimTask : public HeapTask {
+ public:
+  explicit HeapTrimTask(uint64_t delta_time) : HeapTask(NanoTime() + delta_time) { }
+  virtual void Run(Thread* self) OVERRIDE {
+    gc::Heap* heap = Runtime::Current()->GetHeap();
+    heap->Trim(self);
+    heap->ClearPendingTrim(self);
   }
-  SignalHeapTrimDaemon(self);
+};
+
+void Heap::ClearPendingTrim(Thread* self) {
+  MutexLock mu(self, *pending_task_lock_);
+  pending_heap_trim_ = nullptr;
 }
 
-void Heap::RequestHeapTrim() {
+void Heap::RequestTrim(Thread* self) {
+  if (!CanAddHeapTask(self)) {
+    return;
+  }
   // GC completed and now we must decide whether to request a heap trim (advising pages back to the
   // kernel) or not. Issuing a request will also cause trimming of the libc heap. As a trim scans
   // a space it will hold its lock and can become a cause of jank.
@@ -3090,42 +3123,17 @@ void Heap::RequestHeapTrim() {
   // to utilization (which is probably inversely proportional to how much benefit we can expect).
   // We could try mincore(2) but that's only a measure of how many pages we haven't given away,
   // not how much use we're making of those pages.
-
-  Thread* self = Thread::Current();
-  Runtime* runtime = Runtime::Current();
-  if (runtime == nullptr || !runtime->IsFinishedStarting() || runtime->IsShuttingDown(self) ||
-      runtime->IsZygote()) {
-    // Ignore the request if we are the zygote to prevent app launching lag due to sleep in heap
-    // trimmer daemon. b/17310019
-    // Heap trimming isn't supported without a Java runtime or Daemons (such as at dex2oat time)
-    // Also: we do not wish to start a heap trim if the runtime is shutting down (a racy check
-    // as we don't hold the lock while requesting the trim).
-    return;
-  }
+  HeapTrimTask* added_task = nullptr;
   {
-    MutexLock mu(self, *heap_trim_request_lock_);
-    if (last_trim_time_ + kHeapTrimWait >= NanoTime()) {
-      // We have done a heap trim in the last kHeapTrimWait nanosecs, don't request another one
-      // just yet.
+    MutexLock mu(self, *pending_task_lock_);
+    if (pending_heap_trim_ != nullptr) {
+      // Already have a heap trim request in task processor, ignore this request.
       return;
     }
-    heap_trim_request_pending_ = true;
-    uint64_t current_time = NanoTime();
-    if (heap_transition_or_trim_target_time_ < current_time) {
-      heap_transition_or_trim_target_time_ = current_time + kHeapTrimWait;
-    }
+    added_task = new HeapTrimTask(kHeapTrimWait);
+    pending_heap_trim_ = added_task;
   }
-  // Notify the daemon thread which will actually do the heap trim.
-  SignalHeapTrimDaemon(self);
-}
-
-void Heap::SignalHeapTrimDaemon(Thread* self) {
-  JNIEnv* env = self->GetJniEnv();
-  DCHECK(WellKnownClasses::java_lang_Daemons != nullptr);
-  DCHECK(WellKnownClasses::java_lang_Daemons_requestHeapTrim != nullptr);
-  env->CallStaticVoidMethod(WellKnownClasses::java_lang_Daemons,
-                            WellKnownClasses::java_lang_Daemons_requestHeapTrim);
-  CHECK(!env->ExceptionCheck());
+  task_processor_->AddTask(self, added_task);
 }
 
 void Heap::RevokeThreadLocalBuffers(Thread* thread) {
@@ -3153,7 +3161,7 @@ void Heap::RevokeAllThreadLocalBuffers() {
 }
 
 bool Heap::IsGCRequestPending() const {
-  return concurrent_start_bytes_ != std::numeric_limits<size_t>::max();
+  return concurrent_gc_pending_.LoadRelaxed();
 }
 
 void Heap::RunFinalization(JNIEnv* env) {
@@ -3235,7 +3243,7 @@ void Heap::AddModUnionTable(accounting::ModUnionTable* mod_union_table) {
 }
 
 void Heap::CheckPreconditionsForAllocObject(mirror::Class* c, size_t byte_count) {
-  CHECK(c == NULL || (c->IsClassClass() && byte_count >= sizeof(mirror::Class)) ||
+  CHECK(c == nullptr || (c->IsClassClass() && byte_count >= sizeof(mirror::Class)) ||
         (c->IsVariableSize() || c->GetObjectSize() == byte_count));
   CHECK_GE(byte_count, sizeof(mirror::Object));
 }
@@ -3272,25 +3280,5 @@ void Heap::ClearMarkedObjects() {
   }
 }
 
-void Heap::WaitForConcurrentGCRequest(Thread* self) {
-  ScopedThreadStateChange tsc(self, kBlocked);
-  MutexLock mu(self, *gc_request_lock_);
-  conc_gc_running_ = false;
-  while (!gc_request_pending_) {
-    gc_request_cond_->Wait(self);
-  }
-  gc_request_pending_ = false;
-  conc_gc_running_ = true;
-}
-
-void Heap::NotifyConcurrentGCRequest(Thread* self) {
-  ScopedThreadStateChange tsc(self, kBlocked);
-  MutexLock mu(self, *gc_request_lock_);
-  if (!conc_gc_running_) {
-    gc_request_pending_ = true;
-    gc_request_cond_->Signal(self);
-  }
-}
-
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index cf94eb6a9d..1738124c0c 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -57,6 +57,7 @@ namespace mirror {
 namespace gc {
 
 class ReferenceProcessor;
+class TaskProcessor;
 
 namespace accounting {
   class HeapBitmap;
@@ -470,11 +471,11 @@ class Heap {
 
   void DumpForSigQuit(std::ostream& os);
 
-  // Do a pending heap transition or trim.
-  void DoPendingTransitionOrTrim() LOCKS_EXCLUDED(heap_trim_request_lock_);
+  // Do a pending collector transition.
+  void DoPendingCollectorTransition();
 
-  // Trim the managed and native heaps by releasing unused memory back to the OS.
-  void Trim() LOCKS_EXCLUDED(heap_trim_request_lock_);
+  // Deflate monitors, ... and trim the spaces.
+  void Trim(Thread* self) LOCKS_EXCLUDED(gc_complete_lock_);
 
   void RevokeThreadLocalBuffers(Thread* thread);
   void RevokeRosAllocThreadLocalBuffers(Thread* thread);
@@ -606,15 +607,25 @@ class Heap {
   ReferenceProcessor* GetReferenceProcessor() {
     return &reference_processor_;
   }
+  TaskProcessor* GetTaskProcessor() {
+    return task_processor_.get();
+  }
 
   bool HasZygoteSpace() const {
     return zygote_space_ != nullptr;
   }
 
-  void WaitForConcurrentGCRequest(Thread* self) LOCKS_EXCLUDED(gc_request_lock_);
-  void NotifyConcurrentGCRequest(Thread* self) LOCKS_EXCLUDED(gc_request_lock_);
+  // Request an asynchronous trim.
+  void RequestTrim(Thread* self) LOCKS_EXCLUDED(pending_task_lock_);
+
+  // Request asynchronous GC.
+  void RequestConcurrentGC(Thread* self) LOCKS_EXCLUDED(pending_task_lock_);
 
  private:
+  class ConcurrentGCTask;
+  class CollectorTransitionTask;
+  class HeapTrimTask;
+
   // Compact source space to target space.
   void Compact(space::ContinuousMemMapAllocSpace* target_space,
                space::ContinuousMemMapAllocSpace* source_space,
@@ -705,12 +716,10 @@ class Heap {
       EXCLUSIVE_LOCKS_REQUIRED(gc_complete_lock_);
 
   void RequestCollectorTransition(CollectorType desired_collector_type, uint64_t delta_time)
-      LOCKS_EXCLUDED(heap_trim_request_lock_);
-  void RequestHeapTrim() LOCKS_EXCLUDED(Locks::runtime_shutdown_lock_);
+      LOCKS_EXCLUDED(pending_task_lock_);
+
   void RequestConcurrentGCAndSaveObject(Thread* self, mirror::Object** obj)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  void RequestConcurrentGC(Thread* self)
-      LOCKS_EXCLUDED(Locks::runtime_shutdown_lock_);
   bool IsGCRequestPending() const;
 
   // Sometimes CollectGarbageInternal decides to run a different Gc than you requested. Returns
@@ -771,10 +780,6 @@ class Heap {
   // Clear cards and update the mod union table.
   void ProcessCards(TimingLogger* timings, bool use_rem_sets);
 
-  // Signal the heap trim daemon that there is something to do, either a heap transition or heap
-  // trim.
-  void SignalHeapTrimDaemon(Thread* self);
-
   // Push an object onto the allocation stack.
   void PushOnAllocationStack(Thread* self, mirror::Object** obj)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
@@ -783,12 +788,22 @@ class Heap {
   void PushOnThreadLocalAllocationStackWithInternalGC(Thread* thread, mirror::Object** obj)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  void ClearConcurrentGCRequest();
+  void ClearPendingTrim(Thread* self) LOCKS_EXCLUDED(pending_task_lock_);
+  void ClearPendingCollectorTransition(Thread* self) LOCKS_EXCLUDED(pending_task_lock_);
+
   // What kind of concurrency behavior is the runtime after? Currently true for concurrent mark
   // sweep GC, false for other GC types.
   bool IsGcConcurrent() const ALWAYS_INLINE {
     return collector_type_ == kCollectorTypeCMS || collector_type_ == kCollectorTypeCC;
   }
 
+  // Trim the managed and native spaces by releasing unused memory back to the OS.
+  void TrimSpaces(Thread* self) LOCKS_EXCLUDED(gc_complete_lock_);
+
+  // Trim 0 pages at the end of reference tables.
+  void TrimIndirectReferenceTables(Thread* self);
+
   // All-known continuous spaces, where objects lie within fixed bounds.
   std::vector<space::ContinuousSpace*> continuous_spaces_;
 
@@ -835,14 +850,8 @@ class Heap {
   // Desired collector type, heap trimming daemon transitions the heap if it is != collector_type_.
   CollectorType desired_collector_type_;
 
-  // Lock which guards heap trim requests.
-  Mutex* heap_trim_request_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
-  // When we want to perform the next heap trim (nano seconds).
-  uint64_t last_trim_time_ GUARDED_BY(heap_trim_request_lock_);
-  // When we want to perform the next heap transition (nano seconds) or heap trim.
-  uint64_t heap_transition_or_trim_target_time_ GUARDED_BY(heap_trim_request_lock_);
-  // If we have a heap trim request pending.
-  bool heap_trim_request_pending_ GUARDED_BY(heap_trim_request_lock_);
+  // Lock which guards pending tasks.
+  Mutex* pending_task_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
 
   // How many GC threads we may use for paused parts of garbage collection.
   const size_t parallel_gc_threads_;
@@ -879,15 +888,12 @@ class Heap {
   Mutex* gc_complete_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
   std::unique_ptr<ConditionVariable> gc_complete_cond_ GUARDED_BY(gc_complete_lock_);
 
-  // Guards concurrent GC requests.
-  Mutex* gc_request_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
-  std::unique_ptr<ConditionVariable> gc_request_cond_ GUARDED_BY(gc_request_lock_);
-  bool gc_request_pending_ GUARDED_BY(gc_request_lock_);
-  bool conc_gc_running_ GUARDED_BY(gc_request_lock_);
-
   // Reference processor;
   ReferenceProcessor reference_processor_;
 
+  // Task processor, proxies heap trim requests to the daemon threads.
+  std::unique_ptr<TaskProcessor> task_processor_;
+
   // True while the garbage collector is running.
   volatile CollectorType collector_type_running_ GUARDED_BY(gc_complete_lock_);
 
@@ -1060,9 +1066,17 @@ class Heap {
   // Count for performed homogeneous space compaction.
   Atomic<size_t> count_performed_homogeneous_space_compaction_;
 
+  // Whether or not a concurrent GC is pending.
+  Atomic<bool> concurrent_gc_pending_;
+
+  // Active tasks which we can modify (change target time, desired collector type, etc..).
+  CollectorTransitionTask* pending_collector_transition_ GUARDED_BY(pending_task_lock_);
+  HeapTrimTask* pending_heap_trim_ GUARDED_BY(pending_task_lock_);
+
   // Whether or not we use homogeneous space compaction to avoid OOM errors.
   bool use_homogeneous_space_compaction_for_oom_;
 
+  friend class CollectorTransitionTask;
   friend class collector::GarbageCollector;
   friend class collector::MarkCompact;
   friend class collector::MarkSweep;
diff --git a/runtime/gc/reference_processor.cc b/runtime/gc/reference_processor.cc
index 99bd63fa8a..01e8795669 100644
--- a/runtime/gc/reference_processor.cc
+++ b/runtime/gc/reference_processor.cc
@@ -23,11 +23,14 @@
 #include "reflection.h"
 #include "ScopedLocalRef.h"
 #include "scoped_thread_state_change.h"
+#include "task_processor.h"
 #include "well_known_classes.h"
 
 namespace art {
 namespace gc {
 
+static constexpr bool kAsyncReferenceQueueAdd = false;
+
 ReferenceProcessor::ReferenceProcessor()
     : process_references_args_(nullptr, nullptr, nullptr),
       preserving_references_(false),
@@ -213,17 +216,43 @@ void ReferenceProcessor::UpdateRoots(IsMarkedCallback* callback, void* arg) {
   cleared_references_.UpdateRoots(callback, arg);
 }
 
+class ClearedReferenceTask : public HeapTask {
+ public:
+  explicit ClearedReferenceTask(jobject cleared_references)
+      : HeapTask(NanoTime()), cleared_references_(cleared_references) {
+  }
+  virtual void Run(Thread* thread) {
+    ScopedObjectAccess soa(thread);
+    jvalue args[1];
+    args[0].l = cleared_references_;
+    InvokeWithJValues(soa, nullptr, WellKnownClasses::java_lang_ref_ReferenceQueue_add, args);
+    soa.Env()->DeleteGlobalRef(cleared_references_);
+  }
+
+ private:
+  const jobject cleared_references_;
+};
+
 void ReferenceProcessor::EnqueueClearedReferences(Thread* self) {
   Locks::mutator_lock_->AssertNotHeld(self);
+  // When a runtime isn't started there are no reference queues to care about so ignore.
   if (!cleared_references_.IsEmpty()) {
-    // When a runtime isn't started there are no reference queues to care about so ignore.
     if (LIKELY(Runtime::Current()->IsStarted())) {
-      ScopedObjectAccess soa(self);
-      ScopedLocalRef<jobject> arg(self->GetJniEnv(),
-                                  soa.AddLocalReference<jobject>(cleared_references_.GetList()));
-      jvalue args[1];
-      args[0].l = arg.get();
-      InvokeWithJValues(soa, nullptr, WellKnownClasses::java_lang_ref_ReferenceQueue_add, args);
+      jobject cleared_references;
+      {
+        ReaderMutexLock mu(self, *Locks::mutator_lock_);
+        cleared_references = self->GetJniEnv()->vm->AddGlobalRef(
+            self, cleared_references_.GetList());
+      }
+      if (kAsyncReferenceQueueAdd) {
+        // TODO: This can cause RunFinalization to terminate before newly freed objects are
+        // finalized since they may not be enqueued by the time RunFinalization starts.
+        Runtime::Current()->GetHeap()->GetTaskProcessor()->AddTask(
+            self, new ClearedReferenceTask(cleared_references));
+      } else {
+        ClearedReferenceTask task(cleared_references);
+        task.Run(self);
+      }
     }
     cleared_references_.Clear();
   }
diff --git a/runtime/gc/space/rosalloc_space.cc b/runtime/gc/space/rosalloc_space.cc
index 74d1a2b7db..ced25a40bb 100644
--- a/runtime/gc/space/rosalloc_space.cc
+++ b/runtime/gc/space/rosalloc_space.cc
@@ -365,8 +365,9 @@ void RosAllocSpace::Clear() {
   mark_bitmap_->Clear();
   SetEnd(begin_ + starting_size_);
   delete rosalloc_;
-  rosalloc_ = CreateRosAlloc(mem_map_->Begin(), starting_size_, initial_size_, Capacity(),
-                             low_memory_mode_, Runtime::Current()->RunningOnValgrind());
+  rosalloc_ = CreateRosAlloc(mem_map_->Begin(), starting_size_, initial_size_,
+                             NonGrowthLimitCapacity(), low_memory_mode_,
+                             Runtime::Current()->RunningOnValgrind());
   SetFootprintLimit(footprint_limit);
 }
 
diff --git a/runtime/gc/task_processor.cc b/runtime/gc/task_processor.cc
new file mode 100644
index 0000000000..1a3c6f5399
--- /dev/null
+++ b/runtime/gc/task_processor.cc
@@ -0,0 +1,125 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "task_processor.h"
+
+#include "scoped_thread_state_change.h"
+
+namespace art {
+namespace gc {
+
+TaskProcessor::TaskProcessor()
+    : lock_(new Mutex("Task processor lock", kReferenceProcessorLock)), is_running_(false) {
+  // Piggyback off the reference processor lock level.
+  cond_.reset(new ConditionVariable("Task processor condition", *lock_));
+}
+
+TaskProcessor::~TaskProcessor() {
+  delete lock_;
+}
+
+void TaskProcessor::AddTask(Thread* self, HeapTask* task) {
+  ScopedThreadStateChange tsc(self, kBlocked);
+  MutexLock mu(self, *lock_);
+  tasks_.insert(task);
+  cond_->Signal(self);
+}
+
+HeapTask* TaskProcessor::GetTask(Thread* self) {
+  ScopedThreadStateChange tsc(self, kBlocked);
+  MutexLock mu(self, *lock_);
+  while (true) {
+    if (tasks_.empty()) {
+      if (!is_running_) {
+        return nullptr;
+      }
+      cond_->Wait(self);  // Empty queue, wait until we are signalled.
+    } else {
+      // Non empty queue, look at the top element and see if we are ready to run it.
+      const uint64_t current_time = NanoTime();
+      HeapTask* task = *tasks_.begin();
+      // If we are shutting down, return the task right away without waiting. Otherwise return the
+      // task if it is late enough.
+      uint64_t target_time = task->GetTargetRunTime();
+      if (!is_running_ || target_time <= current_time) {
+        tasks_.erase(tasks_.begin());
+        return task;
+      }
+      DCHECK_GT(target_time, current_time);
+      // Wait untl we hit the target run time.
+      const uint64_t delta_time = target_time - current_time;
+      const uint64_t ms_delta = NsToMs(delta_time);
+      const uint64_t ns_delta = delta_time - MsToNs(ms_delta);
+      cond_->TimedWait(self, static_cast<int64_t>(ms_delta), static_cast<int32_t>(ns_delta));
+    }
+  }
+  UNREACHABLE();
+  return nullptr;
+}
+
+void TaskProcessor::UpdateTargetRunTime(Thread* self, HeapTask* task, uint64_t new_target_time) {
+  MutexLock mu(self, *lock_);
+  // Find the task.
+  auto range = tasks_.equal_range(task);
+  for (auto it = range.first; it != range.second; ++it) {
+    if (*it == task) {
+      // Check if the target time was updated, if so re-insert then wait.
+      if (new_target_time != task->GetTargetRunTime()) {
+        tasks_.erase(it);
+        task->SetTargetRunTime(new_target_time);
+        tasks_.insert(task);
+        // If we became the first task then we may need to signal since we changed the task that we
+        // are sleeping on.
+        if (*tasks_.begin() == task) {
+          cond_->Signal(self);
+        }
+        return;
+      }
+    }
+  }
+}
+
+bool TaskProcessor::IsRunning() const {
+  MutexLock mu(Thread::Current(), *lock_);
+  return is_running_;
+}
+
+void TaskProcessor::Stop(Thread* self) {
+  MutexLock mu(self, *lock_);
+  is_running_ = false;
+  cond_->Broadcast(self);
+}
+
+void TaskProcessor::Start(Thread* self) {
+  MutexLock mu(self, *lock_);
+  is_running_ = true;
+}
+
+void TaskProcessor::RunAllTasks(Thread* self) {
+  while (true) {
+    // Wait and get a task, may be interrupted.
+    HeapTask* task = GetTask(self);
+    if (task != nullptr) {
+      task->Run(self);
+      task->Finalize();
+    } else if (!IsRunning()) {
+      break;
+    }
+  }
+}
+
+}  // namespace gc
+}  // namespace art
diff --git a/runtime/gc/task_processor.h b/runtime/gc/task_processor.h
new file mode 100644
index 0000000000..765f03557e
--- /dev/null
+++ b/runtime/gc/task_processor.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_GC_TASK_PROCESSOR_H_
+#define ART_RUNTIME_GC_TASK_PROCESSOR_H_
+
+#include <memory>
+#include <set>
+
+#include "base/mutex.h"
+#include "globals.h"
+#include "thread_pool.h"
+
+namespace art {
+namespace gc {
+
+class HeapTask : public SelfDeletingTask {
+ public:
+  explicit HeapTask(uint64_t target_run_time) : target_run_time_(target_run_time) {
+  }
+  uint64_t GetTargetRunTime() const {
+    return target_run_time_;
+  }
+
+ private:
+  // Update the updated_target_run_time_, the task processor will re-insert the task when it is
+  // popped and update the target_run_time_.
+  void SetTargetRunTime(uint64_t new_target_run_time) {
+    target_run_time_ = new_target_run_time;
+  }
+
+  // Time in ns at which we want the task to run.
+  uint64_t target_run_time_;
+
+  friend class TaskProcessor;
+};
+
+// Used to process GC tasks (heap trim, heap transitions, concurrent GC).
+class TaskProcessor {
+ public:
+  TaskProcessor();
+  virtual ~TaskProcessor();
+  void AddTask(Thread* self, HeapTask* task) LOCKS_EXCLUDED(lock_);
+  HeapTask* GetTask(Thread* self) LOCKS_EXCLUDED(lock_);
+  void Start(Thread* self) LOCKS_EXCLUDED(lock_);
+  // Stop tells the RunAllTasks to finish up the remaining tasks as soon as
+  // possible then return.
+  void Stop(Thread* self) LOCKS_EXCLUDED(lock_);
+  void RunAllTasks(Thread* self) LOCKS_EXCLUDED(lock_);
+  bool IsRunning() const LOCKS_EXCLUDED(lock_);
+  void UpdateTargetRunTime(Thread* self, HeapTask* target_time, uint64_t new_target_time)
+      LOCKS_EXCLUDED(lock_);
+
+ private:
+  class CompareByTargetRunTime {
+   public:
+    bool operator()(const HeapTask* a, const HeapTask* b) const {
+      return a->GetTargetRunTime() < b->GetTargetRunTime();
+    }
+  };
+
+  mutable Mutex* lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
+  bool is_running_ GUARDED_BY(lock_);
+  std::unique_ptr<ConditionVariable> cond_ GUARDED_BY(lock_);
+  std::multiset<HeapTask*, CompareByTargetRunTime> tasks_ GUARDED_BY(lock_);
+};
+
+}  // namespace gc
+}  // namespace art
+
+#endif  // ART_RUNTIME_GC_TASK_PROCESSOR_H_
diff --git a/runtime/gc/task_processor_test.cc b/runtime/gc/task_processor_test.cc
new file mode 100644
index 0000000000..5dd6d8fb7b
--- /dev/null
+++ b/runtime/gc/task_processor_test.cc
@@ -0,0 +1,149 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "common_runtime_test.h"
+#include "task_processor.h"
+#include "thread_pool.h"
+#include "thread-inl.h"
+#include "utils.h"
+
+namespace art {
+namespace gc {
+
+class TaskProcessorTest : public CommonRuntimeTest {
+ public:
+};
+
+class RecursiveTask : public HeapTask {
+ public:
+  RecursiveTask(TaskProcessor* task_processor, Atomic<size_t>* counter, size_t max_recursion)
+     : HeapTask(NanoTime() + MsToNs(10)), task_processor_(task_processor), counter_(counter),
+       max_recursion_(max_recursion) {
+  }
+  virtual void Run(Thread* self) OVERRIDE {
+    if (max_recursion_ > 0) {
+      task_processor_->AddTask(self,
+                               new RecursiveTask(task_processor_, counter_, max_recursion_ - 1));
+      counter_->FetchAndAddSequentiallyConsistent(1U);
+    }
+  }
+
+ private:
+  TaskProcessor* const task_processor_;
+  Atomic<size_t>* const counter_;
+  const size_t max_recursion_;
+};
+
+class WorkUntilDoneTask : public SelfDeletingTask {
+ public:
+  WorkUntilDoneTask(TaskProcessor* task_processor, Atomic<bool>* done_running)
+      : task_processor_(task_processor), done_running_(done_running) {
+  }
+  virtual void Run(Thread* self) OVERRIDE {
+    task_processor_->RunAllTasks(self);
+    done_running_->StoreSequentiallyConsistent(true);
+  }
+
+ private:
+  TaskProcessor* const task_processor_;
+  Atomic<bool>* done_running_;
+};
+
+TEST_F(TaskProcessorTest, Interrupt) {
+  ThreadPool thread_pool("task processor test", 1U);
+  Thread* const self = Thread::Current();
+  TaskProcessor task_processor;
+  static constexpr size_t kRecursion = 10;
+  Atomic<bool> done_running(false);
+  Atomic<size_t> counter(0);
+  task_processor.AddTask(self, new RecursiveTask(&task_processor, &counter, kRecursion));
+  task_processor.Start(self);
+  // Add a task which will wait until interrupted to the thread pool.
+  thread_pool.AddTask(self, new WorkUntilDoneTask(&task_processor, &done_running));
+  thread_pool.StartWorkers(self);
+  ASSERT_FALSE(done_running);
+  // Wait until all the tasks are done, but since we didn't interrupt, done_running should be 0.
+  while (counter.LoadSequentiallyConsistent() != kRecursion) {
+    usleep(10);
+  }
+  ASSERT_FALSE(done_running);
+  task_processor.Stop(self);
+  thread_pool.Wait(self, true, false);
+  // After the interrupt and wait, the WorkUntilInterruptedTasktask should have terminated and
+  // set done_running_ to true.
+  ASSERT_TRUE(done_running.LoadSequentiallyConsistent());
+
+  // Test that we finish remaining tasks before returning from RunTasksUntilInterrupted.
+  counter.StoreSequentiallyConsistent(0);
+  done_running.StoreSequentiallyConsistent(false);
+  // Self interrupt before any of the other tasks run, but since we added them we should keep on
+  // working until all the tasks are completed.
+  task_processor.Stop(self);
+  task_processor.AddTask(self, new RecursiveTask(&task_processor, &counter, kRecursion));
+  thread_pool.AddTask(self, new WorkUntilDoneTask(&task_processor, &done_running));
+  thread_pool.StartWorkers(self);
+  thread_pool.Wait(self, true, false);
+  ASSERT_TRUE(done_running.LoadSequentiallyConsistent());
+  ASSERT_EQ(counter.LoadSequentiallyConsistent(), kRecursion);
+}
+
+class TestOrderTask : public HeapTask {
+ public:
+  explicit TestOrderTask(uint64_t expected_time, size_t expected_counter, size_t* counter)
+     : HeapTask(expected_time), expected_counter_(expected_counter), counter_(counter) {
+  }
+  virtual void Run(Thread* thread) OVERRIDE {
+    UNUSED(thread);  // Fix cppling bug.
+    ASSERT_EQ(*counter_, expected_counter_);
+    ++*counter_;
+  }
+
+ private:
+  const size_t expected_counter_;
+  size_t* const counter_;
+};
+
+TEST_F(TaskProcessorTest, Ordering) {
+  static const size_t kNumTasks = 25;
+  const uint64_t current_time = NanoTime();
+  Thread* const self = Thread::Current();
+  TaskProcessor task_processor;
+  task_processor.Stop(self);
+  size_t counter = 0;
+  std::vector<std::pair<uint64_t, size_t>> orderings;
+  for (size_t i = 0; i < kNumTasks; ++i) {
+    orderings.push_back(std::make_pair(current_time + MsToNs(10U * i), i));
+  }
+  for (size_t i = 0; i < kNumTasks; ++i) {
+    std::swap(orderings[i], orderings[(i * 87654231 + 12345) % orderings.size()]);
+  }
+  for (const auto& pair : orderings) {
+    auto* task = new TestOrderTask(pair.first, pair.second, &counter);
+    task_processor.AddTask(self, task);
+  }
+  ThreadPool thread_pool("task processor test", 1U);
+  Atomic<bool> done_running(false);
+  // Add a task which will wait until interrupted to the thread pool.
+  thread_pool.AddTask(self, new WorkUntilDoneTask(&task_processor, &done_running));
+  ASSERT_FALSE(done_running.LoadSequentiallyConsistent());
+  thread_pool.StartWorkers(self);
+  thread_pool.Wait(self, true, false);
+  ASSERT_TRUE(done_running.LoadSequentiallyConsistent());
+  ASSERT_EQ(counter, kNumTasks);
+}
+
+}  // namespace gc
+}  // namespace art
diff --git a/runtime/native/dalvik_system_VMRuntime.cc b/runtime/native/dalvik_system_VMRuntime.cc
index a348432340..f503b354f7 100644
--- a/runtime/native/dalvik_system_VMRuntime.cc
+++ b/runtime/native/dalvik_system_VMRuntime.cc
@@ -34,6 +34,7 @@
 #include "gc/heap.h"
 #include "gc/space/dlmalloc_space.h"
 #include "gc/space/image_space.h"
+#include "gc/task_processor.h"
 #include "intern_table.h"
 #include "jni_internal.h"
 #include "mirror/art_method-inl.h"
@@ -213,19 +214,32 @@ static void VMRuntime_updateProcessState(JNIEnv*, jobject, jint process_state) {
   runtime->UpdateProfilerState(process_state);
 }
 
-static void VMRuntime_trimHeap(JNIEnv*, jobject) {
-  Runtime::Current()->GetHeap()->DoPendingTransitionOrTrim();
+static void VMRuntime_trimHeap(JNIEnv* env, jobject) {
+  Runtime::Current()->GetHeap()->Trim(ThreadForEnv(env));
 }
 
 static void VMRuntime_concurrentGC(JNIEnv* env, jobject) {
   Runtime::Current()->GetHeap()->ConcurrentGC(ThreadForEnv(env));
 }
 
+static void VMRuntime_requestHeapTrim(JNIEnv* env, jobject) {
+  Runtime::Current()->GetHeap()->RequestTrim(ThreadForEnv(env));
+}
+
 static void VMRuntime_requestConcurrentGC(JNIEnv* env, jobject) {
-  Runtime::Current()->GetHeap()->NotifyConcurrentGCRequest(ThreadForEnv(env));
+  Runtime::Current()->GetHeap()->RequestConcurrentGC(ThreadForEnv(env));
 }
-static void VMRuntime_waitForConcurrentGCRequest(JNIEnv* env, jobject) {
-  Runtime::Current()->GetHeap()->WaitForConcurrentGCRequest(ThreadForEnv(env));
+
+static void VMRuntime_startHeapTaskProcessor(JNIEnv* env, jobject) {
+  Runtime::Current()->GetHeap()->GetTaskProcessor()->Start(ThreadForEnv(env));
+}
+
+static void VMRuntime_stopHeapTaskProcessor(JNIEnv* env, jobject) {
+  Runtime::Current()->GetHeap()->GetTaskProcessor()->Stop(ThreadForEnv(env));
+}
+
+static void VMRuntime_runHeapTasks(JNIEnv* env, jobject) {
+  Runtime::Current()->GetHeap()->GetTaskProcessor()->RunAllTasks(ThreadForEnv(env));
 }
 
 typedef std::map<std::string, mirror::String*> StringTable;
@@ -566,8 +580,6 @@ static JNINativeMethod gMethods[] = {
   NATIVE_METHOD(VMRuntime, classPath, "()Ljava/lang/String;"),
   NATIVE_METHOD(VMRuntime, clearGrowthLimit, "()V"),
   NATIVE_METHOD(VMRuntime, concurrentGC, "()V"),
-  NATIVE_METHOD(VMRuntime, requestConcurrentGC, "()V"),
-  NATIVE_METHOD(VMRuntime, waitForConcurrentGCRequest, "()V"),
   NATIVE_METHOD(VMRuntime, disableJitCompilation, "()V"),
   NATIVE_METHOD(VMRuntime, getTargetHeapUtilization, "()F"),
   NATIVE_METHOD(VMRuntime, isDebuggerActive, "!()Z"),
@@ -578,8 +590,13 @@ static JNINativeMethod gMethods[] = {
   NATIVE_METHOD(VMRuntime, setTargetSdkVersionNative, "(I)V"),
   NATIVE_METHOD(VMRuntime, registerNativeAllocation, "(I)V"),
   NATIVE_METHOD(VMRuntime, registerNativeFree, "(I)V"),
+  NATIVE_METHOD(VMRuntime, requestConcurrentGC, "()V"),
+  NATIVE_METHOD(VMRuntime, requestHeapTrim, "()V"),
+  NATIVE_METHOD(VMRuntime, runHeapTasks, "()V"),
   NATIVE_METHOD(VMRuntime, updateProcessState, "(I)V"),
+  NATIVE_METHOD(VMRuntime, startHeapTaskProcessor, "()V"),
   NATIVE_METHOD(VMRuntime, startJitCompilation, "()V"),
+  NATIVE_METHOD(VMRuntime, stopHeapTaskProcessor, "()V"),
   NATIVE_METHOD(VMRuntime, trimHeap, "()V"),
   NATIVE_METHOD(VMRuntime, vmVersion, "()Ljava/lang/String;"),
   NATIVE_METHOD(VMRuntime, vmLibrary, "()Ljava/lang/String;"),
diff --git a/runtime/native/dalvik_system_ZygoteHooks.cc b/runtime/native/dalvik_system_ZygoteHooks.cc
index f1a04cb35d..5f68d6000a 100644
--- a/runtime/native/dalvik_system_ZygoteHooks.cc
+++ b/runtime/native/dalvik_system_ZygoteHooks.cc
@@ -86,9 +86,15 @@ static void EnableDebugFeatures(uint32_t debug_flags) {
   }
   debug_flags &= ~DEBUG_ENABLE_DEBUGGER;
 
-  // These two are for backwards compatibility with Dalvik.
+  if ((debug_flags & DEBUG_ENABLE_SAFEMODE) != 0) {
+    // Ensure that any (secondary) oat files will be interpreted.
+    Runtime* runtime = Runtime::Current();
+    runtime->AddCompilerOption("--compiler-filter=interpret-only");
+    debug_flags &= ~DEBUG_ENABLE_SAFEMODE;
+  }
+
+  // This is for backwards compatibility with Dalvik.
   debug_flags &= ~DEBUG_ENABLE_ASSERT;
-  debug_flags &= ~DEBUG_ENABLE_SAFEMODE;
 
   if (debug_flags != 0) {
     LOG(ERROR) << StringPrintf("Unknown bits set in debug_flags: %#x", debug_flags);
diff --git a/runtime/runtime.h b/runtime/runtime.h
index 39fd910893..e31996338d 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -133,6 +133,10 @@ class Runtime {
     return compiler_options_;
   }
 
+  void AddCompilerOption(std::string option) {
+    compiler_options_.push_back(option);
+  }
+
   const std::vector<std::string>& GetImageCompilerOptions() const {
     return image_compiler_options_;
   }
diff --git a/runtime/thread_pool.h b/runtime/thread_pool.h
index 8c080673f9..79b57afedd 100644
--- a/runtime/thread_pool.h
+++ b/runtime/thread_pool.h
@@ -36,10 +36,18 @@ class Closure {
 
 class Task : public Closure {
  public:
-  // Called when references reaches 0.
+  // Called after Closure::Run has been called.
   virtual void Finalize() { }
 };
 
+class SelfDeletingTask : public Task {
+ public:
+  virtual ~SelfDeletingTask() { }
+  virtual void Finalize() {
+    delete this;
+  }
+};
+
 class ThreadPoolWorker {
  public:
   static const size_t kDefaultStackSize = 1 * MB;
diff --git a/test/800-smali/expected.txt b/test/800-smali/expected.txt
index 5f86f1e047..a55a13743c 100644
--- a/test/800-smali/expected.txt
+++ b/test/800-smali/expected.txt
@@ -9,4 +9,5 @@ invoke-super abstract
 BadCaseInOpRegRegReg
 CmpLong
 FloatIntConstPassing
+b/18718277
 Done!
diff --git a/test/800-smali/smali/b_18718277.smali b/test/800-smali/smali/b_18718277.smali
new file mode 100644
index 0000000000..b14ad2081e
--- /dev/null
+++ b/test/800-smali/smali/b_18718277.smali
@@ -0,0 +1,29 @@
+.class public LB18718277;
+
+.super Ljava/lang/Object;
+
+.method public static helper(I)I
+    .locals 1
+    add-int/lit8 v0, p0, 2
+    neg-int v0, v0
+    return v0
+.end method
+
+.method public static getInt()I
+    .registers 2
+    const/4 v1, 3
+    invoke-static {v1}, LB18718277;->helper(I)I
+    move-result v0
+    :outer_loop
+    if-eqz v1, :exit_outer_loop
+    const/4 v0, 0
+    if-eqz v0, :skip_dead_loop
+    :dead_loop
+    add-int/2addr v0, v0
+    if-gez v0, :dead_loop
+    :skip_dead_loop
+    add-int/lit8 v1, v1, -1
+    goto :outer_loop
+    :exit_outer_loop
+    return v0
+.end method
diff --git a/test/800-smali/src/Main.java b/test/800-smali/src/Main.java
index a2db05135d..70641b2069 100644
--- a/test/800-smali/src/Main.java
+++ b/test/800-smali/src/Main.java
@@ -65,6 +65,7 @@ public class Main {
         testCases.add(new TestCase("BadCaseInOpRegRegReg", "BadCaseInOpRegRegReg", "getInt", null, null, 2));
         testCases.add(new TestCase("CmpLong", "CmpLong", "run", null, null, 0));
         testCases.add(new TestCase("FloatIntConstPassing", "FloatIntConstPassing", "run", null, null, 2));
+        testCases.add(new TestCase("b/18718277", "B18718277", "getInt", null, null, 0));
     }
 
     public void runTests() {