Use atomic load/store for volatile IGET/IPUT/SGET/SPUT.

Bug: 14112919
Change-Id: I79316f438dd3adea9b2653ffc968af83671ad282
diff --git a/compiler/dex/quick/arm/codegen_arm.h b/compiler/dex/quick/arm/codegen_arm.h
index aab6b46..56f4830 100644
--- a/compiler/dex/quick/arm/codegen_arm.h
+++ b/compiler/dex/quick/arm/codegen_arm.h
@@ -32,6 +32,8 @@
     bool EasyMultiply(RegLocation rl_src, RegLocation rl_dest, int lit) OVERRIDE;
     LIR* CheckSuspendUsingLoad() OVERRIDE;
     RegStorage LoadHelper(ThreadOffset<4> offset);
+    LIR* LoadBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_dest,
+                              OpSize size) OVERRIDE;
     LIR* LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
                       OpSize size) OVERRIDE;
     LIR* LoadBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_dest, int scale,
@@ -40,6 +42,8 @@
                              RegStorage r_dest, OpSize size) OVERRIDE;
     LIR* LoadConstantNoClobber(RegStorage r_dest, int value);
     LIR* LoadConstantWide(RegStorage r_dest, int64_t value);
+    LIR* StoreBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_src,
+                               OpSize size) OVERRIDE;
     LIR* StoreBaseDisp(RegStorage r_base, int displacement, RegStorage r_src,
                        OpSize size) OVERRIDE;
     LIR* StoreBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_src, int scale,
@@ -86,6 +90,11 @@
     int GetInsnSize(LIR* lir);
     bool IsUnconditionalBranch(LIR* lir);
 
+    // Check support for volatile load/store of a given size.
+    bool SupportsVolatileLoadStore(OpSize size) OVERRIDE;
+    // Get the register class for load/store of a field.
+    RegisterClass RegClassForFieldLoadStore(OpSize size, bool is_volatile) OVERRIDE;
+
     // Required for target - Dalvik-level generators.
     void GenArithImmOpLong(Instruction::Code opcode, RegLocation rl_dest,
                            RegLocation rl_src1, RegLocation rl_src2);
diff --git a/compiler/dex/quick/arm/target_arm.cc b/compiler/dex/quick/arm/target_arm.cc
index f59720b..d0c81d5 100644
--- a/compiler/dex/quick/arm/target_arm.cc
+++ b/compiler/dex/quick/arm/target_arm.cc
@@ -522,6 +522,21 @@
   return ((lir->opcode == kThumbBUncond) || (lir->opcode == kThumb2BUncond));
 }
 
+bool ArmMir2Lir::SupportsVolatileLoadStore(OpSize size) {
+  return true;
+}
+
+RegisterClass ArmMir2Lir::RegClassForFieldLoadStore(OpSize size, bool is_volatile) {
+  if (UNLIKELY(is_volatile)) {
+    // On arm, atomic 64-bit load/store requires a core register pair.
+    // Smaller aligned load/store is atomic for both core and fp registers.
+    if (size == k64 || size == kDouble) {
+      return kCoreReg;
+    }
+  }
+  return RegClassBySize(size);
+}
+
 ArmMir2Lir::ArmMir2Lir(CompilationUnit* cu, MIRGraph* mir_graph, ArenaAllocator* arena)
     : Mir2Lir(cu, mir_graph, arena) {
   // Sanity check - make sure encoding map lines up.
diff --git a/compiler/dex/quick/arm/utility_arm.cc b/compiler/dex/quick/arm/utility_arm.cc
index 1745c18..1afd890 100644
--- a/compiler/dex/quick/arm/utility_arm.cc
+++ b/compiler/dex/quick/arm/utility_arm.cc
@@ -952,7 +952,26 @@
   return load;
 }
 
-LIR* ArmMir2Lir::LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest, OpSize size) {
+LIR* ArmMir2Lir::LoadBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_dest,
+                                      OpSize size) {
+  // Only 64-bit load needs special handling.
+  if (UNLIKELY(size == k64 || size == kDouble)) {
+    DCHECK(!r_dest.IsFloat());  // See RegClassForFieldLoadSave().
+    // If the cpu supports LPAE, aligned LDRD is atomic - fall through to LoadBaseDisp().
+    if (!cu_->compiler_driver->GetInstructionSetFeatures().HasLpae()) {
+      // Use LDREXD for the atomic load. (Expect displacement > 0, don't optimize for == 0.)
+      RegStorage r_ptr = AllocTemp();
+      OpRegRegImm(kOpAdd, r_ptr, r_base, displacement);
+      LIR* lir = NewLIR3(kThumb2Ldrexd, r_dest.GetLowReg(), r_dest.GetHighReg(), r_ptr.GetReg());
+      FreeTemp(r_ptr);
+      return lir;
+    }
+  }
+  return LoadBaseDisp(r_base, displacement, r_dest, size);
+}
+
+LIR* ArmMir2Lir::LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
+                              OpSize size) {
   // TODO: base this on target.
   if (size == kWord) {
     size = k32;
@@ -1072,6 +1091,42 @@
   return store;
 }
 
+LIR* ArmMir2Lir::StoreBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_src,
+                                       OpSize size) {
+  // Only 64-bit store needs special handling.
+  if (UNLIKELY(size == k64 || size == kDouble)) {
+    DCHECK(!r_src.IsFloat());  // See RegClassForFieldLoadSave().
+    // If the cpu supports LPAE, aligned STRD is atomic - fall through to StoreBaseDisp().
+    if (!cu_->compiler_driver->GetInstructionSetFeatures().HasLpae()) {
+      // Use STREXD for the atomic store. (Expect displacement > 0, don't optimize for == 0.)
+      RegStorage r_ptr = AllocTemp();
+      OpRegRegImm(kOpAdd, r_ptr, r_base, displacement);
+      LIR* fail_target = NewLIR0(kPseudoTargetLabel);
+      // We have only 5 temporary registers available and if r_base, r_src and r_ptr already
+      // take 4, we can't directly allocate 2 more for LDREXD temps. In that case clobber r_ptr
+      // in LDREXD and recalculate it from r_base.
+      RegStorage r_temp = AllocTemp();
+      RegStorage r_temp_high = AllocFreeTemp();  // We may not have another temp.
+      if (r_temp_high.Valid()) {
+        NewLIR3(kThumb2Ldrexd, r_temp.GetReg(), r_temp_high.GetReg(), r_ptr.GetReg());
+        FreeTemp(r_temp_high);
+        FreeTemp(r_temp);
+      } else {
+        // If we don't have another temp, clobber r_ptr in LDREXD and reload it.
+        NewLIR3(kThumb2Ldrexd, r_temp.GetReg(), r_ptr.GetReg(), r_ptr.GetReg());
+        FreeTemp(r_temp);  // May need the temp for kOpAdd.
+        OpRegRegImm(kOpAdd, r_ptr, r_base, displacement);
+      }
+      LIR* lir = NewLIR4(kThumb2Strexd, r_temp.GetReg(), r_src.GetLowReg(), r_src.GetHighReg(),
+                         r_ptr.GetReg());
+      OpCmpImmBranch(kCondNe, r_temp, 0, fail_target);
+      FreeTemp(r_ptr);
+      return lir;
+    }
+  }
+  return StoreBaseDisp(r_base, displacement, r_src, size);
+}
+
 LIR* ArmMir2Lir::StoreBaseDisp(RegStorage r_base, int displacement, RegStorage r_src,
                                OpSize size) {
   // TODO: base this on target.
diff --git a/compiler/dex/quick/arm64/codegen_arm64.h b/compiler/dex/quick/arm64/codegen_arm64.h
index 903be10..26084a2 100644
--- a/compiler/dex/quick/arm64/codegen_arm64.h
+++ b/compiler/dex/quick/arm64/codegen_arm64.h
@@ -32,6 +32,8 @@
     bool EasyMultiply(RegLocation rl_src, RegLocation rl_dest, int lit) OVERRIDE;
     LIR* CheckSuspendUsingLoad() OVERRIDE;
     RegStorage LoadHelper(A64ThreadOffset offset);
+    LIR* LoadBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_dest,
+                              OpSize size) OVERRIDE;
     LIR* LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
                       OpSize size) OVERRIDE;
     LIR* LoadBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_dest, int scale,
@@ -40,6 +42,8 @@
                              RegStorage r_dest, OpSize size) OVERRIDE;
     LIR* LoadConstantNoClobber(RegStorage r_dest, int value);
     LIR* LoadConstantWide(RegStorage r_dest, int64_t value);
+    LIR* StoreBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_dest,
+                               OpSize size) OVERRIDE;
     LIR* StoreBaseDisp(RegStorage r_base, int displacement, RegStorage r_src,
                        OpSize size) OVERRIDE;
     LIR* StoreBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_src, int scale,
@@ -86,6 +90,11 @@
     int GetInsnSize(LIR* lir);
     bool IsUnconditionalBranch(LIR* lir);
 
+    // Check support for volatile load/store of a given size.
+    bool SupportsVolatileLoadStore(OpSize size) OVERRIDE;
+    // Get the register class for load/store of a field.
+    RegisterClass RegClassForFieldLoadStore(OpSize size, bool is_volatile) OVERRIDE;
+
     // Required for target - Dalvik-level generators.
     void GenArithImmOpLong(Instruction::Code opcode, RegLocation rl_dest,
                            RegLocation rl_src1, RegLocation rl_src2);
diff --git a/compiler/dex/quick/arm64/target_arm64.cc b/compiler/dex/quick/arm64/target_arm64.cc
index 7e07e15..e4764eb 100644
--- a/compiler/dex/quick/arm64/target_arm64.cc
+++ b/compiler/dex/quick/arm64/target_arm64.cc
@@ -535,6 +535,20 @@
   return (lir->opcode == kA64B1t);
 }
 
+bool Arm64Mir2Lir::SupportsVolatileLoadStore(OpSize size) {
+  return true;
+}
+
+RegisterClass Arm64Mir2Lir::RegClassForFieldLoadStore(OpSize size, bool is_volatile) {
+  if (UNLIKELY(is_volatile)) {
+    // On arm64, fp register load/store is atomic only for single bytes.
+    if (size != kSignedByte && size != kUnsignedByte) {
+      return kCoreReg;
+    }
+  }
+  return RegClassBySize(size);
+}
+
 Arm64Mir2Lir::Arm64Mir2Lir(CompilationUnit* cu, MIRGraph* mir_graph, ArenaAllocator* arena)
     : Mir2Lir(cu, mir_graph, arena) {
   // Sanity check - make sure encoding map lines up.
diff --git a/compiler/dex/quick/arm64/utility_arm64.cc b/compiler/dex/quick/arm64/utility_arm64.cc
index e46e201..ae17711 100644
--- a/compiler/dex/quick/arm64/utility_arm64.cc
+++ b/compiler/dex/quick/arm64/utility_arm64.cc
@@ -930,6 +930,13 @@
   return load;
 }
 
+LIR* Arm64Mir2Lir::LoadBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_dest,
+                                        OpSize size) {
+  // LoadBaseDisp() will emit correct insn for atomic load on arm64
+  // assuming r_dest is correctly prepared using RegClassForFieldLoadStore().
+  return LoadBaseDisp(r_base, displacement, r_dest, size);
+}
+
 LIR* Arm64Mir2Lir::LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
                                 OpSize size) {
   return LoadBaseDispBody(r_base, displacement, r_dest, size);
@@ -1032,8 +1039,15 @@
   return store;
 }
 
+LIR* Arm64Mir2Lir::StoreBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_src,
+                                         OpSize size) {
+  // StoreBaseDisp() will emit correct insn for atomic store on arm64
+  // assuming r_dest is correctly prepared using RegClassForFieldLoadStore().
+  return StoreBaseDisp(r_base, displacement, r_src, size);
+}
+
 LIR* Arm64Mir2Lir::StoreBaseDisp(RegStorage r_base, int displacement, RegStorage r_src,
-                               OpSize size) {
+                                 OpSize size) {
   return StoreBaseDispBody(r_base, displacement, r_src, size);
 }
 
diff --git a/compiler/dex/quick/gen_common.cc b/compiler/dex/quick/gen_common.cc
index 83d5045..732e776 100644
--- a/compiler/dex/quick/gen_common.cc
+++ b/compiler/dex/quick/gen_common.cc
@@ -500,7 +500,9 @@
                       bool is_object) {
   const MirSFieldLoweringInfo& field_info = mir_graph_->GetSFieldLoweringInfo(mir);
   cu_->compiler_driver->ProcessedStaticField(field_info.FastPut(), field_info.IsReferrersClass());
-  if (field_info.FastPut() && !SLOW_FIELD_PATH) {
+  OpSize store_size = LoadStoreOpSize(is_long_or_double, is_object);
+  if (!SLOW_FIELD_PATH && field_info.FastPut() &&
+      (!field_info.IsVolatile() || SupportsVolatileLoadStore(store_size))) {
     DCHECK_GE(field_info.FieldOffset().Int32Value(), 0);
     RegStorage r_base;
     if (field_info.IsReferrersClass()) {
@@ -550,25 +552,20 @@
       FreeTemp(r_method);
     }
     // rBase now holds static storage base
+    RegisterClass reg_class = RegClassForFieldLoadStore(store_size, field_info.IsVolatile());
     if (is_long_or_double) {
-      RegisterClass register_kind = kAnyReg;
-      if (field_info.IsVolatile() && cu_->instruction_set == kX86) {
-        // Force long/double volatile stores into SSE registers to avoid tearing.
-        register_kind = kFPReg;
-      }
-      rl_src = LoadValueWide(rl_src, register_kind);
+      rl_src = LoadValueWide(rl_src, reg_class);
     } else {
-      rl_src = LoadValue(rl_src, kAnyReg);
+      rl_src = LoadValue(rl_src, reg_class);
     }
     if (field_info.IsVolatile()) {
       // There might have been a store before this volatile one so insert StoreStore barrier.
       GenMemBarrier(kStoreStore);
-    }
-    OpSize size = LoadStoreOpSize(is_long_or_double, rl_src.ref);
-    StoreBaseDisp(r_base, field_info.FieldOffset().Int32Value(), rl_src.reg, size);
-    if (field_info.IsVolatile()) {
+      StoreBaseDispVolatile(r_base, field_info.FieldOffset().Int32Value(), rl_src.reg, store_size);
       // A load might follow the volatile store so insert a StoreLoad barrier.
       GenMemBarrier(kStoreLoad);
+    } else {
+      StoreBaseDisp(r_base, field_info.FieldOffset().Int32Value(), rl_src.reg, store_size);
     }
     if (is_object && !mir_graph_->IsConstantNullRef(rl_src)) {
       MarkGCCard(rl_src.reg, r_base);
@@ -588,7 +585,9 @@
                       bool is_long_or_double, bool is_object) {
   const MirSFieldLoweringInfo& field_info = mir_graph_->GetSFieldLoweringInfo(mir);
   cu_->compiler_driver->ProcessedStaticField(field_info.FastGet(), field_info.IsReferrersClass());
-  if (field_info.FastGet() && !SLOW_FIELD_PATH) {
+  OpSize load_size = LoadStoreOpSize(is_long_or_double, is_object);
+  if (!SLOW_FIELD_PATH && field_info.FastGet() &&
+      (!field_info.IsVolatile() || SupportsVolatileLoadStore(load_size))) {
     DCHECK_GE(field_info.FieldOffset().Int32Value(), 0);
     RegStorage r_base;
     if (field_info.IsReferrersClass()) {
@@ -634,23 +633,20 @@
       FreeTemp(r_method);
     }
     // r_base now holds static storage base
-    RegisterClass result_reg_kind = kAnyReg;
-    if (field_info.IsVolatile() && cu_->instruction_set == kX86) {
-      // Force long/double volatile loads into SSE registers to avoid tearing.
-      result_reg_kind = kFPReg;
-    }
-    RegLocation rl_result = EvalLoc(rl_dest, result_reg_kind, true);
+    RegisterClass reg_class = RegClassForFieldLoadStore(load_size, field_info.IsVolatile());
+    RegLocation rl_result = EvalLoc(rl_dest, reg_class, true);
 
-    OpSize size = LoadStoreOpSize(is_long_or_double, rl_result.ref);
-    LoadBaseDisp(r_base, field_info.FieldOffset().Int32Value(), rl_result.reg, size);
-    FreeTemp(r_base);
-
+    int field_offset = field_info.FieldOffset().Int32Value();
     if (field_info.IsVolatile()) {
+      LoadBaseDispVolatile(r_base, field_offset, rl_result.reg, load_size);
       // Without context sensitive analysis, we must issue the most conservative barriers.
       // In this case, either a load or store may follow so we issue both barriers.
       GenMemBarrier(kLoadLoad);
       GenMemBarrier(kLoadStore);
+    } else {
+      LoadBaseDisp(r_base, field_offset, rl_result.reg, load_size);
     }
+    FreeTemp(r_base);
 
     if (is_long_or_double) {
       StoreValueWide(rl_dest, rl_result);
@@ -689,55 +685,29 @@
                       bool is_object) {
   const MirIFieldLoweringInfo& field_info = mir_graph_->GetIFieldLoweringInfo(mir);
   cu_->compiler_driver->ProcessedInstanceField(field_info.FastGet());
-  if (field_info.FastGet() && !SLOW_FIELD_PATH) {
-    RegLocation rl_result;
-    RegisterClass reg_class = RegClassBySize(size);
+  OpSize load_size = LoadStoreOpSize(is_long_or_double, is_object);
+  if (!SLOW_FIELD_PATH && field_info.FastGet() &&
+      (!field_info.IsVolatile() || SupportsVolatileLoadStore(load_size))) {
+    RegisterClass reg_class = RegClassForFieldLoadStore(load_size, field_info.IsVolatile());
     DCHECK_GE(field_info.FieldOffset().Int32Value(), 0);
     rl_obj = LoadValue(rl_obj, kCoreReg);
+    GenNullCheck(rl_obj.reg, opt_flags);
+    RegLocation rl_result = EvalLoc(rl_dest, reg_class, true);
+    int field_offset = field_info.FieldOffset().Int32Value();
+    if (field_info.IsVolatile()) {
+      LoadBaseDispVolatile(rl_obj.reg, field_offset, rl_result.reg, load_size);
+      MarkPossibleNullPointerException(opt_flags);
+      // Without context sensitive analysis, we must issue the most conservative barriers.
+      // In this case, either a load or store may follow so we issue both barriers.
+      GenMemBarrier(kLoadLoad);
+      GenMemBarrier(kLoadStore);
+    } else {
+      LoadBaseDisp(rl_obj.reg, field_offset, rl_result.reg, load_size);
+      MarkPossibleNullPointerException(opt_flags);
+    }
     if (is_long_or_double) {
-      DCHECK(rl_dest.wide);
-      GenNullCheck(rl_obj.reg, opt_flags);
-      if (cu_->instruction_set == kX86 || cu_->instruction_set == kX86_64) {
-        RegisterClass result_reg_kind = kAnyReg;
-        if (field_info.IsVolatile() && cu_->instruction_set == kX86) {
-          // Force long/double volatile loads into SSE registers to avoid tearing.
-          result_reg_kind = kFPReg;
-        }
-        rl_result = EvalLoc(rl_dest, result_reg_kind, true);
-        LoadBaseDisp(rl_obj.reg, field_info.FieldOffset().Int32Value(), rl_result.reg, size);
-        MarkPossibleNullPointerException(opt_flags);
-        if (field_info.IsVolatile()) {
-          // Without context sensitive analysis, we must issue the most conservative barriers.
-          // In this case, either a load or store may follow so we issue both barriers.
-          GenMemBarrier(kLoadLoad);
-          GenMemBarrier(kLoadStore);
-        }
-      } else {
-        RegStorage reg_ptr = AllocTemp();
-        OpRegRegImm(kOpAdd, reg_ptr, rl_obj.reg, field_info.FieldOffset().Int32Value());
-        rl_result = EvalLoc(rl_dest, reg_class, true);
-        LoadBaseDisp(reg_ptr, 0, rl_result.reg, size);
-        MarkPossibleNullPointerException(opt_flags);
-        if (field_info.IsVolatile()) {
-          // Without context sensitive analysis, we must issue the most conservative barriers.
-          // In this case, either a load or store may follow so we issue both barriers.
-          GenMemBarrier(kLoadLoad);
-          GenMemBarrier(kLoadStore);
-        }
-        FreeTemp(reg_ptr);
-      }
       StoreValueWide(rl_dest, rl_result);
     } else {
-      rl_result = EvalLoc(rl_dest, reg_class, true);
-      GenNullCheck(rl_obj.reg, opt_flags);
-      LoadBaseDisp(rl_obj.reg, field_info.FieldOffset().Int32Value(), rl_result.reg, k32);
-      MarkPossibleNullPointerException(opt_flags);
-      if (field_info.IsVolatile()) {
-        // Without context sensitive analysis, we must issue the most conservative barriers.
-        // In this case, either a load or store may follow so we issue both barriers.
-        GenMemBarrier(kLoadLoad);
-        GenMemBarrier(kLoadStore);
-      }
       StoreValue(rl_dest, rl_result);
     }
   } else {
@@ -761,47 +731,32 @@
                       bool is_object) {
   const MirIFieldLoweringInfo& field_info = mir_graph_->GetIFieldLoweringInfo(mir);
   cu_->compiler_driver->ProcessedInstanceField(field_info.FastPut());
-  if (field_info.FastPut() && !SLOW_FIELD_PATH) {
-    RegisterClass reg_class = RegClassBySize(size);
+  OpSize store_size = LoadStoreOpSize(is_long_or_double, is_object);
+  if (!SLOW_FIELD_PATH && field_info.FastPut() &&
+      (!field_info.IsVolatile() || SupportsVolatileLoadStore(store_size))) {
+    RegisterClass reg_class = RegClassForFieldLoadStore(store_size, field_info.IsVolatile());
     DCHECK_GE(field_info.FieldOffset().Int32Value(), 0);
     rl_obj = LoadValue(rl_obj, kCoreReg);
     if (is_long_or_double) {
-      RegisterClass src_reg_kind = kAnyReg;
-      if (field_info.IsVolatile() && cu_->instruction_set == kX86) {
-        // Force long/double volatile stores into SSE registers to avoid tearing.
-        src_reg_kind = kFPReg;
-      }
-      rl_src = LoadValueWide(rl_src, src_reg_kind);
-      GenNullCheck(rl_obj.reg, opt_flags);
-      RegStorage reg_ptr = AllocTemp();
-      OpRegRegImm(kOpAdd, reg_ptr, rl_obj.reg, field_info.FieldOffset().Int32Value());
-      if (field_info.IsVolatile()) {
-        // There might have been a store before this volatile one so insert StoreStore barrier.
-        GenMemBarrier(kStoreStore);
-      }
-      StoreBaseDisp(reg_ptr, 0, rl_src.reg, size);
-      MarkPossibleNullPointerException(opt_flags);
-      if (field_info.IsVolatile()) {
-        // A load might follow the volatile store so insert a StoreLoad barrier.
-        GenMemBarrier(kStoreLoad);
-      }
-      FreeTemp(reg_ptr);
+      rl_src = LoadValueWide(rl_src, reg_class);
     } else {
       rl_src = LoadValue(rl_src, reg_class);
-      GenNullCheck(rl_obj.reg, opt_flags);
-      if (field_info.IsVolatile()) {
-        // There might have been a store before this volatile one so insert StoreStore barrier.
-        GenMemBarrier(kStoreStore);
-      }
-      Store32Disp(rl_obj.reg, field_info.FieldOffset().Int32Value(), rl_src.reg);
+    }
+    GenNullCheck(rl_obj.reg, opt_flags);
+    int field_offset = field_info.FieldOffset().Int32Value();
+    if (field_info.IsVolatile()) {
+      // There might have been a store before this volatile one so insert StoreStore barrier.
+      GenMemBarrier(kStoreStore);
+      StoreBaseDispVolatile(rl_obj.reg, field_offset, rl_src.reg, store_size);
       MarkPossibleNullPointerException(opt_flags);
-      if (field_info.IsVolatile()) {
-        // A load might follow the volatile store so insert a StoreLoad barrier.
-        GenMemBarrier(kStoreLoad);
-      }
-      if (is_object && !mir_graph_->IsConstantNullRef(rl_src)) {
-        MarkGCCard(rl_src.reg, rl_obj.reg);
-      }
+      // A load might follow the volatile store so insert a StoreLoad barrier.
+      GenMemBarrier(kStoreLoad);
+    } else {
+      StoreBaseDisp(rl_obj.reg, field_offset, rl_src.reg, store_size);
+      MarkPossibleNullPointerException(opt_flags);
+    }
+    if (is_object && !mir_graph_->IsConstantNullRef(rl_src)) {
+      MarkGCCard(rl_src.reg, rl_obj.reg);
     }
   } else {
     ThreadOffset<4> setter_offset =
diff --git a/compiler/dex/quick/mips/codegen_mips.h b/compiler/dex/quick/mips/codegen_mips.h
index 20fd4b1..90d5a28 100644
--- a/compiler/dex/quick/mips/codegen_mips.h
+++ b/compiler/dex/quick/mips/codegen_mips.h
@@ -32,6 +32,8 @@
     bool EasyMultiply(RegLocation rl_src, RegLocation rl_dest, int lit) OVERRIDE;
     LIR* CheckSuspendUsingLoad() OVERRIDE;
     RegStorage LoadHelper(ThreadOffset<4> offset);
+    LIR* LoadBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_dest,
+                              OpSize size) OVERRIDE;
     LIR* LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
                       OpSize size) OVERRIDE;
     LIR* LoadBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_dest, int scale,
@@ -40,6 +42,8 @@
                              RegStorage r_dest, OpSize size) OVERRIDE;
     LIR* LoadConstantNoClobber(RegStorage r_dest, int value);
     LIR* LoadConstantWide(RegStorage r_dest, int64_t value);
+    LIR* StoreBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_src,
+                               OpSize size) OVERRIDE;
     LIR* StoreBaseDisp(RegStorage r_base, int displacement, RegStorage r_src,
                        OpSize size) OVERRIDE;
     LIR* StoreBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_src, int scale,
@@ -84,6 +88,11 @@
     int GetInsnSize(LIR* lir);
     bool IsUnconditionalBranch(LIR* lir);
 
+    // Check support for volatile load/store of a given size.
+    bool SupportsVolatileLoadStore(OpSize size) OVERRIDE;
+    // Get the register class for load/store of a field.
+    RegisterClass RegClassForFieldLoadStore(OpSize size, bool is_volatile) OVERRIDE;
+
     // Required for target - Dalvik-level generators.
     void GenArithImmOpLong(Instruction::Code opcode, RegLocation rl_dest,
                            RegLocation rl_src1, RegLocation rl_src2);
diff --git a/compiler/dex/quick/mips/target_mips.cc b/compiler/dex/quick/mips/target_mips.cc
index 8d91aba..570c220 100644
--- a/compiler/dex/quick/mips/target_mips.cc
+++ b/compiler/dex/quick/mips/target_mips.cc
@@ -555,6 +555,18 @@
   return (lir->opcode == kMipsB);
 }
 
+bool MipsMir2Lir::SupportsVolatileLoadStore(OpSize size) {
+  // No support for 64-bit atomic load/store on mips.
+  return size != k64 && size != kDouble;
+}
+
+RegisterClass MipsMir2Lir::RegClassForFieldLoadStore(OpSize size, bool is_volatile) {
+  // No support for 64-bit atomic load/store on mips.
+  DCHECK(size != k64 && size != kDouble);
+  // TODO: Verify that both core and fp registers are suitable for smaller sizes.
+  return RegClassBySize(size);
+}
+
 MipsMir2Lir::MipsMir2Lir(CompilationUnit* cu, MIRGraph* mir_graph, ArenaAllocator* arena)
     : Mir2Lir(cu, mir_graph, arena) {
   for (int i = 0; i < kMipsLast; i++) {
diff --git a/compiler/dex/quick/mips/utility_mips.cc b/compiler/dex/quick/mips/utility_mips.cc
index 8397411..58fbace 100644
--- a/compiler/dex/quick/mips/utility_mips.cc
+++ b/compiler/dex/quick/mips/utility_mips.cc
@@ -545,6 +545,12 @@
   return load;
 }
 
+LIR* MipsMir2Lir::LoadBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_dest,
+                                       OpSize size) {
+  DCHECK(size != k64 && size != kDouble);
+  return LoadBaseDisp(r_base, displacement, r_dest, size);
+}
+
 LIR* MipsMir2Lir::LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
                                OpSize size) {
   // TODO: base this on target.
@@ -640,6 +646,12 @@
   return res;
 }
 
+LIR* MipsMir2Lir::StoreBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_src,
+                                        OpSize size) {
+  DCHECK(size != k64 && size != kDouble);
+  return StoreBaseDisp(r_base, displacement, r_src, size);
+}
+
 LIR* MipsMir2Lir::StoreBaseDisp(RegStorage r_base, int displacement, RegStorage r_src,
                                 OpSize size) {
   // TODO: base this on target.
diff --git a/compiler/dex/quick/mir_to_lir.cc b/compiler/dex/quick/mir_to_lir.cc
index d10296f..0ffd189 100644
--- a/compiler/dex/quick/mir_to_lir.cc
+++ b/compiler/dex/quick/mir_to_lir.cc
@@ -128,6 +128,9 @@
   bool wide = (data.op_variant == InlineMethodAnalyser::IGetVariant(Instruction::IGET_WIDE));
   bool ref = (data.op_variant == InlineMethodAnalyser::IGetVariant(Instruction::IGET_OBJECT));
   OpSize size = LoadStoreOpSize(wide, ref);
+  if (data.is_volatile && !SupportsVolatileLoadStore(size)) {
+    return false;
+  }
 
   // The inliner doesn't distinguish kDouble or kFloat, use shorty.
   bool double_or_float = cu_->shorty[0] == 'F' || cu_->shorty[0] == 'D';
@@ -137,12 +140,14 @@
   LockArg(data.object_arg);
   RegLocation rl_dest = wide ? GetReturnWide(double_or_float) : GetReturn(double_or_float);
   RegStorage reg_obj = LoadArg(data.object_arg);
-  LoadBaseDisp(reg_obj, data.field_offset, rl_dest.reg, size);
   if (data.is_volatile) {
+    LoadBaseDispVolatile(reg_obj, data.field_offset, rl_dest.reg, size);
     // Without context sensitive analysis, we must issue the most conservative barriers.
     // In this case, either a load or store may follow so we issue both barriers.
     GenMemBarrier(kLoadLoad);
     GenMemBarrier(kLoadStore);
+  } else {
+    LoadBaseDisp(reg_obj, data.field_offset, rl_dest.reg, size);
   }
   return true;
 }
@@ -162,6 +167,9 @@
   bool wide = (data.op_variant == InlineMethodAnalyser::IPutVariant(Instruction::IPUT_WIDE));
   bool ref = (data.op_variant == InlineMethodAnalyser::IGetVariant(Instruction::IGET_OBJECT));
   OpSize size = LoadStoreOpSize(wide, ref);
+  if (data.is_volatile && !SupportsVolatileLoadStore(size)) {
+    return false;
+  }
 
   // Point of no return - no aborts after this
   GenPrintLabel(mir);
@@ -172,11 +180,11 @@
   if (data.is_volatile) {
     // There might have been a store before this volatile one so insert StoreStore barrier.
     GenMemBarrier(kStoreStore);
-  }
-  StoreBaseDisp(reg_obj, data.field_offset, reg_src, size);
-  if (data.is_volatile) {
+    StoreBaseDispVolatile(reg_obj, data.field_offset, reg_src, size);
     // A load might follow the volatile store so insert a StoreLoad barrier.
     GenMemBarrier(kStoreLoad);
+  } else {
+    StoreBaseDisp(reg_obj, data.field_offset, reg_src, size);
   }
   if (ref) {
     MarkGCCard(reg_src, reg_obj);
diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index 4b1de4b..c5125a2 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h
@@ -978,6 +978,8 @@
     virtual bool EasyMultiply(RegLocation rl_src, RegLocation rl_dest, int lit) = 0;
     virtual LIR* CheckSuspendUsingLoad() = 0;
     virtual RegStorage LoadHelper(ThreadOffset<4> offset) = 0;
+    virtual LIR* LoadBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_dest,
+                                      OpSize size) = 0;
     virtual LIR* LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
                               OpSize size) = 0;
     virtual LIR* LoadBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_dest,
@@ -986,6 +988,8 @@
                                      int displacement, RegStorage r_dest, OpSize size) = 0;
     virtual LIR* LoadConstantNoClobber(RegStorage r_dest, int value) = 0;
     virtual LIR* LoadConstantWide(RegStorage r_dest, int64_t value) = 0;
+    virtual LIR* StoreBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_src,
+                                       OpSize size) = 0;
     virtual LIR* StoreBaseDisp(RegStorage r_base, int displacement, RegStorage r_src,
                                OpSize size) = 0;
     virtual LIR* StoreBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_src,
@@ -1027,6 +1031,11 @@
     virtual int GetInsnSize(LIR* lir) = 0;
     virtual bool IsUnconditionalBranch(LIR* lir) = 0;
 
+    // Check support for volatile load/store of a given size.
+    virtual bool SupportsVolatileLoadStore(OpSize size) = 0;
+    // Get the register class for load/store of a field.
+    virtual RegisterClass RegClassForFieldLoadStore(OpSize size, bool is_volatile) = 0;
+
     // Required for target - Dalvik-level generators.
     virtual void GenArithImmOpLong(Instruction::Code opcode, RegLocation rl_dest,
                                    RegLocation rl_src1, RegLocation rl_src2) = 0;
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index a03e5f2..f7fcf19 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -32,6 +32,8 @@
     bool EasyMultiply(RegLocation rl_src, RegLocation rl_dest, int lit) OVERRIDE;
     LIR* CheckSuspendUsingLoad() OVERRIDE;
     RegStorage LoadHelper(ThreadOffset<4> offset);
+    LIR* LoadBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_dest,
+                              OpSize size) OVERRIDE;
     LIR* LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
                       OpSize size) OVERRIDE;
     LIR* LoadBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_dest, int scale,
@@ -40,6 +42,8 @@
                              RegStorage r_dest, OpSize size) OVERRIDE;
     LIR* LoadConstantNoClobber(RegStorage r_dest, int value);
     LIR* LoadConstantWide(RegStorage r_dest, int64_t value);
+    LIR* StoreBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_src,
+                               OpSize size) OVERRIDE;
     LIR* StoreBaseDisp(RegStorage r_base, int displacement, RegStorage r_src,
                        OpSize size) OVERRIDE;
     LIR* StoreBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_src, int scale,
@@ -84,6 +88,11 @@
     int GetInsnSize(LIR* lir);
     bool IsUnconditionalBranch(LIR* lir);
 
+    // Check support for volatile load/store of a given size.
+    bool SupportsVolatileLoadStore(OpSize size) OVERRIDE;
+    // Get the register class for load/store of a field.
+    RegisterClass RegClassForFieldLoadStore(OpSize size, bool is_volatile) OVERRIDE;
+
     // Required for target - Dalvik-level generators.
     void GenArithImmOpLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
                            RegLocation rl_src2);
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index 05bef52..0c61439 100644
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -545,6 +545,21 @@
   return (lir->opcode == kX86Jmp8 || lir->opcode == kX86Jmp32);
 }
 
+bool X86Mir2Lir::SupportsVolatileLoadStore(OpSize size) {
+  return true;
+}
+
+RegisterClass X86Mir2Lir::RegClassForFieldLoadStore(OpSize size, bool is_volatile) {
+  if (UNLIKELY(is_volatile)) {
+    // On x86, atomic 64-bit load/store requires an fp register.
+    // Smaller aligned load/store is atomic for both core and fp registers.
+    if (size == k64 || size == kDouble) {
+      return kFPReg;
+    }
+  }
+  return RegClassBySize(size);
+}
+
 X86Mir2Lir::X86Mir2Lir(CompilationUnit* cu, MIRGraph* mir_graph, ArenaAllocator* arena)
     : Mir2Lir(cu, mir_graph, arena),
       base_of_code_(nullptr), store_method_addr_(false), store_method_addr_used_(false),
diff --git a/compiler/dex/quick/x86/utility_x86.cc b/compiler/dex/quick/x86/utility_x86.cc
index 8423ec4..5326c2b 100644
--- a/compiler/dex/quick/x86/utility_x86.cc
+++ b/compiler/dex/quick/x86/utility_x86.cc
@@ -666,6 +666,13 @@
   return LoadBaseIndexedDisp(r_base, r_index, scale, 0, r_dest, size);
 }
 
+LIR* X86Mir2Lir::LoadBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_dest,
+                                      OpSize size) {
+  // LoadBaseDisp() will emit correct insn for atomic load on x86
+  // assuming r_dest is correctly prepared using RegClassForFieldLoadStore().
+  return LoadBaseDisp(r_base, displacement, r_dest, size);
+}
+
 LIR* X86Mir2Lir::LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
                               OpSize size) {
   // TODO: base this on target.
@@ -755,6 +762,13 @@
   return StoreBaseIndexedDisp(r_base, r_index, scale, 0, r_src, size);
 }
 
+LIR* X86Mir2Lir::StoreBaseDispVolatile(RegStorage r_base, int displacement,
+                                       RegStorage r_src, OpSize size) {
+  // StoreBaseDisp() will emit correct insn for atomic store on x86
+  // assuming r_dest is correctly prepared using RegClassForFieldLoadStore().
+  return StoreBaseDisp(r_base, displacement, r_src, size);
+}
+
 LIR* X86Mir2Lir::StoreBaseDisp(RegStorage r_base, int displacement,
                                RegStorage r_src, OpSize size) {
   // TODO: base this on target.
diff --git a/dex2oat/dex2oat.cc b/dex2oat/dex2oat.cc
index 7c0befc..3529c27 100644
--- a/dex2oat/dex2oat.cc
+++ b/dex2oat/dex2oat.cc
@@ -687,6 +687,12 @@
     } else if (feature == "nodiv") {
       // Turn off support for divide instruction.
       result.SetHasDivideInstruction(false);
+    } else if (feature == "lpae") {
+      // Supports Large Physical Address Extension.
+      result.SetHasLpae(true);
+    } else if (feature == "nolpae") {
+      // Turn off support for Large Physical Address Extension.
+      result.SetHasLpae(false);
     } else {
       Usage("Unknown instruction set feature: '%s'", feature.c_str());
     }
diff --git a/runtime/instruction_set.h b/runtime/instruction_set.h
index bfbbbd6..1cea24b 100644
--- a/runtime/instruction_set.h
+++ b/runtime/instruction_set.h
@@ -59,7 +59,8 @@
 #endif
 
 enum InstructionFeatures {
-  kHwDiv = 1                  // Supports hardware divide.
+  kHwDiv  = 0x1,              // Supports hardware divide.
+  kHwLpae = 0x2,              // Supports Large Physical Address Extension.
 };
 
 // This is a bitmask of supported features per architecture.
@@ -78,6 +79,14 @@
     mask_ = (mask_ & ~kHwDiv) | (v ? kHwDiv : 0);
   }
 
+  bool HasLpae() const {
+    return (mask_ & kHwLpae) != 0;
+  }
+
+  void SetHasLpae(bool v) {
+    mask_ = (mask_ & ~kHwLpae) | (v ? kHwLpae : 0);
+  }
+
   std::string GetFeatureString() const;
 
   // Other features in here.