diff options
| author | 2015-11-19 14:08:40 -0500 | |
|---|---|---|
| committer | 2015-12-15 15:48:39 -0500 | |
| commit | 7b3e4f99b25c31048a33a08688557b133ad345ab (patch) | |
| tree | 446ce2d9b4684120c35fad9c097ea2f760f0797c /compiler/optimizing | |
| parent | 089ff4886aa9b5e7cec04d2ef5cdeb9d68e5dc43 (diff) | |
X86: Use locked add rather than mfence
Java semantics for memory ordering can be satisfied using
lock addl $0,0(SP)
rather than mfence. The locked add synchronizes the memory caches, but
doesn't affect device memory.
Timing on a micro benchmark with a mfence or lock add $0,0(sp) in a loop
with 600000000 iterations:
time ./mfence
real 0m5.411s
user 0m5.408s
sys 0m0.000s
time ./locked_add
real 0m3.552s
user 0m3.550s
sys 0m0.000s
Implement this as an instruction-set-feature lock_add. This is off by
default (uses mfence), and enabled for atom & silvermont variants.
Generation of mfence can be forced by a parameter to MemoryFence.
Change-Id: I5cb4fded61f4cbbd7b7db42a1b6902e43e458911
Signed-off-by: Mark Mendell <mark.p.mendell@intel.com>
Diffstat (limited to 'compiler/optimizing')
| -rw-r--r-- | compiler/optimizing/code_generator_x86.cc | 2 | ||||
| -rw-r--r-- | compiler/optimizing/code_generator_x86.h | 14 | ||||
| -rw-r--r-- | compiler/optimizing/code_generator_x86_64.cc | 2 | ||||
| -rw-r--r-- | compiler/optimizing/code_generator_x86_64.h | 13 | ||||
| -rw-r--r-- | compiler/optimizing/intrinsics_x86.cc | 2 | ||||
| -rw-r--r-- | compiler/optimizing/intrinsics_x86_64.cc | 2 |
6 files changed, 31 insertions, 4 deletions
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc index 469dd49a8e..7a5b8dbe46 100644 --- a/compiler/optimizing/code_generator_x86.cc +++ b/compiler/optimizing/code_generator_x86.cc @@ -4156,7 +4156,7 @@ void CodeGeneratorX86::GenerateMemoryBarrier(MemBarrierKind kind) { */ switch (kind) { case MemBarrierKind::kAnyAny: { - __ mfence(); + MemoryFence(); break; } case MemBarrierKind::kAnyStore: diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h index 712179920b..f0ead0356d 100644 --- a/compiler/optimizing/code_generator_x86.h +++ b/compiler/optimizing/code_generator_x86.h @@ -17,6 +17,7 @@ #ifndef ART_COMPILER_OPTIMIZING_CODE_GENERATOR_X86_H_ #define ART_COMPILER_OPTIMIZING_CODE_GENERATOR_X86_H_ +#include "arch/x86/instruction_set_features_x86.h" #include "code_generator.h" #include "dex/compiler_enums.h" #include "driver/compiler_options.h" @@ -506,6 +507,19 @@ class CodeGeneratorX86 : public CodeGenerator { // artReadBarrierForRootSlow. void GenerateReadBarrierForRootSlow(HInstruction* instruction, Location out, Location root); + // Ensure that prior stores complete to memory before subsequent loads. + // The locked add implementation will avoid serializing device memory, but will + // touch (but not change) the top of the stack. + // The 'non_temporal' parameter should be used to ensure ordering of non-temporal stores. + void MemoryFence(bool non_temporal = false) { + if (!non_temporal && isa_features_.PrefersLockedAddSynchronization()) { + assembler_.lock()->addl(Address(ESP, 0), Immediate(0)); + } else { + assembler_.mfence(); + } + } + + private: // Factored implementation of GenerateFieldLoadWithBakerReadBarrier // and GenerateArrayLoadWithBakerReadBarrier. diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc index 1fee192e36..7cea0a7160 100644 --- a/compiler/optimizing/code_generator_x86_64.cc +++ b/compiler/optimizing/code_generator_x86_64.cc @@ -3968,7 +3968,7 @@ void InstructionCodeGeneratorX86_64::GenerateMemoryBarrier(MemBarrierKind kind) */ switch (kind) { case MemBarrierKind::kAnyAny: { - __ mfence(); + codegen_->MemoryFence(); break; } case MemBarrierKind::kAnyStore: diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h index 7351fed0fa..9397d8f8af 100644 --- a/compiler/optimizing/code_generator_x86_64.h +++ b/compiler/optimizing/code_generator_x86_64.h @@ -17,6 +17,7 @@ #ifndef ART_COMPILER_OPTIMIZING_CODE_GENERATOR_X86_64_H_ #define ART_COMPILER_OPTIMIZING_CODE_GENERATOR_X86_64_H_ +#include "arch/x86_64/instruction_set_features_x86_64.h" #include "code_generator.h" #include "dex/compiler_enums.h" #include "driver/compiler_options.h" @@ -423,6 +424,18 @@ class CodeGeneratorX86_64 : public CodeGenerator { int64_t v, HInstruction* instruction); + // Ensure that prior stores complete to memory before subsequent loads. + // The locked add implementation will avoid serializing device memory, but will + // touch (but not change) the top of the stack. The locked add should not be used for + // ordering non-temporal stores. + void MemoryFence(bool force_mfence = false) { + if (!force_mfence && isa_features_.PrefersLockedAddSynchronization()) { + assembler_.lock()->addl(Address(CpuRegister(RSP), 0), Immediate(0)); + } else { + assembler_.mfence(); + } + } + private: struct PcRelativeDexCacheAccessInfo { PcRelativeDexCacheAccessInfo(const DexFile& dex_file, uint32_t element_off) diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc index fd454d8322..74ade7c420 100644 --- a/compiler/optimizing/intrinsics_x86.cc +++ b/compiler/optimizing/intrinsics_x86.cc @@ -2005,7 +2005,7 @@ static void GenUnsafePut(LocationSummary* locations, } if (is_volatile) { - __ mfence(); + codegen->MemoryFence(); } if (type == Primitive::kPrimNot) { diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc index ac9b24503c..d519034263 100644 --- a/compiler/optimizing/intrinsics_x86_64.cc +++ b/compiler/optimizing/intrinsics_x86_64.cc @@ -2059,7 +2059,7 @@ static void GenUnsafePut(LocationSummary* locations, Primitive::Type type, bool } if (is_volatile) { - __ mfence(); + codegen->MemoryFence(); } if (type == Primitive::kPrimNot) { |