summaryrefslogtreecommitdiff
path: root/compiler/optimizing
diff options
context:
space:
mode:
author Mark Mendell <mark.p.mendell@intel.com> 2015-11-19 14:08:40 -0500
committer Mark Mendell <mark.p.mendell@intel.com> 2015-12-15 15:48:39 -0500
commit7b3e4f99b25c31048a33a08688557b133ad345ab (patch)
tree446ce2d9b4684120c35fad9c097ea2f760f0797c /compiler/optimizing
parent089ff4886aa9b5e7cec04d2ef5cdeb9d68e5dc43 (diff)
X86: Use locked add rather than mfence
Java semantics for memory ordering can be satisfied using lock addl $0,0(SP) rather than mfence. The locked add synchronizes the memory caches, but doesn't affect device memory. Timing on a micro benchmark with a mfence or lock add $0,0(sp) in a loop with 600000000 iterations: time ./mfence real 0m5.411s user 0m5.408s sys 0m0.000s time ./locked_add real 0m3.552s user 0m3.550s sys 0m0.000s Implement this as an instruction-set-feature lock_add. This is off by default (uses mfence), and enabled for atom & silvermont variants. Generation of mfence can be forced by a parameter to MemoryFence. Change-Id: I5cb4fded61f4cbbd7b7db42a1b6902e43e458911 Signed-off-by: Mark Mendell <mark.p.mendell@intel.com>
Diffstat (limited to 'compiler/optimizing')
-rw-r--r--compiler/optimizing/code_generator_x86.cc2
-rw-r--r--compiler/optimizing/code_generator_x86.h14
-rw-r--r--compiler/optimizing/code_generator_x86_64.cc2
-rw-r--r--compiler/optimizing/code_generator_x86_64.h13
-rw-r--r--compiler/optimizing/intrinsics_x86.cc2
-rw-r--r--compiler/optimizing/intrinsics_x86_64.cc2
6 files changed, 31 insertions, 4 deletions
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 469dd49a8e..7a5b8dbe46 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -4156,7 +4156,7 @@ void CodeGeneratorX86::GenerateMemoryBarrier(MemBarrierKind kind) {
*/
switch (kind) {
case MemBarrierKind::kAnyAny: {
- __ mfence();
+ MemoryFence();
break;
}
case MemBarrierKind::kAnyStore:
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index 712179920b..f0ead0356d 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -17,6 +17,7 @@
#ifndef ART_COMPILER_OPTIMIZING_CODE_GENERATOR_X86_H_
#define ART_COMPILER_OPTIMIZING_CODE_GENERATOR_X86_H_
+#include "arch/x86/instruction_set_features_x86.h"
#include "code_generator.h"
#include "dex/compiler_enums.h"
#include "driver/compiler_options.h"
@@ -506,6 +507,19 @@ class CodeGeneratorX86 : public CodeGenerator {
// artReadBarrierForRootSlow.
void GenerateReadBarrierForRootSlow(HInstruction* instruction, Location out, Location root);
+ // Ensure that prior stores complete to memory before subsequent loads.
+ // The locked add implementation will avoid serializing device memory, but will
+ // touch (but not change) the top of the stack.
+ // The 'non_temporal' parameter should be used to ensure ordering of non-temporal stores.
+ void MemoryFence(bool non_temporal = false) {
+ if (!non_temporal && isa_features_.PrefersLockedAddSynchronization()) {
+ assembler_.lock()->addl(Address(ESP, 0), Immediate(0));
+ } else {
+ assembler_.mfence();
+ }
+ }
+
+
private:
// Factored implementation of GenerateFieldLoadWithBakerReadBarrier
// and GenerateArrayLoadWithBakerReadBarrier.
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 1fee192e36..7cea0a7160 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -3968,7 +3968,7 @@ void InstructionCodeGeneratorX86_64::GenerateMemoryBarrier(MemBarrierKind kind)
*/
switch (kind) {
case MemBarrierKind::kAnyAny: {
- __ mfence();
+ codegen_->MemoryFence();
break;
}
case MemBarrierKind::kAnyStore:
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index 7351fed0fa..9397d8f8af 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -17,6 +17,7 @@
#ifndef ART_COMPILER_OPTIMIZING_CODE_GENERATOR_X86_64_H_
#define ART_COMPILER_OPTIMIZING_CODE_GENERATOR_X86_64_H_
+#include "arch/x86_64/instruction_set_features_x86_64.h"
#include "code_generator.h"
#include "dex/compiler_enums.h"
#include "driver/compiler_options.h"
@@ -423,6 +424,18 @@ class CodeGeneratorX86_64 : public CodeGenerator {
int64_t v,
HInstruction* instruction);
+ // Ensure that prior stores complete to memory before subsequent loads.
+ // The locked add implementation will avoid serializing device memory, but will
+ // touch (but not change) the top of the stack. The locked add should not be used for
+ // ordering non-temporal stores.
+ void MemoryFence(bool force_mfence = false) {
+ if (!force_mfence && isa_features_.PrefersLockedAddSynchronization()) {
+ assembler_.lock()->addl(Address(CpuRegister(RSP), 0), Immediate(0));
+ } else {
+ assembler_.mfence();
+ }
+ }
+
private:
struct PcRelativeDexCacheAccessInfo {
PcRelativeDexCacheAccessInfo(const DexFile& dex_file, uint32_t element_off)
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index fd454d8322..74ade7c420 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -2005,7 +2005,7 @@ static void GenUnsafePut(LocationSummary* locations,
}
if (is_volatile) {
- __ mfence();
+ codegen->MemoryFence();
}
if (type == Primitive::kPrimNot) {
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index ac9b24503c..d519034263 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -2059,7 +2059,7 @@ static void GenUnsafePut(LocationSummary* locations, Primitive::Type type, bool
}
if (is_volatile) {
- __ mfence();
+ codegen->MemoryFence();
}
if (type == Primitive::kPrimNot) {