X86/X86_64: Switch to locked add from mfence
I finally received the answers about the performance of locked add vs.
mfence for Java memory semantics. Locked add has been faster than
mfence for all processors since the Pentium 4. Accordingly, I have made
the synchronization use locked add at all times, removing it from an
instruction set feature.
Also add support in the optimizing compiler for barrier type
kNTStoreStore, which is used after non-temporal moves.
Change-Id: Ib47c2fd64c2ff2128ad677f1f39c73444afb8e94
Signed-off-by: Mark Mendell <mark.p.mendell@intel.com>
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 304cf08..1fd0cbe 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -4263,8 +4263,10 @@
// nop
break;
}
- default:
- LOG(FATAL) << "Unexpected memory barrier " << kind;
+ case MemBarrierKind::kNTStoreStore:
+ // Non-Temporal Store/Store needs an explicit fence.
+ MemoryFence(/* non-temporal */ true);
+ break;
}
}
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index 69a6253..b87bf45 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -538,7 +538,7 @@
// touch (but not change) the top of the stack.
// The 'non_temporal' parameter should be used to ensure ordering of non-temporal stores.
void MemoryFence(bool non_temporal = false) {
- if (!non_temporal && isa_features_.PrefersLockedAddSynchronization()) {
+ if (!non_temporal) {
assembler_.lock()->addl(Address(ESP, 0), Immediate(0));
} else {
assembler_.mfence();
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 056b69b..225f547 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -4058,8 +4058,10 @@
// nop
break;
}
- default:
- LOG(FATAL) << "Unexpected memory barier " << kind;
+ case MemBarrierKind::kNTStoreStore:
+ // Non-Temporal Store/Store needs an explicit fence.
+ MemoryFence(/* non-temporal */ true);
+ break;
}
}
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index d7ce7c6..ce805cf 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -509,10 +509,10 @@
// Ensure that prior stores complete to memory before subsequent loads.
// The locked add implementation will avoid serializing device memory, but will
- // touch (but not change) the top of the stack. The locked add should not be used for
- // ordering non-temporal stores.
+ // touch (but not change) the top of the stack.
+ // The 'non_temporal' parameter should be used to ensure ordering of non-temporal stores.
void MemoryFence(bool force_mfence = false) {
- if (!force_mfence && isa_features_.PrefersLockedAddSynchronization()) {
+ if (!force_mfence) {
assembler_.lock()->addl(Address(CpuRegister(RSP), 0), Immediate(0));
} else {
assembler_.mfence();
diff --git a/runtime/arch/x86/instruction_set_features_x86.cc b/runtime/arch/x86/instruction_set_features_x86.cc
index b97a8db..0093e82 100644
--- a/runtime/arch/x86/instruction_set_features_x86.cc
+++ b/runtime/arch/x86/instruction_set_features_x86.cc
@@ -45,11 +45,6 @@
"silvermont",
};
-static constexpr const char* x86_variants_prefer_locked_add_sync[] = {
- "atom",
- "silvermont",
-};
-
static constexpr const char* x86_variants_with_popcnt[] = {
"silvermont",
};
@@ -69,10 +64,6 @@
bool has_AVX = false;
bool has_AVX2 = false;
- bool prefers_locked_add = FindVariantInArray(x86_variants_prefer_locked_add_sync,
- arraysize(x86_variants_prefer_locked_add_sync),
- variant);
-
bool has_POPCNT = FindVariantInArray(x86_variants_with_popcnt,
arraysize(x86_variants_with_popcnt),
variant);
@@ -86,10 +77,10 @@
if (x86_64) {
return new X86_64InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, has_AVX,
- has_AVX2, prefers_locked_add, has_POPCNT);
+ has_AVX2, has_POPCNT);
} else {
return new X86InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, has_AVX,
- has_AVX2, prefers_locked_add, has_POPCNT);
+ has_AVX2, has_POPCNT);
}
}
@@ -101,16 +92,13 @@
bool has_SSE4_2 = (bitmap & kSse4_2Bitfield) != 0;
bool has_AVX = (bitmap & kAvxBitfield) != 0;
bool has_AVX2 = (bitmap & kAvxBitfield) != 0;
- bool prefers_locked_add = (bitmap & kPrefersLockedAdd) != 0;
bool has_POPCNT = (bitmap & kPopCntBitfield) != 0;
if (x86_64) {
return new X86_64InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2,
- has_AVX, has_AVX2, prefers_locked_add,
- has_POPCNT);
+ has_AVX, has_AVX2, has_POPCNT);
} else {
return new X86InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2,
- has_AVX, has_AVX2, prefers_locked_add,
- has_POPCNT);
+ has_AVX, has_AVX2, has_POPCNT);
}
}
@@ -147,9 +135,6 @@
const bool has_AVX2 = true;
#endif
- // No #define for memory synchronization preference.
- const bool prefers_locked_add = false;
-
#ifndef __POPCNT__
const bool has_POPCNT = false;
#else
@@ -158,10 +143,10 @@
if (x86_64) {
return new X86_64InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, has_AVX,
- has_AVX2, prefers_locked_add, has_POPCNT);
+ has_AVX2, has_POPCNT);
} else {
return new X86InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, has_AVX,
- has_AVX2, prefers_locked_add, has_POPCNT);
+ has_AVX2, has_POPCNT);
}
}
@@ -174,8 +159,6 @@
bool has_SSE4_2 = false;
bool has_AVX = false;
bool has_AVX2 = false;
- // No cpuinfo for memory synchronization preference.
- const bool prefers_locked_add = false;
bool has_POPCNT = false;
std::ifstream in("/proc/cpuinfo");
@@ -217,10 +200,10 @@
}
if (x86_64) {
return new X86_64InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, has_AVX,
- has_AVX2, prefers_locked_add, has_POPCNT);
+ has_AVX2, has_POPCNT);
} else {
return new X86InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, has_AVX,
- has_AVX2, prefers_locked_add, has_POPCNT);
+ has_AVX2, has_POPCNT);
}
}
@@ -245,7 +228,6 @@
(has_SSE4_2_ == other_as_x86->has_SSE4_2_) &&
(has_AVX_ == other_as_x86->has_AVX_) &&
(has_AVX2_ == other_as_x86->has_AVX2_) &&
- (prefers_locked_add_ == other_as_x86->prefers_locked_add_) &&
(has_POPCNT_ == other_as_x86->has_POPCNT_);
}
@@ -256,7 +238,6 @@
(has_SSE4_2_ ? kSse4_2Bitfield : 0) |
(has_AVX_ ? kAvxBitfield : 0) |
(has_AVX2_ ? kAvx2Bitfield : 0) |
- (prefers_locked_add_ ? kPrefersLockedAdd : 0) |
(has_POPCNT_ ? kPopCntBitfield : 0);
}
@@ -292,11 +273,6 @@
} else {
result += ",-avx2";
}
- if (prefers_locked_add_) {
- result += ",lock_add";
- } else {
- result += ",-lock_add";
- }
if (has_POPCNT_) {
result += ",popcnt";
} else {
@@ -313,7 +289,6 @@
bool has_SSE4_2 = has_SSE4_2_;
bool has_AVX = has_AVX_;
bool has_AVX2 = has_AVX2_;
- bool prefers_locked_add = prefers_locked_add_;
bool has_POPCNT = has_POPCNT_;
for (auto i = features.begin(); i != features.end(); i++) {
std::string feature = Trim(*i);
@@ -337,10 +312,6 @@
has_AVX2 = true;
} else if (feature == "-avx2") {
has_AVX2 = false;
- } else if (feature == "lock_add") {
- prefers_locked_add = true;
- } else if (feature == "-lock_add") {
- prefers_locked_add = false;
} else if (feature == "popcnt") {
has_POPCNT = true;
} else if (feature == "-popcnt") {
@@ -352,10 +323,10 @@
}
if (x86_64) {
return new X86_64InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, has_AVX,
- has_AVX2, prefers_locked_add, has_POPCNT);
+ has_AVX2, has_POPCNT);
} else {
return new X86InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, has_AVX,
- has_AVX2, prefers_locked_add, has_POPCNT);
+ has_AVX2, has_POPCNT);
}
}
diff --git a/runtime/arch/x86/instruction_set_features_x86.h b/runtime/arch/x86/instruction_set_features_x86.h
index 1819654..2aa8ae6 100644
--- a/runtime/arch/x86/instruction_set_features_x86.h
+++ b/runtime/arch/x86/instruction_set_features_x86.h
@@ -60,8 +60,6 @@
bool HasSSE4_1() const { return has_SSE4_1_; }
- bool PrefersLockedAddSynchronization() const { return prefers_locked_add_; }
-
bool HasPopCnt() const { return has_POPCNT_; }
protected:
@@ -77,16 +75,13 @@
bool x86_64, std::string* error_msg) const;
X86InstructionSetFeatures(bool smp, bool has_SSSE3, bool has_SSE4_1, bool has_SSE4_2,
- bool has_AVX, bool has_AVX2,
- bool prefers_locked_add,
- bool has_POPCNT)
+ bool has_AVX, bool has_AVX2, bool has_POPCNT)
: InstructionSetFeatures(smp),
has_SSSE3_(has_SSSE3),
has_SSE4_1_(has_SSE4_1),
has_SSE4_2_(has_SSE4_2),
has_AVX_(has_AVX),
has_AVX2_(has_AVX2),
- prefers_locked_add_(prefers_locked_add),
has_POPCNT_(has_POPCNT) {
}
@@ -99,8 +94,7 @@
kSse4_2Bitfield = 8,
kAvxBitfield = 16,
kAvx2Bitfield = 32,
- kPrefersLockedAdd = 64,
- kPopCntBitfield = 128,
+ kPopCntBitfield = 64,
};
const bool has_SSSE3_; // x86 128bit SIMD - Supplemental SSE.
@@ -108,7 +102,6 @@
const bool has_SSE4_2_; // x86 128bit SIMD SSE4.2.
const bool has_AVX_; // x86 256bit SIMD AVX.
const bool has_AVX2_; // x86 256bit SIMD AVX 2.0.
- const bool prefers_locked_add_; // x86 use locked add for memory synchronization.
const bool has_POPCNT_; // x86 population count
DISALLOW_COPY_AND_ASSIGN(X86InstructionSetFeatures);
diff --git a/runtime/arch/x86/instruction_set_features_x86_test.cc b/runtime/arch/x86/instruction_set_features_x86_test.cc
index a062c12..9e154c6 100644
--- a/runtime/arch/x86/instruction_set_features_x86_test.cc
+++ b/runtime/arch/x86/instruction_set_features_x86_test.cc
@@ -27,7 +27,7 @@
ASSERT_TRUE(x86_features.get() != nullptr) << error_msg;
EXPECT_EQ(x86_features->GetInstructionSet(), kX86);
EXPECT_TRUE(x86_features->Equals(x86_features.get()));
- EXPECT_STREQ("smp,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-lock_add,-popcnt",
+ EXPECT_STREQ("smp,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-popcnt",
x86_features->GetFeatureString().c_str());
EXPECT_EQ(x86_features->AsBitmap(), 1U);
}
@@ -40,9 +40,9 @@
ASSERT_TRUE(x86_features.get() != nullptr) << error_msg;
EXPECT_EQ(x86_features->GetInstructionSet(), kX86);
EXPECT_TRUE(x86_features->Equals(x86_features.get()));
- EXPECT_STREQ("smp,ssse3,-sse4.1,-sse4.2,-avx,-avx2,lock_add,-popcnt",
+ EXPECT_STREQ("smp,ssse3,-sse4.1,-sse4.2,-avx,-avx2,-popcnt",
x86_features->GetFeatureString().c_str());
- EXPECT_EQ(x86_features->AsBitmap(), 67U);
+ EXPECT_EQ(x86_features->AsBitmap(), 3U);
// Build features for a 32-bit x86 default processor.
std::unique_ptr<const InstructionSetFeatures> x86_default_features(
@@ -50,7 +50,7 @@
ASSERT_TRUE(x86_default_features.get() != nullptr) << error_msg;
EXPECT_EQ(x86_default_features->GetInstructionSet(), kX86);
EXPECT_TRUE(x86_default_features->Equals(x86_default_features.get()));
- EXPECT_STREQ("smp,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-lock_add,-popcnt",
+ EXPECT_STREQ("smp,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-popcnt",
x86_default_features->GetFeatureString().c_str());
EXPECT_EQ(x86_default_features->AsBitmap(), 1U);
@@ -60,9 +60,9 @@
ASSERT_TRUE(x86_64_features.get() != nullptr) << error_msg;
EXPECT_EQ(x86_64_features->GetInstructionSet(), kX86_64);
EXPECT_TRUE(x86_64_features->Equals(x86_64_features.get()));
- EXPECT_STREQ("smp,ssse3,-sse4.1,-sse4.2,-avx,-avx2,lock_add,-popcnt",
+ EXPECT_STREQ("smp,ssse3,-sse4.1,-sse4.2,-avx,-avx2,-popcnt",
x86_64_features->GetFeatureString().c_str());
- EXPECT_EQ(x86_64_features->AsBitmap(), 67U);
+ EXPECT_EQ(x86_64_features->AsBitmap(), 3U);
EXPECT_FALSE(x86_64_features->Equals(x86_features.get()));
EXPECT_FALSE(x86_64_features->Equals(x86_default_features.get()));
@@ -77,9 +77,9 @@
ASSERT_TRUE(x86_features.get() != nullptr) << error_msg;
EXPECT_EQ(x86_features->GetInstructionSet(), kX86);
EXPECT_TRUE(x86_features->Equals(x86_features.get()));
- EXPECT_STREQ("smp,ssse3,sse4.1,sse4.2,-avx,-avx2,lock_add,popcnt",
+ EXPECT_STREQ("smp,ssse3,sse4.1,sse4.2,-avx,-avx2,popcnt",
x86_features->GetFeatureString().c_str());
- EXPECT_EQ(x86_features->AsBitmap(), 207U);
+ EXPECT_EQ(x86_features->AsBitmap(), 79U);
// Build features for a 32-bit x86 default processor.
std::unique_ptr<const InstructionSetFeatures> x86_default_features(
@@ -87,7 +87,7 @@
ASSERT_TRUE(x86_default_features.get() != nullptr) << error_msg;
EXPECT_EQ(x86_default_features->GetInstructionSet(), kX86);
EXPECT_TRUE(x86_default_features->Equals(x86_default_features.get()));
- EXPECT_STREQ("smp,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-lock_add,-popcnt",
+ EXPECT_STREQ("smp,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-popcnt",
x86_default_features->GetFeatureString().c_str());
EXPECT_EQ(x86_default_features->AsBitmap(), 1U);
@@ -97,9 +97,9 @@
ASSERT_TRUE(x86_64_features.get() != nullptr) << error_msg;
EXPECT_EQ(x86_64_features->GetInstructionSet(), kX86_64);
EXPECT_TRUE(x86_64_features->Equals(x86_64_features.get()));
- EXPECT_STREQ("smp,ssse3,sse4.1,sse4.2,-avx,-avx2,lock_add,popcnt",
+ EXPECT_STREQ("smp,ssse3,sse4.1,sse4.2,-avx,-avx2,popcnt",
x86_64_features->GetFeatureString().c_str());
- EXPECT_EQ(x86_64_features->AsBitmap(), 207U);
+ EXPECT_EQ(x86_64_features->AsBitmap(), 79U);
EXPECT_FALSE(x86_64_features->Equals(x86_features.get()));
EXPECT_FALSE(x86_64_features->Equals(x86_default_features.get()));
diff --git a/runtime/arch/x86_64/instruction_set_features_x86_64.h b/runtime/arch/x86_64/instruction_set_features_x86_64.h
index aba7234..0840f89 100644
--- a/runtime/arch/x86_64/instruction_set_features_x86_64.h
+++ b/runtime/arch/x86_64/instruction_set_features_x86_64.h
@@ -74,10 +74,9 @@
private:
X86_64InstructionSetFeatures(bool smp, bool has_SSSE3, bool has_SSE4_1, bool has_SSE4_2,
- bool has_AVX, bool has_AVX2, bool prefers_locked_add,
- bool has_POPCNT)
+ bool has_AVX, bool has_AVX2, bool has_POPCNT)
: X86InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, has_AVX,
- has_AVX2, prefers_locked_add, has_POPCNT) {
+ has_AVX2, has_POPCNT) {
}
friend class X86InstructionSetFeatures;
diff --git a/runtime/arch/x86_64/instruction_set_features_x86_64_test.cc b/runtime/arch/x86_64/instruction_set_features_x86_64_test.cc
index 78aeacf..f2b2cd8 100644
--- a/runtime/arch/x86_64/instruction_set_features_x86_64_test.cc
+++ b/runtime/arch/x86_64/instruction_set_features_x86_64_test.cc
@@ -27,7 +27,7 @@
ASSERT_TRUE(x86_64_features.get() != nullptr) << error_msg;
EXPECT_EQ(x86_64_features->GetInstructionSet(), kX86_64);
EXPECT_TRUE(x86_64_features->Equals(x86_64_features.get()));
- EXPECT_STREQ("smp,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-lock_add,-popcnt",
+ EXPECT_STREQ("smp,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-popcnt",
x86_64_features->GetFeatureString().c_str());
EXPECT_EQ(x86_64_features->AsBitmap(), 1U);
}