Implement Intel QuasiAtomics.

Don't use striped locks for 64bit atomics on x86.
Modify QuasiAtomic::Swap to be QuasiAtomic::Write that fits our current use of
Swap and is closer to Intel's implementation.
Return that MIPS doesn't support 64bit compare-and-exchanges in AtomicLong.
Set the SSE2 flag for host and target Intel ART builds as our codegen assumes
it.

Change-Id: Ic1cd5c3b06838e42c6f94e0dd91e77a2d0bb5868
diff --git a/build/Android.common.mk b/build/Android.common.mk
index 2556fa2..f2f38e1 100644
--- a/build/Android.common.mk
+++ b/build/Android.common.mk
@@ -87,7 +87,11 @@
 
 ART_HOST_CFLAGS := $(art_cflags) -DANDROID_SMP=1 -DART_BASE_ADDRESS=$(IMG_HOST_BASE_ADDRESS)
 # The host GCC isn't necessarily new enough to support -Wthread-safety (GCC 4.4).
-ART_HOST_CFLAGS := $(filter-out -Wthread-safety,$(ART_HOST_CFLAGS))
+ART_HOST_CFLAGS := $(filter-out -Wthread-safety,$(ART_HOST_CFLAGS)) -msse2
+
+ifeq ($(TARGET_ARCH),x86)
+ART_TARGET_CFLAGS += -msse2
+endif
 
 ART_TARGET_CFLAGS := $(art_cflags) -DART_TARGET -DART_BASE_ADDRESS=$(IMG_TARGET_BASE_ADDRESS)
 ifeq ($(TARGET_CPU_SMP),true)
diff --git a/src/atomic.cc b/src/atomic.cc
index 5bbbb4f..e7bab09 100644
--- a/src/atomic.cc
+++ b/src/atomic.cc
@@ -16,84 +16,94 @@
 
 #include "atomic.h"
 
-#include <pthread.h>
+#define NEED_SWAP_MUTEXES !defined(__arm__) && !defined(__i386__)
 
+#if NEED_SWAP_MUTEXES
 #include <vector>
-
 #include "base/mutex.h"
 #include "base/stl_util.h"
 #include "base/stringprintf.h"
 #include "thread.h"
-
-#if defined(__APPLE__)
-#include <libkern/OSAtomic.h>
-#endif
-#if defined(__arm__)
-#include <machine/cpu-features.h>
 #endif
 
 namespace art {
 
-#if defined(HAVE_MACOSX_IPC)
-#define NEED_MAC_QUASI_ATOMICS 1
+#if NEED_SWAP_MUTEXES
+// We stripe across a bunch of different mutexes to reduce contention.
+static const size_t kSwapMutexCount = 32;
+static std::vector<Mutex*>* gSwapMutexes;
 
-#elif defined(__i386__) || defined(__x86_64__)
-#define NEED_PTHREADS_QUASI_ATOMICS 1
-
-#elif defined(__mips__)
-#define NEED_PTHREADS_QUASI_ATOMICS 1
-
-#elif defined(__arm__)
-
-#if defined(__ARM_HAVE_LDREXD)
-#define NEED_ARM_LDREXD_QUASI_ATOMICS 1
-#else
-#define NEED_PTHREADS_QUASI_ATOMICS 1
+static Mutex& GetSwapMutex(const volatile int64_t* addr) {
+  return *(*gSwapMutexes)[((unsigned)(void*)(addr) >> 3U) % kSwapMutexCount];
+}
 #endif
 
-#else
-#error "QuasiAtomic unsupported on this platform"
+void QuasiAtomic::Startup() {
+#if NEED_SWAP_MUTEXES
+  gSwapMutexes = new std::vector<Mutex*>;
+  for (size_t i = 0; i < kSwapMutexCount; ++i) {
+    gSwapMutexes->push_back(new Mutex(StringPrintf("QuasiAtomic stripe %d", i).c_str()));
+  }
 #endif
-
-// *****************************************************************************
-
-#if NEED_ARM_LDREXD_QUASI_ATOMICS
-
-static inline int64_t QuasiAtomicSwap64Impl(int64_t new_value, volatile int64_t* addr) {
-  int64_t prev;
-  int status;
-  do {
-    __asm__ __volatile__("@ QuasiAtomic::Swap64\n"
-        "ldrexd     %0, %H0, [%3]\n"
-        "strexd     %1, %4, %H4, [%3]"
-        : "=&r" (prev), "=&r" (status), "+m"(*addr)
-        : "r" (addr), "r" (new_value)
-        : "cc");
-  } while (__builtin_expect(status != 0, 0));
-  return prev;
 }
 
-int64_t QuasiAtomic::Swap64(int64_t new_value, volatile int64_t* addr) {
-  return QuasiAtomicSwap64Impl(new_value, addr);
-}
-
-int64_t QuasiAtomic::Swap64Sync(int64_t new_value, volatile int64_t* addr) {
-  ANDROID_MEMBAR_STORE();
-  int64_t old_value = QuasiAtomicSwap64Impl(new_value, addr);
-  ANDROID_MEMBAR_FULL();
-  return old_value;
+void QuasiAtomic::Shutdown() {
+#if NEED_SWAP_MUTEXES
+  STLDeleteElements(gSwapMutexes);
+  delete gSwapMutexes;
+#endif
 }
 
 int64_t QuasiAtomic::Read64(volatile const int64_t* addr) {
   int64_t value;
+#if defined(__arm__)
+  // Exclusive loads are defined not to tear, clearing the exclusive state isn't necessary. If we
+  // have LPAE (such as Cortex-A15) then ldrd would suffice.
   __asm__ __volatile__("@ QuasiAtomic::Read64\n"
       "ldrexd     %0, %H0, [%1]"
       : "=&r" (value)
       : "r" (addr));
+#elif defined(__i386__)
+  __asm__ __volatile__(
+      "movq     %1, %0\n"
+      : "=x" (value)
+      : "m" (*addr));
+#else
+  MutexLock mu(Thread::Current(), GetSwapMutex(addr));
+  return *addr;
+#endif
   return value;
 }
 
-int QuasiAtomic::Cas64(int64_t old_value, int64_t new_value, volatile int64_t* addr) {
+void QuasiAtomic::Write64(volatile int64_t* addr, int64_t value) {
+#if defined(__arm__)
+  // The write is done as a swap so that the cache-line is in the exclusive state for the store. If
+  // we know that ARM architecture has LPAE (such as Cortex-A15) this isn't necessary and strd will
+  // suffice.
+  int64_t prev;
+  int status;
+  do {
+    __asm__ __volatile__("@ QuasiAtomic::Write64\n"
+        "ldrexd     %0, %H0, [%3]\n"
+        "strexd     %1, %4, %H4, [%3]"
+        : "=&r" (prev), "=&r" (status), "+m"(*addr)
+        : "r" (addr), "r" (value)
+        : "cc");
+  } while (__builtin_expect(status != 0, 0));
+#elif defined(__i386__)
+  __asm__ __volatile__(
+      "movq     %1, %0"
+      : "=m" (*addr)
+      : "x" (value));
+#else
+  MutexLock mu(Thread::Current(), GetSwapMutex(addr));
+  *addr = value;
+#endif
+}
+
+
+bool QuasiAtomic::Cas64(int64_t old_value, int64_t new_value, volatile int64_t* addr) {
+#if defined(__arm__)
   int64_t prev;
   int status;
   do {
@@ -108,103 +118,37 @@
         : "cc");
   } while (__builtin_expect(status != 0, 0));
   return prev != old_value;
-}
-
-#endif
-
-// *****************************************************************************
-
-#if NEED_MAC_QUASI_ATOMICS
-
-static inline int64_t QuasiAtomicSwap64Impl(int64_t value, volatile int64_t* addr) {
-  int64_t old_value;
-  do {
-    old_value = *addr;
-  } while (QuasiAtomic::Cas64(old_value, value, addr));
-  return old_value;
-}
-
-int64_t QuasiAtomic::Swap64(int64_t value, volatile int64_t* addr) {
-  return QuasiAtomicSwap64Impl(value, addr);
-}
-
-int64_t QuasiAtomic::Swap64Sync(int64_t value, volatile int64_t* addr) {
-  ANDROID_MEMBAR_STORE();
-  int64_t old_value = QuasiAtomicSwap64Impl(value, addr);
-  // TUNING: barriers can be avoided on some architectures.
-  ANDROID_MEMBAR_FULL();
-  return old_value;
-}
-
-int64_t QuasiAtomic::Read64(volatile const int64_t* addr) {
-  return OSAtomicAdd64Barrier(0, const_cast<volatile int64_t*>(addr));
-}
-
-int QuasiAtomic::Cas64(int64_t old_value, int64_t new_value, volatile int64_t* addr) {
-  return OSAtomicCompareAndSwap64Barrier(old_value, new_value, const_cast<int64_t*>(addr)) == 0;
-}
-
-#endif
-
-// *****************************************************************************
-
-#if NEED_PTHREADS_QUASI_ATOMICS
-
-// In the absence of a better implementation, we implement the 64-bit atomic
-// operations through mutex locking.
-
-// We stripe across a bunch of different mutexes to reduce contention.
-static const size_t kSwapLockCount = 32;
-static std::vector<Mutex*>* gSwapLocks;
-
-void QuasiAtomic::Startup() {
-  gSwapLocks = new std::vector<Mutex*>;
-  for (size_t i = 0; i < kSwapLockCount; ++i) {
-    gSwapLocks->push_back(new Mutex(StringPrintf("QuasiAtomic stripe %d", i).c_str()));
-  }
-}
-
-void QuasiAtomic::Shutdown() {
-  STLDeleteElements(gSwapLocks);
-  delete gSwapLocks;
-}
-
-static inline Mutex& GetSwapLock(const volatile int64_t* addr) {
-  return *(*gSwapLocks)[((unsigned)(void*)(addr) >> 3U) % kSwapLockCount];
-}
-
-int64_t QuasiAtomic::Swap64(int64_t value, volatile int64_t* addr) {
-  MutexLock mu(Thread::Current(), GetSwapLock(addr));
-  int64_t old_value = *addr;
-  *addr = value;
-  return old_value;
-}
-
-int64_t QuasiAtomic::Swap64Sync(int64_t value, volatile int64_t* addr) {
-  // Same as QuasiAtomicSwap64 - mutex handles barrier.
-  return QuasiAtomic::Swap64(value, addr);
-}
-
-int QuasiAtomic::Cas64(int64_t old_value, int64_t new_value, volatile int64_t* addr) {
-  MutexLock mu(Thread::Current(), GetSwapLock(addr));
-  if (*addr == old_value) {
-    *addr  = new_value;
-    return 0;
-  }
-  return 1;
-}
-
-int64_t QuasiAtomic::Read64(volatile const int64_t* addr) {
-  MutexLock mu(Thread::Current(), GetSwapLock(addr));
-  return *addr;
-}
-
+#elif defined(__i386__)
+  // cmpxchg8b implicitly uses %ebx which is also the PIC register.
+  int8_t status;
+  __asm__ __volatile__ (
+      "pushl          %%ebx\n"
+      "movl           (%3), %%ebx\n"
+      "movl           4(%3), %%ecx\n"
+      "lock cmpxchg8b %1\n"
+      "sete           %0\n"
+      "popl           %%ebx"
+      : "=R" (status), "+m" (*addr)
+      : "A"(old_value), "D" (&new_value)
+      : "%ecx"
+      );
+  return status != 0;
 #else
-
-// The other implementations don't need any special setup.
-void QuasiAtomic::Startup() {}
-void QuasiAtomic::Shutdown() {}
-
+  MutexLock mu(Thread::Current(), GetSwapMutex(addr));
+  if (*addr == old_value) {
+    *addr = new_value;
+    return true;
+  }
+  return false;
 #endif
+}
+
+bool QuasiAtomic::LongAtomicsUseMutexes() {
+#if NEED_SWAP_MUTEXES
+  return true;
+#else
+  return false;
+#endif
+}
 
 }  // namespace art
diff --git a/src/atomic.h b/src/atomic.h
index c69a9d1..d340dc5 100644
--- a/src/atomic.h
+++ b/src/atomic.h
@@ -20,8 +20,6 @@
 #include <stdint.h>
 
 #include "base/macros.h"
-#include "cutils/atomic.h"
-#include "cutils/atomic-inline.h"
 
 namespace art {
 
@@ -31,28 +29,24 @@
 // non-quasiatomic operations on the same address, nor about
 // quasiatomic operations that are performed on partially-overlapping
 // memory.
-//
-// Only the "Sync" functions provide a memory barrier.
 class QuasiAtomic {
  public:
   static void Startup();
 
   static void Shutdown();
 
-  // Swaps the 64-bit value at "addr" with "value".  Returns the previous
-  // value. No memory barriers.
-  static int64_t Swap64(int64_t value, volatile int64_t* addr);
-
-  // Swaps the 64-bit value at "addr" with "value".  Returns the previous
-  // value. Provides memory barriers.
-  static int64_t Swap64Sync(int64_t value, volatile int64_t* addr);
-
-  // Reads the 64-bit value at "addr".
+  // Reads the 64-bit value at "addr" without tearing.
   static int64_t Read64(volatile const int64_t* addr);
 
-  // If the value at "addr" is equal to "old_value", replace it with "new_value"
-  // and return 0. Otherwise, don't swap, and return nonzero.
-  static int Cas64(int64_t old_value, int64_t new_value, volatile int64_t* addr);
+  // Writes to the 64-bit value at "addr" without tearing.
+  static void Write64(volatile int64_t* addr, int64_t val);
+
+  // Atomically compare the value at "addr" to "old_value", if equal replace it with "new_value"
+  // and return true. Otherwise, don't swap, and return false.
+  static bool Cas64(int64_t old_value, int64_t new_value, volatile int64_t* addr);
+
+  // Does the architecture provide reasonable atomic long operations or do we fall back on mutexes?
+  static bool LongAtomicsUseMutexes();
 
  private:
   DISALLOW_COPY_AND_ASSIGN(QuasiAtomic);
diff --git a/src/jdwp/jdwp_handler.cc b/src/jdwp/jdwp_handler.cc
index bd50c61..cb13695 100644
--- a/src/jdwp/jdwp_handler.cc
+++ b/src/jdwp/jdwp_handler.cc
@@ -1842,7 +1842,7 @@
      * so waitForDebugger() doesn't return if we stall for a bit here.
      */
     Dbg::GoActive();
-    QuasiAtomic::Swap64(0, &last_activity_time_ms_);
+    QuasiAtomic::Write64(&last_activity_time_ms_, 0);
   }
 
   /*
@@ -1912,7 +1912,7 @@
    * the initial setup.  Only update if this is a non-DDMS packet.
    */
   if (pHeader->cmdSet != kJDWPDdmCmdSet) {
-    QuasiAtomic::Swap64(MilliTime(), &last_activity_time_ms_);
+    QuasiAtomic::Write64(&last_activity_time_ms_, MilliTime());
   }
 
   /* tell the VM that GC is okay again */
diff --git a/src/native/java_util_concurrent_atomic_AtomicLong.cc b/src/native/java_util_concurrent_atomic_AtomicLong.cc
index 7caa23f..bf92e12 100644
--- a/src/native/java_util_concurrent_atomic_AtomicLong.cc
+++ b/src/native/java_util_concurrent_atomic_AtomicLong.cc
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
+#include "atomic.h"
 #include "jni_internal.h"
-#include "object.h"
 
 namespace art {
 
 static jboolean AtomicLong_VMSupportsCS8(JNIEnv*, jclass) {
-  return JNI_TRUE;
+  return QuasiAtomic::LongAtomicsUseMutexes() ? JNI_FALSE : JNI_TRUE;
 }
 
 static JNINativeMethod gMethods[] = {
diff --git a/src/native/sun_misc_Unsafe.cc b/src/native/sun_misc_Unsafe.cc
index 5dc32b0..cb06a0b 100644
--- a/src/native/sun_misc_Unsafe.cc
+++ b/src/native/sun_misc_Unsafe.cc
@@ -27,7 +27,7 @@
   volatile int32_t* address = reinterpret_cast<volatile int32_t*>(raw_addr);
   // Note: android_atomic_release_cas() returns 0 on success, not failure.
   int result = android_atomic_release_cas(expectedValue, newValue, address);
-  return (result == 0);
+  return (result == 0) ? JNI_TRUE : JNI_FALSE;
 }
 
 static jboolean Unsafe_compareAndSwapLong(JNIEnv* env, jobject, jobject javaObj, jlong offset, jlong expectedValue, jlong newValue) {
@@ -36,8 +36,8 @@
   byte* raw_addr = reinterpret_cast<byte*>(obj) + offset;
   volatile int64_t* address = reinterpret_cast<volatile int64_t*>(raw_addr);
   // Note: android_atomic_cmpxchg() returns 0 on success, not failure.
-  int result = QuasiAtomic::Cas64(expectedValue, newValue, address);
-  return (result == 0);
+  bool success = QuasiAtomic::Cas64(expectedValue, newValue, address);
+  return success ? JNI_TRUE : JNI_FALSE;
 }
 
 static jboolean Unsafe_compareAndSwapObject(JNIEnv* env, jobject, jobject javaObj, jlong offset, jobject javaExpectedValue, jobject javaNewValue) {
@@ -53,7 +53,7 @@
   if (result == 0) {
     Runtime::Current()->GetHeap()->WriteBarrierField(obj, MemberOffset(offset), newValue);
   }
-  return (result == 0);
+  return (result == 0) ? JNI_TRUE : JNI_FALSE;
 }
 
 static jint Unsafe_getInt(JNIEnv* env, jobject, jobject javaObj, jlong offset) {
diff --git a/src/object.h b/src/object.h
index 07bcde1..f02e312 100644
--- a/src/object.h
+++ b/src/object.h
@@ -315,7 +315,7 @@
     int64_t* addr = reinterpret_cast<int64_t*>(raw_addr);
     if (UNLIKELY(is_volatile)) {
       ANDROID_MEMBAR_STORE();
-      QuasiAtomic::Swap64(new_value, addr);
+      QuasiAtomic::Write64(addr, new_value);
       // Post-store barrier not required due to use of atomic op or mutex.
     } else {
       *addr = new_value;