[ARM] barriers: improve xchg, bitops and atomic SMP barriers

Mathieu Desnoyers pointed out that the ARM barriers were lacking:

- cmpxchg, xchg and atomic add return need memory barriers on
  architectures which can reorder the relative order in which memory
  read/writes can be seen between CPUs, which seems to include recent
  ARM architectures. Those barriers are currently missing on ARM.

- test_and_xxx_bit were missing SMP barriers.

So put these barriers in.  Provide separate atomic_add/atomic_sub
operations which do not require barriers.

Reported-Reviewed-and-Acked-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h
index ee99723..16b52f3 100644
--- a/arch/arm/include/asm/atomic.h
+++ b/arch/arm/include/asm/atomic.h
@@ -44,11 +44,29 @@
 	: "cc");
 }
 
+static inline void atomic_add(int i, atomic_t *v)
+{
+	unsigned long tmp;
+	int result;
+
+	__asm__ __volatile__("@ atomic_add\n"
+"1:	ldrex	%0, [%2]\n"
+"	add	%0, %0, %3\n"
+"	strex	%1, %0, [%2]\n"
+"	teq	%1, #0\n"
+"	bne	1b"
+	: "=&r" (result), "=&r" (tmp)
+	: "r" (&v->counter), "Ir" (i)
+	: "cc");
+}
+
 static inline int atomic_add_return(int i, atomic_t *v)
 {
 	unsigned long tmp;
 	int result;
 
+	smp_mb();
+
 	__asm__ __volatile__("@ atomic_add_return\n"
 "1:	ldrex	%0, [%2]\n"
 "	add	%0, %0, %3\n"
@@ -59,14 +77,34 @@
 	: "r" (&v->counter), "Ir" (i)
 	: "cc");
 
+	smp_mb();
+
 	return result;
 }
 
+static inline void atomic_sub(int i, atomic_t *v)
+{
+	unsigned long tmp;
+	int result;
+
+	__asm__ __volatile__("@ atomic_sub\n"
+"1:	ldrex	%0, [%2]\n"
+"	sub	%0, %0, %3\n"
+"	strex	%1, %0, [%2]\n"
+"	teq	%1, #0\n"
+"	bne	1b"
+	: "=&r" (result), "=&r" (tmp)
+	: "r" (&v->counter), "Ir" (i)
+	: "cc");
+}
+
 static inline int atomic_sub_return(int i, atomic_t *v)
 {
 	unsigned long tmp;
 	int result;
 
+	smp_mb();
+
 	__asm__ __volatile__("@ atomic_sub_return\n"
 "1:	ldrex	%0, [%2]\n"
 "	sub	%0, %0, %3\n"
@@ -77,6 +115,8 @@
 	: "r" (&v->counter), "Ir" (i)
 	: "cc");
 
+	smp_mb();
+
 	return result;
 }
 
@@ -84,6 +124,8 @@
 {
 	unsigned long oldval, res;
 
+	smp_mb();
+
 	do {
 		__asm__ __volatile__("@ atomic_cmpxchg\n"
 		"ldrex	%1, [%2]\n"
@@ -95,6 +137,8 @@
 		    : "cc");
 	} while (res);
 
+	smp_mb();
+
 	return oldval;
 }
 
@@ -135,6 +179,7 @@
 
 	return val;
 }
+#define atomic_add(i, v)	(void) atomic_add_return(i, v)
 
 static inline int atomic_sub_return(int i, atomic_t *v)
 {
@@ -148,6 +193,7 @@
 
 	return val;
 }
+#define atomic_sub(i, v)	(void) atomic_sub_return(i, v)
 
 static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
 {
@@ -187,10 +233,8 @@
 }
 #define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
 
-#define atomic_add(i, v)	(void) atomic_add_return(i, v)
-#define atomic_inc(v)		(void) atomic_add_return(1, v)
-#define atomic_sub(i, v)	(void) atomic_sub_return(i, v)
-#define atomic_dec(v)		(void) atomic_sub_return(1, v)
+#define atomic_inc(v)		atomic_add(1, v)
+#define atomic_dec(v)		atomic_sub(1, v)
 
 #define atomic_inc_and_test(v)	(atomic_add_return(1, v) == 0)
 #define atomic_dec_and_test(v)	(atomic_sub_return(1, v) == 0)
@@ -200,11 +244,10 @@
 
 #define atomic_add_negative(i,v) (atomic_add_return(i, v) < 0)
 
-/* Atomic operations are already serializing on ARM */
-#define smp_mb__before_atomic_dec()	barrier()
-#define smp_mb__after_atomic_dec()	barrier()
-#define smp_mb__before_atomic_inc()	barrier()
-#define smp_mb__after_atomic_inc()	barrier()
+#define smp_mb__before_atomic_dec()	smp_mb()
+#define smp_mb__after_atomic_dec()	smp_mb()
+#define smp_mb__before_atomic_inc()	smp_mb()
+#define smp_mb__after_atomic_inc()	smp_mb()
 
 #include <asm-generic/atomic.h>
 #endif