sh: GUSA atomic rollback support.

This implements kernel-level atomic rollback built on top of gUSA,
as an alternative non-IRQ based atomicity method. This is generally
a faster method for platforms that are lacking the LL/SC pairs that
SH-4A and later use, and is only supportable on legacy cores.

Signed-off-by: Stuart Menefy <stuart.menefy@st.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 2dc3b17..f645f84 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -692,7 +692,7 @@
 
 config GUSA
 	def_bool y
-	depends on !SMP
+	depends on !SMP && SUPERH32
 	help
 	  This enables support for gUSA (general UserSpace Atomicity).
 	  This is the default implementation for both UP and non-ll/sc
@@ -704,6 +704,16 @@
 	  This should only be disabled for special cases where alternate
 	  atomicity implementations exist.
 
+config GUSA_RB
+	bool "Implement atomic operations by roll-back (gRB) (EXPERIMENTAL)"
+	depends on GUSA && CPU_SH3 || (CPU_SH4 && !CPU_SH4A)
+	help
+	  Enabling this option will allow the kernel to implement some
+	  atomic operations using a software implemention of load-locked/
+	  store-conditional (LLSC). On machines which do not have hardware
+	  LLSC, this should be more efficient than the other alternative of
+	  disabling insterrupts around the atomic sequence.
+
 endmenu
 
 menu "Boot options"
diff --git a/arch/sh/kernel/cpu/sh3/entry.S b/arch/sh/kernel/cpu/sh3/entry.S
index 0d12a12..4004073 100644
--- a/arch/sh/kernel/cpu/sh3/entry.S
+++ b/arch/sh/kernel/cpu/sh3/entry.S
@@ -13,8 +13,9 @@
 #include <linux/linkage.h>
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
-#include <asm/cpu/mmu_context.h>
 #include <asm/unistd.h>
+#include <asm/cpu/mmu_context.h>
+#include <asm/page.h>
 
 ! NOTE:
 ! GNU as (as of 2.9.1) changes bf/s into bt/s and bra, when the address
@@ -409,6 +410,27 @@
 	! Using k0, k1 for scratch registers (r0_bank1, r1_bank),
 	! save all registers onto stack.
 	!
+
+#ifdef CONFIG_GUSA
+	! Check for roll back gRB (User and Kernel)
+	mov	r15, k0
+	shll	k0
+	bf/s	1f
+	 shll	k0
+	bf/s	1f
+	 stc	spc, k1
+	stc	r0_bank, k0
+	cmp/hs	k0, k1		! test k1 (saved PC) >= k0 (saved r0)
+	bt/s	2f
+	 stc	r1_bank, k1
+
+	add	#-2, k0
+	add	r15, k0
+	ldc	k0, spc		! PC = saved r0 + r15 - 2
+2:	mov	k1, r15		! SP = r1
+1:
+#endif
+
 	stc	ssr, k0		! Is it from kernel space?
 	shll	k0		! Check MD bit (bit30) by shifting it into...
 	shll	k0		!       ...the T bit
diff --git a/arch/sh/kernel/entry-common.S b/arch/sh/kernel/entry-common.S
index 397ac71..926b2e7 100644
--- a/arch/sh/kernel/entry-common.S
+++ b/arch/sh/kernel/entry-common.S
@@ -176,25 +176,6 @@
 	jmp	@r1
 	 lds	r0, pr
 work_resched:
-#if defined(CONFIG_GUSA) && !defined(CONFIG_PREEMPT)
-	! gUSA handling
-	mov.l	@(OFF_SP,r15), r0	! get user space stack pointer
-	mov	r0, r1
-	shll	r0
-	bf/s	1f
-	 shll	r0
-	bf/s	1f
-	 mov	#OFF_PC, r0
-	! 				  SP >= 0xc0000000 : gUSA mark
-	mov.l	@(r0,r15), r2		! get user space PC (program counter)
-	mov.l	@(OFF_R0,r15), r3	! end point
-	cmp/hs	r3, r2			! r2 >= r3? 
-	bt	1f
-	add	r3, r1			! rewind point #2
-	mov.l	r1, @(r0,r15)		! reset PC to rewind point #2
-	!
-1:
-#endif
 	mov.l	1f, r1
 	jsr	@r1				! schedule
 	 nop
diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c
index b483248..9ab1926 100644
--- a/arch/sh/kernel/process_32.c
+++ b/arch/sh/kernel/process_32.c
@@ -322,25 +322,6 @@
 	unlazy_fpu(prev, task_pt_regs(prev));
 #endif
 
-#if defined(CONFIG_GUSA) && defined(CONFIG_PREEMPT)
-	{
-		struct pt_regs *regs;
-
-		preempt_disable();
-		regs = task_pt_regs(prev);
-		if (user_mode(regs) && regs->regs[15] >= 0xc0000000) {
-			int offset = (int)regs->regs[15];
-
-			/* Reset stack pointer: clear critical region mark */
-			regs->regs[15] = regs->regs[1];
-			if (regs->pc < regs->regs[0])
-				/* Go to rewind point */
-				regs->pc = regs->regs[0] + offset;
-		}
-		preempt_enable_no_resched();
-	}
-#endif
-
 #ifdef CONFIG_MMU
 	/*
 	 * Restore the kernel mode register
diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c
index ca754fd..f6b5fbf 100644
--- a/arch/sh/kernel/signal_32.c
+++ b/arch/sh/kernel/signal_32.c
@@ -507,24 +507,6 @@
 						ctrl_inw(regs->pc - 4));
 				break;
 		}
-#ifdef CONFIG_GUSA
-	} else {
-		/* gUSA handling */
-		preempt_disable();
-
-		if (regs->regs[15] >= 0xc0000000) {
-			int offset = (int)regs->regs[15];
-
-			/* Reset stack pointer: clear critical region mark */
-			regs->regs[15] = regs->regs[1];
-			if (regs->pc < regs->regs[0])
-				/* Go to rewind point #1 */
-				regs->pc = regs->regs[0] + offset -
-					instruction_size(ctrl_inw(regs->pc-4));
-		}
-
-		preempt_enable_no_resched();
-#endif
 	}
 
 	/* Set up the stack frame */
diff --git a/include/asm-sh/atomic-grb.h b/include/asm-sh/atomic-grb.h
new file mode 100644
index 0000000..4c5b7db
--- /dev/null
+++ b/include/asm-sh/atomic-grb.h
@@ -0,0 +1,169 @@
+#ifndef __ASM_SH_ATOMIC_GRB_H
+#define __ASM_SH_ATOMIC_GRB_H
+
+static inline void atomic_add(int i, atomic_t *v)
+{
+	int tmp;
+
+	__asm__ __volatile__ (
+		"   .align 2              \n\t"
+		"   mova    1f,   r0      \n\t" /* r0 = end point */
+		"   mov    r15,   r1      \n\t" /* r1 = saved sp */
+		"   mov    #-6,   r15     \n\t" /* LOGIN: r15 = size */
+		"   mov.l  @%1,   %0      \n\t" /* load  old value */
+		"   add     %2,   %0      \n\t" /* add */
+		"   mov.l   %0,   @%1     \n\t" /* store new value */
+		"1: mov     r1,   r15     \n\t" /* LOGOUT */
+		: "=&r" (tmp),
+		  "+r"  (v)
+		: "r"   (i)
+		: "memory" , "r0", "r1");
+}
+
+static inline void atomic_sub(int i, atomic_t *v)
+{
+	int tmp;
+
+	__asm__ __volatile__ (
+		"   .align 2              \n\t"
+		"   mova    1f,   r0      \n\t" /* r0 = end point */
+		"   mov     r15,  r1      \n\t" /* r1 = saved sp */
+		"   mov    #-6,   r15     \n\t" /* LOGIN: r15 = size */
+		"   mov.l  @%1,   %0      \n\t" /* load  old value */
+		"   sub     %2,   %0      \n\t" /* sub */
+		"   mov.l   %0,   @%1     \n\t" /* store new value */
+		"1: mov     r1,   r15     \n\t" /* LOGOUT */
+		: "=&r" (tmp),
+		  "+r"  (v)
+		: "r"   (i)
+		: "memory" , "r0", "r1");
+}
+
+static inline int atomic_add_return(int i, atomic_t *v)
+{
+	int tmp;
+
+	__asm__ __volatile__ (
+		"   .align 2              \n\t"
+		"   mova    1f,   r0      \n\t" /* r0 = end point */
+		"   mov    r15,   r1      \n\t" /* r1 = saved sp */
+		"   mov    #-6,   r15     \n\t" /* LOGIN: r15 = size */
+		"   mov.l  @%1,   %0      \n\t" /* load  old value */
+		"   add     %2,   %0      \n\t" /* add */
+		"   mov.l   %0,   @%1     \n\t" /* store new value */
+		"1: mov     r1,   r15     \n\t" /* LOGOUT */
+		: "=&r" (tmp),
+		  "+r"  (v)
+		: "r"   (i)
+		: "memory" , "r0", "r1");
+
+	return tmp;
+}
+
+static inline int atomic_sub_return(int i, atomic_t *v)
+{
+	int tmp;
+
+	__asm__ __volatile__ (
+		"   .align 2              \n\t"
+		"   mova    1f,   r0      \n\t" /* r0 = end point */
+		"   mov    r15,   r1      \n\t" /* r1 = saved sp */
+		"   mov    #-6,   r15     \n\t" /* LOGIN: r15 = size */
+		"   mov.l  @%1,   %0      \n\t" /* load  old value */
+		"   sub     %2,   %0      \n\t" /* sub */
+		"   mov.l   %0,   @%1     \n\t" /* store new value */
+		"1: mov     r1,   r15     \n\t" /* LOGOUT */
+		: "=&r" (tmp),
+		  "+r"  (v)
+		: "r"   (i)
+		: "memory", "r0", "r1");
+
+	return tmp;
+}
+
+static inline void atomic_clear_mask(unsigned int mask, atomic_t *v)
+{
+	int tmp;
+	unsigned int _mask = ~mask;
+
+	__asm__ __volatile__ (
+		"   .align 2              \n\t"
+		"   mova    1f,   r0      \n\t" /* r0 = end point */
+		"   mov    r15,   r1      \n\t" /* r1 = saved sp */
+		"   mov    #-6,   r15     \n\t" /* LOGIN: r15 = size */
+		"   mov.l  @%1,   %0      \n\t" /* load  old value */
+		"   and     %2,   %0      \n\t" /* add */
+		"   mov.l   %0,   @%1     \n\t" /* store new value */
+		"1: mov     r1,   r15     \n\t" /* LOGOUT */
+		: "=&r" (tmp),
+		  "+r"  (v)
+		: "r"   (_mask)
+		: "memory" , "r0", "r1");
+}
+
+static inline void atomic_set_mask(unsigned int mask, atomic_t *v)
+{
+	int tmp;
+
+	__asm__ __volatile__ (
+		"   .align 2              \n\t"
+		"   mova    1f,   r0      \n\t" /* r0 = end point */
+		"   mov    r15,   r1      \n\t" /* r1 = saved sp */
+		"   mov    #-6,   r15     \n\t" /* LOGIN: r15 = size */
+		"   mov.l  @%1,   %0      \n\t" /* load  old value */
+		"   or      %2,   %0      \n\t" /* or */
+		"   mov.l   %0,   @%1     \n\t" /* store new value */
+		"1: mov     r1,   r15     \n\t" /* LOGOUT */
+		: "=&r" (tmp),
+		  "+r"  (v)
+		: "r"   (mask)
+		: "memory" , "r0", "r1");
+}
+
+static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
+{
+	int ret;
+
+	__asm__ __volatile__ (
+		"   .align 2		\n\t"
+		"   mova     1f,  r0	\n\t"
+		"   nop			\n\t"
+		"   mov     r15,  r1	\n\t"
+		"   mov    #-8,  r15	\n\t"
+		"   mov.l   @%1,  %0	\n\t"
+		"   cmp/eq   %2,  %0	\n\t"
+		"   bf	     1f		\n\t"
+		"   mov.l    %3, @%1	\n\t"
+		"1: mov      r1,  r15	\n\t"
+		: "=&r" (ret)
+		: "r" (v), "r" (old), "r" (new)
+		: "memory" , "r0", "r1" , "t");
+
+	return ret;
+}
+
+static inline int atomic_add_unless(atomic_t *v, int a, int u)
+{
+	int ret;
+	unsigned long tmp;
+
+	__asm__ __volatile__ (
+		"   .align 2		\n\t"
+		"   mova    1f,   r0	\n\t"
+		"   nop			\n\t"
+		"   mov    r15,   r1	\n\t"
+		"   mov    #-12,  r15	\n\t"
+		"   mov.l  @%2,   %1	\n\t"
+		"   mov	    %1,   %0    \n\t"
+		"   cmp/eq  %4,   %0	\n\t"
+		"   bt/s    1f		\n\t"
+		"    add    %3,   %1	\n\t"
+		"   mov.l   %1,  @%2	\n\t"
+		"1: mov     r1,   r15	\n\t"
+		: "=&r" (ret), "=&r" (tmp)
+		: "r" (v), "r" (a), "r" (u)
+		: "memory" , "r0", "r1" , "t");
+
+	return ret != u;
+}
+#endif /* __ASM_SH_ATOMIC_GRB_H */
diff --git a/include/asm-sh/atomic.h b/include/asm-sh/atomic.h
index e12570b..c043ef0 100644
--- a/include/asm-sh/atomic.h
+++ b/include/asm-sh/atomic.h
@@ -17,7 +17,9 @@
 #include <linux/compiler.h>
 #include <asm/system.h>
 
-#ifdef CONFIG_CPU_SH4A
+#if defined(CONFIG_GUSA_RB)
+#include <asm/atomic-grb.h>
+#elif defined(CONFIG_CPU_SH4A)
 #include <asm/atomic-llsc.h>
 #else
 #include <asm/atomic-irq.h>
@@ -44,6 +46,7 @@
 #define atomic_inc(v) atomic_add(1,(v))
 #define atomic_dec(v) atomic_sub(1,(v))
 
+#ifndef CONFIG_GUSA_RB
 static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
 {
 	int ret;
@@ -58,8 +61,6 @@
 	return ret;
 }
 
-#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
-
 static inline int atomic_add_unless(atomic_t *v, int a, int u)
 {
 	int ret;
@@ -73,6 +74,9 @@
 
 	return ret != u;
 }
+#endif
+
+#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
 #define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
 
 /* Atomic operations are already serializing on SH */
diff --git a/include/asm-sh/bitops-grb.h b/include/asm-sh/bitops-grb.h
new file mode 100644
index 0000000..a5907b9
--- /dev/null
+++ b/include/asm-sh/bitops-grb.h
@@ -0,0 +1,169 @@
+#ifndef __ASM_SH_BITOPS_GRB_H
+#define __ASM_SH_BITOPS_GRB_H
+
+static inline void set_bit(int nr, volatile void * addr)
+{
+	int	mask;
+	volatile unsigned int *a = addr;
+	unsigned long tmp;
+
+	a += nr >> 5;
+	mask = 1 << (nr & 0x1f);
+
+        __asm__ __volatile__ (
+                "   .align 2              \n\t"
+                "   mova    1f,   r0      \n\t" /* r0 = end point */
+                "   mov    r15,   r1      \n\t" /* r1 = saved sp */
+                "   mov    #-6,   r15     \n\t" /* LOGIN: r15 = size */
+                "   mov.l  @%1,   %0      \n\t" /* load  old value */
+                "   or      %2,   %0      \n\t" /* or */
+                "   mov.l   %0,   @%1     \n\t" /* store new value */
+                "1: mov     r1,   r15     \n\t" /* LOGOUT */
+                : "=&r" (tmp),
+                  "+r"  (a)
+                : "r"   (mask)
+                : "memory" , "r0", "r1");
+}
+
+static inline void clear_bit(int nr, volatile void * addr)
+{
+	int	mask;
+	volatile unsigned int *a = addr;
+        unsigned long tmp;
+
+	a += nr >> 5;
+        mask = ~(1 << (nr & 0x1f));
+        __asm__ __volatile__ (
+                "   .align 2              \n\t"
+                "   mova    1f,   r0      \n\t" /* r0 = end point */
+                "   mov    r15,   r1      \n\t" /* r1 = saved sp */
+                "   mov    #-6,   r15     \n\t" /* LOGIN: r15 = size */
+                "   mov.l  @%1,   %0      \n\t" /* load  old value */
+                "   and     %2,   %0      \n\t" /* and */
+                "   mov.l   %0,   @%1     \n\t" /* store new value */
+                "1: mov     r1,   r15     \n\t" /* LOGOUT */
+                : "=&r" (tmp),
+                  "+r"  (a)
+                : "r"   (mask)
+                : "memory" , "r0", "r1");
+}
+
+static inline void change_bit(int nr, volatile void * addr)
+{
+        int     mask;
+        volatile unsigned int *a = addr;
+        unsigned long tmp;
+
+        a += nr >> 5;
+        mask = 1 << (nr & 0x1f);
+        __asm__ __volatile__ (
+                "   .align 2              \n\t"
+                "   mova    1f,   r0      \n\t" /* r0 = end point */
+                "   mov    r15,   r1      \n\t" /* r1 = saved sp */
+                "   mov    #-6,   r15     \n\t" /* LOGIN: r15 = size */
+                "   mov.l  @%1,   %0      \n\t" /* load  old value */
+                "   xor     %2,   %0      \n\t" /* xor */
+                "   mov.l   %0,   @%1     \n\t" /* store new value */
+                "1: mov     r1,   r15     \n\t" /* LOGOUT */
+                : "=&r" (tmp),
+                  "+r"  (a)
+                : "r"   (mask)
+                : "memory" , "r0", "r1");
+}
+
+static inline int test_and_set_bit(int nr, volatile void * addr)
+{
+        int     mask, retval;
+	volatile unsigned int *a = addr;
+        unsigned long tmp;
+
+	a += nr >> 5;
+	mask = 1 << (nr & 0x1f);
+
+        __asm__ __volatile__ (
+                "   .align 2              \n\t"
+                "   mova    1f,   r0      \n\t" /* r0 = end point */
+                "   mov    r15,   r1      \n\t" /* r1 = saved sp */
+                "   mov   #-14,   r15     \n\t" /* LOGIN: r15 = size */
+                "   mov.l  @%2,   %0      \n\t" /* load old value */
+                "   mov     %0,   %1      \n\t"
+                "   tst     %1,   %3      \n\t" /* T = ((*a & mask) == 0) */
+                "   mov    #-1,   %1      \n\t" /* retvat = -1 */
+                "   negc    %1,   %1      \n\t" /* retval = (mask & *a) != 0 */
+                "   or      %3,   %0      \n\t"
+                "   mov.l   %0,  @%2      \n\t" /* store new value */
+                "1: mov     r1,  r15      \n\t" /* LOGOUT */
+                : "=&r" (tmp),
+                  "=&r" (retval),
+                  "+r"  (a)
+                : "r"   (mask)
+                : "memory" , "r0", "r1" ,"t");
+
+        return retval;
+}
+
+static inline int test_and_clear_bit(int nr, volatile void * addr)
+{
+        int     mask, retval,not_mask;
+        volatile unsigned int *a = addr;
+        unsigned long tmp;
+
+        a += nr >> 5;
+        mask = 1 << (nr & 0x1f);
+
+	not_mask = ~mask;
+
+        __asm__ __volatile__ (
+                "   .align 2              \n\t"
+		"   mova    1f,   r0      \n\t" /* r0 = end point */
+                "   mov    r15,   r1      \n\t" /* r1 = saved sp */
+		"   mov   #-14,   r15     \n\t" /* LOGIN */
+		"   mov.l  @%2,   %0      \n\t" /* load old value */
+                "   mov     %0,   %1      \n\t" /* %1 = *a */
+                "   tst     %1,   %3      \n\t" /* T = ((*a & mask) == 0) */
+		"   mov    #-1,   %1      \n\t" /* retvat = -1 */
+                "   negc    %1,   %1      \n\t" /* retval = (mask & *a) != 0 */
+                "   and     %4,   %0      \n\t"
+                "   mov.l   %0,  @%2      \n\t" /* store new value */
+		"1: mov     r1,   r15     \n\t" /* LOGOUT */
+		: "=&r" (tmp),
+		  "=&r" (retval),
+		  "+r"  (a)
+		: "r"   (mask),
+		  "r"   (not_mask)
+		: "memory" , "r0", "r1", "t");
+
+        return retval;
+}
+
+static inline int test_and_change_bit(int nr, volatile void * addr)
+{
+        int     mask, retval;
+        volatile unsigned int *a = addr;
+        unsigned long tmp;
+
+        a += nr >> 5;
+        mask = 1 << (nr & 0x1f);
+
+        __asm__ __volatile__ (
+                "   .align 2              \n\t"
+                "   mova    1f,   r0      \n\t" /* r0 = end point */
+                "   mov    r15,   r1      \n\t" /* r1 = saved sp */
+                "   mov   #-14,   r15     \n\t" /* LOGIN */
+                "   mov.l  @%2,   %0      \n\t" /* load old value */
+                "   mov     %0,   %1      \n\t" /* %1 = *a */
+                "   tst     %1,   %3      \n\t" /* T = ((*a & mask) == 0) */
+                "   mov    #-1,   %1      \n\t" /* retvat = -1 */
+                "   negc    %1,   %1      \n\t" /* retval = (mask & *a) != 0 */
+                "   xor     %3,   %0      \n\t"
+                "   mov.l   %0,  @%2      \n\t" /* store new value */
+                "1: mov     r1,   r15     \n\t" /* LOGOUT */
+                : "=&r" (tmp),
+                  "=&r" (retval),
+                  "+r"  (a)
+                : "r"   (mask)
+                : "memory" , "r0", "r1", "t");
+
+        return retval;
+}
+#endif /* __ASM_SH_BITOPS_GRB_H */
diff --git a/include/asm-sh/bitops-irq.h b/include/asm-sh/bitops-irq.h
new file mode 100644
index 0000000..653a127
--- /dev/null
+++ b/include/asm-sh/bitops-irq.h
@@ -0,0 +1,91 @@
+#ifndef __ASM_SH_BITOPS_IRQ_H
+#define __ASM_SH_BITOPS_IRQ_H
+
+static inline void set_bit(int nr, volatile void *addr)
+{
+	int	mask;
+	volatile unsigned int *a = addr;
+	unsigned long flags;
+
+	a += nr >> 5;
+	mask = 1 << (nr & 0x1f);
+	local_irq_save(flags);
+	*a |= mask;
+	local_irq_restore(flags);
+}
+
+static inline void clear_bit(int nr, volatile void *addr)
+{
+	int	mask;
+	volatile unsigned int *a = addr;
+	unsigned long flags;
+
+	a += nr >> 5;
+	mask = 1 << (nr & 0x1f);
+	local_irq_save(flags);
+	*a &= ~mask;
+	local_irq_restore(flags);
+}
+
+static inline void change_bit(int nr, volatile void *addr)
+{
+	int	mask;
+	volatile unsigned int *a = addr;
+	unsigned long flags;
+
+	a += nr >> 5;
+	mask = 1 << (nr & 0x1f);
+	local_irq_save(flags);
+	*a ^= mask;
+	local_irq_restore(flags);
+}
+
+static inline int test_and_set_bit(int nr, volatile void *addr)
+{
+	int	mask, retval;
+	volatile unsigned int *a = addr;
+	unsigned long flags;
+
+	a += nr >> 5;
+	mask = 1 << (nr & 0x1f);
+	local_irq_save(flags);
+	retval = (mask & *a) != 0;
+	*a |= mask;
+	local_irq_restore(flags);
+
+	return retval;
+}
+
+static inline int test_and_clear_bit(int nr, volatile void *addr)
+{
+	int	mask, retval;
+	volatile unsigned int *a = addr;
+	unsigned long flags;
+
+	a += nr >> 5;
+	mask = 1 << (nr & 0x1f);
+	local_irq_save(flags);
+	retval = (mask & *a) != 0;
+	*a &= ~mask;
+	local_irq_restore(flags);
+
+	return retval;
+}
+
+static inline int test_and_change_bit(int nr, volatile void *addr)
+{
+	int	mask, retval;
+	volatile unsigned int *a = addr;
+	unsigned long flags;
+
+	a += nr >> 5;
+	mask = 1 << (nr & 0x1f);
+	local_irq_save(flags);
+	retval = (mask & *a) != 0;
+	*a ^= mask;
+	local_irq_restore(flags);
+
+	return retval;
+}
+
+#endif /* __ASM_SH_BITOPS_IRQ_H */
diff --git a/include/asm-sh/bitops.h b/include/asm-sh/bitops.h
index a7bd81a..b6ba5a6 100644
--- a/include/asm-sh/bitops.h
+++ b/include/asm-sh/bitops.h
@@ -11,97 +11,18 @@
 /* For __swab32 */
 #include <asm/byteorder.h>
 
-static inline void set_bit(int nr, volatile void * addr)
-{
-	int	mask;
-	volatile unsigned int *a = addr;
-	unsigned long flags;
+#ifdef CONFIG_GUSA_RB
+#include <asm/bitops-grb.h>
+#else
+#include <asm/bitops-irq.h>
+#endif
 
-	a += nr >> 5;
-	mask = 1 << (nr & 0x1f);
-	local_irq_save(flags);
-	*a |= mask;
-	local_irq_restore(flags);
-}
 
 /*
  * clear_bit() doesn't provide any barrier for the compiler.
  */
 #define smp_mb__before_clear_bit()	barrier()
 #define smp_mb__after_clear_bit()	barrier()
-static inline void clear_bit(int nr, volatile void * addr)
-{
-	int	mask;
-	volatile unsigned int *a = addr;
-	unsigned long flags;
-
-	a += nr >> 5;
-	mask = 1 << (nr & 0x1f);
-	local_irq_save(flags);
-	*a &= ~mask;
-	local_irq_restore(flags);
-}
-
-static inline void change_bit(int nr, volatile void * addr)
-{
-	int	mask;
-	volatile unsigned int *a = addr;
-	unsigned long flags;
-
-	a += nr >> 5;
-	mask = 1 << (nr & 0x1f);
-	local_irq_save(flags);
-	*a ^= mask;
-	local_irq_restore(flags);
-}
-
-static inline int test_and_set_bit(int nr, volatile void * addr)
-{
-	int	mask, retval;
-	volatile unsigned int *a = addr;
-	unsigned long flags;
-
-	a += nr >> 5;
-	mask = 1 << (nr & 0x1f);
-	local_irq_save(flags);
-	retval = (mask & *a) != 0;
-	*a |= mask;
-	local_irq_restore(flags);
-
-	return retval;
-}
-
-static inline int test_and_clear_bit(int nr, volatile void * addr)
-{
-	int	mask, retval;
-	volatile unsigned int *a = addr;
-	unsigned long flags;
-
-	a += nr >> 5;
-	mask = 1 << (nr & 0x1f);
-	local_irq_save(flags);
-	retval = (mask & *a) != 0;
-	*a &= ~mask;
-	local_irq_restore(flags);
-
-	return retval;
-}
-
-static inline int test_and_change_bit(int nr, volatile void * addr)
-{
-	int	mask, retval;
-	volatile unsigned int *a = addr;
-	unsigned long flags;
-
-	a += nr >> 5;
-	mask = 1 << (nr & 0x1f);
-	local_irq_save(flags);
-	retval = (mask & *a) != 0;
-	*a ^= mask;
-	local_irq_restore(flags);
-
-	return retval;
-}
 
 #include <asm-generic/bitops/non-atomic.h>
 
diff --git a/include/asm-sh/cmpxchg-grb.h b/include/asm-sh/cmpxchg-grb.h
new file mode 100644
index 0000000..e2681ab
--- /dev/null
+++ b/include/asm-sh/cmpxchg-grb.h
@@ -0,0 +1,70 @@
+#ifndef __ASM_SH_CMPXCHG_GRB_H
+#define __ASM_SH_CMPXCHG_GRB_H
+
+static inline unsigned long xchg_u32(volatile u32 *m, unsigned long val)
+{
+	unsigned long retval;
+
+	__asm__ __volatile__ (
+		"   .align 2              \n\t"
+		"   mova    1f,   r0      \n\t" /* r0 = end point */
+		"   nop                   \n\t"
+		"   mov    r15,   r1      \n\t" /* r1 = saved sp */
+		"   mov    #-4,   r15     \n\t" /* LOGIN */
+		"   mov.l  @%1,   %0      \n\t" /* load  old value */
+		"   mov.l   %2,   @%1     \n\t" /* store new value */
+		"1: mov     r1,   r15     \n\t" /* LOGOUT */
+		: "=&r" (retval),
+		  "+r"  (m)
+		: "r"   (val)
+		: "memory", "r0", "r1");
+
+	return retval;
+}
+
+static inline unsigned long xchg_u8(volatile u8 *m, unsigned long val)
+{
+	unsigned long retval;
+
+	__asm__ __volatile__ (
+		"   .align  2             \n\t"
+		"   mova    1f,   r0      \n\t" /* r0 = end point */
+		"   mov    r15,   r1      \n\t" /* r1 = saved sp */
+		"   mov    #-6,   r15     \n\t" /* LOGIN */
+		"   mov.b  @%1,   %0      \n\t" /* load  old value */
+		"   extu.b  %0,   %0      \n\t" /* extend as unsigned */
+		"   mov.b   %2,   @%1     \n\t" /* store new value */
+		"1: mov     r1,   r15     \n\t" /* LOGOUT */
+		: "=&r" (retval),
+		  "+r"  (m)
+		: "r"   (val)
+		: "memory" , "r0", "r1");
+
+	return retval;
+}
+
+static inline unsigned long __cmpxchg_u32(volatile int *m, unsigned long old,
+					  unsigned long new)
+{
+	unsigned long retval;
+
+	__asm__ __volatile__ (
+		"   .align  2             \n\t"
+		"   mova    1f,   r0      \n\t" /* r0 = end point */
+		"   nop                   \n\t"
+		"   mov    r15,   r1      \n\t" /* r1 = saved sp */
+		"   mov    #-8,   r15     \n\t" /* LOGIN */
+		"   mov.l  @%1,   %0      \n\t" /* load  old value */
+		"   cmp/eq  %0,   %2      \n\t"
+		"   bf            1f      \n\t" /* if not equal */
+		"   mov.l   %2,   @%1     \n\t" /* store new value */
+		"1: mov     r1,   r15     \n\t" /* LOGOUT */
+		: "=&r" (retval),
+		  "+r"  (m)
+		: "r"   (new)
+		: "memory" , "r0", "r1", "t");
+
+	return retval;
+}
+
+#endif /* __ASM_SH_CMPXCHG_GRB_H */
diff --git a/include/asm-sh/cmpxchg-irq.h b/include/asm-sh/cmpxchg-irq.h
new file mode 100644
index 0000000..43049ec
--- /dev/null
+++ b/include/asm-sh/cmpxchg-irq.h
@@ -0,0 +1,40 @@
+#ifndef __ASM_SH_CMPXCHG_IRQ_H
+#define __ASM_SH_CMPXCHG_IRQ_H
+
+static inline unsigned long xchg_u32(volatile u32 *m, unsigned long val)
+{
+	unsigned long flags, retval;
+
+	local_irq_save(flags);
+	retval = *m;
+	*m = val;
+	local_irq_restore(flags);
+	return retval;
+}
+
+static inline unsigned long xchg_u8(volatile u8 *m, unsigned long val)
+{
+	unsigned long flags, retval;
+
+	local_irq_save(flags);
+	retval = *m;
+	*m = val & 0xff;
+	local_irq_restore(flags);
+	return retval;
+}
+
+static inline unsigned long __cmpxchg_u32(volatile int *m, unsigned long old,
+	unsigned long new)
+{
+	__u32 retval;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	retval = *m;
+	if (retval == old)
+		*m = new;
+	local_irq_restore(flags);       /* implies memory barrier  */
+	return retval;
+}
+
+#endif /* __ASM_SH_CMPXCHG_IRQ_H */
diff --git a/include/asm-sh/system.h b/include/asm-sh/system.h
index ad3d2a6..969f3d4 100644
--- a/include/asm-sh/system.h
+++ b/include/asm-sh/system.h
@@ -68,27 +68,11 @@
 
 #define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
 
-static inline unsigned long xchg_u32(volatile u32 *m, unsigned long val)
-{
-	unsigned long flags, retval;
-
-	local_irq_save(flags);
-	retval = *m;
-	*m = val;
-	local_irq_restore(flags);
-	return retval;
-}
-
-static inline unsigned long xchg_u8(volatile u8 *m, unsigned long val)
-{
-	unsigned long flags, retval;
-
-	local_irq_save(flags);
-	retval = *m;
-	*m = val & 0xff;
-	local_irq_restore(flags);
-	return retval;
-}
+#ifdef CONFIG_GUSA_RB
+#include <asm/cmpxchg-grb.h>
+#else
+#include <asm/cmpxchg-irq.h>
+#endif
 
 extern void __xchg_called_with_bad_pointer(void);
 
@@ -115,20 +99,6 @@
 #define xchg(ptr,x)	\
 	((__typeof__(*(ptr)))__xchg((ptr),(unsigned long)(x), sizeof(*(ptr))))
 
-static inline unsigned long __cmpxchg_u32(volatile int * m, unsigned long old,
-	unsigned long new)
-{
-	__u32 retval;
-	unsigned long flags;
-
-	local_irq_save(flags);
-	retval = *m;
-	if (retval == old)
-		*m = new;
-	local_irq_restore(flags);       /* implies memory barrier  */
-	return retval;
-}
-
 /* This function doesn't exist, so you'll get a linker error
  * if something tries to do an invalid cmpxchg(). */
 extern void __cmpxchg_called_with_bad_pointer(void);