[SPARC64]: Fix missing load-twin usage in Niagara-1 memcpy.

For the case where the source is not aligned modulo 8
we don't use load-twins to suck the data in and this
kills performance since normal loads allocate in the
L1 cache (unlike load-twin) and thus big memcpys swipe
the entire L1 D-cache.

We need to allocate a register window to implement this
properly, but that actually simplifies a lot of things
as a nice side-effect.

Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/arch/sparc64/lib/NGcopy_to_user.S b/arch/sparc64/lib/NGcopy_to_user.S
index 34112d5..6ea01c5 100644
--- a/arch/sparc64/lib/NGcopy_to_user.S
+++ b/arch/sparc64/lib/NGcopy_to_user.S
@@ -1,6 +1,6 @@
 /* NGcopy_to_user.S: Niagara optimized copy to userspace.
  *
- * Copyright (C) 2006 David S. Miller (davem@davemloft.net)
+ * Copyright (C) 2006, 2007 David S. Miller (davem@davemloft.net)
  */
 
 #define EX_ST(x)		\
@@ -8,8 +8,8 @@
 	.section .fixup;	\
 	.align 4;		\
 99:	wr	%g0, ASI_AIUS, %asi;\
-	retl;			\
-	 mov	1, %o0;		\
+	ret;			\
+	 restore %g0, 1, %o0;	\
 	.section __ex_table,"a";\
 	.align 4;		\
 	.word 98b, 99b;		\
@@ -23,7 +23,7 @@
 #define FUNC_NAME		NGcopy_to_user
 #define STORE(type,src,addr)	type##a src, [addr] ASI_AIUS
 #define STORE_ASI		ASI_BLK_INIT_QUAD_LDD_AIUS
-#define EX_RETVAL(x)		0
+#define EX_RETVAL(x)		%g0
 
 #ifdef __KERNEL__
 	/* Writing to %asi is _expensive_ so we hardcode it.