Linux-2.6.12-rc2

Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.

Let it rip!
diff --git a/arch/sh/lib/Makefile b/arch/sh/lib/Makefile
new file mode 100644
index 0000000..b5681e3
--- /dev/null
+++ b/arch/sh/lib/Makefile
@@ -0,0 +1,13 @@
+#
+# Makefile for SuperH-specific library files..
+#
+
+lib-y  = delay.o memset.o memmove.o memchr.o \
+	 checksum.o strcasecmp.o strlen.o div64.o udivdi3.o \
+	 div64-generic.o
+
+memcpy-y			:= memcpy.o
+memcpy-$(CONFIG_CPU_SH4)	:= memcpy-sh4.o
+
+lib-y	+= $(memcpy-y)
+
diff --git a/arch/sh/lib/checksum.S b/arch/sh/lib/checksum.S
new file mode 100644
index 0000000..7c50dfe
--- /dev/null
+++ b/arch/sh/lib/checksum.S
@@ -0,0 +1,385 @@
+/* $Id: checksum.S,v 1.10 2001/07/06 13:11:32 gniibe Exp $
+ *
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		IP/TCP/UDP checksumming routines
+ *
+ * Authors:	Jorge Cwik, <jorge@laser.satlink.net>
+ *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *		Tom May, <ftom@netcom.com>
+ *              Pentium Pro/II routines:
+ *              Alexander Kjeldaas <astor@guardian.no>
+ *              Finn Arne Gangstad <finnag@guardian.no>
+ *		Lots of code moved from tcp.c and ip.c; see those files
+ *		for more names.
+ *
+ * Changes:     Ingo Molnar, converted csum_partial_copy() to 2.1 exception
+ *			     handling.
+ *		Andi Kleen,  add zeroing on error
+ *                   converted to pure assembler
+ *
+ * SuperH version:  Copyright (C) 1999  Niibe Yutaka
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/errno.h>
+#include <linux/linkage.h>
+
+/*
+ * computes a partial checksum, e.g. for TCP/UDP fragments
+ */
+
+/*	
+ * unsigned int csum_partial(const unsigned char *buf, int len,
+ *                           unsigned int sum);
+ */
+
+.text
+ENTRY(csum_partial)
+	  /*
+	   * Experiments with Ethernet and SLIP connections show that buff
+	   * is aligned on either a 2-byte or 4-byte boundary.  We get at
+	   * least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
+	   * Fortunately, it is easy to convert 2-byte alignment to 4-byte
+	   * alignment for the unrolled loop.
+	   */
+	mov	r5, r1
+	mov	r4, r0
+	tst	#2, r0		! Check alignment.
+	bt	2f		! Jump if alignment is ok.
+	!
+	add	#-2, r5		! Alignment uses up two bytes.
+	cmp/pz	r5		!
+	bt/s	1f		! Jump if we had at least two bytes.
+	 clrt
+	bra	6f
+	 add	#2, r5		! r5 was < 2.  Deal with it.
+1:
+	mov	r5, r1		! Save new len for later use.
+	mov.w	@r4+, r0
+	extu.w	r0, r0
+	addc	r0, r6
+	bf	2f
+	add	#1, r6
+2:
+	mov	#-5, r0
+	shld	r0, r5
+	tst	r5, r5
+	bt/s	4f		! if it's =0, go to 4f
+	 clrt
+	.align	2
+3:
+	mov.l	@r4+, r0
+	mov.l	@r4+, r2
+	mov.l	@r4+, r3
+	addc	r0, r6
+	mov.l	@r4+, r0
+	addc	r2, r6
+	mov.l	@r4+, r2
+	addc	r3, r6
+	mov.l	@r4+, r3
+	addc	r0, r6
+	mov.l	@r4+, r0
+	addc	r2, r6
+	mov.l	@r4+, r2
+	addc	r3, r6
+	addc	r0, r6
+	addc	r2, r6
+	movt	r0
+	dt	r5
+	bf/s	3b
+	 cmp/eq	#1, r0
+	! here, we know r5==0
+	addc	r5, r6			! add carry to r6
+4:
+	mov	r1, r0
+	and	#0x1c, r0
+	tst	r0, r0
+	bt/s	6f
+	 mov	r0, r5
+	shlr2	r5
+	mov	#0, r2
+5:
+	addc	r2, r6
+	mov.l	@r4+, r2
+	movt	r0
+	dt	r5
+	bf/s	5b
+	 cmp/eq	#1, r0
+	addc	r2, r6
+	addc	r5, r6		! r5==0 here, so it means add carry-bit
+6:
+	mov	r1, r5
+	mov	#3, r0
+	and	r0, r5
+	tst	r5, r5
+	bt	9f		! if it's =0 go to 9f
+	mov	#2, r1
+	cmp/hs  r1, r5
+	bf	7f
+	mov.w	@r4+, r0
+	extu.w	r0, r0
+	cmp/eq	r1, r5
+	bt/s	8f
+	 clrt
+	shll16	r0
+	addc	r0, r6
+7:
+	mov.b	@r4+, r0
+	extu.b	r0, r0
+#ifndef	__LITTLE_ENDIAN__
+	shll8	r0
+#endif
+8:
+	addc	r0, r6
+	mov	#0, r0
+	addc	r0, r6 
+9:
+	rts
+	 mov	r6, r0
+
+/*
+unsigned int csum_partial_copy_generic (const char *src, char *dst, int len, 
+					int sum, int *src_err_ptr, int *dst_err_ptr)
+ */ 
+
+/*
+ * Copy from ds while checksumming, otherwise like csum_partial
+ *
+ * The macros SRC and DST specify the type of access for the instruction.
+ * thus we can call a custom exception handler for all access types.
+ *
+ * FIXME: could someone double-check whether I haven't mixed up some SRC and
+ *	  DST definitions? It's damn hard to trigger all cases.  I hope I got
+ *	  them all but there's no guarantee.
+ */
+
+#define SRC(...)			\
+	9999: __VA_ARGS__ ;		\
+	.section __ex_table, "a";	\
+	.long 9999b, 6001f	;	\
+	.previous
+
+#define DST(...)			\
+	9999: __VA_ARGS__ ;		\
+	.section __ex_table, "a";	\
+	.long 9999b, 6002f	;	\
+	.previous
+
+!
+! r4:	const char *SRC
+! r5:	char *DST
+! r6:	int LEN
+! r7:	int SUM
+!
+! on stack:
+! int *SRC_ERR_PTR
+! int *DST_ERR_PTR
+!
+ENTRY(csum_partial_copy_generic)
+	mov.l	r5,@-r15
+	mov.l	r6,@-r15
+
+	mov	#3,r0		! Check src and dest are equally aligned
+	mov	r4,r1
+	and	r0,r1
+	and	r5,r0
+	cmp/eq	r1,r0
+	bf	3f		! Different alignments, use slow version
+	tst	#1,r0		! Check dest word aligned
+	bf	3f		! If not, do it the slow way
+
+	mov	#2,r0
+	tst	r0,r5		! Check dest alignment. 
+	bt	2f		! Jump if alignment is ok.
+	add	#-2,r6		! Alignment uses up two bytes.
+	cmp/pz	r6		! Jump if we had at least two bytes.
+	bt/s	1f
+	 clrt
+	bra	4f
+	 add	#2,r6		! r6 was < 2.	Deal with it.
+
+3:	! Handle different src and dest alignments.
+	! This is not common, so simple byte by byte copy will do.
+	mov	r6,r2
+	shlr	r6
+	tst	r6,r6
+	bt	4f
+	clrt
+	.align	2
+5:
+SRC(	mov.b	@r4+,r1 	)
+SRC(	mov.b	@r4+,r0		)
+	extu.b	r1,r1
+DST(	mov.b	r1,@r5		)
+DST(	mov.b	r0,@(1,r5)	)
+	extu.b	r0,r0
+	add	#2,r5
+
+#ifdef	__LITTLE_ENDIAN__
+	shll8	r0
+#else
+	shll8	r1
+#endif
+	or	r1,r0
+
+	addc	r0,r7
+	movt	r0
+	dt	r6
+	bf/s	5b
+	 cmp/eq	#1,r0
+	mov	#0,r0
+	addc	r0, r7
+
+	mov	r2, r0
+	tst	#1, r0
+	bt	7f
+	bra	5f
+	 clrt
+
+	! src and dest equally aligned, but to a two byte boundary.
+	! Handle first two bytes as a special case
+	.align	2
+1:	
+SRC(	mov.w	@r4+,r0		)
+DST(	mov.w	r0,@r5		)
+	add	#2,r5
+	extu.w	r0,r0
+	addc	r0,r7
+	mov	#0,r0
+	addc	r0,r7
+2:
+	mov	r6,r2
+	mov	#-5,r0
+	shld	r0,r6
+	tst	r6,r6
+	bt/s	2f
+	 clrt
+	.align	2
+1:	
+SRC(	mov.l	@r4+,r0		)
+SRC(	mov.l	@r4+,r1		)
+	addc	r0,r7
+DST(	mov.l	r0,@r5		)
+DST(	mov.l	r1,@(4,r5)	)
+	addc	r1,r7
+
+SRC(	mov.l	@r4+,r0		)
+SRC(	mov.l	@r4+,r1		)
+	addc	r0,r7
+DST(	mov.l	r0,@(8,r5)	)
+DST(	mov.l	r1,@(12,r5)	)
+	addc	r1,r7
+
+SRC(	mov.l	@r4+,r0 	)
+SRC(	mov.l	@r4+,r1		)
+	addc	r0,r7
+DST(	mov.l	r0,@(16,r5)	)
+DST(	mov.l	r1,@(20,r5)	)
+	addc	r1,r7
+
+SRC(	mov.l	@r4+,r0		)
+SRC(	mov.l	@r4+,r1		)
+	addc	r0,r7
+DST(	mov.l	r0,@(24,r5)	)
+DST(	mov.l	r1,@(28,r5)	)
+	addc	r1,r7
+	add	#32,r5
+	movt	r0
+	dt	r6
+	bf/s	1b
+	 cmp/eq	#1,r0
+	mov	#0,r0
+	addc	r0,r7
+
+2:	mov	r2,r6
+	mov	#0x1c,r0
+	and	r0,r6
+	cmp/pl	r6
+	bf/s	4f
+	 clrt
+	shlr2	r6
+3:	
+SRC(	mov.l	@r4+,r0	)
+	addc	r0,r7
+DST(	mov.l	r0,@r5	)
+	add	#4,r5
+	movt	r0
+	dt	r6
+	bf/s	3b
+	 cmp/eq	#1,r0
+	mov	#0,r0
+	addc	r0,r7
+4:	mov	r2,r6
+	mov	#3,r0
+	and	r0,r6
+	cmp/pl	r6
+	bf	7f
+	mov	#2,r1
+	cmp/hs	r1,r6
+	bf	5f
+SRC(	mov.w	@r4+,r0	)
+DST(	mov.w	r0,@r5	)
+	extu.w	r0,r0
+	add	#2,r5
+	cmp/eq	r1,r6
+	bt/s	6f
+	 clrt
+	shll16	r0
+	addc	r0,r7
+5:	
+SRC(	mov.b	@r4+,r0	)
+DST(	mov.b	r0,@r5	)
+	extu.b	r0,r0
+#ifndef	__LITTLE_ENDIAN__
+	shll8	r0
+#endif
+6:	addc	r0,r7
+	mov	#0,r0
+	addc	r0,r7
+7:
+5000:
+
+# Exception handler:
+.section .fixup, "ax"							
+
+6001:
+	mov.l	@(8,r15),r0			! src_err_ptr
+	mov	#-EFAULT,r1
+	mov.l	r1,@r0
+
+	! zero the complete destination - computing the rest
+	! is too much work 
+	mov.l	@(4,r15),r5		! dst
+	mov.l	@r15,r6			! len
+	mov	#0,r7
+1:	mov.b	r7,@r5
+	dt	r6
+	bf/s	1b
+	 add	#1,r5
+	mov.l	8000f,r0
+	jmp	@r0
+	 nop
+	.align	2
+8000:	.long	5000b
+
+6002:
+	mov.l	@(12,r15),r0			! dst_err_ptr
+	mov	#-EFAULT,r1
+	mov.l	r1,@r0
+	mov.l	8001f,r0
+	jmp	@r0
+	 nop
+	.align	2
+8001:	.long	5000b
+
+.previous
+	add	#8,r15
+	rts
+	 mov	r7,r0
diff --git a/arch/sh/lib/delay.c b/arch/sh/lib/delay.c
new file mode 100644
index 0000000..50b3603
--- /dev/null
+++ b/arch/sh/lib/delay.c
@@ -0,0 +1,41 @@
+/*
+ *	Precise Delay Loops for SuperH
+ *
+ *	Copyright (C) 1999 Niibe Yutaka & Kaz Kojima
+ */
+
+#include <linux/sched.h>
+#include <linux/delay.h>
+
+void __delay(unsigned long loops)
+{
+	__asm__ __volatile__(
+		"tst	%0, %0\n\t"
+		"1:\t"
+		"bf/s	1b\n\t"
+		" dt	%0"
+		: "=r" (loops)
+		: "0" (loops)
+		: "t");
+}
+
+inline void __const_udelay(unsigned long xloops)
+{
+	__asm__("dmulu.l	%0, %2\n\t"
+		"sts	mach, %0"
+		: "=r" (xloops)
+		: "0" (xloops), "r" (cpu_data[_smp_processor_id()].loops_per_jiffy)
+		: "macl", "mach");
+	__delay(xloops * HZ);
+}
+
+void __udelay(unsigned long usecs)
+{
+	__const_udelay(usecs * 0x000010c6);  /* 2**32 / 1000000 */
+}
+
+void __ndelay(unsigned long nsecs)
+{
+	__const_udelay(nsecs * 0x00000005);
+}
+
diff --git a/arch/sh/lib/div64-generic.c b/arch/sh/lib/div64-generic.c
new file mode 100644
index 0000000..c02473a
--- /dev/null
+++ b/arch/sh/lib/div64-generic.c
@@ -0,0 +1,19 @@
+/*
+ * Generic __div64_32 wrapper for __xdiv64_32.
+ */
+
+#include <linux/types.h>
+
+extern u64 __xdiv64_32(u64 n, u32 d);
+
+u64 __div64_32(u64 *xp, u32 y)
+{
+	u64 rem;
+	u64 q = __xdiv64_32(*xp, y);
+
+	rem = *xp - q * y;
+	*xp = q;
+
+	return rem;
+}
+
diff --git a/arch/sh/lib/div64.S b/arch/sh/lib/div64.S
new file mode 100644
index 0000000..eefc275
--- /dev/null
+++ b/arch/sh/lib/div64.S
@@ -0,0 +1,46 @@
+/*	
+ * unsigned long long __xdiv64_32(unsigned long long n, unsigned long d); 
+ */
+
+#include <linux/linkage.h>
+
+.text
+ENTRY(__xdiv64_32)
+#ifdef  __LITTLE_ENDIAN__
+	mov	r4, r0
+	mov	r5, r1
+#else
+	mov	r4, r1
+	mov	r5, r0
+#endif
+	cmp/hs	r6, r1
+	bf.s	1f
+	 mov	#0, r2
+
+	mov	r1, r2
+	mov	#0, r3
+	div0u
+	.rept	32
+	rotcl	r2
+	div1	r6, r3
+	.endr
+	rotcl	r2
+	mul.l	r6, r2
+	sts	macl, r3
+	sub	r3, r1
+1:
+	div0u
+	.rept	32
+	rotcl	r0
+	div1	r6, r1
+	.endr
+#ifdef  __LITTLE_ENDIAN__
+	mov	r2, r1
+	rts
+	 rotcl	r0
+#else
+	rotcl	r0
+	mov	r0, r1
+	rts
+	 mov	r2, r0
+#endif
diff --git a/arch/sh/lib/memchr.S b/arch/sh/lib/memchr.S
new file mode 100644
index 0000000..bc6036a
--- /dev/null
+++ b/arch/sh/lib/memchr.S
@@ -0,0 +1,26 @@
+/* $Id: memchr.S,v 1.1 2000/04/14 16:49:01 mjd Exp $
+ *
+ * "memchr" implementation of SuperH
+ *
+ * Copyright (C) 1999  Niibe Yutaka
+ *
+ */
+
+/*
+ * void *memchr(const void *s, int c, size_t n);
+ */
+
+#include <linux/linkage.h>
+ENTRY(memchr)
+	tst	r6,r6
+	bt/s	2f
+	 exts.b	r5,r5
+1:	mov.b	@r4,r1
+	cmp/eq	r1,r5
+	bt/s	3f
+	 dt	r6
+	bf/s	1b
+	 add	#1,r4
+2:	mov	#0,r4
+3:	rts
+	 mov	r4,r0
diff --git a/arch/sh/lib/memcpy-sh4.S b/arch/sh/lib/memcpy-sh4.S
new file mode 100644
index 0000000..55f2274
--- /dev/null
+++ b/arch/sh/lib/memcpy-sh4.S
@@ -0,0 +1,800 @@
+/*
+ * "memcpy" implementation of SuperH
+ *
+ * Copyright (C) 1999  Niibe Yutaka
+ * Copyright (c) 2002  STMicroelectronics Ltd
+ *   Modified from memcpy.S and micro-optimised for SH4
+ *   Stuart Menefy (stuart.menefy@st.com)
+ *
+ */
+#include <linux/linkage.h>
+#include <linux/config.h>
+
+/*
+ * void *memcpy(void *dst, const void *src, size_t n);
+ *
+ * It is assumed that there is no overlap between src and dst.
+ * If there is an overlap, then the results are undefined.
+ */
+
+	!
+	!	GHIJ KLMN OPQR -->  ...G HIJK LMNO PQR.
+	!
+
+	! Size is 16 or greater, and may have trailing bytes
+
+	.balign	32
+.Lcase1:
+	! Read a long word and write a long word at once
+	! At the start of each iteration, r7 contains last long load
+	add	#-1,r5		!  79 EX
+	mov	r4,r2		!   5 MT (0 cycles latency)
+
+	mov.l	@(r0,r5),r7	!  21 LS (2 cycles latency)
+	add	#-4,r5		!  50 EX
+
+	add	#7,r2		!  79 EX
+	!
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
+	! 6 cycles, 4 bytes per iteration
+3:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! NMLK
+	mov	r7, r3		!   5 MT (latency=0)	! RQPO
+
+	cmp/hi	r2,r0		!  57 MT
+	shll16	r3		! 103 EX
+
+	mov	r1,r6		!   5 MT (latency=0)
+	shll8	r3		! 102 EX		! Oxxx
+
+	shlr8	r6		! 106 EX		! xNML
+	mov	r1, r7		!   5 MT (latency=0)
+
+	or	r6,r3		!  82 EX		! ONML
+	bt/s	3b		! 109 BR
+
+	 mov.l	r3,@-r0		!  30 LS
+#else
+3:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! KLMN
+	mov	r7,r3		!   5 MT (latency=0)	! OPQR
+
+	cmp/hi	r2,r0		!  57 MT
+	shlr16	r3		! 107 EX
+
+	shlr8	r3		! 106 EX		! xxxO
+	mov	r1,r6		!   5 MT (latency=0)
+
+	shll8	r6		! 102 EX		! LMNx
+	mov	r1,r7		!   5 MT (latency=0)
+
+	or	r6,r3		!  82 EX		! LMNO
+	bt/s	3b		! 109 BR
+
+	 mov.l	r3,@-r0		!  30 LS
+#endif
+	! Finally, copy a byte at once, if necessary
+
+	add	#4,r5		!  50 EX
+	cmp/eq	r4,r0		!  54 MT
+
+	add	#-6,r2		!  50 EX
+	bt	9f		! 109 BR
+
+8:	cmp/hi	r2,r0		!  57 MT
+	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
+
+	bt/s	8b		! 109 BR
+
+	 mov.b	r1,@-r0		!  29 LS
+
+9:	rts
+	 nop
+
+
+	!
+	!	GHIJ KLMN OPQR -->  .GHI JKLM NOPQ R...
+	!
+
+	! Size is 16 or greater, and may have trailing bytes
+
+	.balign	32
+.Lcase3:
+	! Read a long word and write a long word at once
+	! At the start of each iteration, r7 contains last long load
+	add	#-3,r5		! 79 EX
+	mov	r4,r2		!  5 MT (0 cycles latency)
+
+	mov.l	@(r0,r5),r7	! 21 LS (2 cycles latency)
+	add	#-4,r5		! 50 EX
+
+	add	#7,r2		!  79 EX
+	!
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
+	! 6 cycles, 4 bytes per iteration
+3:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! NMLK
+	mov	r7, r3		!   5 MT (latency=0)	! RQPO
+
+	cmp/hi	r2,r0		!  57 MT
+	shll8	r3		! 102 EX		! QPOx
+
+	mov	r1,r6		!   5 MT (latency=0)
+	shlr16	r6		! 107 EX
+
+	shlr8	r6		! 106 EX		! xxxN
+	mov	r1, r7		!   5 MT (latency=0)
+
+	or	r6,r3		!  82 EX		! QPON
+	bt/s	3b		! 109 BR
+
+	 mov.l	r3,@-r0		!  30 LS
+#else
+3:	mov	r1,r3		! OPQR
+	shlr8	r3		! xOPQ
+	mov.l	@(r0,r5),r1	! KLMN
+	mov	r1,r6
+	shll16	r6
+	shll8	r6		! Nxxx
+	or	r6,r3		! NOPQ
+	cmp/hi	r2,r0
+	bt/s	3b
+	 mov.l	r3,@-r0
+#endif
+
+	! Finally, copy a byte at once, if necessary
+
+	add	#6,r5		!  50 EX
+	cmp/eq	r4,r0		!  54 MT
+
+	add	#-6,r2		!  50 EX
+	bt	9f		! 109 BR
+
+8:	cmp/hi	r2,r0		!  57 MT
+	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
+
+	bt/s	8b		! 109 BR
+
+	 mov.b	r1,@-r0		!  29 LS
+
+9:	rts
+	 nop
+
+ENTRY(memcpy)
+
+	! Calculate the invariants which will be used in the remainder
+	! of the code:
+	!
+	!      r4   -->  [ ...  ] DST             [ ...  ] SRC
+	!	         [ ...  ]                 [ ...  ]
+	!	           :                        :
+	!      r0   -->  [ ...  ]       r0+r5 --> [ ...  ]
+	!
+	!
+
+	! Short circuit the common case of src, dst and len being 32 bit aligned
+	! and test for zero length move
+
+	mov	r6, r0		!   5 MT (0 cycle latency)
+	or	r4, r0		!  82 EX
+
+	or	r5, r0		!  82 EX
+	tst	r6, r6		!  86 MT
+
+	bt/s	99f		! 111 BR		(zero len)
+	 tst	#3, r0		!  87 MT
+
+	mov	r4, r0		!   5 MT (0 cycle latency)
+	add	r6, r0		!  49 EX
+
+	mov	#16, r1		!   6 EX
+	bt/s	.Lcase00	! 111 BR		(aligned)
+
+	 sub	r4, r5		!  75 EX
+
+	! Arguments are not nicely long word aligned or zero len.
+	! Check for small copies, and if so do a simple byte at a time copy.
+	!
+	! Deciding on an exact value of 'small' is not easy, as the point at which
+	! using the optimised routines become worthwhile varies (these are the
+	! cycle counts for differnet sizes using byte-at-a-time vs. optimised):
+	!	size	byte-at-time	long	word	byte
+	!	16	42		39-40	46-50	50-55
+	!	24	58		43-44	54-58	62-67
+	!	36	82		49-50	66-70	80-85
+	! However the penalty for getting it 'wrong' is much higher for long word
+	! aligned data (and this is more common), so use a value of 16.
+
+	cmp/gt	r6,r1		!  56 MT
+
+	add	#-1,r5		!  50 EX
+	bf/s	6f		! 108 BR		(not small)
+
+	 mov	r5, r3		!   5 MT (latency=0)
+	shlr	r6		! 104 EX
+
+	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
+	bf/s	4f		! 111 BR
+
+	 add	#-1,r3		!  50 EX
+	tst	r6, r6		!  86 MT
+
+	bt/s	98f		! 110 BR
+	 mov.b	r1,@-r0		!  29 LS
+
+	! 4 cycles, 2 bytes per iteration
+3:	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
+
+4:	mov.b	@(r0,r3),r2	!  20 LS (latency=2)
+	dt	r6		!  67 EX
+
+	mov.b	r1,@-r0		!  29 LS
+	bf/s	3b		! 111 BR
+
+	 mov.b	r2,@-r0		!  29 LS
+98:
+	rts
+	 nop
+
+99:	rts
+	 mov	r4, r0
+
+	! Size is not small, so its worthwhile looking for optimisations.
+	! First align destination to a long word boundary.
+	!
+	! r5 = normal value -1
+
+6:	tst	#3, r0		!  87 MT
+        mov	#3, r3		!   6 EX
+
+	bt/s	2f		! 111 BR
+	 and	r0,r3		!  78 EX
+
+	! 3 cycles, 1 byte per iteration
+1:	dt	r3		!  67 EX
+	mov.b	@(r0,r5),r1	!  19 LS (latency=2)
+
+	add	#-1, r6		!  79 EX
+	bf/s	1b		! 109 BR
+
+	 mov.b	r1,@-r0		!  28 LS
+
+2:	add	#1, r5		!  79 EX
+
+	! Now select the appropriate bulk transfer code based on relative
+	! alignment of src and dst.
+
+	mov	r0, r3		!   5 MT (latency=0)
+
+	mov	r5, r0		!   5 MT (latency=0)
+	tst	#1, r0		!  87 MT
+
+	bf/s	1f		! 111 BR
+	 mov	#64, r7		!   6 EX
+
+	! bit 0 clear
+
+	cmp/ge	r7, r6		!  55 MT
+
+	bt/s	2f		! 111 BR
+	 tst	#2, r0		!  87 MT
+
+	! small
+	bt/s	.Lcase0
+	 mov	r3, r0
+
+	bra	.Lcase2
+	 nop
+
+	! big
+2:	bt/s	.Lcase0b
+	 mov	r3, r0
+
+	bra	.Lcase2b
+	 nop
+
+	! bit 0 set
+1:	tst	#2, r0		! 87 MT
+
+	bt/s	.Lcase1
+	 mov	r3, r0
+
+	bra	.Lcase3
+	 nop
+
+
+	!
+	!	GHIJ KLMN OPQR -->  GHIJ KLMN OPQR
+	!
+
+	! src, dst and size are all long word aligned
+	! size is non-zero
+
+	.balign	32
+.Lcase00:
+	mov	#64, r1		!   6 EX
+	mov	r5, r3		!   5 MT (latency=0)
+
+	cmp/gt	r6, r1		!  56 MT
+	add	#-4, r5		!  50 EX
+
+	bf	.Lcase00b	! 108 BR		(big loop)
+	shlr2	r6		! 105 EX
+
+	shlr	r6		! 104 EX
+	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
+
+	bf/s	4f		! 111 BR
+	 add	#-8, r3		!  50 EX
+
+	tst	r6, r6		!  86 MT
+	bt/s	5f		! 110 BR
+
+	 mov.l	r1,@-r0		!  30 LS
+
+	! 4 cycles, 2 long words per iteration
+3:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
+
+4:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)
+	dt	r6		!  67 EX
+
+	mov.l	r1, @-r0	!  30 LS
+	bf/s	3b		! 109 BR
+
+	 mov.l	r2, @-r0	!  30 LS
+
+5:	rts
+	 nop
+
+
+	! Size is 16 or greater and less than 64, but may have trailing bytes
+
+	.balign	32
+.Lcase0:
+	add	#-4, r5		!  50 EX
+	mov	r4, r7		!   5 MT (latency=0)
+
+	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
+	mov	#4, r2		!   6 EX
+
+	add	#11, r7		!  50 EX
+	tst	r2, r6		!  86 MT
+
+	mov	r5, r3		!   5 MT (latency=0)
+	bt/s	4f		! 111 BR
+
+	 add	#-4, r3		!  50 EX
+	mov.l	r1,@-r0		!  30 LS
+
+	! 4 cycles, 2 long words per iteration
+3:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
+
+4:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)
+	cmp/hi	r7, r0
+
+	mov.l	r1, @-r0	!  30 LS
+	bt/s	3b		! 109 BR
+
+	 mov.l	r2, @-r0	!  30 LS
+
+	! Copy the final 0-3 bytes
+
+	add	#3,r5		!  50 EX
+
+	cmp/eq	r0, r4		!  54 MT
+	add	#-10, r7	!  50 EX
+
+	bt	9f		! 110 BR
+
+	! 3 cycles, 1 byte per iteration
+1:	mov.b	@(r0,r5),r1	!  19 LS
+	cmp/hi	r7,r0		!  57 MT
+
+	bt/s	1b		! 111 BR
+	 mov.b	r1,@-r0		!  28 LS
+
+9:	rts
+	 nop
+
+	! Size is at least 64 bytes, so will be going round the big loop at least once.
+	!
+	!   r2 = rounded up r4
+	!   r3 = rounded down r0
+
+	.balign	32
+.Lcase0b:
+	add	#-4, r5		!  50 EX
+
+.Lcase00b:
+	mov	r0, r3		!   5 MT (latency=0)
+	mov	#(~0x1f), r1	!   6 EX
+
+	and	r1, r3		!  78 EX
+	mov	r4, r2		!   5 MT (latency=0)
+
+	cmp/eq	r3, r0		!  54 MT
+	add	#0x1f, r2	!  50 EX
+
+	bt/s	1f		! 110 BR
+	 and	r1, r2		!  78 EX
+
+	! copy initial words until cache line aligned
+
+	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
+	tst	#4, r0		!  87 MT
+
+	mov	r5, r6		!   5 MT (latency=0)
+	add	#-4, r6		!  50 EX
+
+	bt/s	4f		! 111 BR
+	 add	#8, r3		!  50 EX
+
+	tst	#0x18, r0	!  87 MT
+
+	bt/s	1f		! 109 BR
+	 mov.l	r1,@-r0		!  30 LS
+
+	! 4 cycles, 2 long words per iteration
+3:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
+
+4:	mov.l	@(r0, r6), r7	!  21 LS (latency=2)
+	cmp/eq	r3, r0		!  54 MT
+
+	mov.l	r1, @-r0	!  30 LS
+	bf/s	3b		! 109 BR
+
+	 mov.l	r7, @-r0	!  30 LS
+
+	! Copy the cache line aligned blocks
+	!
+	! In use: r0, r2, r4, r5
+	! Scratch: r1, r3, r6, r7
+	!
+	! We could do this with the four scratch registers, but if src
+	! and dest hit the same cache line, this will thrash, so make
+	! use of additional registers.
+	!
+	! We also need r0 as a temporary (for movca), so 'undo' the invariant:
+	!   r5:	 src (was r0+r5)
+	!   r1:	 dest (was r0)
+	! this can be reversed at the end, so we don't need to save any extra
+	! state.
+	!
+1:	mov.l	r8, @-r15	!  30 LS
+	add	r0, r5		!  49 EX
+
+	mov.l	r9, @-r15	!  30 LS
+	mov	r0, r1		!   5 MT (latency=0)
+
+	mov.l	r10, @-r15	!  30 LS
+	add	#-0x1c, r5	!  50 EX
+
+	mov.l	r11, @-r15	!  30 LS
+
+	! 16 cycles, 32 bytes per iteration
+2:	mov.l	@(0x00,r5),r0	! 18 LS (latency=2)
+	add	#-0x20, r1	! 50 EX
+	mov.l	@(0x04,r5),r3	! 18 LS (latency=2)
+	mov.l	@(0x08,r5),r6	! 18 LS (latency=2)
+	mov.l	@(0x0c,r5),r7	! 18 LS (latency=2)
+	mov.l	@(0x10,r5),r8	! 18 LS (latency=2)
+	mov.l	@(0x14,r5),r9	! 18 LS (latency=2)
+	mov.l	@(0x18,r5),r10	! 18 LS (latency=2)
+	mov.l	@(0x1c,r5),r11	! 18 LS (latency=2)
+	movca.l	r0,@r1		! 40 LS (latency=3-7)
+	mov.l	r3,@(0x04,r1)	! 33 LS
+	mov.l	r6,@(0x08,r1)	! 33 LS
+	mov.l	r7,@(0x0c,r1)	! 33 LS
+
+	mov.l	r8,@(0x10,r1)	! 33 LS
+	add	#-0x20, r5	! 50 EX
+
+	mov.l	r9,@(0x14,r1)	! 33 LS
+	cmp/eq	r2,r1		! 54 MT
+
+	mov.l	r10,@(0x18,r1)	!  33 LS
+	bf/s	2b		! 109 BR
+
+	 mov.l	r11,@(0x1c,r1)	!  33 LS
+
+	mov	r1, r0		!   5 MT (latency=0)
+
+	mov.l	@r15+, r11	!  15 LS
+	sub	r1, r5		!  75 EX
+
+	mov.l	@r15+, r10	!  15 LS
+	cmp/eq	r4, r0		!  54 MT
+
+	bf/s	1f		! 109 BR
+	 mov.l	 @r15+, r9	!  15 LS
+
+	rts
+1:	 mov.l	@r15+, r8	!  15 LS
+	sub	r4, r1		!  75 EX		(len remaining)
+
+	! number of trailing bytes is non-zero
+	!
+	! invariants restored (r5 already decremented by 4)
+	! also r1=num bytes remaining
+
+	mov	#4, r2		!   6 EX
+	mov	r4, r7		!   5 MT (latency=0)
+
+	add	#0x1c, r5	!  50 EX		(back to -4)
+	cmp/hs	r2, r1		!  58 MT
+
+	bf/s	5f		! 108 BR
+	 add	 #11, r7	!  50 EX
+
+	mov.l	@(r0, r5), r6	!  21 LS (latency=2)
+	tst	r2, r1		!  86 MT
+
+	mov	r5, r3		!   5 MT (latency=0)
+	bt/s	4f		! 111 BR
+
+	 add	#-4, r3		!  50 EX
+	cmp/hs	r2, r1		!  58 MT
+
+	bt/s	5f		! 111 BR
+	 mov.l	r6,@-r0		!  30 LS
+
+	! 4 cycles, 2 long words per iteration
+3:	mov.l	@(r0, r5), r6	!  21 LS (latency=2)
+
+4:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)
+	cmp/hi	r7, r0
+
+	mov.l	r6, @-r0	!  30 LS
+	bt/s	3b		! 109 BR
+
+	 mov.l	r2, @-r0	!  30 LS
+
+	! Copy the final 0-3 bytes
+
+5:	cmp/eq	r0, r4		!  54 MT
+	add	#-10, r7	!  50 EX
+
+	bt	9f		! 110 BR
+	add	#3,r5		!  50 EX
+
+	! 3 cycles, 1 byte per iteration
+1:	mov.b	@(r0,r5),r1	!  19 LS
+	cmp/hi	r7,r0		!  57 MT
+
+	bt/s	1b		! 111 BR
+	 mov.b	r1,@-r0		!  28 LS
+
+9:	rts
+	 nop
+
+	!
+	!	GHIJ KLMN OPQR -->  ..GH IJKL MNOP QR..
+	!
+
+	.balign	32
+.Lcase2:
+	! Size is 16 or greater and less then 64, but may have trailing bytes
+
+2:	mov	r5, r6		!   5 MT (latency=0)
+	add	#-2,r5		!  50 EX
+
+	mov	r4,r2		!   5 MT (latency=0)
+	add	#-4,r6		!  50 EX
+
+	add	#7,r2		!  50 EX
+3:	mov.w	@(r0,r5),r1	!  20 LS (latency=2)
+
+	mov.w	@(r0,r6),r3	!  20 LS (latency=2)
+	cmp/hi	r2,r0		!  57 MT
+
+	mov.w	r1,@-r0		!  29 LS
+	bt/s	3b		! 111 BR
+
+	 mov.w	r3,@-r0		!  29 LS
+
+	bra	10f
+	 nop
+
+
+	.balign	32
+.Lcase2b:
+	! Size is at least 64 bytes, so will be going round the big loop at least once.
+	!
+	!   r2 = rounded up r4
+	!   r3 = rounded down r0
+
+	mov	r0, r3		!   5 MT (latency=0)
+	mov	#(~0x1f), r1	!   6 EX
+
+	and	r1, r3		!  78 EX
+	mov	r4, r2		!   5 MT (latency=0)
+
+	cmp/eq	r3, r0		!  54 MT
+	add	#0x1f, r2	!  50 EX
+
+	add	#-2, r5		!  50 EX
+	bt/s	1f		! 110 BR
+	 and	r1, r2		!  78 EX
+
+	! Copy a short word one at a time until we are cache line aligned
+	!   Normal values: r0, r2, r3, r4
+	!   Unused: r1, r6, r7
+	!   Mod: r5 (=r5-2)
+	!
+	add	#2, r3		!  50 EX
+
+2:	mov.w	@(r0,r5),r1	!  20 LS (latency=2)
+	cmp/eq	r3,r0		!  54 MT
+
+	bf/s	2b		! 111 BR
+
+	 mov.w	r1,@-r0		!  29 LS
+
+	! Copy the cache line aligned blocks
+	!
+	! In use: r0, r2, r4, r5 (=r5-2)
+	! Scratch: r1, r3, r6, r7
+	!
+	! We could do this with the four scratch registers, but if src
+	! and dest hit the same cache line, this will thrash, so make
+	! use of additional registers.
+	!
+	! We also need r0 as a temporary (for movca), so 'undo' the invariant:
+	!   r5:	 src (was r0+r5)
+	!   r1:	 dest (was r0)
+	! this can be reversed at the end, so we don't need to save any extra
+	! state.
+	!
+1:	mov.l	r8, @-r15	!  30 LS
+	add	r0, r5		!  49 EX
+
+	mov.l	r9, @-r15	!  30 LS
+	mov	r0, r1		!   5 MT (latency=0)
+
+	mov.l	r10, @-r15	!  30 LS
+	add	#-0x1e, r5	!  50 EX
+
+	mov.l	r11, @-r15	!  30 LS
+
+	mov.l	r12, @-r15	!  30 LS
+
+	! 17 cycles, 32 bytes per iteration
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
+2:	mov.w	@r5+, r0	!  14 LS (latency=2)		..JI
+	add	#-0x20, r1	!  50 EX
+
+	mov.l	@r5+, r3	!  15 LS (latency=2)		NMLK
+
+	mov.l	@r5+, r6	!  15 LS (latency=2)		RQPO
+	shll16	r0		! 103 EX			JI..
+
+	mov.l	@r5+, r7	!  15 LS (latency=2)
+	xtrct	r3, r0		!  48 EX			LKJI
+
+	mov.l	@r5+, r8	!  15 LS (latency=2)
+	xtrct	r6, r3		!  48 EX			PONM
+
+	mov.l	@r5+, r9	!  15 LS (latency=2)
+	xtrct	r7, r6		!  48 EX
+
+	mov.l	@r5+, r10	!  15 LS (latency=2)
+	xtrct	r8, r7		!  48 EX
+
+	mov.l	@r5+, r11	!  15 LS (latency=2)
+	xtrct	r9, r8		!  48 EX
+
+	mov.w	@r5+, r12	!  15 LS (latency=2)
+	xtrct	r10, r9		!  48 EX
+
+	movca.l	r0,@r1		!  40 LS (latency=3-7)
+	xtrct	r11, r10	!  48 EX
+
+	mov.l	r3, @(0x04,r1)	!  33 LS
+	xtrct	r12, r11	!  48 EX
+
+	mov.l	r6, @(0x08,r1)	!  33 LS
+
+	mov.l	r7, @(0x0c,r1)	!  33 LS
+
+	mov.l	r8, @(0x10,r1)	!  33 LS
+	add	#-0x40, r5	!  50 EX
+
+	mov.l	r9, @(0x14,r1)	!  33 LS
+	cmp/eq	r2,r1		!  54 MT
+
+	mov.l	r10, @(0x18,r1)	!  33 LS
+	bf/s	2b		! 109 BR
+
+	 mov.l	r11, @(0x1c,r1)	!  33 LS
+#else
+2:	mov.w	@(0x1e,r5), r0	!  17 LS (latency=2)
+	add	#-2, r5		!  50 EX
+
+	mov.l	@(0x1c,r5), r3	!  18 LS (latency=2)
+	add	#-4, r1		!  50 EX
+
+	mov.l	@(0x18,r5), r6	!  18 LS (latency=2)
+	shll16	r0		! 103 EX
+
+	mov.l	@(0x14,r5), r7	!  18 LS (latency=2)
+	xtrct	r3, r0		!  48 EX
+
+	mov.l	@(0x10,r5), r8	!  18 LS (latency=2)
+	xtrct	r6, r3		!  48 EX
+
+	mov.l	@(0x0c,r5), r9	!  18 LS (latency=2)
+	xtrct	r7, r6		!  48 EX
+
+	mov.l	@(0x08,r5), r10	!  18 LS (latency=2)
+	xtrct	r8, r7		!  48 EX
+
+	mov.l	@(0x04,r5), r11	!  18 LS (latency=2)
+	xtrct	r9, r8		!  48 EX
+
+	mov.w	@(0x02,r5), r12	!  18 LS (latency=2)
+	xtrct	r10, r9		!  48 EX
+
+	movca.l	r0,@r1		!  40 LS (latency=3-7)
+	add	#-0x1c, r1	!  50 EX
+
+	mov.l	r3, @(0x1c,r1)	!  33 LS
+	xtrct	r11, r10	!  48 EX
+
+	mov.l	r6, @(0x18,r1)	!  33 LS
+	xtrct	r12, r11	!  48 EX
+
+	mov.l	r7, @(0x14,r1)	!  33 LS
+
+	mov.l	r8, @(0x10,r1)	!  33 LS
+	add	#-0x3e, r5	!  50 EX
+
+	mov.l	r9, @(0x0c,r1)	!  33 LS
+	cmp/eq	r2,r1		!  54 MT
+
+	mov.l	r10, @(0x08,r1)	!  33 LS
+	bf/s	2b		! 109 BR
+
+	 mov.l	r11, @(0x04,r1)	!  33 LS
+#endif
+
+	mov.l	@r15+, r12
+	mov	r1, r0		!   5 MT (latency=0)
+
+	mov.l	@r15+, r11	!  15 LS
+	sub	r1, r5		!  75 EX
+
+	mov.l	@r15+, r10	!  15 LS
+	cmp/eq	r4, r0		!  54 MT
+
+	bf/s	1f		! 109 BR
+	 mov.l	 @r15+, r9	!  15 LS
+
+	rts
+1:	 mov.l	@r15+, r8	!  15 LS
+
+	add	#0x1e, r5	!  50 EX
+
+	! Finish off a short word at a time
+	! r5 must be invariant - 2
+10:	mov	r4,r2		!   5 MT (latency=0)
+	add	#1,r2		!  50 EX
+
+	cmp/hi	r2, r0		!  57 MT
+	bf/s	1f		! 109 BR
+
+	 add	#2, r2		!  50 EX
+
+3:	mov.w	@(r0,r5),r1	!  20 LS
+	cmp/hi	r2,r0		!  57 MT
+
+	bt/s	3b		! 109 BR
+
+	 mov.w	r1,@-r0		!  29 LS
+1:
+
+	!
+	! Finally, copy the last byte if necessary
+	cmp/eq	r4,r0		!  54 MT
+	bt/s	9b
+	 add	#1,r5
+	mov.b	@(r0,r5),r1
+	rts
+	 mov.b	r1,@-r0
+
diff --git a/arch/sh/lib/memcpy.S b/arch/sh/lib/memcpy.S
new file mode 100644
index 0000000..232fab3
--- /dev/null
+++ b/arch/sh/lib/memcpy.S
@@ -0,0 +1,227 @@
+/* $Id: memcpy.S,v 1.3 2001/07/27 11:50:52 gniibe Exp $
+ *
+ * "memcpy" implementation of SuperH
+ *
+ * Copyright (C) 1999  Niibe Yutaka
+ *
+ */
+
+/*
+ * void *memcpy(void *dst, const void *src, size_t n);
+ * No overlap between the memory of DST and of SRC are assumed.
+ */
+
+#include <linux/linkage.h>
+ENTRY(memcpy)
+	tst	r6,r6
+	bt/s	9f		! if n=0, do nothing
+	 mov	r4,r0
+	sub	r4,r5		! From here, r5 has the distance to r0
+	add	r6,r0		! From here, r0 points the end of copying point
+	mov	#12,r1
+	cmp/gt	r6,r1
+	bt/s	7f		! if it's too small, copy a byte at once
+	 add	#-1,r5
+	add	#1,r5
+	!			From here, r6 is free
+	!
+	!      r4   -->  [ ...  ] DST             [ ...  ] SRC
+	!	         [ ...  ]                 [ ...  ]
+	!	           :                        :
+	!      r0   -->  [ ...  ]       r0+r5 --> [ ...  ]
+	!
+	!
+	mov	r5,r1
+	mov	#3,r2
+	and	r2,r1
+	shll2	r1
+	mov	r0,r3		! Save the value on R0 to R3
+	mova	jmptable,r0
+	add	r1,r0
+	mov.l	@r0,r1
+	jmp	@r1
+	 mov	r3,r0		! and back to R0
+	.balign	4
+jmptable:
+	.long	case0
+	.long	case1
+	.long	case2
+	.long	case3
+
+	! copy a byte at once
+7:	mov	r4,r2
+	add	#1,r2
+8:
+	cmp/hi	r2,r0
+	mov.b	@(r0,r5),r1
+	bt/s	8b			! while (r0>r2)
+	 mov.b	r1,@-r0
+9:
+	rts
+	 nop
+
+case0:
+	!
+	!	GHIJ KLMN OPQR -->  GHIJ KLMN OPQR
+	!
+	! First, align to long word boundary
+	mov	r0,r3
+	and	r2,r3
+	tst	r3,r3
+	bt/s	2f
+	 add	#-4,r5
+	add	#3,r5
+1:	dt	r3
+	mov.b	@(r0,r5),r1
+	bf/s	1b
+	 mov.b	r1,@-r0
+	!
+	add	#-3,r5
+2:	! Second, copy a long word at once
+	mov	r4,r2
+	add	#7,r2
+3:	mov.l	@(r0,r5),r1
+	cmp/hi	r2,r0
+	bt/s	3b
+	 mov.l	r1,@-r0
+	!
+	! Third, copy a byte at once, if necessary
+	cmp/eq	r4,r0
+	bt/s	9b
+	 add	#3,r5
+	bra	8b
+	 add	#-6,r2
+
+case1:
+	!
+	!	GHIJ KLMN OPQR -->  ...G HIJK LMNO PQR.
+	!
+	! First, align to long word boundary
+	mov	r0,r3
+	and	r2,r3
+	tst	r3,r3
+	bt/s	2f
+	 add	#-1,r5
+1:	dt	r3
+	mov.b	@(r0,r5),r1
+	bf/s	1b
+	 mov.b	r1,@-r0
+	!
+2:	! Second, read a long word and write a long word at once
+	mov.l	@(r0,r5),r1
+	add	#-4,r5
+	mov	r4,r2
+	add	#7,r2
+	!
+#ifdef __LITTLE_ENDIAN__
+3:	mov	r1,r3		! RQPO
+	shll16	r3
+	shll8	r3		! Oxxx
+	mov.l	@(r0,r5),r1	! NMLK
+	mov	r1,r6
+	shlr8	r6		! xNML
+	or	r6,r3		! ONML
+	cmp/hi	r2,r0
+	bt/s	3b
+	 mov.l	r3,@-r0
+#else
+3:	mov	r1,r3		! OPQR
+	shlr16	r3
+	shlr8	r3		! xxxO
+	mov.l	@(r0,r5),r1	! KLMN
+	mov	r1,r6
+	shll8	r6		! LMNx
+	or	r6,r3		! LMNO
+	cmp/hi	r2,r0
+	bt/s	3b
+	 mov.l	r3,@-r0
+#endif
+	!
+	! Third, copy a byte at once, if necessary
+	cmp/eq	r4,r0
+	bt/s	9b
+	 add	#4,r5
+	bra	8b
+	 add	#-6,r2
+
+case2:
+	!
+	!	GHIJ KLMN OPQR -->  ..GH IJKL MNOP QR..
+	!
+	! First, align to word boundary
+	tst	#1,r0
+	bt/s	2f
+	 add	#-1,r5
+	mov.b	@(r0,r5),r1
+	mov.b	r1,@-r0
+	!
+2:	! Second, read a word and write a word at once
+	add	#-1,r5
+	mov	r4,r2
+	add	#3,r2
+	!
+3:	mov.w	@(r0,r5),r1
+	cmp/hi	r2,r0
+	bt/s	3b
+	 mov.w	r1,@-r0
+	!
+	! Third, copy a byte at once, if necessary
+	cmp/eq	r4,r0
+	bt/s	9b
+	 add	#1,r5
+	mov.b	@(r0,r5),r1
+	rts
+	 mov.b	r1,@-r0
+
+case3:
+	!
+	!	GHIJ KLMN OPQR -->  .GHI JKLM NOPQ R...
+	!
+	! First, align to long word boundary
+	mov	r0,r3
+	and	r2,r3
+	tst	r3,r3
+	bt/s	2f
+	 add	#-1,r5
+1:	dt	r3
+	mov.b	@(r0,r5),r1
+	bf/s	1b
+	 mov.b	r1,@-r0
+	!
+2:	! Second, read a long word and write a long word at once
+	add	#-2,r5
+	mov.l	@(r0,r5),r1
+	add	#-4,r5
+	mov	r4,r2
+	add	#7,r2
+	!
+#ifdef __LITTLE_ENDIAN__
+3:	mov	r1,r3		! RQPO
+	shll8	r3		! QPOx
+	mov.l	@(r0,r5),r1	! NMLK
+	mov	r1,r6
+	shlr16	r6
+	shlr8	r6		! xxxN
+	or	r6,r3		! QPON
+	cmp/hi	r2,r0
+	bt/s	3b
+	 mov.l	r3,@-r0
+#else
+3:	mov	r1,r3		! OPQR
+	shlr8	r3		! xOPQ
+	mov.l	@(r0,r5),r1	! KLMN
+	mov	r1,r6
+	shll16	r6
+	shll8	r6		! Nxxx
+	or	r6,r3		! NOPQ
+	cmp/hi	r2,r0
+	bt/s	3b
+	 mov.l	r3,@-r0
+#endif
+	!
+	! Third, copy a byte at once, if necessary
+	cmp/eq	r4,r0
+	bt/s	9b
+	 add	#6,r5
+	bra	8b
+	 add	#-6,r2
diff --git a/arch/sh/lib/memmove.S b/arch/sh/lib/memmove.S
new file mode 100644
index 0000000..5a2211f
--- /dev/null
+++ b/arch/sh/lib/memmove.S
@@ -0,0 +1,254 @@
+/* $Id: memmove.S,v 1.2 2001/07/27 11:51:09 gniibe Exp $
+ *
+ * "memmove" implementation of SuperH
+ *
+ * Copyright (C) 1999  Niibe Yutaka
+ *
+ */
+
+/*
+ * void *memmove(void *dst, const void *src, size_t n);
+ * The memory areas may overlap.
+ */
+
+#include <linux/linkage.h>
+ENTRY(memmove)
+	! if dest > src, call memcpy (it copies in decreasing order)
+	cmp/hi	r5,r4
+	bf	1f
+	mov.l	2f,r0
+	jmp	@r0
+	 nop
+	.balign 4
+2:	.long	memcpy
+1:
+	sub	r5,r4		! From here, r4 has the distance to r0
+	tst	r6,r6
+	bt/s	9f		! if n=0, do nothing
+	 mov	r5,r0
+	add	r6,r5
+	mov	#12,r1
+	cmp/gt	r6,r1
+	bt/s	8f		! if it's too small, copy a byte at once
+	 add	#-1,r4
+	add	#1,r4
+	!
+	!                [ ...  ] DST             [ ...  ] SRC
+	!	         [ ...  ]                 [ ...  ]
+	!	           :                        :
+	!      r0+r4-->  [ ...  ]       r0    --> [ ...  ]
+	!	           :                        :
+	!	         [ ...  ]                 [ ...  ]
+	!			        r5    -->
+	!
+	mov	r4,r1
+	mov	#3,r2
+	and	r2,r1
+	shll2	r1
+	mov	r0,r3		! Save the value on R0 to R3
+	mova	jmptable,r0
+	add	r1,r0
+	mov.l	@r0,r1
+	jmp	@r1
+	 mov	r3,r0		! and back to R0
+	.balign	4
+jmptable:
+	.long	case0
+	.long	case1
+	.long	case2
+	.long	case3
+
+	! copy a byte at once
+8:	mov.b	@r0+,r1
+	cmp/hs	r5,r0
+	bf/s	8b			! while (r0<r5)
+	 mov.b	r1,@(r0,r4)
+	add	#1,r4
+9:
+	add	r4,r0
+	rts
+	 sub	r6,r0
+
+case_none:
+	bra	8b
+	 add	#-1,r4
+
+case0:
+	!
+	!	GHIJ KLMN OPQR -->  GHIJ KLMN OPQR
+	!
+	! First, align to long word boundary
+	mov	r0,r3
+	and	r2,r3
+	tst	r3,r3
+	bt/s	2f
+	 add	#-1,r4
+	mov	#4,r2
+	sub	r3,r2
+1:	dt	r2
+	mov.b	@r0+,r1
+	bf/s	1b
+	 mov.b	r1,@(r0,r4)
+	!
+2:	! Second, copy a long word at once
+	add	#-3,r4
+	add	#-3,r5
+3:	mov.l	@r0+,r1
+	cmp/hs	r5,r0
+	bf/s	3b
+	 mov.l	r1,@(r0,r4)
+	add	#3,r5
+	!
+	! Third, copy a byte at once, if necessary
+	cmp/eq	r5,r0
+	bt/s	9b
+	 add	#4,r4
+	bra	8b
+	 add	#-1,r4
+
+case3:
+	!
+	!	GHIJ KLMN OPQR -->  ...G HIJK LMNO PQR.
+	!
+	! First, align to long word boundary
+	mov	r0,r3
+	and	r2,r3
+	tst	r3,r3
+	bt/s	2f
+	 add	#-1,r4
+	mov	#4,r2
+	sub	r3,r2
+1:	dt	r2
+	mov.b	@r0+,r1
+	bf/s	1b
+	 mov.b	r1,@(r0,r4)
+	!
+2:	! Second, read a long word and write a long word at once
+	add	#-2,r4
+	mov.l	@(r0,r4),r1
+	add	#-7,r5
+	add	#-4,r4
+	!
+#ifdef __LITTLE_ENDIAN__
+	shll8	r1
+3:	mov	r1,r3		! JIHG
+	shlr8	r3		! xJIH
+	mov.l	@r0+,r1		! NMLK
+	mov	r1,r2
+	shll16	r2
+	shll8	r2		! Kxxx
+	or	r2,r3		! KJIH
+	cmp/hs	r5,r0
+	bf/s	3b
+	 mov.l	r3,@(r0,r4)
+#else
+	shlr8	r1
+3:	mov	r1,r3		! GHIJ
+	shll8	r3		! HIJx
+	mov.l	@r0+,r1		! KLMN
+	mov	r1,r2
+	shlr16	r2
+	shlr8	r2		! xxxK
+	or	r2,r3		! HIJK
+	cmp/hs	r5,r0
+	bf/s	3b
+	 mov.l	r3,@(r0,r4)
+#endif
+	add	#7,r5
+	!
+	! Third, copy a byte at once, if necessary
+	cmp/eq	r5,r0
+	bt/s	9b
+	 add	#7,r4
+	add	#-3,r0
+	bra	8b
+	 add	#-1,r4
+
+case2:
+	!
+	!	GHIJ KLMN OPQR -->  ..GH IJKL MNOP QR..
+	!
+	! First, align to word boundary
+	tst	#1,r0
+	bt/s	2f
+	 add	#-1,r4
+	mov.b	@r0+,r1
+	mov.b	r1,@(r0,r4)
+	!
+2:	! Second, read a word and write a word at once
+	add	#-1,r4
+	add	#-1,r5
+	!
+3:	mov.w	@r0+,r1
+	cmp/hs	r5,r0
+	bf/s	3b
+	 mov.w	r1,@(r0,r4)
+	add	#1,r5
+	!
+	! Third, copy a byte at once, if necessary
+	cmp/eq	r5,r0
+	bt/s	9b
+	 add	#2,r4
+	mov.b	@r0,r1
+	mov.b	r1,@(r0,r4)
+	bra	9b
+	 add	#1,r0
+
+case1:
+	!
+	!	GHIJ KLMN OPQR -->  .GHI JKLM NOPQ R...
+	!
+	! First, align to long word boundary
+	mov	r0,r3
+	and	r2,r3
+	tst	r3,r3
+	bt/s	2f
+	 add	#-1,r4
+	mov	#4,r2
+	sub	r3,r2
+1:	dt	r2
+	mov.b	@r0+,r1
+	bf/s	1b
+	 mov.b	r1,@(r0,r4)
+	!
+2:	! Second, read a long word and write a long word at once
+	mov.l	@(r0,r4),r1
+	add	#-7,r5
+	add	#-4,r4
+	!
+#ifdef __LITTLE_ENDIAN__
+	shll16	r1
+	shll8	r1
+3:	mov	r1,r3		! JIHG
+	shlr16	r3
+	shlr8	r3		! xxxJ
+	mov.l	@r0+,r1		! NMLK
+	mov	r1,r2
+	shll8	r2		! MLKx
+	or	r2,r3		! MLKJ
+	cmp/hs	r5,r0
+	bf/s	3b
+	 mov.l	r3,@(r0,r4)
+#else
+	shlr16	r1
+	shlr8	r1
+3:	mov	r1,r3		! GHIJ
+	shll16	r3
+	shll8	r3		! Jxxx
+	mov.l	@r0+,r1		! KLMN
+	mov	r1,r2
+	shlr8	r2		! xKLM
+	or	r2,r3		! JKLM
+	cmp/hs	r5,r0
+	bf/s	3b		! while(r0<r5)
+	 mov.l	r3,@(r0,r4)
+#endif
+	add	#7,r5
+	!
+	! Third, copy a byte at once, if necessary
+	cmp/eq	r5,r0
+	bt/s	9b
+	 add	#5,r4
+	add	#-3,r0
+	bra	8b
+	 add	#-1,r4
diff --git a/arch/sh/lib/memset.S b/arch/sh/lib/memset.S
new file mode 100644
index 0000000..9567009
--- /dev/null
+++ b/arch/sh/lib/memset.S
@@ -0,0 +1,57 @@
+/* $Id: memset.S,v 1.1 2000/04/14 16:49:01 mjd Exp $
+ *
+ * "memset" implementation of SuperH
+ *
+ * Copyright (C) 1999  Niibe Yutaka
+ *
+ */
+
+/*
+ *            void *memset(void *s, int c, size_t n);
+ */
+
+#include <linux/linkage.h>
+
+ENTRY(memset)
+	tst	r6,r6
+	bt/s	5f		! if n=0, do nothing
+	 add	r6,r4
+	mov	#12,r0
+	cmp/gt	r6,r0
+	bt/s	4f		! if it's too small, set a byte at once
+	 mov	r4,r0
+	and	#3,r0
+	cmp/eq	#0,r0
+	bt/s	2f		! It's aligned
+	 sub	r0,r6
+1:
+	dt	r0
+	bf/s	1b
+	 mov.b	r5,@-r4
+2:				! make VVVV
+	swap.b	r5,r0		!   V0
+	or	r0,r5		!   VV
+	swap.w	r5,r0		! VV00
+	or	r0,r5		! VVVV
+	!
+	mov	r6,r0
+	shlr2	r0
+	shlr	r0		! r0 = r6 >> 3
+3:
+	dt	r0
+	mov.l	r5,@-r4		! set 8-byte at once
+	bf/s	3b
+	 mov.l	r5,@-r4
+	!
+	mov	#7,r0
+	and	r0,r6
+	tst	r6,r6
+	bt	5f
+	! fill bytes
+4:
+	dt	r6
+	bf/s	4b
+	 mov.b	r5,@-r4
+5:
+	rts
+	 mov	r4,r0
diff --git a/arch/sh/lib/strcasecmp.c b/arch/sh/lib/strcasecmp.c
new file mode 100644
index 0000000..4e57a21
--- /dev/null
+++ b/arch/sh/lib/strcasecmp.c
@@ -0,0 +1,26 @@
+/*
+ *  linux/arch/alpha/lib/strcasecmp.c
+ */
+
+#include <linux/string.h>
+
+
+/* We handle nothing here except the C locale.  Since this is used in
+   only one place, on strings known to contain only 7 bit ASCII, this
+   is ok.  */
+
+int strcasecmp(const char *a, const char *b)
+{
+	int ca, cb;
+
+	do {
+		ca = *a++ & 0xff;
+		cb = *b++ & 0xff;
+		if (ca >= 'A' && ca <= 'Z')
+			ca += 'a' - 'A';
+		if (cb >= 'A' && cb <= 'Z')
+			cb += 'a' - 'A';
+	} while (ca == cb && ca != '\0');
+
+	return ca - cb;
+}
diff --git a/arch/sh/lib/strlen.S b/arch/sh/lib/strlen.S
new file mode 100644
index 0000000..f8ab296
--- /dev/null
+++ b/arch/sh/lib/strlen.S
@@ -0,0 +1,70 @@
+/* $Id: strlen.S,v 1.2 2001/06/29 14:07:15 gniibe Exp $
+ *
+ * "strlen" implementation of SuperH
+ *
+ * Copyright (C) 1999  Kaz Kojima
+ *
+ */
+
+/* size_t strlen (const char *s)  */
+
+#include <linux/linkage.h>
+ENTRY(strlen)
+	mov	r4,r0
+	and	#3,r0
+	tst	r0,r0
+	bt/s	1f
+	 mov	#0,r2
+
+	add	#-1,r0
+	shll2	r0
+	shll	r0
+	braf	r0
+	 nop
+
+	mov.b	@r4+,r1
+	tst	r1,r1
+	bt	8f
+	add	#1,r2
+
+	mov.b	@r4+,r1
+	tst	r1,r1
+	bt	8f
+	add	#1,r2
+
+	mov.b	@r4+,r1
+	tst	r1,r1
+	bt	8f
+	add	#1,r2	
+
+1:
+	mov	#0,r3
+2:
+	mov.l	@r4+,r1
+	cmp/str	r3,r1
+	bf/s	2b
+	 add	#4,r2
+
+	add	#-4,r2
+#ifndef __LITTLE_ENDIAN__
+	swap.b	r1,r1
+	swap.w	r1,r1
+	swap.b	r1,r1
+#endif
+	extu.b	r1,r0
+	tst	r0,r0
+	bt/s	8f
+	 shlr8	r1
+	add	#1,r2
+	extu.b	r1,r0
+	tst	r0,r0
+	bt/s	8f
+	 shlr8	r1
+	add	#1,r2
+	extu.b	r1,r0
+	tst	r0,r0
+	bt	8f
+	add	#1,r2
+8:
+	rts
+	 mov	r2,r0
diff --git a/arch/sh/lib/udivdi3.c b/arch/sh/lib/udivdi3.c
new file mode 100644
index 0000000..68f038b
--- /dev/null
+++ b/arch/sh/lib/udivdi3.c
@@ -0,0 +1,16 @@
+/*
+ * Simple __udivdi3 function which doesn't use FPU.
+ */
+
+#include <linux/types.h>
+
+extern u64 __xdiv64_32(u64 n, u32 d);
+extern void panic(const char * fmt, ...);
+
+u64 __udivdi3(u64 n, u64 d)
+{
+	if (d & ~0xffffffff)
+		panic("Need true 64-bit/64-bit division");
+	return __xdiv64_32(n, (u32)d);
+}
+