MIPS: Outline udelay and fix a few issues.

Outlining fixes the issue were on certain CPUs such as the R10000 family
the delay loop would need an extra cycle if it overlaps a cacheline
boundary.

The rewrite also fixes build errors with GCC 4.4 which was changed in
way incompatible with the kernel's inline assembly.

Relying on pure C for computation of the delay value removes the need for
explicit.  The price we pay is a slight slowdown of the computation - to
be fixed on another day.

Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
diff --git a/arch/mips/include/asm/delay.h b/arch/mips/include/asm/delay.h
index b0bccd2..a07e51b 100644
--- a/arch/mips/include/asm/delay.h
+++ b/arch/mips/include/asm/delay.h
@@ -11,94 +11,12 @@
 #ifndef _ASM_DELAY_H
 #define _ASM_DELAY_H
 
-#include <linux/param.h>
-#include <linux/smp.h>
+extern void __delay(unsigned int loops);
+extern void __ndelay(unsigned int ns);
+extern void __udelay(unsigned int us);
 
-#include <asm/compiler.h>
-#include <asm/war.h>
-
-static inline void __delay(unsigned long loops)
-{
-	if (sizeof(long) == 4)
-		__asm__ __volatile__ (
-		"	.set	noreorder				\n"
-		"	.align	3					\n"
-		"1:	bnez	%0, 1b					\n"
-		"	subu	%0, 1					\n"
-		"	.set	reorder					\n"
-		: "=r" (loops)
-		: "0" (loops));
-	else if (sizeof(long) == 8 && !DADDI_WAR)
-		__asm__ __volatile__ (
-		"	.set	noreorder				\n"
-		"	.align	3					\n"
-		"1:	bnez	%0, 1b					\n"
-		"	dsubu	%0, 1					\n"
-		"	.set	reorder					\n"
-		: "=r" (loops)
-		: "0" (loops));
-	else if (sizeof(long) == 8 && DADDI_WAR)
-		__asm__ __volatile__ (
-		"	.set	noreorder				\n"
-		"	.align	3					\n"
-		"1:	bnez	%0, 1b					\n"
-		"	dsubu	%0, %2					\n"
-		"	.set	reorder					\n"
-		: "=r" (loops)
-		: "0" (loops), "r" (1));
-}
-
-
-/*
- * Division by multiplication: you don't have to worry about
- * loss of precision.
- *
- * Use only for very small delays ( < 1 msec).  Should probably use a
- * lookup table, really, as the multiplications take much too long with
- * short delays.  This is a "reasonable" implementation, though (and the
- * first constant multiplications gets optimized away if the delay is
- * a constant)
- */
-
-static inline void __udelay(unsigned long usecs, unsigned long lpj)
-{
-	unsigned long hi, lo;
-
-	/*
-	 * The rates of 128 is rounded wrongly by the catchall case
-	 * for 64-bit.  Excessive precission?  Probably ...
-	 */
-#if defined(CONFIG_64BIT) && (HZ == 128)
-	usecs *= 0x0008637bd05af6c7UL;		/* 2**64 / (1000000 / HZ) */
-#elif defined(CONFIG_64BIT)
-	usecs *= (0x8000000000000000UL / (500000 / HZ));
-#else /* 32-bit junk follows here */
-	usecs *= (unsigned long) (((0x8000000000000000ULL / (500000 / HZ)) +
-	                           0x80000000ULL) >> 32);
-#endif
-
-	if (sizeof(long) == 4)
-		__asm__("multu\t%2, %3"
-		: "=h" (usecs), "=l" (lo)
-		: "r" (usecs), "r" (lpj)
-		: GCC_REG_ACCUM);
-	else if (sizeof(long) == 8 && !R4000_WAR)
-		__asm__("dmultu\t%2, %3"
-		: "=h" (usecs), "=l" (lo)
-		: "r" (usecs), "r" (lpj)
-		: GCC_REG_ACCUM);
-	else if (sizeof(long) == 8 && R4000_WAR)
-		__asm__("dmultu\t%3, %4\n\tmfhi\t%0"
-		: "=r" (usecs), "=h" (hi), "=l" (lo)
-		: "r" (usecs), "r" (lpj)
-		: GCC_REG_ACCUM);
-
-	__delay(usecs);
-}
-
-#define __udelay_val cpu_data[raw_smp_processor_id()].udelay_val
-
-#define udelay(usecs) __udelay((usecs), __udelay_val)
+#define ndelay(ns) __udelay(ns)
+#define udelay(us) __udelay(us)
 
 /* make sure "usecs *= ..." in udelay do not overflow. */
 #if HZ >= 1000