[ARM] 3256/1: Make the function-returning ldm's use sp as the base register

Patch from Catalin Marinas

If the low interrupt latency mode is enabled for the CPU (from ARMv6
onwards), the ldm/stm instructions are no longer atomic. An ldm instruction
restoring the sp and pc registers can be interrupted immediately after sp
was updated but before the pc. If this happens, the CPU restores the base
register to the value before the ldm instruction but if the base register
is not sp, the interrupt routine will corrupt the stack and the restarted
ldm instruction will load garbage.

Note that future ARM cores might always run in the low interrupt latency
mode.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
diff --git a/arch/arm/kernel/fiq.c b/arch/arm/kernel/fiq.c
index 9299dfc25..1ec3f7f 100644
--- a/arch/arm/kernel/fiq.c
+++ b/arch/arm/kernel/fiq.c
@@ -101,7 +101,7 @@
 	ldmia	%1, {r8 - r14}\n\
 	msr	cpsr_c, %0	@ return to SVC mode\n\
 	mov	r0, r0\n\
-	ldmea	fp, {fp, sp, pc}"
+	ldmfd	sp, {fp, sp, pc}"
 	: "=&r" (tmp)
 	: "r" (&regs->ARM_r8), "I" (PSR_I_BIT | PSR_F_BIT | FIQ_MODE));
 }
@@ -119,7 +119,7 @@
 	stmia	%1, {r8 - r14}\n\
 	msr	cpsr_c, %0	@ return to SVC mode\n\
 	mov	r0, r0\n\
-	ldmea	fp, {fp, sp, pc}"
+	ldmfd	sp, {fp, sp, pc}"
 	: "=&r" (tmp)
 	: "r" (&regs->ARM_r8), "I" (PSR_I_BIT | PSR_F_BIT | FIQ_MODE));
 }
diff --git a/arch/arm/lib/csumpartialcopy.S b/arch/arm/lib/csumpartialcopy.S
index 990ee63..21effe0 100644
--- a/arch/arm/lib/csumpartialcopy.S
+++ b/arch/arm/lib/csumpartialcopy.S
@@ -18,11 +18,13 @@
  */
 
 		.macro	save_regs
+		mov	ip, sp
 		stmfd	sp!, {r1, r4 - r8, fp, ip, lr, pc}
+		sub	fp, ip, #4
 		.endm
 
-		.macro	load_regs,flags
-		LOADREGS(\flags,fp,{r1, r4 - r8, fp, sp, pc})
+		.macro	load_regs
+		ldmfd	sp, {r1, r4 - r8, fp, sp, pc}
 		.endm
 
 		.macro	load1b, reg1
diff --git a/arch/arm/lib/csumpartialcopygeneric.S b/arch/arm/lib/csumpartialcopygeneric.S
index 4a4609c..c50e8f5 100644
--- a/arch/arm/lib/csumpartialcopygeneric.S
+++ b/arch/arm/lib/csumpartialcopygeneric.S
@@ -23,7 +23,7 @@
 sum	.req	r3
 
 .Lzero:		mov	r0, sum
-		load_regs	ea
+		load_regs
 
 		/*
 		 * Align an unaligned destination pointer.  We know that
@@ -87,9 +87,7 @@
 		b	.Ldone
 
 FN_ENTRY
-		mov	ip, sp
 		save_regs
-		sub	fp, ip, #4
 
 		cmp	len, #8			@ Ensure that we have at least
 		blo	.Lless8			@ 8 bytes to copy.
@@ -163,7 +161,7 @@
 		ldr	sum, [sp, #0]		@ dst
 		tst	sum, #1
 		movne	r0, r0, ror #8
-		load_regs	ea
+		load_regs
 
 .Lsrc_not_aligned:
 		adc	sum, sum, #0		@ include C from dst alignment
diff --git a/arch/arm/lib/csumpartialcopyuser.S b/arch/arm/lib/csumpartialcopyuser.S
index 333bca2..c3b93e2 100644
--- a/arch/arm/lib/csumpartialcopyuser.S
+++ b/arch/arm/lib/csumpartialcopyuser.S
@@ -18,11 +18,13 @@
 		.text
 
 		.macro	save_regs
+		mov	ip, sp
 		stmfd	sp!, {r1 - r2, r4 - r8, fp, ip, lr, pc}
+		sub	fp, ip, #4
 		.endm
 
-		.macro	load_regs,flags
-		ldm\flags	fp, {r1, r2, r4-r8, fp, sp, pc}
+		.macro	load_regs
+		ldmfd	sp, {r1, r2, r4-r8, fp, sp, pc}
 		.endm
 
 		.macro	load1b,	reg1
@@ -100,5 +102,5 @@
 6002:		teq	r2, r1
 		strneb	r0, [r1], #1
 		bne	6002b
-		load_regs	ea
+		load_regs
 		.previous