[ARM] 4111/1: Allow VFP to work with thread migration on SMP

The current lazy saving of the VFP registers is no longer possible
with thread migration on SMP. This patch implements a per-CPU
vfp-state pointer and the saving of the VFP registers at every context
switch. The registers restoring is still performed in a lazy way.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
diff --git a/arch/arm/vfp/entry.S b/arch/arm/vfp/entry.S
index 7b59554..ca2a5ad 100644
--- a/arch/arm/vfp/entry.S
+++ b/arch/arm/vfp/entry.S
@@ -25,6 +25,7 @@
 do_vfp:
 	enable_irq
  	ldr	r4, .LCvfp
+	ldr	r11, [r10, #TI_CPU]	@ CPU number
 	add	r10, r10, #TI_VFPSTATE	@ r10 = workspace
 	ldr	pc, [r4]		@ call VFP entry point
 
diff --git a/arch/arm/vfp/vfp.h b/arch/arm/vfp/vfp.h
index f279789..54a2ad6 100644
--- a/arch/arm/vfp/vfp.h
+++ b/arch/arm/vfp/vfp.h
@@ -370,3 +370,7 @@
 	u32 (* const fn)(int dd, int dn, int dm, u32 fpscr);
 	u32 flags;
 };
+
+#ifdef CONFIG_SMP
+extern void vfp_save_state(void *location, u32 fpexc);
+#endif
diff --git a/arch/arm/vfp/vfphw.S b/arch/arm/vfp/vfphw.S
index e51e667..d4b7b22 100644
--- a/arch/arm/vfp/vfphw.S
+++ b/arch/arm/vfp/vfphw.S
@@ -65,6 +65,7 @@
 @  r2  = faulted PC+4
 @  r9  = successful return
 @  r10 = vfp_state union
+@  r11 = CPU number
 @  lr  = failure return
 
 	.globl	vfp_support_entry
@@ -79,7 +80,7 @@
 	DBGSTR1 "enable %x", r10
 	ldr	r3, last_VFP_context_address
 	orr	r1, r1, #FPEXC_ENABLE	@ user FPEXC has the enable bit set
-	ldr	r4, [r3]		@ last_VFP_context pointer
+	ldr	r4, [r3, r11, lsl #2]	@ last_VFP_context pointer
 	bic	r5, r1, #FPEXC_EXCEPTION @ make sure exceptions are disabled
 	cmp	r4, r10
 	beq	check_for_exception	@ we are returning to the same
@@ -91,7 +92,9 @@
 					@ exceptions, so we can get at the
 					@ rest of it
 
+#ifndef CONFIG_SMP
 	@ Save out the current registers to the old thread state
+	@ No need for SMP since this is not done lazily
 
 	DBGSTR1	"save old state %p", r4
 	cmp	r4, #0
@@ -105,10 +108,11 @@
 	stmia	r4, {r1, r5, r6, r8}	@ save FPEXC, FPSCR, FPINST, FPINST2
 					@ and point r4 at the word at the
 					@ start of the register dump
+#endif
 
 no_old_VFP_process:
 	DBGSTR1	"load state %p", r10
-	str	r10, [r3]		@ update the last_VFP_context pointer
+	str	r10, [r3, r11, lsl #2]	@ update the last_VFP_context pointer
 					@ Load the saved state back into the VFP
 	VFPFLDMIA r10	 		@ reload the working registers while
 					@ FPEXC is in a safe state
@@ -162,6 +166,24 @@
 					@ required. If not, the user code will
 					@ retry the faulted instruction
 
+#ifdef CONFIG_SMP
+	.globl	vfp_save_state
+	.type	vfp_save_state, %function
+vfp_save_state:
+	@ Save the current VFP state
+	@ r0 - save location
+	@ r1 - FPEXC
+	DBGSTR1	"save VFP state %p", r0
+	VFPFMRX	r2, FPSCR		@ current status
+	VFPFMRX	r3, FPINST		@ FPINST (always there, rev0 onwards)
+	tst	r1, #FPEXC_FPV2		@ is there an FPINST2 to read?
+	VFPFMRX	r12, FPINST2, NE	@ FPINST2 if needed - avoids reading
+					@ nonexistant reg on rev0
+	VFPFSTMIA r0 			@ save the working registers
+	stmia	r0, {r1, r2, r3, r12}	@ save FPEXC, FPSCR, FPINST, FPINST2
+	mov	pc, lr
+#endif
+
 last_VFP_context_address:
 	.word	last_VFP_context
 
diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c
index 490d9d1..f1e5951 100644
--- a/arch/arm/vfp/vfpmodule.c
+++ b/arch/arm/vfp/vfpmodule.c
@@ -28,7 +28,7 @@
 void vfp_support_entry(void);
 
 void (*vfp_vector)(void) = vfp_testing_entry;
-union vfp_state *last_VFP_context;
+union vfp_state *last_VFP_context[NR_CPUS];
 
 /*
  * Dual-use variable.
@@ -41,13 +41,35 @@
 {
 	struct thread_info *thread = v;
 	union vfp_state *vfp;
+	__u32 cpu = thread->cpu;
 
 	if (likely(cmd == THREAD_NOTIFY_SWITCH)) {
+		u32 fpexc = fmrx(FPEXC);
+
+#ifdef CONFIG_SMP
+		/*
+		 * On SMP, if VFP is enabled, save the old state in
+		 * case the thread migrates to a different CPU. The
+		 * restoring is done lazily.
+		 */
+		if ((fpexc & FPEXC_ENABLE) && last_VFP_context[cpu]) {
+			vfp_save_state(last_VFP_context[cpu], fpexc);
+			last_VFP_context[cpu]->hard.cpu = cpu;
+		}
+		/*
+		 * Thread migration, just force the reloading of the
+		 * state on the new CPU in case the VFP registers
+		 * contain stale data.
+		 */
+		if (thread->vfpstate.hard.cpu != cpu)
+			last_VFP_context[cpu] = NULL;
+#endif
+
 		/*
 		 * Always disable VFP so we can lazily save/restore the
 		 * old state.
 		 */
-		fmxr(FPEXC, fmrx(FPEXC) & ~FPEXC_ENABLE);
+		fmxr(FPEXC, fpexc & ~FPEXC_ENABLE);
 		return NOTIFY_DONE;
 	}
 
@@ -68,8 +90,8 @@
 	}
 
 	/* flush and release case: Per-thread VFP cleanup. */
-	if (last_VFP_context == vfp)
-		last_VFP_context = NULL;
+	if (last_VFP_context[cpu] == vfp)
+		last_VFP_context[cpu] = NULL;
 
 	return NOTIFY_DONE;
 }
diff --git a/include/asm-arm/fpstate.h b/include/asm-arm/fpstate.h
index 6af4e6b..f31cda5 100644
--- a/include/asm-arm/fpstate.h
+++ b/include/asm-arm/fpstate.h
@@ -35,6 +35,9 @@
 	 */
 	__u32 fpinst;
 	__u32 fpinst2;
+#ifdef CONFIG_SMP
+	__u32 cpu;
+#endif
 };
 
 union vfp_state {