[SPARC64]: Optimized TSB table initialization.

We only need to write an invalid tag every 16 bytes,
so taking advantage of this can save many instructions
compared to the simple memset() call we make now.

A prefetching implementation is implemented for sun4u
and a block-init store version if implemented for Niagara.

The next trick is to be able to perform an init and
a copy_tsb() in parallel when growing a TSB table.

Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/arch/sparc64/kernel/tsb.S b/arch/sparc64/kernel/tsb.S
index 1b154c8..118baea 100644
--- a/arch/sparc64/kernel/tsb.S
+++ b/arch/sparc64/kernel/tsb.S
@@ -371,3 +371,72 @@
 	retl
 	 TSB_MEMBAR
 	.size		copy_tsb, .-copy_tsb
+
+	/* Set the invalid bit in all TSB entries.  */
+	.align		32
+	.globl		tsb_init
+	.type		tsb_init,#function
+tsb_init:		/* %o0 = TSB vaddr, %o1 = size in bytes */
+	prefetch	[%o0 + 0x000], #n_writes
+	mov		1, %g1
+	prefetch	[%o0 + 0x040], #n_writes
+	sllx		%g1, TSB_TAG_INVALID_BIT, %g1
+	prefetch	[%o0 + 0x080], #n_writes
+1:	prefetch	[%o0 + 0x0c0], #n_writes
+	stx		%g1, [%o0 + 0x00]
+	stx		%g1, [%o0 + 0x10]
+	stx		%g1, [%o0 + 0x20]
+	stx		%g1, [%o0 + 0x30]
+	prefetch	[%o0 + 0x100], #n_writes
+	stx		%g1, [%o0 + 0x40]
+	stx		%g1, [%o0 + 0x50]
+	stx		%g1, [%o0 + 0x60]
+	stx		%g1, [%o0 + 0x70]
+	prefetch	[%o0 + 0x140], #n_writes
+	stx		%g1, [%o0 + 0x80]
+	stx		%g1, [%o0 + 0x90]
+	stx		%g1, [%o0 + 0xa0]
+	stx		%g1, [%o0 + 0xb0]
+	prefetch	[%o0 + 0x180], #n_writes
+	stx		%g1, [%o0 + 0xc0]
+	stx		%g1, [%o0 + 0xd0]
+	stx		%g1, [%o0 + 0xe0]
+	stx		%g1, [%o0 + 0xf0]
+	subcc		%o1, 0x100, %o1
+	bne,pt		%xcc, 1b
+	 add		%o0, 0x100, %o0
+	retl
+	 nop
+	nop
+	nop
+	.size		tsb_init, .-tsb_init
+
+	.globl		NGtsb_init
+	.type		NGtsb_init,#function
+NGtsb_init:
+	rd		%asi, %g2
+	mov		1, %g1
+	wr		%g0, ASI_BLK_INIT_QUAD_LDD_P, %asi
+	sllx		%g1, TSB_TAG_INVALID_BIT, %g1
+1:	stxa		%g1, [%o0 + 0x00] %asi
+	stxa		%g1, [%o0 + 0x10] %asi
+	stxa		%g1, [%o0 + 0x20] %asi
+	stxa		%g1, [%o0 + 0x30] %asi
+	stxa		%g1, [%o0 + 0x40] %asi
+	stxa		%g1, [%o0 + 0x50] %asi
+	stxa		%g1, [%o0 + 0x60] %asi
+	stxa		%g1, [%o0 + 0x70] %asi
+	stxa		%g1, [%o0 + 0x80] %asi
+	stxa		%g1, [%o0 + 0x90] %asi
+	stxa		%g1, [%o0 + 0xa0] %asi
+	stxa		%g1, [%o0 + 0xb0] %asi
+	stxa		%g1, [%o0 + 0xc0] %asi
+	stxa		%g1, [%o0 + 0xd0] %asi
+	stxa		%g1, [%o0 + 0xe0] %asi
+	stxa		%g1, [%o0 + 0xf0] %asi
+	subcc		%o1, 0x100, %o1
+	bne,pt		%xcc, 1b
+	 add		%o0, 0x100, %o0
+	retl
+	 wr		%g2, 0x0, %asi
+	.size		NGtsb_init, .-NGtsb_init
diff --git a/arch/sparc64/lib/NGbzero.S b/arch/sparc64/lib/NGbzero.S
index fef584f..e86baec 100644
--- a/arch/sparc64/lib/NGbzero.S
+++ b/arch/sparc64/lib/NGbzero.S
@@ -157,6 +157,7 @@
 	NG_DO_PATCH(memset, NGmemset)
 	NG_DO_PATCH(__bzero, NGbzero)
 	NG_DO_PATCH(__clear_user, NGclear_user)
+	NG_DO_PATCH(tsb_init, NGtsb_init)
 	retl
 	 nop
 	.size	niagara_patch_bzero,.-niagara_patch_bzero
diff --git a/arch/sparc64/mm/tsb.c b/arch/sparc64/mm/tsb.c
index 1af797a..b2064e2 100644
--- a/arch/sparc64/mm/tsb.c
+++ b/arch/sparc64/mm/tsb.c
@@ -313,7 +313,7 @@
 	}
 
 	/* Mark all tags as invalid.  */
-	memset(new_tsb, 0x40, new_size);
+	tsb_init(new_tsb, new_size);
 
 	/* Ok, we are about to commit the changes.  If we are
 	 * growing an existing TSB the locking is very tricky,