slub: Support 4k kmallocs again to compensate for page allocator slowness

Currently we hand off PAGE_SIZEd kmallocs to the page allocator in the
mistaken belief that the page allocator can handle these allocations
effectively. However, measurements indicate a minimum slowdown by the
factor of 8 (and that is only SMP, NUMA is much worse) vs the slub fastpath
which causes regressions in tbench.

Increase the number of kmalloc caches by one so that we again handle 4k
kmallocs directly from slub. 4k page buffering for the page allocator
will be performed by slub like done by slab.

At some point the page allocator fastpath should be fixed. A lot of the kernel
would benefit from a faster ability to allocate a single page. If that is
done then the 4k allocs may again be forwarded to the page allocator and this
patch could be reverted.

Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
diff --git a/mm/slub.c b/mm/slub.c
index 644fd0a..4b3895c 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2517,11 +2517,11 @@
  *		Kmalloc subsystem
  *******************************************************************/
 
-struct kmem_cache kmalloc_caches[PAGE_SHIFT] __cacheline_aligned;
+struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned;
 EXPORT_SYMBOL(kmalloc_caches);
 
 #ifdef CONFIG_ZONE_DMA
-static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT];
+static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1];
 #endif
 
 static int __init setup_slub_min_order(char *str)
@@ -2703,7 +2703,7 @@
 {
 	struct kmem_cache *s;
 
-	if (unlikely(size > PAGE_SIZE / 2))
+	if (unlikely(size > PAGE_SIZE))
 		return kmalloc_large(size, flags);
 
 	s = get_slab(size, flags);
@@ -2720,7 +2720,7 @@
 {
 	struct kmem_cache *s;
 
-	if (unlikely(size > PAGE_SIZE / 2))
+	if (unlikely(size > PAGE_SIZE))
 		return kmalloc_large(size, flags);
 
 	s = get_slab(size, flags);
@@ -3032,7 +3032,7 @@
 		caches++;
 	}
 
-	for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++) {
+	for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) {
 		create_kmalloc_cache(&kmalloc_caches[i],
 			"kmalloc", 1 << i, GFP_KERNEL);
 		caches++;
@@ -3059,7 +3059,7 @@
 	slab_state = UP;
 
 	/* Provide the correct kmalloc names now that the caches are up */
-	for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++)
+	for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++)
 		kmalloc_caches[i]. name =
 			kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
 
@@ -3088,7 +3088,7 @@
 	if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
 		return 1;
 
-	if ((s->flags & __PAGE_ALLOC_FALLBACK)
+	if ((s->flags & __PAGE_ALLOC_FALLBACK))
 		return 1;
 
 	if (s->ctor)
@@ -3252,7 +3252,7 @@
 {
 	struct kmem_cache *s;
 
-	if (unlikely(size > PAGE_SIZE / 2))
+	if (unlikely(size > PAGE_SIZE))
 		return kmalloc_large(size, gfpflags);
 
 	s = get_slab(size, gfpflags);
@@ -3268,7 +3268,7 @@
 {
 	struct kmem_cache *s;
 
-	if (unlikely(size > PAGE_SIZE / 2))
+	if (unlikely(size > PAGE_SIZE))
 		return kmalloc_large(size, gfpflags);
 
 	s = get_slab(size, gfpflags);