[PATCH] sched: add option to serialize load balancing

Large sched domains can be very expensive to scan.  Add an option SD_SERIALIZE
to the sched domain flags.  If that flag is set then we make sure that no
other such domain is being balanced.

[akpm@osdl.org: build fix]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Peter Williams <pwil3058@bigpond.net.au>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/include/asm-i386/topology.h b/include/asm-i386/topology.h
index 978d095..ac58580 100644
--- a/include/asm-i386/topology.h
+++ b/include/asm-i386/topology.h
@@ -89,6 +89,7 @@
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_EXEC	\
 				| SD_BALANCE_FORK	\
+				| SD_SERIALIZE		\
 				| SD_WAKE_BALANCE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
diff --git a/include/asm-ia64/topology.h b/include/asm-ia64/topology.h
index a6e3856..22ed674 100644
--- a/include/asm-ia64/topology.h
+++ b/include/asm-ia64/topology.h
@@ -101,6 +101,7 @@
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_EXEC	\
 				| SD_BALANCE_FORK	\
+				| SD_SERIALIZE		\
 				| SD_WAKE_BALANCE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 64,			\
diff --git a/include/asm-powerpc/topology.h b/include/asm-powerpc/topology.h
index 50c0140..6610495f 100644
--- a/include/asm-powerpc/topology.h
+++ b/include/asm-powerpc/topology.h
@@ -66,6 +66,7 @@
 				| SD_BALANCE_EXEC	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_WAKE_IDLE		\
+				| SD_SERIALIZE		\
 				| SD_WAKE_BALANCE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h
index 5c8f492..2facec5 100644
--- a/include/asm-x86_64/topology.h
+++ b/include/asm-x86_64/topology.h
@@ -47,6 +47,7 @@
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_FORK	\
 				| SD_BALANCE_EXEC	\
+				| SD_SERIALIZE		\
 				| SD_WAKE_BALANCE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1208fea..ea92e5c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -648,6 +648,7 @@
 #define SD_SHARE_CPUPOWER	128	/* Domain members share cpu power */
 #define SD_POWERSAVINGS_BALANCE	256	/* Balance for power savings */
 #define SD_SHARE_PKG_RESOURCES	512	/* Domain members share cpu pkg resources */
+#define SD_SERIALIZE		1024	/* Only a single load balancing instance */
 
 #define BALANCE_FOR_MC_POWER	\
 	(sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0)
diff --git a/include/linux/topology.h b/include/linux/topology.h
index b93bb6c..6c5a6e6 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -194,7 +194,8 @@
 	.wake_idx		= 0, /* unused */	\
 	.forkexec_idx		= 0, /* unused */	\
 	.per_cpu_gain		= 100,			\
-	.flags			= SD_LOAD_BALANCE,	\
+	.flags			= SD_LOAD_BALANCE	\
+				| SD_SERIALIZE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 64,			\
 	.nr_balance_failed	= 0,			\
diff --git a/kernel/sched.c b/kernel/sched.c
index 0a4a26b..2b2b780 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2880,6 +2880,7 @@
  *
  * Balancing parameters are set up in arch_init_sched_domains.
  */
+static DEFINE_SPINLOCK(balancing);
 
 static void run_rebalance_domains(struct softirq_action *h)
 {
@@ -2909,6 +2910,11 @@
 		if (unlikely(!interval))
 			interval = 1;
 
+		if (sd->flags & SD_SERIALIZE) {
+			if (!spin_trylock(&balancing))
+				goto out;
+		}
+
 		if (time_after_eq(jiffies, sd->last_balance + interval)) {
 			if (load_balance(this_cpu, this_rq, sd, idle)) {
 				/*
@@ -2920,6 +2926,9 @@
 			}
 			sd->last_balance = jiffies;
 		}
+		if (sd->flags & SD_SERIALIZE)
+			spin_unlock(&balancing);
+out:
 		if (time_after(next_balance, sd->last_balance + interval))
 			next_balance = sd->last_balance + interval;
 	}