[PATCH] sched: disable preempt in idle tasks

Run idle threads with preempt disabled.

Also corrected a bugs in arm26's cpu_idle (make it actually call schedule()).
How did it ever work before?

Might fix the CPU hotplugging hang which Nigel Cunningham noted.

We think the bug hits if the idle thread is preempted after checking
need_resched() and before going to sleep, then the CPU offlined.

After calling stop_machine_run, the CPU eventually returns from preemption and
into the idle thread and goes to sleep.  The CPU will continue executing
previous idle and have no chance to call play_dead.

By disabling preemption until we are ready to explicitly schedule, this bug is
fixed and the idle threads generally become more robust.

From: alexs <ashepard@u.washington.edu>

  PPC build fix

From: Yoichi Yuasa <yuasa@hh.iij4u.or.jp>

  MIPS build fix

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Yoichi Yuasa <yuasa@hh.iij4u.or.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index ba29827..93dd92c 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -116,13 +116,13 @@
 
 		if (!idle)
 			idle = default_idle;
-		preempt_disable();
 		leds_event(led_idle_start);
 		while (!need_resched())
 			idle();
 		leds_event(led_idle_end);
-		preempt_enable();
+		preempt_enable_no_resched();
 		schedule();
+		preempt_disable();
 	}
 }
 
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 77e2e9c..e55ea95 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -256,7 +256,9 @@
 asmlinkage void __cpuinit secondary_start_kernel(void)
 {
 	struct mm_struct *mm = &init_mm;
-	unsigned int cpu = smp_processor_id();
+	unsigned int cpu;
+
+	cpu = smp_processor_id();
 
 	printk("CPU%u: Booted secondary processor\n", cpu);
 
@@ -273,6 +275,7 @@
 	local_flush_tlb_all();
 
 	cpu_init();
+	preempt_disable();
 
 	/*
 	 * Give the platform a chance to do its own initialisation.
diff --git a/arch/arm26/kernel/process.c b/arch/arm26/kernel/process.c
index 9eb9964..15833a0 100644
--- a/arch/arm26/kernel/process.c
+++ b/arch/arm26/kernel/process.c
@@ -74,15 +74,13 @@
 void cpu_idle(void)
 {
 	/* endless idle loop with no priority at all */
-	preempt_disable();
 	while (1) {
-		while (!need_resched()) {
-			local_irq_disable();
-			if (!need_resched() && !hlt_counter)
-				local_irq_enable();
-		}
+		while (!need_resched())
+			cpu_relax();
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
 	}
-	schedule();
 }
 
 static char reboot_mode = 'h';
diff --git a/arch/cris/arch-v32/kernel/smp.c b/arch/cris/arch-v32/kernel/smp.c
index 957f551..13867f4 100644
--- a/arch/cris/arch-v32/kernel/smp.c
+++ b/arch/cris/arch-v32/kernel/smp.c
@@ -161,6 +161,7 @@
 	REG_WR(intr_vect, irq_regs[cpu], rw_mask, vect_mask);
 	unmask_irq(IPI_INTR_VECT);
 	unmask_irq(TIMER_INTR_VECT);
+	preempt_disable();
 	local_irq_enable();
 
 	cpu_set(cpu, cpu_online_map);
diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c
index 949a0e4..7c80afb 100644
--- a/arch/cris/kernel/process.c
+++ b/arch/cris/kernel/process.c
@@ -218,7 +218,9 @@
 				idle = default_idle;
 			idle();
 		}
+		preempt_enable_no_resched();
 		schedule();
+		preempt_disable();
 	}
 }
 
diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c
index 3001b82..54a4521 100644
--- a/arch/frv/kernel/process.c
+++ b/arch/frv/kernel/process.c
@@ -77,16 +77,20 @@
  */
 void cpu_idle(void)
 {
+	int cpu = smp_processor_id();
+
 	/* endless idle loop with no priority at all */
 	while (1) {
 		while (!need_resched()) {
-			irq_stat[smp_processor_id()].idle_timestamp = jiffies;
+			irq_stat[cpu].idle_timestamp = jiffies;
 
 			if (!frv_dma_inprogress && idle)
 				idle();
 		}
 
+		preempt_enable_no_resched();
 		schedule();
+		preempt_disable();
 	}
 }
 
diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c
index 27f1fce..fe21adf 100644
--- a/arch/h8300/kernel/process.c
+++ b/arch/h8300/kernel/process.c
@@ -53,22 +53,18 @@
 #if !defined(CONFIG_H8300H_SIM) && !defined(CONFIG_H8S_SIM)
 void default_idle(void)
 {
-	while(1) {
-		if (!need_resched()) {
-			local_irq_enable();
-			__asm__("sleep");
-			local_irq_disable();
-		}
-		schedule();
-	}
+	local_irq_disable();
+	if (!need_resched()) {
+		local_irq_enable();
+		/* XXX: race here! What if need_resched() gets set now? */
+		__asm__("sleep");
+	} else
+		local_irq_enable();
 }
 #else
 void default_idle(void)
 {
-	while(1) {
-		if (need_resched())
-			schedule();
-	}
+	cpu_relax();
 }
 #endif
 void (*idle)(void) = default_idle;
@@ -81,7 +77,13 @@
  */
 void cpu_idle(void)
 {
-	idle();
+	while (1) {
+		while (!need_resched())
+			idle();
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
+	}
 }
 
 void machine_restart(char * __unused)
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 7a14fdf..5296e28 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -179,7 +179,7 @@
  */
 void cpu_idle(void)
 {
-	int cpu = raw_smp_processor_id();
+	int cpu = smp_processor_id();
 
 	/* endless idle loop with no priority at all */
 	while (1) {
@@ -201,7 +201,9 @@
 			__get_cpu_var(irq_stat).idle_timestamp = jiffies;
 			idle();
 		}
+		preempt_enable_no_resched();
 		schedule();
+		preempt_disable();
 	}
 }
 
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 47ec76794..bc5a9d9 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -485,6 +485,7 @@
 	 * things done here to the most necessary things.
 	 */
 	cpu_init();
+	preempt_disable();
 	smp_callin();
 	while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
 		rep_nop();
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index 051e050..4c621fc 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -292,7 +292,9 @@
 #ifdef CONFIG_SMP
 		normal_xtp();
 #endif
+		preempt_enable_no_resched();
 		schedule();
+		preempt_disable();
 		check_pgt_cache();
 		if (cpu_is_offline(smp_processor_id()))
 			play_dead();
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index 400a489..8f44e7d 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -399,6 +399,7 @@
 	Dprintk("start_secondary: starting CPU 0x%x\n", hard_smp_processor_id());
 	efi_map_pal_code();
 	cpu_init();
+	preempt_disable();
 	smp_callin();
 
 	cpu_idle();
diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c
index ea13a8f..cc4b571 100644
--- a/arch/m32r/kernel/process.c
+++ b/arch/m32r/kernel/process.c
@@ -104,7 +104,9 @@
 
 			idle();
 		}
+		preempt_enable_no_resched();
 		schedule();
+		preempt_disable();
 	}
 }
 
diff --git a/arch/m32r/kernel/smpboot.c b/arch/m32r/kernel/smpboot.c
index 640d592..b90c541 100644
--- a/arch/m32r/kernel/smpboot.c
+++ b/arch/m32r/kernel/smpboot.c
@@ -426,6 +426,7 @@
 int __init start_secondary(void *unused)
 {
 	cpu_init();
+	preempt_disable();
 	smp_callin();
 	while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
 		cpu_relax();
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index 11b1b90..13d10932 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -102,7 +102,9 @@
 	while (1) {
 		while (!need_resched())
 			idle();
+		preempt_enable_no_resched();
 		schedule();
+		preempt_disable();
 	}
 }
 
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index 4fe3d57..dd72577 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -52,7 +52,9 @@
 		while (!need_resched())
 			if (cpu_wait)
 				(*cpu_wait)();
+		preempt_enable_no_resched();
 		schedule();
+		preempt_disable();
 	}
 }
 
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index fcacf1a..25472fc 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -82,7 +82,7 @@
  */
 asmlinkage void start_secondary(void)
 {
-	unsigned int cpu = smp_processor_id();
+	unsigned int cpu;
 
 	cpu_probe();
 	cpu_report();
@@ -95,6 +95,8 @@
 	 */
 
 	calibrate_delay();
+	preempt_disable();
+	cpu = smp_processor_id();
 	cpu_data[cpu].udelay_val = loops_per_jiffy;
 
 	prom_smp_finish();
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index 7fdca87..f482f78 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -92,7 +92,9 @@
 	while (1) {
 		while (!need_resched())
 			barrier();
+		preempt_enable_no_resched();
 		schedule();
+		preempt_disable();
 		check_pgt_cache();
 	}
 }
diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c
index 5db3be4..a9ecf64 100644
--- a/arch/parisc/kernel/smp.c
+++ b/arch/parisc/kernel/smp.c
@@ -463,6 +463,7 @@
 #endif
 
 	smp_cpu_init(slave_id);
+	preempt_disable();
 
 #if 0	/* NOT WORKING YET - see entry.S */
 	istack = (void *)__get_free_pages(GFP_KERNEL,ISTACK_ORDER);
diff --git a/arch/powerpc/platforms/iseries/setup.c b/arch/powerpc/platforms/iseries/setup.c
index d3e4bf7..0130f26 100644
--- a/arch/powerpc/platforms/iseries/setup.c
+++ b/arch/powerpc/platforms/iseries/setup.c
@@ -694,7 +694,9 @@
 		if (hvlpevent_is_pending())
 			process_iSeries_events();
 
+		preempt_enable_no_resched();
 		schedule();
+		preempt_disable();
 	}
 }
 
@@ -726,7 +728,9 @@
 		}
 
 		ppc64_runlatch_on();
+		preempt_enable_no_resched();
 		schedule();
+		preempt_disable();
 	}
 }
 
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index e78c393..4854f5e 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -539,7 +539,9 @@
 		lpaca->lppaca.idle = 0;
 		ppc64_runlatch_on();
 
+		preempt_enable_no_resched();
 		schedule();
+		preempt_disable();
 
 		if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING)
 			cpu_die();
@@ -583,7 +585,9 @@
 		lpaca->lppaca.idle = 0;
 		ppc64_runlatch_on();
 
+		preempt_enable_no_resched();
 		schedule();
+		preempt_disable();
 
 		if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING)
 			cpu_die();
diff --git a/arch/ppc/kernel/idle.c b/arch/ppc/kernel/idle.c
index 11e5b44..a6141f0 100644
--- a/arch/ppc/kernel/idle.c
+++ b/arch/ppc/kernel/idle.c
@@ -53,10 +53,6 @@
 		}
 #endif
 	}
-	if (need_resched())
-		schedule();
-	if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING)
-		cpu_die();
 }
 
 /*
@@ -64,11 +60,22 @@
  */
 void cpu_idle(void)
 {
-	for (;;)
+	int cpu = smp_processor_id();
+
+	for (;;) {
 		if (ppc_md.idle != NULL)
 			ppc_md.idle();
 		else
 			default_idle();
+		if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING)
+			cpu_die();
+		if (need_resched()) {
+			preempt_enable_no_resched();
+			schedule();
+			preempt_disable();
+		}
+
+	}
 }
 
 #if defined(CONFIG_SYSCTL) && defined(CONFIG_6xx)
diff --git a/arch/ppc/kernel/smp.c b/arch/ppc/kernel/smp.c
index bc5bf11..43b8fc2 100644
--- a/arch/ppc/kernel/smp.c
+++ b/arch/ppc/kernel/smp.c
@@ -341,6 +341,7 @@
 	cpu = smp_processor_id();
         smp_store_cpu_info(cpu);
 	set_dec(tb_ticks_per_jiffy);
+	preempt_disable();
 	cpu_callin_map[cpu] = 1;
 
 	printk("CPU %d done callin...\n", cpu);
diff --git a/arch/ppc64/kernel/idle.c b/arch/ppc64/kernel/idle.c
index 8fec274..909ea66 100644
--- a/arch/ppc64/kernel/idle.c
+++ b/arch/ppc64/kernel/idle.c
@@ -61,7 +61,9 @@
 		}
 
 		ppc64_runlatch_on();
+		preempt_enable_no_resched();
 		schedule();
+		preempt_disable();
 		if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING)
 			cpu_die();
 	}
@@ -77,7 +79,9 @@
 
 		if (need_resched()) {
 			ppc64_runlatch_on();
+			preempt_enable_no_resched();
 			schedule();
+			preempt_disable();
 		}
 
 		if (cpu_is_offline(smp_processor_id()) &&
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 9f3dff6..66ca575 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -102,7 +102,6 @@
 	local_irq_disable();
         if (need_resched()) {
 		local_irq_enable();
-                schedule();
                 return;
         }
 
@@ -139,8 +138,14 @@
 
 void cpu_idle(void)
 {
-	for (;;)
-		default_idle();
+	for (;;) {
+		while (!need_resched())
+			default_idle();
+
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
+	}
 }
 
 void show_regs(struct pt_regs *regs)
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index e13c87b..5856b3f 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -533,6 +533,7 @@
 {
         /* Setup the cpu */
         cpu_init();
+	preempt_disable();
         /* init per CPU timer */
         init_cpu_timer();
 #ifdef CONFIG_VIRT_TIMER
diff --git a/arch/sh/kernel/process.c b/arch/sh/kernel/process.c
index 6dce9d0..1cbc26b 100644
--- a/arch/sh/kernel/process.c
+++ b/arch/sh/kernel/process.c
@@ -64,7 +64,9 @@
 				cpu_sleep();
 		}
 
+		preempt_enable_no_resched();
 		schedule();
+		preempt_disable();
 	}
 }
 
diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c
index 5ecefc0..59e49b1 100644
--- a/arch/sh/kernel/smp.c
+++ b/arch/sh/kernel/smp.c
@@ -112,7 +112,9 @@
 
 int start_secondary(void *unused)
 {
-	unsigned int cpu = smp_processor_id();
+	unsigned int cpu;
+
+	cpu = smp_processor_id();
 
 	atomic_inc(&init_mm.mm_count);
 	current->active_mm = &init_mm;
@@ -120,6 +122,7 @@
 	smp_store_cpu_info(cpu);
 
 	__smp_slave_init(cpu);
+	preempt_disable();
 	per_cpu_trap_init();
 	
 	atomic_inc(&cpus_booted);
diff --git a/arch/sh64/kernel/process.c b/arch/sh64/kernel/process.c
index efde41c..0c09537 100644
--- a/arch/sh64/kernel/process.c
+++ b/arch/sh64/kernel/process.c
@@ -334,7 +334,9 @@
 			}
 			local_irq_enable();
 		}
+		preempt_enable_no_resched();
 		schedule();
+		preempt_disable();
 	}
 }
 
diff --git a/arch/sparc/kernel/process.c b/arch/sparc/kernel/process.c
index 29e72b5..c39f4d0 100644
--- a/arch/sparc/kernel/process.c
+++ b/arch/sparc/kernel/process.c
@@ -120,7 +120,9 @@
 			(*pm_idle)();
 		}
 
+		preempt_enable_no_resched();
 		schedule();
+		preempt_disable();
 		check_pgt_cache();
 	}
 }
@@ -133,7 +135,9 @@
 	/* endless idle loop with no priority at all */
 	while(1) {
 		if(need_resched()) {
+			preempt_enable_no_resched();
 			schedule();
+			preempt_disable();
 			check_pgt_cache();
 		}
 		barrier(); /* or else gcc optimizes... */
diff --git a/arch/sparc64/kernel/process.c b/arch/sparc64/kernel/process.c
index 7d10b03..2f89206 100644
--- a/arch/sparc64/kernel/process.c
+++ b/arch/sparc64/kernel/process.c
@@ -74,7 +74,9 @@
 		while (!need_resched())
 			barrier();
 
+		preempt_enable_no_resched();
 		schedule();
+		preempt_disable();
 		check_pgt_cache();
 	}
 }
@@ -93,7 +95,9 @@
 		if (need_resched()) {
 			unidle_me();
 			clear_thread_flag(TIF_POLLING_NRFLAG);
+			preempt_enable_no_resched();
 			schedule();
+			preempt_disable();
 			set_thread_flag(TIF_POLLING_NRFLAG);
 			check_pgt_cache();
 		}
diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index 5d90ee9..8aca4b1 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -168,6 +168,9 @@
 		rmb();
 
 	cpu_set(cpuid, cpu_online_map);
+
+	/* idle thread is expected to have preempt disabled */
+	preempt_disable();
 }
 
 void cpu_panic(void)
diff --git a/arch/v850/kernel/process.c b/arch/v850/kernel/process.c
index 9c708c3..39cf247 100644
--- a/arch/v850/kernel/process.c
+++ b/arch/v850/kernel/process.c
@@ -36,11 +36,8 @@
 /* The idle loop.  */
 void default_idle (void)
 {
-	while (1) {
-		while (! need_resched ())
-			asm ("halt; nop; nop; nop; nop; nop" ::: "cc");
-		schedule ();
-	}
+	while (! need_resched ())
+		asm ("halt; nop; nop; nop; nop; nop" ::: "cc");
 }
 
 void (*idle)(void) = default_idle;
@@ -54,7 +51,14 @@
 void cpu_idle (void)
 {
 	/* endless idle loop with no priority at all */
-	(*idle) ();
+	while (1) {
+		while (!need_resched())
+			(*idle) ();
+
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
+	}
 }
 
 /*
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index b5a89c0..571f9fe 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -204,7 +204,9 @@
 			idle();
 		}
 
+		preempt_enable_no_resched();
 		schedule();
+		preempt_disable();
 	}
 }
 
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 4b5b088..c4e59bb 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -472,6 +472,7 @@
 	 * things done here to the most necessary things.
 	 */
 	cpu_init();
+	preempt_disable();
 	smp_callin();
 
 	/* otherwise gcc will move up the smp_processor_id before the cpu_init */
diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c
index 08ef6d8..6a44b54 100644
--- a/arch/xtensa/kernel/process.c
+++ b/arch/xtensa/kernel/process.c
@@ -96,8 +96,9 @@
 	while (1) {
 		while (!need_resched())
 			platform_idle();
-		preempt_enable();
+		preempt_enable_no_resched();
 		schedule();
+		preempt_disable();
 	}
 }
 
diff --git a/init/main.c b/init/main.c
index f142d40..27f97f9 100644
--- a/init/main.c
+++ b/init/main.c
@@ -394,14 +394,16 @@
 	kernel_thread(init, NULL, CLONE_FS | CLONE_SIGHAND);
 	numa_default_policy();
 	unlock_kernel();
-	preempt_enable_no_resched();
 
 	/*
 	 * The boot idle thread must execute schedule()
 	 * at least one to get things moving:
 	 */
+	preempt_enable_no_resched();
 	schedule();
+	preempt_disable();
 
+	/* Call into cpu_idle with preempt disabled */
 	cpu_idle();
 }