[PATCH] x86_64: Move int 3 handler to debug stack and allow to increase it.

This
- switches the INT3 handler to run on an IST stack (to cope with
  breakpoints set by a kernel debugger on places where the kernel's
  %gs base hasn't been set up, yet); the IST stack used is shared with
  the INT1 handler's
[AK: this also allows setting a kprobe on the interrupt/exception entry
points]
- allows nesting of INT1/INT3 handlers so that one can, with a kernel
  debugger, debug (at least) the user-mode portions of the INT1/INT3
  handling; the nesting isn't actively enabled here since a kernel-
  debugger-free kernel doesn't need it

Signed-Off-By: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/arch/x86_64/kernel/asm-offsets.c b/arch/x86_64/kernel/asm-offsets.c
index aaa6d38..00a08d1 100644
--- a/arch/x86_64/kernel/asm-offsets.c
+++ b/arch/x86_64/kernel/asm-offsets.c
@@ -64,5 +64,9 @@
 	DEFINE(pbe_address, offsetof(struct pbe, address));
 	DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address));
 	DEFINE(pbe_next, offsetof(struct pbe, next));
+	BLANK();
+#if DEBUG_STKSZ > EXCEPTION_STKSZ
+	DEFINE(DEBUG_IST, DEBUG_STACK);
+#endif
 	return 0;
 }
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index bd21eba..28b3c8e 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -673,7 +673,10 @@
 
 	/* error code is on the stack already */
 	/* handle NMI like exceptions that can happen everywhere */
-	.macro paranoidentry sym
+#ifndef DEBUG_IST
+# define DEBUG_IST 0
+#endif
+	.macro paranoidentry sym, ist=0
 	SAVE_ALL
 	cld
 	movl $1,%ebx
@@ -683,10 +686,20 @@
 	js    1f
 	swapgs
 	xorl  %ebx,%ebx
-1:	movq %rsp,%rdi
+1:
+	.if \ist
+	movq	%gs:pda_data_offset, %rbp
+	.endif
+	movq %rsp,%rdi
 	movq ORIG_RAX(%rsp),%rsi
 	movq $-1,ORIG_RAX(%rsp)
+	.if \ist
+	subq	$EXCEPTION_STACK_SIZE, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
+	.endif
 	call \sym
+	.if \ist
+	addq	$EXCEPTION_STACK_SIZE, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
+	.endif
 	cli
 	.endm
 	
@@ -904,7 +917,7 @@
  	INTR_FRAME
 	pushq $0
 	CFI_ADJUST_CFA_OFFSET 8		
-	paranoidentry do_debug
+	paranoidentry do_debug, DEBUG_IST
 	jmp paranoid_exit
 	CFI_ENDPROC
 	.previous .text
@@ -959,7 +972,12 @@
 	CFI_ENDPROC
 
 KPROBE_ENTRY(int3)
-	zeroentry do_int3	
+ 	INTR_FRAME
+ 	pushq $0
+ 	CFI_ADJUST_CFA_OFFSET 8
+ 	paranoidentry do_int3, DEBUG_IST
+ 	jmp paranoid_exit
+ 	CFI_ENDPROC
 	.previous .text
 
 ENTRY(overflow)
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index 7b7131d..39e728c 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -145,7 +145,7 @@
 	pda->irqstackptr += IRQSTACKSIZE-64;
 } 
 
-char boot_exception_stacks[N_EXCEPTION_STACKS * EXCEPTION_STKSZ] 
+char boot_exception_stacks[(N_EXCEPTION_STACKS - 2) * EXCEPTION_STKSZ + DEBUG_STKSZ]
 __attribute__((section(".bss.page_aligned")));
 
 /* May not be marked __init: used by software suspend */
@@ -236,13 +236,27 @@
 	 */
 	for (v = 0; v < N_EXCEPTION_STACKS; v++) {
 		if (cpu) {
-			estacks = (char *)__get_free_pages(GFP_ATOMIC, 
-						   EXCEPTION_STACK_ORDER);
+			static const unsigned int order[N_EXCEPTION_STACKS] = {
+				[0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
+				[DEBUG_STACK - 1] = DEBUG_STACK_ORDER
+			};
+
+			estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
 			if (!estacks)
 				panic("Cannot allocate exception stack %ld %d\n",
 				      v, cpu); 
 		}
-		estacks += EXCEPTION_STKSZ;
+		switch (v + 1) {
+#if DEBUG_STKSZ > EXCEPTION_STKSZ
+		case DEBUG_STACK:
+			cpu_pda[cpu].debugstack = (unsigned long)estacks;
+			estacks += DEBUG_STKSZ;
+			break;
+#endif
+		default:
+			estacks += EXCEPTION_STKSZ;
+			break;
+		}
 		t->ist[v] = (unsigned long)estacks;
 	}
 
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index bd71dda..1a9094d 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -121,19 +121,31 @@
 static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
 					unsigned *usedp, const char **idp)
 {
-	static const char ids[N_EXCEPTION_STACKS][8] = {
+	static char ids[][8] = {
 		[DEBUG_STACK - 1] = "#DB",
 		[NMI_STACK - 1] = "NMI",
 		[DOUBLEFAULT_STACK - 1] = "#DF",
 		[STACKFAULT_STACK - 1] = "#SS",
 		[MCE_STACK - 1] = "#MC",
+#if DEBUG_STKSZ > EXCEPTION_STKSZ
+		[N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
+#endif
 	};
 	unsigned k;
 
 	for (k = 0; k < N_EXCEPTION_STACKS; k++) {
 		unsigned long end;
 
-		end = per_cpu(init_tss, cpu).ist[k];
+		switch (k + 1) {
+#if DEBUG_STKSZ > EXCEPTION_STKSZ
+		case DEBUG_STACK:
+			end = cpu_pda[cpu].debugstack + DEBUG_STKSZ;
+			break;
+#endif
+		default:
+			end = per_cpu(init_tss, cpu).ist[k];
+			break;
+		}
 		if (stack >= end)
 			continue;
 		if (stack >= end - EXCEPTION_STKSZ) {
@@ -143,6 +155,22 @@
 			*idp = ids[k];
 			return (unsigned long *)end;
 		}
+#if DEBUG_STKSZ > EXCEPTION_STKSZ
+		if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
+			unsigned j = N_EXCEPTION_STACKS - 1;
+
+			do {
+				++j;
+				end -= EXCEPTION_STKSZ;
+				ids[j][4] = '1' + (j - N_EXCEPTION_STACKS);
+			} while (stack < end - EXCEPTION_STKSZ);
+			if (*usedp & (1U << j))
+				break;
+			*usedp |= 1U << j;
+			*idp = ids[j];
+			return (unsigned long *)end;
+		}
+#endif
 	}
 	return NULL;
 }
@@ -613,6 +641,7 @@
 		io_check_error(reason, regs);
 }
 
+/* runs on IST stack. */
 asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code)
 {
 	if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
@@ -894,7 +923,7 @@
 	set_intr_gate(0,&divide_error);
 	set_intr_gate_ist(1,&debug,DEBUG_STACK);
 	set_intr_gate_ist(2,&nmi,NMI_STACK);
-	set_system_gate(3,&int3);
+ 	set_system_gate_ist(3,&int3,DEBUG_STACK); /* int3 can be called from all */
 	set_system_gate(4,&overflow);	/* int4 can be called from all */
 	set_intr_gate(5,&bounds);
 	set_intr_gate(6,&invalid_op);