[PATCH] Add the vgetcpu vsyscall

This patch adds a vgetcpu vsyscall, which depending on the CPU RDTSCP
capability uses either the RDTSCP or CPUID to obtain a CPU and node
numbers and pass them to the program.

AK: Lots of changes over Vojtech's original code:
Better prototype for vgetcpu()
It's better to pass the cpu / node numbers as separate arguments
to avoid mistakes when going from SMP to NUMA.
Also add a fast time stamp based cache using a user supplied
argument to speed things more up.
Use fast method from Chuck Ebbert to retrieve node/cpu from
GDT limit instead of CPUID
Made sure RDTSCP init is always executed after node is known.
Drop printk

Signed-off-by: Vojtech Pavlik <vojtech@suse.cz>
Signed-off-by: Andi Kleen <ak@suse.de>
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index c9739ca..505ec4a 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -371,7 +371,7 @@
 	.quad	0,0			/* TSS */
 	.quad	0,0			/* LDT */
 	.quad   0,0,0			/* three TLS descriptors */ 
-	.quad	0			/* unused */
+	.quad	0x0000f40000000000	/* node/CPU stored in limit */
 gdt_end:	
 	/* asm/segment.h:GDT_ENTRIES must match this */	
 	/* This should be a multiple of the cache line size */
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 97b9e46d..560ed94 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -899,12 +899,8 @@
 time_cpu_notifier(struct notifier_block *nb, unsigned long action, void *hcpu)
 {
 	unsigned cpu = (unsigned long) hcpu;
-	if (action == CPU_ONLINE &&
-		cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP)) {
-		unsigned p;
-		p = smp_processor_id() | (cpu_to_node(smp_processor_id())<<12);
-		write_rdtscp_aux(p);
-	}
+	if (action == CPU_ONLINE)
+		vsyscall_set_cpu(cpu);
 	return NOTIFY_DONE;
 }
 
@@ -993,6 +989,11 @@
 	if (unsynchronized_tsc())
 		notsc = 1;
 
+ 	if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
+		vgetcpu_mode = VGETCPU_RDTSCP;
+	else
+		vgetcpu_mode = VGETCPU_LSL;
+
 	if (vxtime.hpet_address && notsc) {
 		timetype = hpet_use_timer ? "HPET" : "PIT/HPET";
 		if (hpet_use_timer)
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index 7c4de31..8d5a514 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -99,6 +99,9 @@
   .vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) }
   vxtime = VVIRT(.vxtime);
 
+  .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) }
+  vgetcpu_mode = VVIRT(.vgetcpu_mode);
+
   .wall_jiffies : AT(VLOAD(.wall_jiffies)) { *(.wall_jiffies) }
   wall_jiffies = VVIRT(.wall_jiffies);
 
diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index f603037..902783b 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -26,6 +26,7 @@
 #include <linux/seqlock.h>
 #include <linux/jiffies.h>
 #include <linux/sysctl.h>
+#include <linux/getcpu.h>
 
 #include <asm/vsyscall.h>
 #include <asm/pgtable.h>
@@ -33,11 +34,15 @@
 #include <asm/fixmap.h>
 #include <asm/errno.h>
 #include <asm/io.h>
+#include <asm/segment.h>
+#include <asm/desc.h>
+#include <asm/topology.h>
 
 #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
 
 int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
 seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
+int __vgetcpu_mode __section_vgetcpu_mode;
 
 #include <asm/unistd.h>
 
@@ -127,9 +132,46 @@
 	return __xtime.tv_sec;
 }
 
-long __vsyscall(2) venosys_0(void)
+/* Fast way to get current CPU and node.
+   This helps to do per node and per CPU caches in user space.
+   The result is not guaranteed without CPU affinity, but usually
+   works out because the scheduler tries to keep a thread on the same
+   CPU.
+
+   tcache must point to a two element sized long array.
+   All arguments can be NULL. */
+long __vsyscall(2)
+vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
 {
-	return -ENOSYS;
+	unsigned int dummy, p;
+	unsigned long j = 0;
+
+	/* Fast cache - only recompute value once per jiffies and avoid
+	   relatively costly rdtscp/cpuid otherwise.
+	   This works because the scheduler usually keeps the process
+	   on the same CPU and this syscall doesn't guarantee its
+	   results anyways.
+	   We do this here because otherwise user space would do it on
+	   its own in a likely inferior way (no access to jiffies).
+	   If you don't like it pass NULL. */
+	if (tcache && tcache->t0 == (j = __jiffies)) {
+		p = tcache->t1;
+	} else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
+		/* Load per CPU data from RDTSCP */
+		rdtscp(dummy, dummy, p);
+	} else {
+		/* Load per CPU data from GDT */
+		asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
+	}
+	if (tcache) {
+		tcache->t0 = j;
+		tcache->t1 = p;
+	}
+	if (cpu)
+		*cpu = p & 0xfff;
+	if (node)
+		*node = p >> 12;
+	return 0;
 }
 
 long __vsyscall(3) venosys_1(void)
@@ -200,6 +242,43 @@
 
 #endif
 
+static void __cpuinit write_rdtscp_cb(void *info)
+{
+	write_rdtscp_aux((unsigned long)info);
+}
+
+void __cpuinit vsyscall_set_cpu(int cpu)
+{
+	unsigned long *d;
+	unsigned long node = 0;
+#ifdef CONFIG_NUMA
+	node = cpu_to_node[cpu];
+#endif
+	if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP)) {
+		void *info = (void *)((node << 12) | cpu);
+		/* Can happen on preemptive kernel */
+		if (get_cpu() == cpu)
+			write_rdtscp_cb(info);
+#ifdef CONFIG_SMP
+		else {
+			/* the notifier is unfortunately not executed on the
+			   target CPU */
+			smp_call_function_single(cpu,write_rdtscp_cb,info,0,1);
+		}
+#endif
+		put_cpu();
+	}
+
+	/* Store cpu number in limit so that it can be loaded quickly
+	   in user space in vgetcpu.
+	   12 bits for the CPU and 8 bits for the node. */
+	d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU);
+	*d = 0x0f40000000000ULL;
+	*d |= cpu;
+	*d |= (node & 0xf) << 12;
+	*d |= (node >> 4) << 48;
+}
+
 static void __init map_vsyscall(void)
 {
 	extern char __vsyscall_0;
@@ -214,6 +293,7 @@
 			VSYSCALL_ADDR(__NR_vgettimeofday)));
 	BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
 	BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
+	BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
 	map_vsyscall();
 #ifdef CONFIG_SYSCTL
 	register_sysctl_table(kernel_root_table2, 0);