Merge branch 'perf/bench' into perf/core

Merge reason: Looks mergable - ready it for the merge window.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/arch/Kconfig b/arch/Kconfig
index 7f418bb..eef3bbb 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -126,4 +126,11 @@
 config HAVE_DEFAULT_NO_SPIN_MUTEXES
 	bool
 
+config HAVE_HW_BREAKPOINT
+	bool
+	depends on HAVE_PERF_EVENTS
+	select ANON_INODES
+	select PERF_EVENTS
+
+
 source "kernel/gcov/Kconfig"
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 72ace95..178084b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -49,6 +49,7 @@
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_BZIP2
 	select HAVE_KERNEL_LZMA
+	select HAVE_HW_BREAKPOINT
 	select HAVE_ARCH_KMEMCHECK
 
 config OUTPUT_FORMAT
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 4a8e80c..9f828f8 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -10,6 +10,7 @@
 header-y += sigcontext32.h
 header-y += ucontext.h
 header-y += processor-flags.h
+header-y += hw_breakpoint.h
 
 unifdef-y += e820.h
 unifdef-y += ist.h
diff --git a/arch/x86/include/asm/a.out-core.h b/arch/x86/include/asm/a.out-core.h
index bb70e39..7a15588 100644
--- a/arch/x86/include/asm/a.out-core.h
+++ b/arch/x86/include/asm/a.out-core.h
@@ -17,6 +17,7 @@
 
 #include <linux/user.h>
 #include <linux/elfcore.h>
+#include <asm/debugreg.h>
 
 /*
  * fill in the user structure for an a.out core dump
@@ -32,14 +33,7 @@
 			>> PAGE_SHIFT;
 	dump->u_dsize -= dump->u_tsize;
 	dump->u_ssize = 0;
-	dump->u_debugreg[0] = current->thread.debugreg0;
-	dump->u_debugreg[1] = current->thread.debugreg1;
-	dump->u_debugreg[2] = current->thread.debugreg2;
-	dump->u_debugreg[3] = current->thread.debugreg3;
-	dump->u_debugreg[4] = 0;
-	dump->u_debugreg[5] = 0;
-	dump->u_debugreg[6] = current->thread.debugreg6;
-	dump->u_debugreg[7] = current->thread.debugreg7;
+	aout_dump_debugregs(dump);
 
 	if (dump->start_stack < TASK_SIZE)
 		dump->u_ssize = ((unsigned long)(TASK_SIZE - dump->start_stack))
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index 3ea6f37..fdabd84 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -18,6 +18,7 @@
 #define DR_TRAP1	(0x2)		/* db1 */
 #define DR_TRAP2	(0x4)		/* db2 */
 #define DR_TRAP3	(0x8)		/* db3 */
+#define DR_TRAP_BITS	(DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)
 
 #define DR_STEP		(0x4000)	/* single-step */
 #define DR_SWITCH	(0x8000)	/* task switch */
@@ -49,6 +50,8 @@
 
 #define DR_LOCAL_ENABLE_SHIFT 0    /* Extra shift to the local enable bit */
 #define DR_GLOBAL_ENABLE_SHIFT 1   /* Extra shift to the global enable bit */
+#define DR_LOCAL_ENABLE (0x1)      /* Local enable for reg 0 */
+#define DR_GLOBAL_ENABLE (0x2)     /* Global enable for reg 0 */
 #define DR_ENABLE_SIZE 2           /* 2 enable bits per register */
 
 #define DR_LOCAL_ENABLE_MASK (0x55)  /* Set  local bits for all 4 regs */
@@ -67,4 +70,34 @@
 #define DR_LOCAL_SLOWDOWN (0x100)   /* Local slow the pipeline */
 #define DR_GLOBAL_SLOWDOWN (0x200)  /* Global slow the pipeline */
 
+/*
+ * HW breakpoint additions
+ */
+#ifdef __KERNEL__
+
+DECLARE_PER_CPU(unsigned long, dr7);
+
+static inline void hw_breakpoint_disable(void)
+{
+	/* Zero the control register for HW Breakpoint */
+	set_debugreg(0UL, 7);
+
+	/* Zero-out the individual HW breakpoint address registers */
+	set_debugreg(0UL, 0);
+	set_debugreg(0UL, 1);
+	set_debugreg(0UL, 2);
+	set_debugreg(0UL, 3);
+}
+
+static inline int hw_breakpoint_active(void)
+{
+	return __get_cpu_var(dr7) & DR_GLOBAL_ENABLE_MASK;
+}
+
+extern void aout_dump_debugregs(struct user *dump);
+
+extern void hw_breakpoint_restore(void);
+
+#endif	/* __KERNEL__ */
+
 #endif /* _ASM_X86_DEBUGREG_H */
diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h
new file mode 100644
index 0000000..0675a7c
--- /dev/null
+++ b/arch/x86/include/asm/hw_breakpoint.h
@@ -0,0 +1,73 @@
+#ifndef	_I386_HW_BREAKPOINT_H
+#define	_I386_HW_BREAKPOINT_H
+
+#ifdef	__KERNEL__
+#define	__ARCH_HW_BREAKPOINT_H
+
+/*
+ * The name should probably be something dealt in
+ * a higher level. While dealing with the user
+ * (display/resolving)
+ */
+struct arch_hw_breakpoint {
+	char		*name; /* Contains name of the symbol to set bkpt */
+	unsigned long	address;
+	u8		len;
+	u8		type;
+};
+
+#include <linux/kdebug.h>
+#include <linux/percpu.h>
+#include <linux/list.h>
+
+/* Available HW breakpoint length encodings */
+#define X86_BREAKPOINT_LEN_1		0x40
+#define X86_BREAKPOINT_LEN_2		0x44
+#define X86_BREAKPOINT_LEN_4		0x4c
+#define X86_BREAKPOINT_LEN_EXECUTE	0x40
+
+#ifdef CONFIG_X86_64
+#define X86_BREAKPOINT_LEN_8		0x48
+#endif
+
+/* Available HW breakpoint type encodings */
+
+/* trigger on instruction execute */
+#define X86_BREAKPOINT_EXECUTE	0x80
+/* trigger on memory write */
+#define X86_BREAKPOINT_WRITE	0x81
+/* trigger on memory read or write */
+#define X86_BREAKPOINT_RW	0x83
+
+/* Total number of available HW breakpoint registers */
+#define HBP_NUM 4
+
+struct perf_event;
+struct pmu;
+
+extern int arch_check_va_in_userspace(unsigned long va, u8 hbp_len);
+extern int arch_validate_hwbkpt_settings(struct perf_event *bp,
+					 struct task_struct *tsk);
+extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused,
+					   unsigned long val, void *data);
+
+
+int arch_install_hw_breakpoint(struct perf_event *bp);
+void arch_uninstall_hw_breakpoint(struct perf_event *bp);
+void hw_breakpoint_pmu_read(struct perf_event *bp);
+void hw_breakpoint_pmu_unthrottle(struct perf_event *bp);
+
+extern void
+arch_fill_perf_breakpoint(struct perf_event *bp);
+
+unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type);
+int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type);
+
+extern int arch_bp_generic_fields(int x86_len, int x86_type,
+				  int *gen_len, int *gen_type);
+
+extern struct pmu perf_ops_bp;
+
+#endif	/* __KERNEL__ */
+#endif	/* _I386_HW_BREAKPOINT_H */
+
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c978648..6f8ec1c 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -30,6 +30,7 @@
 #include <linux/math64.h>
 #include <linux/init.h>
 
+#define HBP_NUM 4
 /*
  * Default implementation of macro that returns current
  * instruction pointer ("program counter").
@@ -422,6 +423,8 @@
 extern void free_thread_xstate(struct task_struct *);
 extern struct kmem_cache *task_xstate_cachep;
 
+struct perf_event;
+
 struct thread_struct {
 	/* Cached TLS descriptors: */
 	struct desc_struct	tls_array[GDT_ENTRY_TLS_ENTRIES];
@@ -443,13 +446,10 @@
 	unsigned long		fs;
 #endif
 	unsigned long		gs;
-	/* Hardware debugging registers: */
-	unsigned long		debugreg0;
-	unsigned long		debugreg1;
-	unsigned long		debugreg2;
-	unsigned long		debugreg3;
-	unsigned long		debugreg6;
-	unsigned long		debugreg7;
+	/* Save middle states of ptrace breakpoints */
+	struct perf_event	*ptrace_bps[HBP_NUM];
+	/* Debug status used for traps, single steps, etc... */
+	unsigned long           debugreg6;
 	/* Fault info: */
 	unsigned long		cr2;
 	unsigned long		trap_no;
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index d8e5d0cd..4f2e66e 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -40,7 +40,7 @@
 obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o
 obj-y			+= bootflag.o e820.o
 obj-y			+= pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
-obj-y			+= alternative.o i8253.o pci-nommu.o
+obj-y			+= alternative.o i8253.o pci-nommu.o hw_breakpoint.o
 obj-y			+= tsc.o io_delay.o rtc.o
 
 obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 68537e9..1d2cb38 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -5,6 +5,7 @@
 # Don't trace early stages of a secondary CPU boot
 ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_common.o = -pg
+CFLAGS_REMOVE_perf_event.o = -pg
 endif
 
 # Make sure load_percpu_segment has no stackprotector
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
new file mode 100644
index 0000000..4d267fb
--- /dev/null
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -0,0 +1,549 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2007 Alan Stern
+ * Copyright (C) 2009 IBM Corporation
+ * Copyright (C) 2009 Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ * Authors: Alan Stern <stern@rowland.harvard.edu>
+ *          K.Prasad <prasad@linux.vnet.ibm.com>
+ *          Frederic Weisbecker <fweisbec@gmail.com>
+ */
+
+/*
+ * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
+ * using the CPU's debug registers.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/irqflags.h>
+#include <linux/notifier.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/percpu.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+
+#include <asm/hw_breakpoint.h>
+#include <asm/processor.h>
+#include <asm/debugreg.h>
+
+/* Per cpu debug control register value */
+DEFINE_PER_CPU(unsigned long, dr7);
+EXPORT_PER_CPU_SYMBOL(dr7);
+
+/* Per cpu debug address registers values */
+static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]);
+
+/*
+ * Stores the breakpoints currently in use on each breakpoint address
+ * register for each cpus
+ */
+static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]);
+
+
+/*
+ * Encode the length, type, Exact, and Enable bits for a particular breakpoint
+ * as stored in debug register 7.
+ */
+unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type)
+{
+	unsigned long bp_info;
+
+	bp_info = (len | type) & 0xf;
+	bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE);
+	bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)) |
+				DR_GLOBAL_SLOWDOWN;
+	return bp_info;
+}
+
+/*
+ * Decode the length and type bits for a particular breakpoint as
+ * stored in debug register 7.  Return the "enabled" status.
+ */
+int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type)
+{
+	int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE);
+
+	*len = (bp_info & 0xc) | 0x40;
+	*type = (bp_info & 0x3) | 0x80;
+
+	return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3;
+}
+
+/*
+ * Install a perf counter breakpoint.
+ *
+ * We seek a free debug address register and use it for this
+ * breakpoint. Eventually we enable it in the debug control register.
+ *
+ * Atomic: we hold the counter->ctx->lock and we only handle variables
+ * and registers local to this cpu.
+ */
+int arch_install_hw_breakpoint(struct perf_event *bp)
+{
+	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+	unsigned long *dr7;
+	int i;
+
+	for (i = 0; i < HBP_NUM; i++) {
+		struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
+
+		if (!*slot) {
+			*slot = bp;
+			break;
+		}
+	}
+
+	if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
+		return -EBUSY;
+
+	set_debugreg(info->address, i);
+	__get_cpu_var(cpu_debugreg[i]) = info->address;
+
+	dr7 = &__get_cpu_var(dr7);
+	*dr7 |= encode_dr7(i, info->len, info->type);
+
+	set_debugreg(*dr7, 7);
+
+	return 0;
+}
+
+/*
+ * Uninstall the breakpoint contained in the given counter.
+ *
+ * First we search the debug address register it uses and then we disable
+ * it.
+ *
+ * Atomic: we hold the counter->ctx->lock and we only handle variables
+ * and registers local to this cpu.
+ */
+void arch_uninstall_hw_breakpoint(struct perf_event *bp)
+{
+	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+	unsigned long *dr7;
+	int i;
+
+	for (i = 0; i < HBP_NUM; i++) {
+		struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
+
+		if (*slot == bp) {
+			*slot = NULL;
+			break;
+		}
+	}
+
+	if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
+		return;
+
+	dr7 = &__get_cpu_var(dr7);
+	*dr7 &= ~encode_dr7(i, info->len, info->type);
+
+	set_debugreg(*dr7, 7);
+}
+
+static int get_hbp_len(u8 hbp_len)
+{
+	unsigned int len_in_bytes = 0;
+
+	switch (hbp_len) {
+	case X86_BREAKPOINT_LEN_1:
+		len_in_bytes = 1;
+		break;
+	case X86_BREAKPOINT_LEN_2:
+		len_in_bytes = 2;
+		break;
+	case X86_BREAKPOINT_LEN_4:
+		len_in_bytes = 4;
+		break;
+#ifdef CONFIG_X86_64
+	case X86_BREAKPOINT_LEN_8:
+		len_in_bytes = 8;
+		break;
+#endif
+	}
+	return len_in_bytes;
+}
+
+/*
+ * Check for virtual address in user space.
+ */
+int arch_check_va_in_userspace(unsigned long va, u8 hbp_len)
+{
+	unsigned int len;
+
+	len = get_hbp_len(hbp_len);
+
+	return (va <= TASK_SIZE - len);
+}
+
+/*
+ * Check for virtual address in kernel space.
+ */
+static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
+{
+	unsigned int len;
+
+	len = get_hbp_len(hbp_len);
+
+	return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
+}
+
+/*
+ * Store a breakpoint's encoded address, length, and type.
+ */
+static int arch_store_info(struct perf_event *bp)
+{
+	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+	/*
+	 * For kernel-addresses, either the address or symbol name can be
+	 * specified.
+	 */
+	if (info->name)
+		info->address = (unsigned long)
+				kallsyms_lookup_name(info->name);
+	if (info->address)
+		return 0;
+
+	return -EINVAL;
+}
+
+int arch_bp_generic_fields(int x86_len, int x86_type,
+			   int *gen_len, int *gen_type)
+{
+	/* Len */
+	switch (x86_len) {
+	case X86_BREAKPOINT_LEN_1:
+		*gen_len = HW_BREAKPOINT_LEN_1;
+		break;
+	case X86_BREAKPOINT_LEN_2:
+		*gen_len = HW_BREAKPOINT_LEN_2;
+		break;
+	case X86_BREAKPOINT_LEN_4:
+		*gen_len = HW_BREAKPOINT_LEN_4;
+		break;
+#ifdef CONFIG_X86_64
+	case X86_BREAKPOINT_LEN_8:
+		*gen_len = HW_BREAKPOINT_LEN_8;
+		break;
+#endif
+	default:
+		return -EINVAL;
+	}
+
+	/* Type */
+	switch (x86_type) {
+	case X86_BREAKPOINT_EXECUTE:
+		*gen_type = HW_BREAKPOINT_X;
+		break;
+	case X86_BREAKPOINT_WRITE:
+		*gen_type = HW_BREAKPOINT_W;
+		break;
+	case X86_BREAKPOINT_RW:
+		*gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+
+static int arch_build_bp_info(struct perf_event *bp)
+{
+	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+
+	info->address = bp->attr.bp_addr;
+
+	/* Len */
+	switch (bp->attr.bp_len) {
+	case HW_BREAKPOINT_LEN_1:
+		info->len = X86_BREAKPOINT_LEN_1;
+		break;
+	case HW_BREAKPOINT_LEN_2:
+		info->len = X86_BREAKPOINT_LEN_2;
+		break;
+	case HW_BREAKPOINT_LEN_4:
+		info->len = X86_BREAKPOINT_LEN_4;
+		break;
+#ifdef CONFIG_X86_64
+	case HW_BREAKPOINT_LEN_8:
+		info->len = X86_BREAKPOINT_LEN_8;
+		break;
+#endif
+	default:
+		return -EINVAL;
+	}
+
+	/* Type */
+	switch (bp->attr.bp_type) {
+	case HW_BREAKPOINT_W:
+		info->type = X86_BREAKPOINT_WRITE;
+		break;
+	case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
+		info->type = X86_BREAKPOINT_RW;
+		break;
+	case HW_BREAKPOINT_X:
+		info->type = X86_BREAKPOINT_EXECUTE;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+/*
+ * Validate the arch-specific HW Breakpoint register settings
+ */
+int arch_validate_hwbkpt_settings(struct perf_event *bp,
+				  struct task_struct *tsk)
+{
+	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+	unsigned int align;
+	int ret;
+
+
+	ret = arch_build_bp_info(bp);
+	if (ret)
+		return ret;
+
+	ret = -EINVAL;
+
+	if (info->type == X86_BREAKPOINT_EXECUTE)
+		/*
+		 * Ptrace-refactoring code
+		 * For now, we'll allow instruction breakpoint only for user-space
+		 * addresses
+		 */
+		if ((!arch_check_va_in_userspace(info->address, info->len)) &&
+			info->len != X86_BREAKPOINT_EXECUTE)
+			return ret;
+
+	switch (info->len) {
+	case X86_BREAKPOINT_LEN_1:
+		align = 0;
+		break;
+	case X86_BREAKPOINT_LEN_2:
+		align = 1;
+		break;
+	case X86_BREAKPOINT_LEN_4:
+		align = 3;
+		break;
+#ifdef CONFIG_X86_64
+	case X86_BREAKPOINT_LEN_8:
+		align = 7;
+		break;
+#endif
+	default:
+		return ret;
+	}
+
+	if (bp->callback)
+		ret = arch_store_info(bp);
+
+	if (ret < 0)
+		return ret;
+	/*
+	 * Check that the low-order bits of the address are appropriate
+	 * for the alignment implied by len.
+	 */
+	if (info->address & align)
+		return -EINVAL;
+
+	/* Check that the virtual address is in the proper range */
+	if (tsk) {
+		if (!arch_check_va_in_userspace(info->address, info->len))
+			return -EFAULT;
+	} else {
+		if (!arch_check_va_in_kernelspace(info->address, info->len))
+			return -EFAULT;
+	}
+
+	return 0;
+}
+
+/*
+ * Dump the debug register contents to the user.
+ * We can't dump our per cpu values because it
+ * may contain cpu wide breakpoint, something that
+ * doesn't belong to the current task.
+ *
+ * TODO: include non-ptrace user breakpoints (perf)
+ */
+void aout_dump_debugregs(struct user *dump)
+{
+	int i;
+	int dr7 = 0;
+	struct perf_event *bp;
+	struct arch_hw_breakpoint *info;
+	struct thread_struct *thread = &current->thread;
+
+	for (i = 0; i < HBP_NUM; i++) {
+		bp = thread->ptrace_bps[i];
+
+		if (bp && !bp->attr.disabled) {
+			dump->u_debugreg[i] = bp->attr.bp_addr;
+			info = counter_arch_bp(bp);
+			dr7 |= encode_dr7(i, info->len, info->type);
+		} else {
+			dump->u_debugreg[i] = 0;
+		}
+	}
+
+	dump->u_debugreg[4] = 0;
+	dump->u_debugreg[5] = 0;
+	dump->u_debugreg[6] = current->thread.debugreg6;
+
+	dump->u_debugreg[7] = dr7;
+}
+EXPORT_SYMBOL_GPL(aout_dump_debugregs);
+
+/*
+ * Release the user breakpoints used by ptrace
+ */
+void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
+{
+	int i;
+	struct thread_struct *t = &tsk->thread;
+
+	for (i = 0; i < HBP_NUM; i++) {
+		unregister_hw_breakpoint(t->ptrace_bps[i]);
+		t->ptrace_bps[i] = NULL;
+	}
+}
+
+void hw_breakpoint_restore(void)
+{
+	set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0);
+	set_debugreg(__get_cpu_var(cpu_debugreg[1]), 1);
+	set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2);
+	set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3);
+	set_debugreg(current->thread.debugreg6, 6);
+	set_debugreg(__get_cpu_var(dr7), 7);
+}
+EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
+
+/*
+ * Handle debug exception notifications.
+ *
+ * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below.
+ *
+ * NOTIFY_DONE returned if one of the following conditions is true.
+ * i) When the causative address is from user-space and the exception
+ * is a valid one, i.e. not triggered as a result of lazy debug register
+ * switching
+ * ii) When there are more bits than trap<n> set in DR6 register (such
+ * as BD, BS or BT) indicating that more than one debug condition is
+ * met and requires some more action in do_debug().
+ *
+ * NOTIFY_STOP returned for all other cases
+ *
+ */
+static int __kprobes hw_breakpoint_handler(struct die_args *args)
+{
+	int i, cpu, rc = NOTIFY_STOP;
+	struct perf_event *bp;
+	unsigned long dr7, dr6;
+	unsigned long *dr6_p;
+
+	/* The DR6 value is pointed by args->err */
+	dr6_p = (unsigned long *)ERR_PTR(args->err);
+	dr6 = *dr6_p;
+
+	/* Do an early return if no trap bits are set in DR6 */
+	if ((dr6 & DR_TRAP_BITS) == 0)
+		return NOTIFY_DONE;
+
+	get_debugreg(dr7, 7);
+	/* Disable breakpoints during exception handling */
+	set_debugreg(0UL, 7);
+	/*
+	 * Assert that local interrupts are disabled
+	 * Reset the DRn bits in the virtualized register value.
+	 * The ptrace trigger routine will add in whatever is needed.
+	 */
+	current->thread.debugreg6 &= ~DR_TRAP_BITS;
+	cpu = get_cpu();
+
+	/* Handle all the breakpoints that were triggered */
+	for (i = 0; i < HBP_NUM; ++i) {
+		if (likely(!(dr6 & (DR_TRAP0 << i))))
+			continue;
+
+		/*
+		 * The counter may be concurrently released but that can only
+		 * occur from a call_rcu() path. We can then safely fetch
+		 * the breakpoint, use its callback, touch its counter
+		 * while we are in an rcu_read_lock() path.
+		 */
+		rcu_read_lock();
+
+		bp = per_cpu(bp_per_reg[i], cpu);
+		if (bp)
+			rc = NOTIFY_DONE;
+		/*
+		 * Reset the 'i'th TRAP bit in dr6 to denote completion of
+		 * exception handling
+		 */
+		(*dr6_p) &= ~(DR_TRAP0 << i);
+		/*
+		 * bp can be NULL due to lazy debug register switching
+		 * or due to concurrent perf counter removing.
+		 */
+		if (!bp) {
+			rcu_read_unlock();
+			break;
+		}
+
+		(bp->callback)(bp, args->regs);
+
+		rcu_read_unlock();
+	}
+	if (dr6 & (~DR_TRAP_BITS))
+		rc = NOTIFY_DONE;
+
+	set_debugreg(dr7, 7);
+	put_cpu();
+
+	return rc;
+}
+
+/*
+ * Handle debug exception notifications.
+ */
+int __kprobes hw_breakpoint_exceptions_notify(
+		struct notifier_block *unused, unsigned long val, void *data)
+{
+	if (val != DIE_DEBUG)
+		return NOTIFY_DONE;
+
+	return hw_breakpoint_handler(data);
+}
+
+void hw_breakpoint_pmu_read(struct perf_event *bp)
+{
+	/* TODO */
+}
+
+void hw_breakpoint_pmu_unthrottle(struct perf_event *bp)
+{
+	/* TODO */
+}
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 8d82a77..34e86b6 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -43,6 +43,7 @@
 #include <linux/smp.h>
 #include <linux/nmi.h>
 
+#include <asm/debugreg.h>
 #include <asm/apicdef.h>
 #include <asm/system.h>
 
@@ -434,6 +435,11 @@
 			"resuming...\n");
 	kgdb_arch_handle_exception(args->trapnr, args->signr,
 				   args->err, "c", "", regs);
+	/*
+	 * Reset the BS bit in dr6 (pointed by args->err) to
+	 * denote completion of processing
+	 */
+	(*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
 
 	return NOTIFY_STOP;
 }
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index c5f1f11..3fe86d7 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -56,6 +56,7 @@
 #include <asm/uaccess.h>
 #include <asm/alternative.h>
 #include <asm/insn.h>
+#include <asm/debugreg.h>
 
 void jprobe_return_end(void);
 
@@ -945,8 +946,14 @@
 			ret = NOTIFY_STOP;
 		break;
 	case DIE_DEBUG:
-		if (post_kprobe_handler(args->regs))
+		if (post_kprobe_handler(args->regs)) {
+			/*
+			 * Reset the BS bit in dr6 (pointed by args->err) to
+			 * denote completion of processing
+			 */
+			(*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
 			ret = NOTIFY_STOP;
+		}
 		break;
 	case DIE_GPF:
 		/*
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index c1c429d..c843f84 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -25,6 +25,7 @@
 #include <asm/desc.h>
 #include <asm/system.h>
 #include <asm/cacheflush.h>
+#include <asm/debugreg.h>
 
 static void set_idt(void *newidt, __u16 limit)
 {
@@ -202,6 +203,7 @@
 
 	/* Interrupts aren't acceptable while we reboot */
 	local_irq_disable();
+	hw_breakpoint_disable();
 
 	if (image->preserve_context) {
 #ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 84c3bf2..4a8bb82 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -18,6 +18,7 @@
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
+#include <asm/debugreg.h>
 
 static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
 				unsigned long addr)
@@ -282,6 +283,7 @@
 
 	/* Interrupts aren't acceptable while we reboot */
 	local_irq_disable();
+	hw_breakpoint_disable();
 
 	if (image->preserve_context) {
 #ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 5284cd2..744508e 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -10,6 +10,7 @@
 #include <linux/clockchips.h>
 #include <linux/random.h>
 #include <trace/events/power.h>
+#include <linux/hw_breakpoint.h>
 #include <asm/system.h>
 #include <asm/apic.h>
 #include <asm/syscalls.h>
@@ -17,6 +18,7 @@
 #include <asm/uaccess.h>
 #include <asm/i387.h>
 #include <asm/ds.h>
+#include <asm/debugreg.h>
 
 unsigned long idle_halt;
 EXPORT_SYMBOL(idle_halt);
@@ -103,14 +105,7 @@
 	}
 #endif
 
-	clear_tsk_thread_flag(tsk, TIF_DEBUG);
-
-	tsk->thread.debugreg0 = 0;
-	tsk->thread.debugreg1 = 0;
-	tsk->thread.debugreg2 = 0;
-	tsk->thread.debugreg3 = 0;
-	tsk->thread.debugreg6 = 0;
-	tsk->thread.debugreg7 = 0;
+	flush_ptrace_hw_breakpoint(tsk);
 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
 	/*
 	 * Forget coprocessor state..
@@ -192,16 +187,6 @@
 	else if (next->debugctlmsr != prev->debugctlmsr)
 		update_debugctlmsr(next->debugctlmsr);
 
-	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
-		set_debugreg(next->debugreg0, 0);
-		set_debugreg(next->debugreg1, 1);
-		set_debugreg(next->debugreg2, 2);
-		set_debugreg(next->debugreg3, 3);
-		/* no 4 and 5 */
-		set_debugreg(next->debugreg6, 6);
-		set_debugreg(next->debugreg7, 7);
-	}
-
 	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
 	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
 		/* prev and next are different */
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 4cf7956..d5bd313 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -58,6 +58,7 @@
 #include <asm/idle.h>
 #include <asm/syscalls.h>
 #include <asm/ds.h>
+#include <asm/debugreg.h>
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 
@@ -259,7 +260,12 @@
 
 	task_user_gs(p) = get_user_gs(regs);
 
+	p->thread.io_bitmap_ptr = NULL;
 	tsk = current;
+	err = -ENOMEM;
+
+	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
+
 	if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
 		p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
 						IO_BITMAP_BYTES, GFP_KERNEL);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index eb62cbc..70cf158 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -52,6 +52,7 @@
 #include <asm/idle.h>
 #include <asm/syscalls.h>
 #include <asm/ds.h>
+#include <asm/debugreg.h>
 
 asmlinkage extern void ret_from_fork(void);
 
@@ -297,12 +298,16 @@
 
 	p->thread.fs = me->thread.fs;
 	p->thread.gs = me->thread.gs;
+	p->thread.io_bitmap_ptr = NULL;
 
 	savesegment(gs, p->thread.gsindex);
 	savesegment(fs, p->thread.fsindex);
 	savesegment(es, p->thread.es);
 	savesegment(ds, p->thread.ds);
 
+	err = -ENOMEM;
+	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
+
 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
 		if (!p->thread.io_bitmap_ptr) {
@@ -341,6 +346,7 @@
 		kfree(p->thread.io_bitmap_ptr);
 		p->thread.io_bitmap_max = 0;
 	}
+
 	return err;
 }
 
@@ -495,6 +501,7 @@
 	 */
 	if (preload_fpu)
 		__math_state_restore();
+
 	return prev_p;
 }
 
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index c4f76d2..b25f894 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -22,6 +22,8 @@
 #include <linux/seccomp.h>
 #include <linux/signal.h>
 #include <linux/workqueue.h>
+#include <linux/perf_event.h>
+#include <linux/hw_breakpoint.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -34,6 +36,7 @@
 #include <asm/prctl.h>
 #include <asm/proto.h>
 #include <asm/ds.h>
+#include <asm/hw_breakpoint.h>
 
 #include "tls.h"
 
@@ -249,11 +252,6 @@
 	return 0;
 }
 
-static unsigned long debugreg_addr_limit(struct task_struct *task)
-{
-	return TASK_SIZE - 3;
-}
-
 #else  /* CONFIG_X86_64 */
 
 #define FLAG_MASK		(FLAG_MASK_32 | X86_EFLAGS_NT)
@@ -378,15 +376,6 @@
 	return 0;
 }
 
-static unsigned long debugreg_addr_limit(struct task_struct *task)
-{
-#ifdef CONFIG_IA32_EMULATION
-	if (test_tsk_thread_flag(task, TIF_IA32))
-		return IA32_PAGE_OFFSET - 3;
-#endif
-	return TASK_SIZE_MAX - 7;
-}
-
 #endif	/* CONFIG_X86_32 */
 
 static unsigned long get_flags(struct task_struct *task)
@@ -566,96 +555,226 @@
 	return ret;
 }
 
-/*
- * This function is trivial and will be inlined by the compiler.
- * Having it separates the implementation details of debug
- * registers from the interface details of ptrace.
- */
-static unsigned long ptrace_get_debugreg(struct task_struct *child, int n)
+static void ptrace_triggered(struct perf_event *bp, void *data)
 {
-	switch (n) {
-	case 0:		return child->thread.debugreg0;
-	case 1:		return child->thread.debugreg1;
-	case 2:		return child->thread.debugreg2;
-	case 3:		return child->thread.debugreg3;
-	case 6:		return child->thread.debugreg6;
-	case 7:		return child->thread.debugreg7;
+	int i;
+	struct thread_struct *thread = &(current->thread);
+
+	/*
+	 * Store in the virtual DR6 register the fact that the breakpoint
+	 * was hit so the thread's debugger will see it.
+	 */
+	for (i = 0; i < HBP_NUM; i++) {
+		if (thread->ptrace_bps[i] == bp)
+			break;
 	}
+
+	thread->debugreg6 |= (DR_TRAP0 << i);
+}
+
+/*
+ * Walk through every ptrace breakpoints for this thread and
+ * build the dr7 value on top of their attributes.
+ *
+ */
+static unsigned long ptrace_get_dr7(struct perf_event *bp[])
+{
+	int i;
+	int dr7 = 0;
+	struct arch_hw_breakpoint *info;
+
+	for (i = 0; i < HBP_NUM; i++) {
+		if (bp[i] && !bp[i]->attr.disabled) {
+			info = counter_arch_bp(bp[i]);
+			dr7 |= encode_dr7(i, info->len, info->type);
+		}
+	}
+
+	return dr7;
+}
+
+/*
+ * Handle ptrace writes to debug register 7.
+ */
+static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
+{
+	struct thread_struct *thread = &(tsk->thread);
+	unsigned long old_dr7;
+	int i, orig_ret = 0, rc = 0;
+	int enabled, second_pass = 0;
+	unsigned len, type;
+	int gen_len, gen_type;
+	struct perf_event *bp;
+
+	data &= ~DR_CONTROL_RESERVED;
+	old_dr7 = ptrace_get_dr7(thread->ptrace_bps);
+restore:
+	/*
+	 * Loop through all the hardware breakpoints, making the
+	 * appropriate changes to each.
+	 */
+	for (i = 0; i < HBP_NUM; i++) {
+		enabled = decode_dr7(data, i, &len, &type);
+		bp = thread->ptrace_bps[i];
+
+		if (!enabled) {
+			if (bp) {
+				/*
+				 * Don't unregister the breakpoints right-away,
+				 * unless all register_user_hw_breakpoint()
+				 * requests have succeeded. This prevents
+				 * any window of opportunity for debug
+				 * register grabbing by other users.
+				 */
+				if (!second_pass)
+					continue;
+				thread->ptrace_bps[i] = NULL;
+				unregister_hw_breakpoint(bp);
+			}
+			continue;
+		}
+
+		/*
+		 * We shoud have at least an inactive breakpoint at this
+		 * slot. It means the user is writing dr7 without having
+		 * written the address register first
+		 */
+		if (!bp) {
+			rc = -EINVAL;
+			break;
+		}
+
+		rc = arch_bp_generic_fields(len, type, &gen_len, &gen_type);
+		if (rc)
+			break;
+
+		/*
+		 * This is a temporary thing as bp is unregistered/registered
+		 * to simulate modification
+		 */
+		bp = modify_user_hw_breakpoint(bp, bp->attr.bp_addr, gen_len,
+					       gen_type, bp->callback,
+					       tsk, true);
+		thread->ptrace_bps[i] = NULL;
+
+		if (!bp) { /* incorrect bp, or we have a bug in bp API */
+			rc = -EINVAL;
+			break;
+		}
+		if (IS_ERR(bp)) {
+			rc = PTR_ERR(bp);
+			bp = NULL;
+			break;
+		}
+		thread->ptrace_bps[i] = bp;
+	}
+	/*
+	 * Make a second pass to free the remaining unused breakpoints
+	 * or to restore the original breakpoints if an error occurred.
+	 */
+	if (!second_pass) {
+		second_pass = 1;
+		if (rc < 0) {
+			orig_ret = rc;
+			data = old_dr7;
+		}
+		goto restore;
+	}
+	return ((orig_ret < 0) ? orig_ret : rc);
+}
+
+/*
+ * Handle PTRACE_PEEKUSR calls for the debug register area.
+ */
+static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
+{
+	struct thread_struct *thread = &(tsk->thread);
+	unsigned long val = 0;
+
+	if (n < HBP_NUM) {
+		struct perf_event *bp;
+		bp = thread->ptrace_bps[n];
+		if (!bp)
+			return 0;
+		val = bp->hw.info.address;
+	} else if (n == 6) {
+		val = thread->debugreg6;
+	 } else if (n == 7) {
+		val = ptrace_get_dr7(thread->ptrace_bps);
+	}
+	return val;
+}
+
+static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
+				      unsigned long addr)
+{
+	struct perf_event *bp;
+	struct thread_struct *t = &tsk->thread;
+
+	if (!t->ptrace_bps[nr]) {
+		/*
+		 * Put stub len and type to register (reserve) an inactive but
+		 * correct bp
+		 */
+		bp = register_user_hw_breakpoint(addr, HW_BREAKPOINT_LEN_1,
+						 HW_BREAKPOINT_W,
+						 ptrace_triggered, tsk,
+						 false);
+	} else {
+		bp = t->ptrace_bps[nr];
+		t->ptrace_bps[nr] = NULL;
+		bp = modify_user_hw_breakpoint(bp, addr, bp->attr.bp_len,
+					       bp->attr.bp_type,
+					       bp->callback,
+					       tsk,
+					       bp->attr.disabled);
+	}
+
+	if (!bp)
+		return -EIO;
+	/*
+	 * CHECKME: the previous code returned -EIO if the addr wasn't a
+	 * valid task virtual addr. The new one will return -EINVAL in this
+	 * case.
+	 * -EINVAL may be what we want for in-kernel breakpoints users, but
+	 * -EIO looks better for ptrace, since we refuse a register writing
+	 * for the user. And anyway this is the previous behaviour.
+	 */
+	if (IS_ERR(bp))
+		return PTR_ERR(bp);
+
+	t->ptrace_bps[nr] = bp;
+
 	return 0;
 }
 
-static int ptrace_set_debugreg(struct task_struct *child,
-			       int n, unsigned long data)
+/*
+ * Handle PTRACE_POKEUSR calls for the debug register area.
+ */
+int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val)
 {
-	int i;
+	struct thread_struct *thread = &(tsk->thread);
+	int rc = 0;
 
-	if (unlikely(n == 4 || n == 5))
+	/* There are no DR4 or DR5 registers */
+	if (n == 4 || n == 5)
 		return -EIO;
 
-	if (n < 4 && unlikely(data >= debugreg_addr_limit(child)))
-		return -EIO;
-
-	switch (n) {
-	case 0:		child->thread.debugreg0 = data; break;
-	case 1:		child->thread.debugreg1 = data; break;
-	case 2:		child->thread.debugreg2 = data; break;
-	case 3:		child->thread.debugreg3 = data; break;
-
-	case 6:
-		if ((data & ~0xffffffffUL) != 0)
-			return -EIO;
-		child->thread.debugreg6 = data;
-		break;
-
-	case 7:
-		/*
-		 * Sanity-check data. Take one half-byte at once with
-		 * check = (val >> (16 + 4*i)) & 0xf. It contains the
-		 * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
-		 * 2 and 3 are LENi. Given a list of invalid values,
-		 * we do mask |= 1 << invalid_value, so that
-		 * (mask >> check) & 1 is a correct test for invalid
-		 * values.
-		 *
-		 * R/Wi contains the type of the breakpoint /
-		 * watchpoint, LENi contains the length of the watched
-		 * data in the watchpoint case.
-		 *
-		 * The invalid values are:
-		 * - LENi == 0x10 (undefined), so mask |= 0x0f00.	[32-bit]
-		 * - R/Wi == 0x10 (break on I/O reads or writes), so
-		 *   mask |= 0x4444.
-		 * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
-		 *   0x1110.
-		 *
-		 * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
-		 *
-		 * See the Intel Manual "System Programming Guide",
-		 * 15.2.4
-		 *
-		 * Note that LENi == 0x10 is defined on x86_64 in long
-		 * mode (i.e. even for 32-bit userspace software, but
-		 * 64-bit kernel), so the x86_64 mask value is 0x5454.
-		 * See the AMD manual no. 24593 (AMD64 System Programming)
-		 */
-#ifdef CONFIG_X86_32
-#define	DR7_MASK	0x5f54
-#else
-#define	DR7_MASK	0x5554
-#endif
-		data &= ~DR_CONTROL_RESERVED;
-		for (i = 0; i < 4; i++)
-			if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1)
-				return -EIO;
-		child->thread.debugreg7 = data;
-		if (data)
-			set_tsk_thread_flag(child, TIF_DEBUG);
-		else
-			clear_tsk_thread_flag(child, TIF_DEBUG);
-		break;
+	if (n == 6) {
+		thread->debugreg6 = val;
+		goto ret_path;
 	}
+	if (n < HBP_NUM) {
+		rc = ptrace_set_breakpoint_addr(tsk, n, val);
+		if (rc)
+			return rc;
+	}
+	/* All that's left is DR7 */
+	if (n == 7)
+		rc = ptrace_write_dr7(tsk, val);
 
-	return 0;
+ret_path:
+	return rc;
 }
 
 /*
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 6a44a76..fbf3b07c 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -799,15 +799,6 @@
 
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
-		/*
-		 * Re-enable any watchpoints before delivering the
-		 * signal to user space. The processor register will
-		 * have been cleared if the watchpoint triggered
-		 * inside the kernel.
-		 */
-		if (current->thread.debugreg7)
-			set_debugreg(current->thread.debugreg7, 7);
-
 		/* Whee! Actually deliver the signal.  */
 		if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
 			/*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 7e37dce..3339917 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -529,77 +529,56 @@
 dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 {
 	struct task_struct *tsk = current;
-	unsigned long condition;
+	unsigned long dr6;
 	int si_code;
 
-	get_debugreg(condition, 6);
+	get_debugreg(dr6, 6);
 
 	/* Catch kmemcheck conditions first of all! */
-	if (condition & DR_STEP && kmemcheck_trap(regs))
+	if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
 		return;
 
+	/* DR6 may or may not be cleared by the CPU */
+	set_debugreg(0, 6);
 	/*
 	 * The processor cleared BTF, so don't mark that we need it set.
 	 */
 	clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
 	tsk->thread.debugctlmsr = 0;
 
-	if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
-						SIGTRAP) == NOTIFY_STOP)
+	/* Store the virtualized DR6 value */
+	tsk->thread.debugreg6 = dr6;
+
+	if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code,
+							SIGTRAP) == NOTIFY_STOP)
 		return;
 
 	/* It's safe to allow irq's after DR6 has been saved */
 	preempt_conditional_sti(regs);
 
-	/* Mask out spurious debug traps due to lazy DR7 setting */
-	if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
-		if (!tsk->thread.debugreg7)
-			goto clear_dr7;
+	if (regs->flags & X86_VM_MASK) {
+		handle_vm86_trap((struct kernel_vm86_regs *) regs,
+				error_code, 1);
+		return;
 	}
 
-#ifdef CONFIG_X86_32
-	if (regs->flags & X86_VM_MASK)
-		goto debug_vm86;
-#endif
-
-	/* Save debug status register where ptrace can see it */
-	tsk->thread.debugreg6 = condition;
-
 	/*
-	 * Single-stepping through TF: make sure we ignore any events in
-	 * kernel space (but re-enable TF when returning to user mode).
+	 * Single-stepping through system calls: ignore any exceptions in
+	 * kernel space, but re-enable TF when returning to user mode.
+	 *
+	 * We already checked v86 mode above, so we can check for kernel mode
+	 * by just checking the CPL of CS.
 	 */
-	if (condition & DR_STEP) {
-		if (!user_mode(regs))
-			goto clear_TF_reenable;
+	if ((dr6 & DR_STEP) && !user_mode(regs)) {
+		tsk->thread.debugreg6 &= ~DR_STEP;
+		set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
+		regs->flags &= ~X86_EFLAGS_TF;
 	}
-
-	si_code = get_si_code(condition);
-	/* Ok, finally something we can handle */
-	send_sigtrap(tsk, regs, error_code, si_code);
-
-	/*
-	 * Disable additional traps. They'll be re-enabled when
-	 * the signal is delivered.
-	 */
-clear_dr7:
-	set_debugreg(0, 7);
+	si_code = get_si_code(tsk->thread.debugreg6);
+	if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS))
+		send_sigtrap(tsk, regs, error_code, si_code);
 	preempt_conditional_cli(regs);
-	return;
 
-#ifdef CONFIG_X86_32
-debug_vm86:
-	/* reenable preemption: handle_vm86_trap() might sleep */
-	dec_preempt_count();
-	handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
-	conditional_cli(regs);
-	return;
-#endif
-
-clear_TF_reenable:
-	set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
-	regs->flags &= ~X86_EFLAGS_TF;
-	preempt_conditional_cli(regs);
 	return;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ae07d26..4fc8017 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -42,6 +42,7 @@
 #define CREATE_TRACE_POINTS
 #include "trace.h"
 
+#include <asm/debugreg.h>
 #include <asm/uaccess.h>
 #include <asm/msr.h>
 #include <asm/desc.h>
@@ -3643,14 +3644,15 @@
 	trace_kvm_entry(vcpu->vcpu_id);
 	kvm_x86_ops->run(vcpu, kvm_run);
 
-	if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) {
-		set_debugreg(current->thread.debugreg0, 0);
-		set_debugreg(current->thread.debugreg1, 1);
-		set_debugreg(current->thread.debugreg2, 2);
-		set_debugreg(current->thread.debugreg3, 3);
-		set_debugreg(current->thread.debugreg6, 6);
-		set_debugreg(current->thread.debugreg7, 7);
-	}
+	/*
+	 * If the guest has used debug registers, at least dr7
+	 * will be disabled while returning to the host.
+	 * If we don't have active breakpoints in the host, we don't
+	 * care about the messed up debug address registers. But if
+	 * we have some of them active, restore the old state.
+	 */
+	if (hw_breakpoint_active())
+		hw_breakpoint_restore();
 
 	set_bit(KVM_REQ_KICK, &vcpu->requests);
 	local_irq_enable();
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 16ccbd7..11a4ad4 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -540,8 +540,14 @@
 	struct die_args *arg = args;
 
 	if (val == DIE_DEBUG && (arg->err & DR_STEP))
-		if (post_kmmio_handler(arg->err, arg->regs) == 1)
+		if (post_kmmio_handler(arg->err, arg->regs) == 1) {
+			/*
+			 * Reset the BS bit in dr6 (pointed by args->err) to
+			 * denote completion of processing
+			 */
+			(*(unsigned long *)ERR_PTR(arg->err)) &= ~DR_STEP;
 			return NOTIFY_STOP;
+		}
 
 	return NOTIFY_DONE;
 }
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 8aa85f1..0a979f3 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -18,6 +18,7 @@
 #include <asm/mce.h>
 #include <asm/xcr.h>
 #include <asm/suspend.h>
+#include <asm/debugreg.h>
 
 #ifdef CONFIG_X86_32
 static struct saved_context saved_context;
@@ -142,31 +143,6 @@
 #endif
 	load_TR_desc();				/* This does ltr */
 	load_LDT(&current->active_mm->context);	/* This does lldt */
-
-	/*
-	 * Now maybe reload the debug registers
-	 */
-	if (current->thread.debugreg7) {
-#ifdef CONFIG_X86_32
-		set_debugreg(current->thread.debugreg0, 0);
-		set_debugreg(current->thread.debugreg1, 1);
-		set_debugreg(current->thread.debugreg2, 2);
-		set_debugreg(current->thread.debugreg3, 3);
-		/* no 4 and 5 */
-		set_debugreg(current->thread.debugreg6, 6);
-		set_debugreg(current->thread.debugreg7, 7);
-#else
-		/* CONFIG_X86_64 */
-		loaddebug(&current->thread, 0);
-		loaddebug(&current->thread, 1);
-		loaddebug(&current->thread, 2);
-		loaddebug(&current->thread, 3);
-		/* no 4 and 5 */
-		loaddebug(&current->thread, 6);
-		loaddebug(&current->thread, 7);
-#endif
-	}
-
 }
 
 /**
diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c
index af75e07..d8214dc 100644
--- a/arch/x86/tools/test_get_len.c
+++ b/arch/x86/tools/test_get_len.c
@@ -114,6 +114,7 @@
 	unsigned char insn_buf[16];
 	struct insn insn;
 	int insns = 0, c;
+	int warnings = 0;
 
 	parse_args(argc, argv);
 
@@ -151,18 +152,22 @@
 		insn_init(&insn, insn_buf, x86_64);
 		insn_get_length(&insn);
 		if (insn.length != nb) {
-			fprintf(stderr, "Error: %s found a difference at %s\n",
+			warnings++;
+			fprintf(stderr, "Warning: %s found difference at %s\n",
 				prog, sym);
-			fprintf(stderr, "Error: %s", line);
-			fprintf(stderr, "Error: objdump says %d bytes, but "
+			fprintf(stderr, "Warning: %s", line);
+			fprintf(stderr, "Warning: objdump says %d bytes, but "
 				"insn_get_length() says %d\n", nb,
 				insn.length);
 			if (verbose)
 				dump_insn(stderr, &insn);
-			exit(2);
 		}
 	}
-	fprintf(stderr, "Succeed: decoded and checked %d instructions\n",
-		insns);
+	if (warnings)
+		fprintf(stderr, "Warning: decoded and checked %d"
+			" instructions with %d warnings\n", insns, warnings);
+	else
+		fprintf(stderr, "Succeed: decoded and checked %d"
+			" instructions\n", insns);
 	return 0;
 }
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 43360c1..47bbdf9 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -137,13 +137,8 @@
 
 #define FTRACE_MAX_PROFILE_SIZE	2048
 
-struct perf_trace_buf {
-	char	buf[FTRACE_MAX_PROFILE_SIZE];
-	int	recursion;
-};
-
-extern struct perf_trace_buf	*perf_trace_buf;
-extern struct perf_trace_buf	*perf_trace_buf_nmi;
+extern char *perf_trace_buf;
+extern char *perf_trace_buf_nmi;
 
 #define MAX_FILTER_PRED		32
 #define MAX_FILTER_STR_VAL	256	/* Should handle KSYM_SYMBOL_LEN */
diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h
new file mode 100644
index 0000000..c9f7f7c
--- /dev/null
+++ b/include/linux/hw_breakpoint.h
@@ -0,0 +1,140 @@
+#ifndef _LINUX_HW_BREAKPOINT_H
+#define _LINUX_HW_BREAKPOINT_H
+
+enum {
+	HW_BREAKPOINT_LEN_1 = 1,
+	HW_BREAKPOINT_LEN_2 = 2,
+	HW_BREAKPOINT_LEN_4 = 4,
+	HW_BREAKPOINT_LEN_8 = 8,
+};
+
+enum {
+	HW_BREAKPOINT_R = 1,
+	HW_BREAKPOINT_W = 2,
+	HW_BREAKPOINT_X = 4,
+};
+
+#ifdef __KERNEL__
+
+#include <linux/perf_event.h>
+
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+
+static inline unsigned long hw_breakpoint_addr(struct perf_event *bp)
+{
+	return bp->attr.bp_addr;
+}
+
+static inline int hw_breakpoint_type(struct perf_event *bp)
+{
+	return bp->attr.bp_type;
+}
+
+static inline int hw_breakpoint_len(struct perf_event *bp)
+{
+	return bp->attr.bp_len;
+}
+
+extern struct perf_event *
+register_user_hw_breakpoint(unsigned long addr,
+			    int len,
+			    int type,
+			    perf_callback_t triggered,
+			    struct task_struct *tsk,
+			    bool active);
+
+/* FIXME: only change from the attr, and don't unregister */
+extern struct perf_event *
+modify_user_hw_breakpoint(struct perf_event *bp,
+			  unsigned long addr,
+			  int len,
+			  int type,
+			  perf_callback_t triggered,
+			  struct task_struct *tsk,
+			  bool active);
+
+/*
+ * Kernel breakpoints are not associated with any particular thread.
+ */
+extern struct perf_event *
+register_wide_hw_breakpoint_cpu(unsigned long addr,
+				int len,
+				int type,
+				perf_callback_t triggered,
+				int cpu,
+				bool active);
+
+extern struct perf_event **
+register_wide_hw_breakpoint(unsigned long addr,
+			    int len,
+			    int type,
+			    perf_callback_t triggered,
+			    bool active);
+
+extern int register_perf_hw_breakpoint(struct perf_event *bp);
+extern int __register_perf_hw_breakpoint(struct perf_event *bp);
+extern void unregister_hw_breakpoint(struct perf_event *bp);
+extern void unregister_wide_hw_breakpoint(struct perf_event **cpu_events);
+
+extern int reserve_bp_slot(struct perf_event *bp);
+extern void release_bp_slot(struct perf_event *bp);
+
+extern void flush_ptrace_hw_breakpoint(struct task_struct *tsk);
+
+static inline struct arch_hw_breakpoint *counter_arch_bp(struct perf_event *bp)
+{
+	return &bp->hw.info;
+}
+
+#else /* !CONFIG_HAVE_HW_BREAKPOINT */
+
+static inline struct perf_event *
+register_user_hw_breakpoint(unsigned long addr,
+			    int len,
+			    int type,
+			    perf_callback_t triggered,
+			    struct task_struct *tsk,
+			    bool active)		{ return NULL; }
+static inline struct perf_event *
+modify_user_hw_breakpoint(struct perf_event *bp,
+			  unsigned long addr,
+			  int len,
+			  int type,
+			  perf_callback_t triggered,
+			  struct task_struct *tsk,
+			  bool active)			{ return NULL; }
+static inline struct perf_event *
+register_wide_hw_breakpoint_cpu(unsigned long addr,
+				int len,
+				int type,
+				perf_callback_t triggered,
+				int cpu,
+				bool active)		{ return NULL; }
+static inline struct perf_event **
+register_wide_hw_breakpoint(unsigned long addr,
+			    int len,
+			    int type,
+			    perf_callback_t triggered,
+			    bool active)		{ return NULL; }
+static inline int
+register_perf_hw_breakpoint(struct perf_event *bp)	{ return -ENOSYS; }
+static inline int
+__register_perf_hw_breakpoint(struct perf_event *bp) 	{ return -ENOSYS; }
+static inline void unregister_hw_breakpoint(struct perf_event *bp)	{ }
+static inline void
+unregister_wide_hw_breakpoint(struct perf_event **cpu_events)		{ }
+static inline int
+reserve_bp_slot(struct perf_event *bp)			{return -ENOSYS; }
+static inline void release_bp_slot(struct perf_event *bp) 		{ }
+
+static inline void flush_ptrace_hw_breakpoint(struct task_struct *tsk)	{ }
+
+static inline struct arch_hw_breakpoint *counter_arch_bp(struct perf_event *bp)
+{
+	return NULL;
+}
+
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_HW_BREAKPOINT_H */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 7f87563..43adbd7 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -18,6 +18,10 @@
 #include <linux/ioctl.h>
 #include <asm/byteorder.h>
 
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+#include <asm/hw_breakpoint.h>
+#endif
+
 /*
  * User-space ABI bits:
  */
@@ -31,6 +35,7 @@
 	PERF_TYPE_TRACEPOINT			= 2,
 	PERF_TYPE_HW_CACHE			= 3,
 	PERF_TYPE_RAW				= 4,
+	PERF_TYPE_BREAKPOINT			= 5,
 
 	PERF_TYPE_MAX,				/* non-ABI */
 };
@@ -209,6 +214,15 @@
 		__u32		wakeup_events;	  /* wakeup every n events */
 		__u32		wakeup_watermark; /* bytes before wakeup   */
 	};
+
+	union {
+		struct { /* Hardware breakpoint info */
+			__u64		bp_addr;
+			__u32		bp_type;
+			__u32		bp_len;
+		};
+	};
+
 	__u32			__reserved_2;
 
 	__u64			__reserved_3;
@@ -478,6 +492,11 @@
 			s64		remaining;
 			struct hrtimer	hrtimer;
 		};
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+		union { /* breakpoint */
+			struct arch_hw_breakpoint	info;
+		};
+#endif
 	};
 	atomic64_t			prev_count;
 	u64				sample_period;
@@ -546,6 +565,10 @@
 	void (*func)(struct perf_pending_entry *);
 };
 
+typedef void (*perf_callback_t)(struct perf_event *, void *);
+
+struct perf_sample_data;
+
 /**
  * struct perf_event - performance event kernel representation:
  */
@@ -588,7 +611,7 @@
 	u64				tstamp_running;
 	u64				tstamp_stopped;
 
-	struct perf_event_attr	attr;
+	struct perf_event_attr		attr;
 	struct hw_perf_event		hw;
 
 	struct perf_event_context	*ctx;
@@ -637,10 +660,18 @@
 	struct pid_namespace		*ns;
 	u64				id;
 
+	void (*overflow_handler)(struct perf_event *event,
+			int nmi, struct perf_sample_data *data,
+			struct pt_regs *regs);
+
 #ifdef CONFIG_EVENT_PROFILE
 	struct event_filter		*filter;
 #endif
 
+	perf_callback_t			callback;
+
+	perf_callback_t			event_callback;
+
 #endif /* CONFIG_PERF_EVENTS */
 };
 
@@ -745,6 +776,14 @@
 	       struct perf_cpu_context *cpuctx,
 	       struct perf_event_context *ctx, int cpu);
 extern void perf_event_update_userpage(struct perf_event *event);
+extern int perf_event_release_kernel(struct perf_event *event);
+extern struct perf_event *
+perf_event_create_kernel_counter(struct perf_event_attr *attr,
+				int cpu,
+				pid_t pid,
+				perf_callback_t callback);
+extern u64 perf_event_read_value(struct perf_event *event,
+				 u64 *enabled, u64 *running);
 
 struct perf_sample_data {
 	u64				type;
@@ -821,6 +860,7 @@
 extern void perf_event_init(void);
 extern void perf_tp_event(int event_id, u64 addr, u64 count,
 				 void *record, int entry_size);
+extern void perf_bp_event(struct perf_event *event, void *data);
 
 #ifndef perf_misc_flags
 #define perf_misc_flags(regs)	(user_mode(regs) ? PERF_RECORD_MISC_USER : \
@@ -834,6 +874,8 @@
 extern void perf_output_end(struct perf_output_handle *handle);
 extern void perf_output_copy(struct perf_output_handle *handle,
 			     const void *buf, unsigned int len);
+extern int perf_swevent_get_recursion_context(void);
+extern void perf_swevent_put_recursion_context(int rctx);
 #else
 static inline void
 perf_event_task_sched_in(struct task_struct *task, int cpu)		{ }
@@ -855,11 +897,15 @@
 static inline void
 perf_sw_event(u32 event_id, u64 nr, int nmi,
 		     struct pt_regs *regs, u64 addr)			{ }
+static inline void
+perf_bp_event(struct perf_event *event, void *data)		{ }
 
 static inline void perf_event_mmap(struct vm_area_struct *vma)		{ }
 static inline void perf_event_comm(struct task_struct *tsk)		{ }
 static inline void perf_event_fork(struct task_struct *tsk)		{ }
 static inline void perf_event_init(void)				{ }
+static inline int  perf_swevent_get_recursion_context(void)  { return -1; }
+static inline void perf_swevent_put_recursion_context(int rctx)		{ }
 
 #endif
 
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 4945d1c9..c3417c1 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -724,17 +724,20 @@
 static void ftrace_profile_##call(proto)				\
 {									\
 	struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
+	extern int perf_swevent_get_recursion_context(void);		\
+	extern void perf_swevent_put_recursion_context(int rctx);	\
 	struct ftrace_event_call *event_call = &event_##call;		\
 	extern void perf_tp_event(int, u64, u64, void *, int);		\
 	struct ftrace_raw_##call *entry;				\
-	struct perf_trace_buf *trace_buf;				\
 	u64 __addr = 0, __count = 1;					\
 	unsigned long irq_flags;					\
 	struct trace_entry *ent;					\
 	int __entry_size;						\
 	int __data_size;						\
+	char *trace_buf;						\
 	char *raw_data;							\
 	int __cpu;							\
+	int rctx;							\
 	int pc;								\
 									\
 	pc = preempt_count();						\
@@ -749,6 +752,11 @@
 		return;							\
 									\
 	local_irq_save(irq_flags);					\
+									\
+	rctx = perf_swevent_get_recursion_context();			\
+	if (rctx < 0)							\
+		goto end_recursion;					\
+									\
 	__cpu = smp_processor_id();					\
 									\
 	if (in_nmi())							\
@@ -759,13 +767,7 @@
 	if (!trace_buf)							\
 		goto end;						\
 									\
-	trace_buf = per_cpu_ptr(trace_buf, __cpu);			\
-	if (trace_buf->recursion++)					\
-		goto end_recursion;					\
-									\
-	barrier();							\
-									\
-	raw_data = trace_buf->buf;					\
+	raw_data = per_cpu_ptr(trace_buf, __cpu);			\
 									\
 	*(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL;		\
 	entry = (struct ftrace_raw_##call *)raw_data;			\
@@ -780,9 +782,9 @@
 	perf_tp_event(event_call->id, __addr, __count, entry,		\
 			     __entry_size);				\
 									\
-end_recursion:								\
-	trace_buf->recursion--;						\
 end:									\
+	perf_swevent_put_recursion_context(rctx);			\
+end_recursion:								\
 	local_irq_restore(irq_flags);					\
 									\
 }
diff --git a/kernel/Makefile b/kernel/Makefile
index b8d4cd8..6b7ce81 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -21,6 +21,7 @@
 CFLAGS_REMOVE_rtmutex-debug.o = -pg
 CFLAGS_REMOVE_cgroup-debug.o = -pg
 CFLAGS_REMOVE_sched_clock.o = -pg
+CFLAGS_REMOVE_perf_event.o = -pg
 endif
 
 obj-$(CONFIG_FREEZER) += freezer.o
@@ -95,6 +96,7 @@
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_SLOW_WORK) += slow-work.o
 obj-$(CONFIG_PERF_EVENTS) += perf_event.o
+obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
 
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/exit.c b/kernel/exit.c
index f7864ac..3f45e3c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -49,6 +49,7 @@
 #include <linux/init_task.h>
 #include <linux/perf_event.h>
 #include <trace/events/sched.h>
+#include <linux/hw_breakpoint.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -978,6 +979,10 @@
 	proc_exit_connector(tsk);
 
 	/*
+	 * FIXME: do that only when needed, using sched_exit tracepoint
+	 */
+	flush_ptrace_hw_breakpoint(tsk);
+	/*
 	 * Flush inherited counters to the parent - before the parent
 	 * gets woken up by child-exit notifications.
 	 */
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
new file mode 100644
index 0000000..06d372f
--- /dev/null
+++ b/kernel/hw_breakpoint.c
@@ -0,0 +1,501 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2007 Alan Stern
+ * Copyright (C) IBM Corporation, 2009
+ * Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ * Thanks to Ingo Molnar for his many suggestions.
+ *
+ * Authors: Alan Stern <stern@rowland.harvard.edu>
+ *          K.Prasad <prasad@linux.vnet.ibm.com>
+ *          Frederic Weisbecker <fweisbec@gmail.com>
+ */
+
+/*
+ * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
+ * using the CPU's debug registers.
+ * This file contains the arch-independent routines.
+ */
+
+#include <linux/irqflags.h>
+#include <linux/kallsyms.h>
+#include <linux/notifier.h>
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+
+#include <linux/hw_breakpoint.h>
+
+/*
+ * Constraints data
+ */
+
+/* Number of pinned cpu breakpoints in a cpu */
+static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned);
+
+/* Number of pinned task breakpoints in a cpu */
+static DEFINE_PER_CPU(unsigned int, task_bp_pinned[HBP_NUM]);
+
+/* Number of non-pinned cpu/task breakpoints in a cpu */
+static DEFINE_PER_CPU(unsigned int, nr_bp_flexible);
+
+/* Gather the number of total pinned and un-pinned bp in a cpuset */
+struct bp_busy_slots {
+	unsigned int pinned;
+	unsigned int flexible;
+};
+
+/* Serialize accesses to the above constraints */
+static DEFINE_MUTEX(nr_bp_mutex);
+
+/*
+ * Report the maximum number of pinned breakpoints a task
+ * have in this cpu
+ */
+static unsigned int max_task_bp_pinned(int cpu)
+{
+	int i;
+	unsigned int *tsk_pinned = per_cpu(task_bp_pinned, cpu);
+
+	for (i = HBP_NUM -1; i >= 0; i--) {
+		if (tsk_pinned[i] > 0)
+			return i + 1;
+	}
+
+	return 0;
+}
+
+/*
+ * Report the number of pinned/un-pinned breakpoints we have in
+ * a given cpu (cpu > -1) or in all of them (cpu = -1).
+ */
+static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu)
+{
+	if (cpu >= 0) {
+		slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu);
+		slots->pinned += max_task_bp_pinned(cpu);
+		slots->flexible = per_cpu(nr_bp_flexible, cpu);
+
+		return;
+	}
+
+	for_each_online_cpu(cpu) {
+		unsigned int nr;
+
+		nr = per_cpu(nr_cpu_bp_pinned, cpu);
+		nr += max_task_bp_pinned(cpu);
+
+		if (nr > slots->pinned)
+			slots->pinned = nr;
+
+		nr = per_cpu(nr_bp_flexible, cpu);
+
+		if (nr > slots->flexible)
+			slots->flexible = nr;
+	}
+}
+
+/*
+ * Add a pinned breakpoint for the given task in our constraint table
+ */
+static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable)
+{
+	int count = 0;
+	struct perf_event *bp;
+	struct perf_event_context *ctx = tsk->perf_event_ctxp;
+	unsigned int *task_bp_pinned;
+	struct list_head *list;
+	unsigned long flags;
+
+	if (WARN_ONCE(!ctx, "No perf context for this task"))
+		return;
+
+	list = &ctx->event_list;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	/*
+	 * The current breakpoint counter is not included in the list
+	 * at the open() callback time
+	 */
+	list_for_each_entry(bp, list, event_entry) {
+		if (bp->attr.type == PERF_TYPE_BREAKPOINT)
+			count++;
+	}
+
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	if (WARN_ONCE(count < 0, "No breakpoint counter found in the counter list"))
+		return;
+
+	task_bp_pinned = per_cpu(task_bp_pinned, cpu);
+	if (enable) {
+		task_bp_pinned[count]++;
+		if (count > 0)
+			task_bp_pinned[count-1]--;
+	} else {
+		task_bp_pinned[count]--;
+		if (count > 0)
+			task_bp_pinned[count-1]++;
+	}
+}
+
+/*
+ * Add/remove the given breakpoint in our constraint table
+ */
+static void toggle_bp_slot(struct perf_event *bp, bool enable)
+{
+	int cpu = bp->cpu;
+	struct task_struct *tsk = bp->ctx->task;
+
+	/* Pinned counter task profiling */
+	if (tsk) {
+		if (cpu >= 0) {
+			toggle_bp_task_slot(tsk, cpu, enable);
+			return;
+		}
+
+		for_each_online_cpu(cpu)
+			toggle_bp_task_slot(tsk, cpu, enable);
+		return;
+	}
+
+	/* Pinned counter cpu profiling */
+	if (enable)
+		per_cpu(nr_cpu_bp_pinned, bp->cpu)++;
+	else
+		per_cpu(nr_cpu_bp_pinned, bp->cpu)--;
+}
+
+/*
+ * Contraints to check before allowing this new breakpoint counter:
+ *
+ *  == Non-pinned counter == (Considered as pinned for now)
+ *
+ *   - If attached to a single cpu, check:
+ *
+ *       (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu)
+ *           + max(per_cpu(task_bp_pinned, cpu)))) < HBP_NUM
+ *
+ *       -> If there are already non-pinned counters in this cpu, it means
+ *          there is already a free slot for them.
+ *          Otherwise, we check that the maximum number of per task
+ *          breakpoints (for this cpu) plus the number of per cpu breakpoint
+ *          (for this cpu) doesn't cover every registers.
+ *
+ *   - If attached to every cpus, check:
+ *
+ *       (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *))
+ *           + max(per_cpu(task_bp_pinned, *)))) < HBP_NUM
+ *
+ *       -> This is roughly the same, except we check the number of per cpu
+ *          bp for every cpu and we keep the max one. Same for the per tasks
+ *          breakpoints.
+ *
+ *
+ * == Pinned counter ==
+ *
+ *   - If attached to a single cpu, check:
+ *
+ *       ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu)
+ *            + max(per_cpu(task_bp_pinned, cpu))) < HBP_NUM
+ *
+ *       -> Same checks as before. But now the nr_bp_flexible, if any, must keep
+ *          one register at least (or they will never be fed).
+ *
+ *   - If attached to every cpus, check:
+ *
+ *       ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
+ *            + max(per_cpu(task_bp_pinned, *))) < HBP_NUM
+ */
+int reserve_bp_slot(struct perf_event *bp)
+{
+	struct bp_busy_slots slots = {0};
+	int ret = 0;
+
+	mutex_lock(&nr_bp_mutex);
+
+	fetch_bp_busy_slots(&slots, bp->cpu);
+
+	/* Flexible counters need to keep at least one slot */
+	if (slots.pinned + (!!slots.flexible) == HBP_NUM) {
+		ret = -ENOSPC;
+		goto end;
+	}
+
+	toggle_bp_slot(bp, true);
+
+end:
+	mutex_unlock(&nr_bp_mutex);
+
+	return ret;
+}
+
+void release_bp_slot(struct perf_event *bp)
+{
+	mutex_lock(&nr_bp_mutex);
+
+	toggle_bp_slot(bp, false);
+
+	mutex_unlock(&nr_bp_mutex);
+}
+
+
+int __register_perf_hw_breakpoint(struct perf_event *bp)
+{
+	int ret;
+
+	ret = reserve_bp_slot(bp);
+	if (ret)
+		return ret;
+
+	/*
+	 * Ptrace breakpoints can be temporary perf events only
+	 * meant to reserve a slot. In this case, it is created disabled and
+	 * we don't want to check the params right now (as we put a null addr)
+	 * But perf tools create events as disabled and we want to check
+	 * the params for them.
+	 * This is a quick hack that will be removed soon, once we remove
+	 * the tmp breakpoints from ptrace
+	 */
+	if (!bp->attr.disabled || bp->callback == perf_bp_event)
+		ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
+
+	return ret;
+}
+
+int register_perf_hw_breakpoint(struct perf_event *bp)
+{
+	bp->callback = perf_bp_event;
+
+	return __register_perf_hw_breakpoint(bp);
+}
+
+/*
+ * Register a breakpoint bound to a task and a given cpu.
+ * If cpu is -1, the breakpoint is active for the task in every cpu
+ * If the task is -1, the breakpoint is active for every tasks in the given
+ * cpu.
+ */
+static struct perf_event *
+register_user_hw_breakpoint_cpu(unsigned long addr,
+				int len,
+				int type,
+				perf_callback_t triggered,
+				pid_t pid,
+				int cpu,
+				bool active)
+{
+	struct perf_event_attr *attr;
+	struct perf_event *bp;
+
+	attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+	if (!attr)
+		return ERR_PTR(-ENOMEM);
+
+	attr->type = PERF_TYPE_BREAKPOINT;
+	attr->size = sizeof(*attr);
+	attr->bp_addr = addr;
+	attr->bp_len = len;
+	attr->bp_type = type;
+	/*
+	 * Such breakpoints are used by debuggers to trigger signals when
+	 * we hit the excepted memory op. We can't miss such events, they
+	 * must be pinned.
+	 */
+	attr->pinned = 1;
+
+	if (!active)
+		attr->disabled = 1;
+
+	bp = perf_event_create_kernel_counter(attr, cpu, pid, triggered);
+	kfree(attr);
+
+	return bp;
+}
+
+/**
+ * register_user_hw_breakpoint - register a hardware breakpoint for user space
+ * @addr: is the memory address that triggers the breakpoint
+ * @len: the length of the access to the memory (1 byte, 2 bytes etc...)
+ * @type: the type of the access to the memory (read/write/exec)
+ * @triggered: callback to trigger when we hit the breakpoint
+ * @tsk: pointer to 'task_struct' of the process to which the address belongs
+ * @active: should we activate it while registering it
+ *
+ */
+struct perf_event *
+register_user_hw_breakpoint(unsigned long addr,
+			    int len,
+			    int type,
+			    perf_callback_t triggered,
+			    struct task_struct *tsk,
+			    bool active)
+{
+	return register_user_hw_breakpoint_cpu(addr, len, type, triggered,
+					       tsk->pid, -1, active);
+}
+EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
+
+/**
+ * modify_user_hw_breakpoint - modify a user-space hardware breakpoint
+ * @bp: the breakpoint structure to modify
+ * @addr: is the memory address that triggers the breakpoint
+ * @len: the length of the access to the memory (1 byte, 2 bytes etc...)
+ * @type: the type of the access to the memory (read/write/exec)
+ * @triggered: callback to trigger when we hit the breakpoint
+ * @tsk: pointer to 'task_struct' of the process to which the address belongs
+ * @active: should we activate it while registering it
+ */
+struct perf_event *
+modify_user_hw_breakpoint(struct perf_event *bp,
+			  unsigned long addr,
+			  int len,
+			  int type,
+			  perf_callback_t triggered,
+			  struct task_struct *tsk,
+			  bool active)
+{
+	/*
+	 * FIXME: do it without unregistering
+	 * - We don't want to lose our slot
+	 * - If the new bp is incorrect, don't lose the older one
+	 */
+	unregister_hw_breakpoint(bp);
+
+	return register_user_hw_breakpoint(addr, len, type, triggered,
+					   tsk, active);
+}
+EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
+
+/**
+ * unregister_hw_breakpoint - unregister a user-space hardware breakpoint
+ * @bp: the breakpoint structure to unregister
+ */
+void unregister_hw_breakpoint(struct perf_event *bp)
+{
+	if (!bp)
+		return;
+	perf_event_release_kernel(bp);
+}
+EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
+
+static struct perf_event *
+register_kernel_hw_breakpoint_cpu(unsigned long addr,
+				  int len,
+				  int type,
+				  perf_callback_t triggered,
+				  int cpu,
+				  bool active)
+{
+	return register_user_hw_breakpoint_cpu(addr, len, type, triggered,
+					       -1, cpu, active);
+}
+
+/**
+ * register_wide_hw_breakpoint - register a wide breakpoint in the kernel
+ * @addr: is the memory address that triggers the breakpoint
+ * @len: the length of the access to the memory (1 byte, 2 bytes etc...)
+ * @type: the type of the access to the memory (read/write/exec)
+ * @triggered: callback to trigger when we hit the breakpoint
+ * @active: should we activate it while registering it
+ *
+ * @return a set of per_cpu pointers to perf events
+ */
+struct perf_event **
+register_wide_hw_breakpoint(unsigned long addr,
+			    int len,
+			    int type,
+			    perf_callback_t triggered,
+			    bool active)
+{
+	struct perf_event **cpu_events, **pevent, *bp;
+	long err;
+	int cpu;
+
+	cpu_events = alloc_percpu(typeof(*cpu_events));
+	if (!cpu_events)
+		return ERR_PTR(-ENOMEM);
+
+	for_each_possible_cpu(cpu) {
+		pevent = per_cpu_ptr(cpu_events, cpu);
+		bp = register_kernel_hw_breakpoint_cpu(addr, len, type,
+					triggered, cpu, active);
+
+		*pevent = bp;
+
+		if (IS_ERR(bp) || !bp) {
+			err = PTR_ERR(bp);
+			goto fail;
+		}
+	}
+
+	return cpu_events;
+
+fail:
+	for_each_possible_cpu(cpu) {
+		pevent = per_cpu_ptr(cpu_events, cpu);
+		if (IS_ERR(*pevent) || !*pevent)
+			break;
+		unregister_hw_breakpoint(*pevent);
+	}
+	free_percpu(cpu_events);
+	/* return the error if any */
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
+
+/**
+ * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel
+ * @cpu_events: the per cpu set of events to unregister
+ */
+void unregister_wide_hw_breakpoint(struct perf_event **cpu_events)
+{
+	int cpu;
+	struct perf_event **pevent;
+
+	for_each_possible_cpu(cpu) {
+		pevent = per_cpu_ptr(cpu_events, cpu);
+		unregister_hw_breakpoint(*pevent);
+	}
+	free_percpu(cpu_events);
+}
+EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint);
+
+static struct notifier_block hw_breakpoint_exceptions_nb = {
+	.notifier_call = hw_breakpoint_exceptions_notify,
+	/* we need to be notified first */
+	.priority = 0x7fffffff
+};
+
+static int __init init_hw_breakpoint(void)
+{
+	return register_die_notifier(&hw_breakpoint_exceptions_nb);
+}
+core_initcall(init_hw_breakpoint);
+
+
+struct pmu perf_ops_bp = {
+	.enable		= arch_install_hw_breakpoint,
+	.disable	= arch_uninstall_hw_breakpoint,
+	.read		= hw_breakpoint_pmu_read,
+	.unthrottle	= hw_breakpoint_pmu_unthrottle
+};
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 8b6b8b6..8e5288a 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -181,6 +181,7 @@
 	}
 	return module_kallsyms_lookup_name(name);
 }
+EXPORT_SYMBOL_GPL(kallsyms_lookup_name);
 
 int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
 				      unsigned long),
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 3256e36..accfd7bf 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -29,6 +29,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/perf_event.h>
 #include <linux/ftrace_event.h>
+#include <linux/hw_breakpoint.h>
 
 #include <asm/irq_regs.h>
 
@@ -245,6 +246,49 @@
 	put_ctx(ctx);
 }
 
+static inline u64 perf_clock(void)
+{
+	return cpu_clock(smp_processor_id());
+}
+
+/*
+ * Update the record of the current time in a context.
+ */
+static void update_context_time(struct perf_event_context *ctx)
+{
+	u64 now = perf_clock();
+
+	ctx->time += now - ctx->timestamp;
+	ctx->timestamp = now;
+}
+
+/*
+ * Update the total_time_enabled and total_time_running fields for a event.
+ */
+static void update_event_times(struct perf_event *event)
+{
+	struct perf_event_context *ctx = event->ctx;
+	u64 run_end;
+
+	if (event->state < PERF_EVENT_STATE_INACTIVE ||
+	    event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
+		return;
+
+	if (ctx->is_active)
+		run_end = ctx->time;
+	else
+		run_end = event->tstamp_stopped;
+
+	event->total_time_enabled = run_end - event->tstamp_enabled;
+
+	if (event->state == PERF_EVENT_STATE_INACTIVE)
+		run_end = event->tstamp_stopped;
+	else
+		run_end = ctx->time;
+
+	event->total_time_running = run_end - event->tstamp_running;
+}
+
 /*
  * Add a event from the lists for its context.
  * Must be called with ctx->mutex and ctx->lock held.
@@ -293,6 +337,9 @@
 	if (event->group_leader != event)
 		event->group_leader->nr_siblings--;
 
+	update_event_times(event);
+	event->state = PERF_EVENT_STATE_OFF;
+
 	/*
 	 * If this was a group event with sibling events then
 	 * upgrade the siblings to singleton events by adding them
@@ -446,50 +493,11 @@
 	 * can remove the event safely, if the call above did not
 	 * succeed.
 	 */
-	if (!list_empty(&event->group_entry)) {
+	if (!list_empty(&event->group_entry))
 		list_del_event(event, ctx);
-	}
 	spin_unlock_irq(&ctx->lock);
 }
 
-static inline u64 perf_clock(void)
-{
-	return cpu_clock(smp_processor_id());
-}
-
-/*
- * Update the record of the current time in a context.
- */
-static void update_context_time(struct perf_event_context *ctx)
-{
-	u64 now = perf_clock();
-
-	ctx->time += now - ctx->timestamp;
-	ctx->timestamp = now;
-}
-
-/*
- * Update the total_time_enabled and total_time_running fields for a event.
- */
-static void update_event_times(struct perf_event *event)
-{
-	struct perf_event_context *ctx = event->ctx;
-	u64 run_end;
-
-	if (event->state < PERF_EVENT_STATE_INACTIVE ||
-	    event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
-		return;
-
-	event->total_time_enabled = ctx->time - event->tstamp_enabled;
-
-	if (event->state == PERF_EVENT_STATE_INACTIVE)
-		run_end = event->tstamp_stopped;
-	else
-		run_end = ctx->time;
-
-	event->total_time_running = run_end - event->tstamp_running;
-}
-
 /*
  * Update total_time_enabled and total_time_running for all events in a group.
  */
@@ -1032,10 +1040,10 @@
 	update_context_time(ctx);
 
 	perf_disable();
-	if (ctx->nr_active)
+	if (ctx->nr_active) {
 		list_for_each_entry(event, &ctx->group_list, group_entry)
 			group_sched_out(event, cpuctx, ctx);
-
+	}
 	perf_enable();
  out:
 	spin_unlock(&ctx->lock);
@@ -1060,8 +1068,6 @@
 		&& !ctx1->pin_count && !ctx2->pin_count;
 }
 
-static void __perf_event_read(void *event);
-
 static void __perf_event_sync_stat(struct perf_event *event,
 				     struct perf_event *next_event)
 {
@@ -1079,8 +1085,8 @@
 	 */
 	switch (event->state) {
 	case PERF_EVENT_STATE_ACTIVE:
-		__perf_event_read(event);
-		break;
+		event->pmu->read(event);
+		/* fall-through */
 
 	case PERF_EVENT_STATE_INACTIVE:
 		update_event_times(event);
@@ -1119,6 +1125,8 @@
 	if (!ctx->nr_stat)
 		return;
 
+	update_context_time(ctx);
+
 	event = list_first_entry(&ctx->event_list,
 				   struct perf_event, event_entry);
 
@@ -1162,8 +1170,6 @@
 	if (likely(!ctx || !cpuctx->task_ctx))
 		return;
 
-	update_context_time(ctx);
-
 	rcu_read_lock();
 	parent = rcu_dereference(ctx->parent_ctx);
 	next_ctx = next->perf_event_ctxp;
@@ -1516,7 +1522,6 @@
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_event *event = info;
 	struct perf_event_context *ctx = event->ctx;
-	unsigned long flags;
 
 	/*
 	 * If this is a task context, we need to check whether it is
@@ -1528,12 +1533,12 @@
 	if (ctx->task && cpuctx->task_ctx != ctx)
 		return;
 
-	local_irq_save(flags);
-	if (ctx->is_active)
-		update_context_time(ctx);
-	event->pmu->read(event);
+	spin_lock(&ctx->lock);
+	update_context_time(ctx);
 	update_event_times(event);
-	local_irq_restore(flags);
+	spin_unlock(&ctx->lock);
+
+	event->pmu->read(event);
 }
 
 static u64 perf_event_read(struct perf_event *event)
@@ -1546,7 +1551,13 @@
 		smp_call_function_single(event->oncpu,
 					 __perf_event_read, event, 1);
 	} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
+		struct perf_event_context *ctx = event->ctx;
+		unsigned long flags;
+
+		spin_lock_irqsave(&ctx->lock, flags);
+		update_context_time(ctx);
 		update_event_times(event);
+		spin_unlock_irqrestore(&ctx->lock, flags);
 	}
 
 	return atomic64_read(&event->count);
@@ -1700,16 +1711,10 @@
 	call_rcu(&event->rcu_head, free_event_rcu);
 }
 
-/*
- * Called when the last reference to the file is gone.
- */
-static int perf_release(struct inode *inode, struct file *file)
+int perf_event_release_kernel(struct perf_event *event)
 {
-	struct perf_event *event = file->private_data;
 	struct perf_event_context *ctx = event->ctx;
 
-	file->private_data = NULL;
-
 	WARN_ON_ONCE(ctx->parent_ctx);
 	mutex_lock(&ctx->mutex);
 	perf_event_remove_from_context(event);
@@ -1724,6 +1729,19 @@
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(perf_event_release_kernel);
+
+/*
+ * Called when the last reference to the file is gone.
+ */
+static int perf_release(struct inode *inode, struct file *file)
+{
+	struct perf_event *event = file->private_data;
+
+	file->private_data = NULL;
+
+	return perf_event_release_kernel(event);
+}
 
 static int perf_event_read_size(struct perf_event *event)
 {
@@ -1750,91 +1768,94 @@
 	return size;
 }
 
-static u64 perf_event_read_value(struct perf_event *event)
+u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
 {
 	struct perf_event *child;
 	u64 total = 0;
 
+	*enabled = 0;
+	*running = 0;
+
+	mutex_lock(&event->child_mutex);
 	total += perf_event_read(event);
-	list_for_each_entry(child, &event->child_list, child_list)
+	*enabled += event->total_time_enabled +
+			atomic64_read(&event->child_total_time_enabled);
+	*running += event->total_time_running +
+			atomic64_read(&event->child_total_time_running);
+
+	list_for_each_entry(child, &event->child_list, child_list) {
 		total += perf_event_read(child);
+		*enabled += child->total_time_enabled;
+		*running += child->total_time_running;
+	}
+	mutex_unlock(&event->child_mutex);
 
 	return total;
 }
-
-static int perf_event_read_entry(struct perf_event *event,
-				   u64 read_format, char __user *buf)
-{
-	int n = 0, count = 0;
-	u64 values[2];
-
-	values[n++] = perf_event_read_value(event);
-	if (read_format & PERF_FORMAT_ID)
-		values[n++] = primary_event_id(event);
-
-	count = n * sizeof(u64);
-
-	if (copy_to_user(buf, values, count))
-		return -EFAULT;
-
-	return count;
-}
+EXPORT_SYMBOL_GPL(perf_event_read_value);
 
 static int perf_event_read_group(struct perf_event *event,
 				   u64 read_format, char __user *buf)
 {
 	struct perf_event *leader = event->group_leader, *sub;
-	int n = 0, size = 0, err = -EFAULT;
-	u64 values[3];
+	int n = 0, size = 0, ret = -EFAULT;
+	struct perf_event_context *ctx = leader->ctx;
+	u64 values[5];
+	u64 count, enabled, running;
+
+	mutex_lock(&ctx->mutex);
+	count = perf_event_read_value(leader, &enabled, &running);
 
 	values[n++] = 1 + leader->nr_siblings;
-	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
-		values[n++] = leader->total_time_enabled +
-			atomic64_read(&leader->child_total_time_enabled);
-	}
-	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
-		values[n++] = leader->total_time_running +
-			atomic64_read(&leader->child_total_time_running);
-	}
+	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+		values[n++] = enabled;
+	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+		values[n++] = running;
+	values[n++] = count;
+	if (read_format & PERF_FORMAT_ID)
+		values[n++] = primary_event_id(leader);
 
 	size = n * sizeof(u64);
 
 	if (copy_to_user(buf, values, size))
-		return -EFAULT;
+		goto unlock;
 
-	err = perf_event_read_entry(leader, read_format, buf + size);
-	if (err < 0)
-		return err;
-
-	size += err;
+	ret = size;
 
 	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
-		err = perf_event_read_entry(sub, read_format,
-				buf + size);
-		if (err < 0)
-			return err;
+		n = 0;
 
-		size += err;
+		values[n++] = perf_event_read_value(sub, &enabled, &running);
+		if (read_format & PERF_FORMAT_ID)
+			values[n++] = primary_event_id(sub);
+
+		size = n * sizeof(u64);
+
+		if (copy_to_user(buf + ret, values, size)) {
+			ret = -EFAULT;
+			goto unlock;
+		}
+
+		ret += size;
 	}
+unlock:
+	mutex_unlock(&ctx->mutex);
 
-	return size;
+	return ret;
 }
 
 static int perf_event_read_one(struct perf_event *event,
 				 u64 read_format, char __user *buf)
 {
+	u64 enabled, running;
 	u64 values[4];
 	int n = 0;
 
-	values[n++] = perf_event_read_value(event);
-	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
-		values[n++] = event->total_time_enabled +
-			atomic64_read(&event->child_total_time_enabled);
-	}
-	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
-		values[n++] = event->total_time_running +
-			atomic64_read(&event->child_total_time_running);
-	}
+	values[n++] = perf_event_read_value(event, &enabled, &running);
+	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+		values[n++] = enabled;
+	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+		values[n++] = running;
 	if (read_format & PERF_FORMAT_ID)
 		values[n++] = primary_event_id(event);
 
@@ -1865,12 +1886,10 @@
 		return -ENOSPC;
 
 	WARN_ON_ONCE(event->ctx->parent_ctx);
-	mutex_lock(&event->child_mutex);
 	if (read_format & PERF_FORMAT_GROUP)
 		ret = perf_event_read_group(event, read_format, buf);
 	else
 		ret = perf_event_read_one(event, read_format, buf);
-	mutex_unlock(&event->child_mutex);
 
 	return ret;
 }
@@ -2315,7 +2334,7 @@
 	}
 
 	if (!data->watermark)
-		data->watermark = max_t(long, PAGE_SIZE, max_size / 2);
+		data->watermark = max_size / 2;
 
 
 	rcu_assign_pointer(event->data, data);
@@ -3245,15 +3264,10 @@
 {
 	struct perf_event *event;
 
-	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
-		return;
-
-	rcu_read_lock();
 	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
 		if (perf_event_task_match(event))
 			perf_event_task_output(event, task_event);
 	}
-	rcu_read_unlock();
 }
 
 static void perf_event_task_event(struct perf_task_event *task_event)
@@ -3261,11 +3275,11 @@
 	struct perf_cpu_context *cpuctx;
 	struct perf_event_context *ctx = task_event->task_ctx;
 
+	rcu_read_lock();
 	cpuctx = &get_cpu_var(perf_cpu_context);
 	perf_event_task_ctx(&cpuctx->ctx, task_event);
 	put_cpu_var(perf_cpu_context);
 
-	rcu_read_lock();
 	if (!ctx)
 		ctx = rcu_dereference(task_event->task->perf_event_ctxp);
 	if (ctx)
@@ -3357,15 +3371,10 @@
 {
 	struct perf_event *event;
 
-	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
-		return;
-
-	rcu_read_lock();
 	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
 		if (perf_event_comm_match(event))
 			perf_event_comm_output(event, comm_event);
 	}
-	rcu_read_unlock();
 }
 
 static void perf_event_comm_event(struct perf_comm_event *comm_event)
@@ -3376,7 +3385,7 @@
 	char comm[TASK_COMM_LEN];
 
 	memset(comm, 0, sizeof(comm));
-	strncpy(comm, comm_event->task->comm, sizeof(comm));
+	strlcpy(comm, comm_event->task->comm, sizeof(comm));
 	size = ALIGN(strlen(comm)+1, sizeof(u64));
 
 	comm_event->comm = comm;
@@ -3384,11 +3393,11 @@
 
 	comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
 
+	rcu_read_lock();
 	cpuctx = &get_cpu_var(perf_cpu_context);
 	perf_event_comm_ctx(&cpuctx->ctx, comm_event);
 	put_cpu_var(perf_cpu_context);
 
-	rcu_read_lock();
 	/*
 	 * doesn't really matter which of the child contexts the
 	 * events ends up in.
@@ -3481,15 +3490,10 @@
 {
 	struct perf_event *event;
 
-	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
-		return;
-
-	rcu_read_lock();
 	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
 		if (perf_event_mmap_match(event, mmap_event))
 			perf_event_mmap_output(event, mmap_event);
 	}
-	rcu_read_unlock();
 }
 
 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
@@ -3545,11 +3549,11 @@
 
 	mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
 
+	rcu_read_lock();
 	cpuctx = &get_cpu_var(perf_cpu_context);
 	perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
 	put_cpu_var(perf_cpu_context);
 
-	rcu_read_lock();
 	/*
 	 * doesn't really matter which of the child contexts the
 	 * events ends up in.
@@ -3688,7 +3692,11 @@
 			perf_event_disable(event);
 	}
 
-	perf_event_output(event, nmi, data, regs);
+	if (event->overflow_handler)
+		event->overflow_handler(event, nmi, data, regs);
+	else
+		perf_event_output(event, nmi, data, regs);
+
 	return ret;
 }
 
@@ -3733,16 +3741,16 @@
 	return nr;
 }
 
-static void perf_swevent_overflow(struct perf_event *event,
+static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
 				    int nmi, struct perf_sample_data *data,
 				    struct pt_regs *regs)
 {
 	struct hw_perf_event *hwc = &event->hw;
 	int throttle = 0;
-	u64 overflow;
 
 	data->period = event->hw.last_period;
-	overflow = perf_swevent_set_period(event);
+	if (!overflow)
+		overflow = perf_swevent_set_period(event);
 
 	if (hwc->interrupts == MAX_INTERRUPTS)
 		return;
@@ -3775,14 +3783,19 @@
 
 	atomic64_add(nr, &event->count);
 
-	if (!hwc->sample_period)
-		return;
-
 	if (!regs)
 		return;
 
-	if (!atomic64_add_negative(nr, &hwc->period_left))
-		perf_swevent_overflow(event, nmi, data, regs);
+	if (!hwc->sample_period)
+		return;
+
+	if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
+		return perf_swevent_overflow(event, 1, nmi, data, regs);
+
+	if (atomic64_add_negative(nr, &hwc->period_left))
+		return;
+
+	perf_swevent_overflow(event, 0, nmi, data, regs);
 }
 
 static int perf_swevent_is_counting(struct perf_event *event)
@@ -3818,6 +3831,20 @@
 static int perf_tp_event_match(struct perf_event *event,
 				struct perf_sample_data *data);
 
+static int perf_exclude_event(struct perf_event *event,
+			      struct pt_regs *regs)
+{
+	if (regs) {
+		if (event->attr.exclude_user && user_mode(regs))
+			return 1;
+
+		if (event->attr.exclude_kernel && !user_mode(regs))
+			return 1;
+	}
+
+	return 0;
+}
+
 static int perf_swevent_match(struct perf_event *event,
 				enum perf_type_id type,
 				u32 event_id,
@@ -3829,16 +3856,12 @@
 
 	if (event->attr.type != type)
 		return 0;
+
 	if (event->attr.config != event_id)
 		return 0;
 
-	if (regs) {
-		if (event->attr.exclude_user && user_mode(regs))
-			return 0;
-
-		if (event->attr.exclude_kernel && !user_mode(regs))
-			return 0;
-	}
+	if (perf_exclude_event(event, regs))
+		return 0;
 
 	if (event->attr.type == PERF_TYPE_TRACEPOINT &&
 	    !perf_tp_event_match(event, data))
@@ -3855,49 +3878,59 @@
 {
 	struct perf_event *event;
 
-	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
-		return;
-
-	rcu_read_lock();
 	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
 		if (perf_swevent_match(event, type, event_id, data, regs))
 			perf_swevent_add(event, nr, nmi, data, regs);
 	}
-	rcu_read_unlock();
 }
 
-static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx)
+int perf_swevent_get_recursion_context(void)
 {
+	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
+	int rctx;
+
 	if (in_nmi())
-		return &cpuctx->recursion[3];
+		rctx = 3;
+	else if (in_irq())
+		rctx = 2;
+	else if (in_softirq())
+		rctx = 1;
+	else
+		rctx = 0;
 
-	if (in_irq())
-		return &cpuctx->recursion[2];
+	if (cpuctx->recursion[rctx]) {
+		put_cpu_var(perf_cpu_context);
+		return -1;
+	}
 
-	if (in_softirq())
-		return &cpuctx->recursion[1];
+	cpuctx->recursion[rctx]++;
+	barrier();
 
-	return &cpuctx->recursion[0];
+	return rctx;
 }
+EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
+
+void perf_swevent_put_recursion_context(int rctx)
+{
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	barrier();
+	cpuctx->recursion[rctx]++;
+	put_cpu_var(perf_cpu_context);
+}
+EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
 
 static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
 				    u64 nr, int nmi,
 				    struct perf_sample_data *data,
 				    struct pt_regs *regs)
 {
-	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
-	int *recursion = perf_swevent_recursion_context(cpuctx);
+	struct perf_cpu_context *cpuctx;
 	struct perf_event_context *ctx;
 
-	if (*recursion)
-		goto out;
-
-	(*recursion)++;
-	barrier();
-
+	cpuctx = &__get_cpu_var(perf_cpu_context);
+	rcu_read_lock();
 	perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
 				 nr, nmi, data, regs);
-	rcu_read_lock();
 	/*
 	 * doesn't really matter which of the child contexts the
 	 * events ends up in.
@@ -3906,23 +3939,24 @@
 	if (ctx)
 		perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
 	rcu_read_unlock();
-
-	barrier();
-	(*recursion)--;
-
-out:
-	put_cpu_var(perf_cpu_context);
 }
 
 void __perf_sw_event(u32 event_id, u64 nr, int nmi,
 			    struct pt_regs *regs, u64 addr)
 {
-	struct perf_sample_data data = {
-		.addr = addr,
-	};
+	struct perf_sample_data data;
+	int rctx;
 
-	do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi,
-				&data, regs);
+	rctx = perf_swevent_get_recursion_context();
+	if (rctx < 0)
+		return;
+
+	data.addr = addr;
+	data.raw  = NULL;
+
+	do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
+
+	perf_swevent_put_recursion_context(rctx);
 }
 
 static void perf_swevent_read(struct perf_event *event)
@@ -4145,6 +4179,7 @@
 	if (!regs)
 		regs = task_pt_regs(current);
 
+	/* Trace events already protected against recursion */
 	do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
 				&data, regs);
 }
@@ -4231,6 +4266,57 @@
 
 #endif /* CONFIG_EVENT_PROFILE */
 
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+static void bp_perf_event_destroy(struct perf_event *event)
+{
+	release_bp_slot(event);
+}
+
+static const struct pmu *bp_perf_event_init(struct perf_event *bp)
+{
+	int err;
+	/*
+	 * The breakpoint is already filled if we haven't created the counter
+	 * through perf syscall
+	 * FIXME: manage to get trigerred to NULL if it comes from syscalls
+	 */
+	if (!bp->callback)
+		err = register_perf_hw_breakpoint(bp);
+	else
+		err = __register_perf_hw_breakpoint(bp);
+	if (err)
+		return ERR_PTR(err);
+
+	bp->destroy = bp_perf_event_destroy;
+
+	return &perf_ops_bp;
+}
+
+void perf_bp_event(struct perf_event *bp, void *data)
+{
+	struct perf_sample_data sample;
+	struct pt_regs *regs = data;
+
+	sample.addr = bp->attr.bp_addr;
+
+	if (!perf_exclude_event(bp, regs))
+		perf_swevent_add(bp, 1, 1, &sample, regs);
+}
+#else
+static void bp_perf_event_destroy(struct perf_event *event)
+{
+}
+
+static const struct pmu *bp_perf_event_init(struct perf_event *bp)
+{
+	return NULL;
+}
+
+void perf_bp_event(struct perf_event *bp, void *regs)
+{
+}
+#endif
+
 atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
 
 static void sw_perf_event_destroy(struct perf_event *event)
@@ -4297,6 +4383,7 @@
 		   struct perf_event_context *ctx,
 		   struct perf_event *group_leader,
 		   struct perf_event *parent_event,
+		   perf_callback_t callback,
 		   gfp_t gfpflags)
 {
 	const struct pmu *pmu;
@@ -4339,6 +4426,11 @@
 
 	event->state		= PERF_EVENT_STATE_INACTIVE;
 
+	if (!callback && parent_event)
+		callback = parent_event->callback;
+	
+	event->callback	= callback;
+
 	if (attr->disabled)
 		event->state = PERF_EVENT_STATE_OFF;
 
@@ -4373,6 +4465,11 @@
 		pmu = tp_perf_event_init(event);
 		break;
 
+	case PERF_TYPE_BREAKPOINT:
+		pmu = bp_perf_event_init(event);
+		break;
+
+
 	default:
 		break;
 	}
@@ -4615,7 +4712,7 @@
 	}
 
 	event = perf_event_alloc(&attr, cpu, ctx, group_leader,
-				     NULL, GFP_KERNEL);
+				     NULL, NULL, GFP_KERNEL);
 	err = PTR_ERR(event);
 	if (IS_ERR(event))
 		goto err_put_context;
@@ -4663,6 +4760,58 @@
 	return err;
 }
 
+/**
+ * perf_event_create_kernel_counter
+ *
+ * @attr: attributes of the counter to create
+ * @cpu: cpu in which the counter is bound
+ * @pid: task to profile
+ */
+struct perf_event *
+perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
+				 pid_t pid, perf_callback_t callback)
+{
+	struct perf_event *event;
+	struct perf_event_context *ctx;
+	int err;
+
+	/*
+	 * Get the target context (task or percpu):
+	 */
+
+	ctx = find_get_context(pid, cpu);
+	if (IS_ERR(ctx))
+		return NULL;
+
+	event = perf_event_alloc(attr, cpu, ctx, NULL,
+				     NULL, callback, GFP_KERNEL);
+	err = PTR_ERR(event);
+	if (IS_ERR(event))
+		goto err_put_context;
+
+	event->filp = NULL;
+	WARN_ON_ONCE(ctx->parent_ctx);
+	mutex_lock(&ctx->mutex);
+	perf_install_in_context(ctx, event, cpu);
+	++ctx->generation;
+	mutex_unlock(&ctx->mutex);
+
+	event->owner = current;
+	get_task_struct(current);
+	mutex_lock(&current->perf_event_mutex);
+	list_add_tail(&event->owner_entry, &current->perf_event_list);
+	mutex_unlock(&current->perf_event_mutex);
+
+	return event;
+
+err_put_context:
+	if (err < 0)
+		put_ctx(ctx);
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
+
 /*
  * inherit a event from parent task to child task:
  */
@@ -4688,7 +4837,7 @@
 	child_event = perf_event_alloc(&parent_event->attr,
 					   parent_event->cpu, child_ctx,
 					   group_leader, parent_event,
-					   GFP_KERNEL);
+					   NULL, GFP_KERNEL);
 	if (IS_ERR(child_event))
 		return child_event;
 	get_ctx(child_ctx);
@@ -4706,6 +4855,8 @@
 	if (parent_event->attr.freq)
 		child_event->hw.sample_period = parent_event->hw.sample_period;
 
+	child_event->overflow_handler = parent_event->overflow_handler;
+
 	/*
 	 * Link it up in the child's context:
 	 */
@@ -4795,7 +4946,6 @@
 {
 	struct perf_event *parent_event;
 
-	update_event_times(child_event);
 	perf_event_remove_from_context(child_event);
 
 	parent_event = child_event->parent;
@@ -4847,6 +4997,7 @@
 	 * the events from it.
 	 */
 	unclone_ctx(child_ctx);
+	update_context_time(child_ctx);
 	spin_unlock_irqrestore(&child_ctx->lock, flags);
 
 	/*
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index f056716..d006554 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -339,6 +339,27 @@
 	  power management decisions, specifically the C-state and P-state
 	  behavior.
 
+config KSYM_TRACER
+	bool "Trace read and write access on kernel memory locations"
+	depends on HAVE_HW_BREAKPOINT
+	select TRACING
+	help
+	  This tracer helps find read and write operations on any given kernel
+	  symbol i.e. /proc/kallsyms.
+
+config PROFILE_KSYM_TRACER
+	bool "Profile all kernel memory accesses on 'watched' variables"
+	depends on KSYM_TRACER
+	help
+	  This tracer profiles kernel accesses on variables watched through the
+	  ksym tracer ftrace plugin. Depending upon the hardware, all read
+	  and write operations on kernel variables can be monitored for
+	  accesses.
+
+	  The results will be displayed in:
+	  /debugfs/tracing/profile_ksym
+
+	  Say N if unsure.
 
 config STACK_TRACER
 	bool "Trace max stack"
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index edc3a3c..cd9ecd8 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -54,6 +54,7 @@
 obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
+obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
 obj-$(CONFIG_EVENT_TRACING) += power-traces.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b4e4212..4da6ede 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -11,6 +11,7 @@
 #include <linux/ftrace.h>
 #include <trace/boot.h>
 #include <linux/kmemtrace.h>
+#include <linux/hw_breakpoint.h>
 
 #include <linux/trace_seq.h>
 #include <linux/ftrace_event.h>
@@ -37,6 +38,7 @@
 	TRACE_KMEM_ALLOC,
 	TRACE_KMEM_FREE,
 	TRACE_BLK,
+	TRACE_KSYM,
 
 	__TRACE_LAST_TYPE,
 };
@@ -232,6 +234,7 @@
 			  TRACE_KMEM_ALLOC);	\
 		IF_ASSIGN(var, ent, struct kmemtrace_free_entry,	\
 			  TRACE_KMEM_FREE);	\
+		IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\
 		__ftrace_bad_type();					\
 	} while (0)
 
@@ -387,6 +390,8 @@
 void unregister_tracer(struct tracer *type);
 int is_tracing_stopped(void);
 
+extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr);
+
 extern unsigned long nsecs_to_usecs(unsigned long nsecs);
 
 #ifdef CONFIG_TRACER_MAX_TRACE
@@ -461,6 +466,8 @@
 					 struct trace_array *tr);
 extern int trace_selftest_startup_hw_branches(struct tracer *trace,
 					      struct trace_array *tr);
+extern int trace_selftest_startup_ksym(struct tracer *trace,
+					 struct trace_array *tr);
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
 
 extern void *head_page(struct trace_array_cpu *data);
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index ead3d72..c16a08f 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -364,3 +364,19 @@
 	F_printk("type:%u call_site:%lx ptr:%p",
 		 __entry->type_id, __entry->call_site, __entry->ptr)
 );
+
+FTRACE_ENTRY(ksym_trace, ksym_trace_entry,
+
+	TRACE_KSYM,
+
+	F_STRUCT(
+		__field(	unsigned long,	ip			  )
+		__field(	unsigned char,	type			  )
+		__array(	char	     ,	cmd,	   TASK_COMM_LEN  )
+		__field(	unsigned long,  addr			  )
+	),
+
+	F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s",
+		(void *)__entry->ip, (unsigned int)__entry->type,
+		(void *)__entry->addr,  __entry->cmd)
+);
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index e0d351b..d9c60f8 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -9,31 +9,33 @@
 #include "trace.h"
 
 
-struct perf_trace_buf *perf_trace_buf;
+char *perf_trace_buf;
 EXPORT_SYMBOL_GPL(perf_trace_buf);
 
-struct perf_trace_buf *perf_trace_buf_nmi;
+char *perf_trace_buf_nmi;
 EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
 
+typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;
+
 /* Count the events in use (per event id, not per instance) */
 static int	total_profile_count;
 
 static int ftrace_profile_enable_event(struct ftrace_event_call *event)
 {
-	struct perf_trace_buf *buf;
+	char *buf;
 	int ret = -ENOMEM;
 
 	if (atomic_inc_return(&event->profile_count))
 		return 0;
 
 	if (!total_profile_count) {
-		buf = alloc_percpu(struct perf_trace_buf);
+		buf = (char *)alloc_percpu(perf_trace_t);
 		if (!buf)
 			goto fail_buf;
 
 		rcu_assign_pointer(perf_trace_buf, buf);
 
-		buf = alloc_percpu(struct perf_trace_buf);
+		buf = (char *)alloc_percpu(perf_trace_t);
 		if (!buf)
 			goto fail_buf_nmi;
 
@@ -79,7 +81,7 @@
 
 static void ftrace_profile_disable_event(struct ftrace_event_call *event)
 {
-	struct perf_trace_buf *buf, *nmi_buf;
+	char *buf, *nmi_buf;
 
 	if (!atomic_add_negative(-1, &event->profile_count))
 		return;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 3696476..79ce6a2 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1208,11 +1208,12 @@
 	struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
 	struct ftrace_event_call *call = &tp->call;
 	struct kprobe_trace_entry *entry;
-	struct perf_trace_buf *trace_buf;
 	struct trace_entry *ent;
 	int size, __size, i, pc, __cpu;
 	unsigned long irq_flags;
+	char *trace_buf;
 	char *raw_data;
+	int rctx;
 
 	pc = preempt_count();
 	__size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
@@ -1227,6 +1228,11 @@
 	 * This also protects the rcu read side
 	 */
 	local_irq_save(irq_flags);
+
+	rctx = perf_swevent_get_recursion_context();
+	if (rctx < 0)
+		goto end_recursion;
+
 	__cpu = smp_processor_id();
 
 	if (in_nmi())
@@ -1237,18 +1243,7 @@
 	if (!trace_buf)
 		goto end;
 
-	trace_buf = per_cpu_ptr(trace_buf, __cpu);
-
-	if (trace_buf->recursion++)
-		goto end_recursion;
-
-	/*
-	 * Make recursion update visible before entering perf_tp_event
-	 * so that we protect from perf recursions.
-	 */
-	barrier();
-
-	raw_data = trace_buf->buf;
+	raw_data = per_cpu_ptr(trace_buf, __cpu);
 
 	/* Zero dead bytes from alignment to avoid buffer leak to userspace */
 	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
@@ -1263,9 +1258,9 @@
 		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
 	perf_tp_event(call->id, entry->ip, 1, entry, size);
 
-end_recursion:
-	trace_buf->recursion--;
 end:
+	perf_swevent_put_recursion_context(rctx);
+end_recursion:
 	local_irq_restore(irq_flags);
 
 	return 0;
@@ -1278,11 +1273,12 @@
 	struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
 	struct ftrace_event_call *call = &tp->call;
 	struct kretprobe_trace_entry *entry;
-	struct perf_trace_buf *trace_buf;
 	struct trace_entry *ent;
 	int size, __size, i, pc, __cpu;
 	unsigned long irq_flags;
+	char *trace_buf;
 	char *raw_data;
+	int rctx;
 
 	pc = preempt_count();
 	__size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
@@ -1297,6 +1293,11 @@
 	 * This also protects the rcu read side
 	 */
 	local_irq_save(irq_flags);
+
+	rctx = perf_swevent_get_recursion_context();
+	if (rctx < 0)
+		goto end_recursion;
+
 	__cpu = smp_processor_id();
 
 	if (in_nmi())
@@ -1307,18 +1308,7 @@
 	if (!trace_buf)
 		goto end;
 
-	trace_buf = per_cpu_ptr(trace_buf, __cpu);
-
-	if (trace_buf->recursion++)
-		goto end_recursion;
-
-	/*
-	 * Make recursion update visible before entering perf_tp_event
-	 * so that we protect from perf recursions.
-	 */
-	barrier();
-
-	raw_data = trace_buf->buf;
+	raw_data = per_cpu_ptr(trace_buf, __cpu);
 
 	/* Zero dead bytes from alignment to avoid buffer leak to userspace */
 	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
@@ -1334,9 +1324,9 @@
 		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
 	perf_tp_event(call->id, entry->ret_ip, 1, entry, size);
 
-end_recursion:
-	trace_buf->recursion--;
 end:
+	perf_swevent_put_recursion_context(rctx);
+end_recursion:
 	local_irq_restore(irq_flags);
 
 	return 0;
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
new file mode 100644
index 0000000..11935b5
--- /dev/null
+++ b/kernel/trace/trace_ksym.c
@@ -0,0 +1,554 @@
+/*
+ * trace_ksym.c - Kernel Symbol Tracer
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2009
+ */
+
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+#include <linux/debugfs.h>
+#include <linux/ftrace.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+
+#include "trace_output.h"
+#include "trace_stat.h"
+#include "trace.h"
+
+#include <linux/hw_breakpoint.h>
+#include <asm/hw_breakpoint.h>
+
+/*
+ * For now, let us restrict the no. of symbols traced simultaneously to number
+ * of available hardware breakpoint registers.
+ */
+#define KSYM_TRACER_MAX HBP_NUM
+
+#define KSYM_TRACER_OP_LEN 3 /* rw- */
+
+struct trace_ksym {
+	struct perf_event	**ksym_hbp;
+	unsigned long		ksym_addr;
+	int			type;
+	int			len;
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+	unsigned long		counter;
+#endif
+	struct hlist_node	ksym_hlist;
+};
+
+static struct trace_array *ksym_trace_array;
+
+static unsigned int ksym_filter_entry_count;
+static unsigned int ksym_tracing_enabled;
+
+static HLIST_HEAD(ksym_filter_head);
+
+static DEFINE_MUTEX(ksym_tracer_mutex);
+
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+
+#define MAX_UL_INT 0xffffffff
+
+void ksym_collect_stats(unsigned long hbp_hit_addr)
+{
+	struct hlist_node *node;
+	struct trace_ksym *entry;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
+		if ((entry->ksym_addr == hbp_hit_addr) &&
+		    (entry->counter <= MAX_UL_INT)) {
+			entry->counter++;
+			break;
+		}
+	}
+	rcu_read_unlock();
+}
+#endif /* CONFIG_PROFILE_KSYM_TRACER */
+
+void ksym_hbp_handler(struct perf_event *hbp, void *data)
+{
+	struct ring_buffer_event *event;
+	struct ksym_trace_entry *entry;
+	struct pt_regs *regs = data;
+	struct ring_buffer *buffer;
+	int pc;
+
+	if (!ksym_tracing_enabled)
+		return;
+
+	buffer = ksym_trace_array->buffer;
+
+	pc = preempt_count();
+
+	event = trace_buffer_lock_reserve(buffer, TRACE_KSYM,
+							sizeof(*entry), 0, pc);
+	if (!event)
+		return;
+
+	entry		= ring_buffer_event_data(event);
+	entry->ip	= instruction_pointer(regs);
+	entry->type	= hw_breakpoint_type(hbp);
+	entry->addr	= hw_breakpoint_addr(hbp);
+	strlcpy(entry->cmd, current->comm, TASK_COMM_LEN);
+
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+	ksym_collect_stats(hw_breakpoint_addr(hbp));
+#endif /* CONFIG_PROFILE_KSYM_TRACER */
+
+	trace_buffer_unlock_commit(buffer, event, 0, pc);
+}
+
+/* Valid access types are represented as
+ *
+ * rw- : Set Read/Write Access Breakpoint
+ * -w- : Set Write Access Breakpoint
+ * --- : Clear Breakpoints
+ * --x : Set Execution Break points (Not available yet)
+ *
+ */
+static int ksym_trace_get_access_type(char *str)
+{
+	int access = 0;
+
+	if (str[0] == 'r')
+		access |= HW_BREAKPOINT_R;
+
+	if (str[1] == 'w')
+		access |= HW_BREAKPOINT_W;
+
+	if (str[2] == 'x')
+		access |= HW_BREAKPOINT_X;
+
+	switch (access) {
+	case HW_BREAKPOINT_R:
+	case HW_BREAKPOINT_W:
+	case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
+		return access;
+	default:
+		return -EINVAL;
+	}
+}
+
+/*
+ * There can be several possible malformed requests and we attempt to capture
+ * all of them. We enumerate some of the rules
+ * 1. We will not allow kernel symbols with ':' since it is used as a delimiter.
+ *    i.e. multiple ':' symbols disallowed. Possible uses are of the form
+ *    <module>:<ksym_name>:<op>.
+ * 2. No delimiter symbol ':' in the input string
+ * 3. Spurious operator symbols or symbols not in their respective positions
+ * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file
+ * 5. Kernel symbol not a part of /proc/kallsyms
+ * 6. Duplicate requests
+ */
+static int parse_ksym_trace_str(char *input_string, char **ksymname,
+							unsigned long *addr)
+{
+	int ret;
+
+	*ksymname = strsep(&input_string, ":");
+	*addr = kallsyms_lookup_name(*ksymname);
+
+	/* Check for malformed request: (2), (1) and (5) */
+	if ((!input_string) ||
+	    (strlen(input_string) != KSYM_TRACER_OP_LEN) ||
+	    (*addr == 0))
+		return -EINVAL;;
+
+	ret = ksym_trace_get_access_type(input_string);
+
+	return ret;
+}
+
+int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
+{
+	struct trace_ksym *entry;
+	int ret = -ENOMEM;
+
+	if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
+		printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
+		" new requests for tracing can be accepted now.\n",
+			KSYM_TRACER_MAX);
+		return -ENOSPC;
+	}
+
+	entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
+	if (!entry)
+		return -ENOMEM;
+
+	entry->type = op;
+	entry->ksym_addr = addr;
+	entry->len = HW_BREAKPOINT_LEN_4;
+
+	ret = -EAGAIN;
+	entry->ksym_hbp = register_wide_hw_breakpoint(entry->ksym_addr,
+					entry->len, entry->type,
+					ksym_hbp_handler, true);
+	if (IS_ERR(entry->ksym_hbp)) {
+		entry->ksym_hbp = NULL;
+		ret = PTR_ERR(entry->ksym_hbp);
+	}
+
+	if (!entry->ksym_hbp) {
+		printk(KERN_INFO "ksym_tracer request failed. Try again"
+					" later!!\n");
+		goto err;
+	}
+
+	hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
+	ksym_filter_entry_count++;
+
+	return 0;
+
+err:
+	kfree(entry);
+
+	return ret;
+}
+
+static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
+						size_t count, loff_t *ppos)
+{
+	struct trace_ksym *entry;
+	struct hlist_node *node;
+	struct trace_seq *s;
+	ssize_t cnt = 0;
+	int ret;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+	trace_seq_init(s);
+
+	mutex_lock(&ksym_tracer_mutex);
+
+	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
+		ret = trace_seq_printf(s, "%pS:", (void *)entry->ksym_addr);
+		if (entry->type == HW_BREAKPOINT_R)
+			ret = trace_seq_puts(s, "r--\n");
+		else if (entry->type == HW_BREAKPOINT_W)
+			ret = trace_seq_puts(s, "-w-\n");
+		else if (entry->type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R))
+			ret = trace_seq_puts(s, "rw-\n");
+		WARN_ON_ONCE(!ret);
+	}
+
+	cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
+
+	mutex_unlock(&ksym_tracer_mutex);
+
+	kfree(s);
+
+	return cnt;
+}
+
+static void __ksym_trace_reset(void)
+{
+	struct trace_ksym *entry;
+	struct hlist_node *node, *node1;
+
+	mutex_lock(&ksym_tracer_mutex);
+	hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
+								ksym_hlist) {
+		unregister_wide_hw_breakpoint(entry->ksym_hbp);
+		ksym_filter_entry_count--;
+		hlist_del_rcu(&(entry->ksym_hlist));
+		synchronize_rcu();
+		kfree(entry);
+	}
+	mutex_unlock(&ksym_tracer_mutex);
+}
+
+static ssize_t ksym_trace_filter_write(struct file *file,
+					const char __user *buffer,
+						size_t count, loff_t *ppos)
+{
+	struct trace_ksym *entry;
+	struct hlist_node *node;
+	char *input_string, *ksymname = NULL;
+	unsigned long ksym_addr = 0;
+	int ret, op, changed = 0;
+
+	input_string = kzalloc(count + 1, GFP_KERNEL);
+	if (!input_string)
+		return -ENOMEM;
+
+	if (copy_from_user(input_string, buffer, count)) {
+		kfree(input_string);
+		return -EFAULT;
+	}
+	input_string[count] = '\0';
+
+	strstrip(input_string);
+
+	/*
+	 * Clear all breakpoints if:
+	 * 1: echo > ksym_trace_filter
+	 * 2: echo 0 > ksym_trace_filter
+	 * 3: echo "*:---" > ksym_trace_filter
+	 */
+	if (!input_string[0] || !strcmp(input_string, "0") ||
+	    !strcmp(input_string, "*:---")) {
+		__ksym_trace_reset();
+		kfree(input_string);
+		return count;
+	}
+
+	ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
+	if (ret < 0) {
+		kfree(input_string);
+		return ret;
+	}
+
+	mutex_lock(&ksym_tracer_mutex);
+
+	ret = -EINVAL;
+	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
+		if (entry->ksym_addr == ksym_addr) {
+			/* Check for malformed request: (6) */
+			if (entry->type != op)
+				changed = 1;
+			else
+				goto out;
+			break;
+		}
+	}
+	if (changed) {
+		unregister_wide_hw_breakpoint(entry->ksym_hbp);
+		entry->type = op;
+		if (op > 0) {
+			entry->ksym_hbp =
+				register_wide_hw_breakpoint(entry->ksym_addr,
+					entry->len, entry->type,
+					ksym_hbp_handler, true);
+			if (IS_ERR(entry->ksym_hbp))
+				entry->ksym_hbp = NULL;
+			if (!entry->ksym_hbp)
+				goto out;
+		}
+		ksym_filter_entry_count--;
+		hlist_del_rcu(&(entry->ksym_hlist));
+		synchronize_rcu();
+		kfree(entry);
+		ret = 0;
+		goto out;
+	} else {
+		/* Check for malformed request: (4) */
+		if (op == 0)
+			goto out;
+		ret = process_new_ksym_entry(ksymname, op, ksym_addr);
+	}
+out:
+	mutex_unlock(&ksym_tracer_mutex);
+
+	kfree(input_string);
+
+	if (!ret)
+		ret = count;
+	return ret;
+}
+
+static const struct file_operations ksym_tracing_fops = {
+	.open		= tracing_open_generic,
+	.read		= ksym_trace_filter_read,
+	.write		= ksym_trace_filter_write,
+};
+
+static void ksym_trace_reset(struct trace_array *tr)
+{
+	ksym_tracing_enabled = 0;
+	__ksym_trace_reset();
+}
+
+static int ksym_trace_init(struct trace_array *tr)
+{
+	int cpu, ret = 0;
+
+	for_each_online_cpu(cpu)
+		tracing_reset(tr, cpu);
+	ksym_tracing_enabled = 1;
+	ksym_trace_array = tr;
+
+	return ret;
+}
+
+static void ksym_trace_print_header(struct seq_file *m)
+{
+	seq_puts(m,
+		 "#       TASK-PID   CPU#      Symbol                    "
+		 "Type    Function\n");
+	seq_puts(m,
+		 "#          |        |          |                       "
+		 " |         |\n");
+}
+
+static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
+{
+	struct trace_entry *entry = iter->ent;
+	struct trace_seq *s = &iter->seq;
+	struct ksym_trace_entry *field;
+	char str[KSYM_SYMBOL_LEN];
+	int ret;
+
+	if (entry->type != TRACE_KSYM)
+		return TRACE_TYPE_UNHANDLED;
+
+	trace_assign_type(field, entry);
+
+	ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd,
+				entry->pid, iter->cpu, (char *)field->addr);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	switch (field->type) {
+	case HW_BREAKPOINT_R:
+		ret = trace_seq_printf(s, " R  ");
+		break;
+	case HW_BREAKPOINT_W:
+		ret = trace_seq_printf(s, " W  ");
+		break;
+	case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
+		ret = trace_seq_printf(s, " RW ");
+		break;
+	default:
+		return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	sprint_symbol(str, field->ip);
+	ret = trace_seq_printf(s, "%s\n", str);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
+struct tracer ksym_tracer __read_mostly =
+{
+	.name		= "ksym_tracer",
+	.init		= ksym_trace_init,
+	.reset		= ksym_trace_reset,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest	= trace_selftest_startup_ksym,
+#endif
+	.print_header   = ksym_trace_print_header,
+	.print_line	= ksym_trace_output
+};
+
+__init static int init_ksym_trace(void)
+{
+	struct dentry *d_tracer;
+	struct dentry *entry;
+
+	d_tracer = tracing_init_dentry();
+	ksym_filter_entry_count = 0;
+
+	entry = debugfs_create_file("ksym_trace_filter", 0644, d_tracer,
+				    NULL, &ksym_tracing_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'ksym_trace_filter' file\n");
+
+	return register_tracer(&ksym_tracer);
+}
+device_initcall(init_ksym_trace);
+
+
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+static int ksym_tracer_stat_headers(struct seq_file *m)
+{
+	seq_puts(m, "  Access Type ");
+	seq_puts(m, "  Symbol                                       Counter\n");
+	seq_puts(m, "  ----------- ");
+	seq_puts(m, "  ------                                       -------\n");
+	return 0;
+}
+
+static int ksym_tracer_stat_show(struct seq_file *m, void *v)
+{
+	struct hlist_node *stat = v;
+	struct trace_ksym *entry;
+	int access_type = 0;
+	char fn_name[KSYM_NAME_LEN];
+
+	entry = hlist_entry(stat, struct trace_ksym, ksym_hlist);
+
+	access_type = entry->type;
+
+	switch (access_type) {
+	case HW_BREAKPOINT_R:
+		seq_puts(m, "  R           ");
+		break;
+	case HW_BREAKPOINT_W:
+		seq_puts(m, "  W           ");
+		break;
+	case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
+		seq_puts(m, "  RW          ");
+		break;
+	default:
+		seq_puts(m, "  NA          ");
+	}
+
+	if (lookup_symbol_name(entry->ksym_addr, fn_name) >= 0)
+		seq_printf(m, "  %-36s", fn_name);
+	else
+		seq_printf(m, "  %-36s", "<NA>");
+	seq_printf(m, " %15lu\n", entry->counter);
+
+	return 0;
+}
+
+static void *ksym_tracer_stat_start(struct tracer_stat *trace)
+{
+	return ksym_filter_head.first;
+}
+
+static void *
+ksym_tracer_stat_next(void *v, int idx)
+{
+	struct hlist_node *stat = v;
+
+	return stat->next;
+}
+
+static struct tracer_stat ksym_tracer_stats = {
+	.name = "ksym_tracer",
+	.stat_start = ksym_tracer_stat_start,
+	.stat_next = ksym_tracer_stat_next,
+	.stat_headers = ksym_tracer_stat_headers,
+	.stat_show = ksym_tracer_stat_show
+};
+
+__init static int ksym_tracer_stat_init(void)
+{
+	int ret;
+
+	ret = register_stat_tracer(&ksym_tracer_stats);
+	if (ret) {
+		printk(KERN_WARNING "Warning: could not register "
+				    "ksym tracer stats\n");
+		return 1;
+	}
+
+	return 0;
+}
+fs_initcall(ksym_tracer_stat_init);
+#endif /* CONFIG_PROFILE_KSYM_TRACER */
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index d2cdbab..dc98309 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -17,6 +17,7 @@
 	case TRACE_GRAPH_ENT:
 	case TRACE_GRAPH_RET:
 	case TRACE_HW_BRANCHES:
+	case TRACE_KSYM:
 		return 1;
 	}
 	return 0;
@@ -808,3 +809,57 @@
 	return ret;
 }
 #endif /* CONFIG_HW_BRANCH_TRACER */
+
+#ifdef CONFIG_KSYM_TRACER
+static int ksym_selftest_dummy;
+
+int
+trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
+{
+	unsigned long count;
+	int ret;
+
+	/* start the tracing */
+	ret = tracer_init(trace, tr);
+	if (ret) {
+		warn_failed_init_tracer(trace, ret);
+		return ret;
+	}
+
+	ksym_selftest_dummy = 0;
+	/* Register the read-write tracing request */
+
+	ret = process_new_ksym_entry("ksym_selftest_dummy",
+				     HW_BREAKPOINT_R | HW_BREAKPOINT_W,
+					(unsigned long)(&ksym_selftest_dummy));
+
+	if (ret < 0) {
+		printk(KERN_CONT "ksym_trace read-write startup test failed\n");
+		goto ret_path;
+	}
+	/* Perform a read and a write operation over the dummy variable to
+	 * trigger the tracer
+	 */
+	if (ksym_selftest_dummy == 0)
+		ksym_selftest_dummy++;
+
+	/* stop the tracing. */
+	tracing_stop();
+	/* check the trace buffer */
+	ret = trace_test_buffer(tr, &count);
+	trace->reset(tr);
+	tracing_start();
+
+	/* read & write operations - one each is performed on the dummy variable
+	 * triggering two entries in the trace buffer
+	 */
+	if (!ret && count != 2) {
+		printk(KERN_CONT "Ksym tracer startup test failed");
+		ret = -1;
+	}
+
+ret_path:
+	return ret;
+}
+#endif /* CONFIG_KSYM_TRACER */
+
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 51213b0..9189cbe 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -477,11 +477,12 @@
 static void prof_syscall_enter(struct pt_regs *regs, long id)
 {
 	struct syscall_metadata *sys_data;
-	struct perf_trace_buf *trace_buf;
 	struct syscall_trace_enter *rec;
 	unsigned long flags;
+	char *trace_buf;
 	char *raw_data;
 	int syscall_nr;
+	int rctx;
 	int size;
 	int cpu;
 
@@ -505,28 +506,18 @@
 	/* Protect the per cpu buffer, begin the rcu read side */
 	local_irq_save(flags);
 
+	rctx = perf_swevent_get_recursion_context();
+	if (rctx < 0)
+		goto end_recursion;
+
 	cpu = smp_processor_id();
 
-	if (in_nmi())
-		trace_buf = rcu_dereference(perf_trace_buf_nmi);
-	else
-		trace_buf = rcu_dereference(perf_trace_buf);
+	trace_buf = rcu_dereference(perf_trace_buf);
 
 	if (!trace_buf)
 		goto end;
 
-	trace_buf = per_cpu_ptr(trace_buf, cpu);
-
-	if (trace_buf->recursion++)
-		goto end_recursion;
-
-	/*
-	 * Make recursion update visible before entering perf_tp_event
-	 * so that we protect from perf recursions.
-	 */
-	barrier();
-
-	raw_data = trace_buf->buf;
+	raw_data = per_cpu_ptr(trace_buf, cpu);
 
 	/* zero the dead bytes from align to not leak stack to user */
 	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
@@ -539,9 +530,9 @@
 			       (unsigned long *)&rec->args);
 	perf_tp_event(sys_data->enter_id, 0, 1, rec, size);
 
-end_recursion:
-	trace_buf->recursion--;
 end:
+	perf_swevent_put_recursion_context(rctx);
+end_recursion:
 	local_irq_restore(flags);
 }
 
@@ -588,10 +579,11 @@
 {
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_exit *rec;
-	struct perf_trace_buf *trace_buf;
 	unsigned long flags;
 	int syscall_nr;
+	char *trace_buf;
 	char *raw_data;
+	int rctx;
 	int size;
 	int cpu;
 
@@ -617,28 +609,19 @@
 
 	/* Protect the per cpu buffer, begin the rcu read side */
 	local_irq_save(flags);
+
+	rctx = perf_swevent_get_recursion_context();
+	if (rctx < 0)
+		goto end_recursion;
+
 	cpu = smp_processor_id();
 
-	if (in_nmi())
-		trace_buf = rcu_dereference(perf_trace_buf_nmi);
-	else
-		trace_buf = rcu_dereference(perf_trace_buf);
+	trace_buf = rcu_dereference(perf_trace_buf);
 
 	if (!trace_buf)
 		goto end;
 
-	trace_buf = per_cpu_ptr(trace_buf, cpu);
-
-	if (trace_buf->recursion++)
-		goto end_recursion;
-
-	/*
-	 * Make recursion update visible before entering perf_tp_event
-	 * so that we protect from perf recursions.
-	 */
-	barrier();
-
-	raw_data = trace_buf->buf;
+	raw_data = per_cpu_ptr(trace_buf, cpu);
 
 	/* zero the dead bytes from align to not leak stack to user */
 	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
@@ -652,9 +635,9 @@
 
 	perf_tp_event(sys_data->exit_id, 0, 1, rec, size);
 
-end_recursion:
-	trace_buf->recursion--;
 end:
+	perf_swevent_put_recursion_context(rctx);
+end_recursion:
 	local_irq_restore(flags);
 }
 
diff --git a/samples/Kconfig b/samples/Kconfig
index b92bde3..e4be84a 100644
--- a/samples/Kconfig
+++ b/samples/Kconfig
@@ -40,5 +40,11 @@
 	default m
 	depends on SAMPLE_KPROBES && KRETPROBES
 
+config SAMPLE_HW_BREAKPOINT
+	tristate "Build kernel hardware breakpoint examples -- loadable module only"
+	depends on HAVE_HW_BREAKPOINT && m
+	help
+	  This builds kernel hardware breakpoint example modules.
+
 endif # SAMPLES
 
diff --git a/samples/Makefile b/samples/Makefile
index 43343a0..0f15e6d 100644
--- a/samples/Makefile
+++ b/samples/Makefile
@@ -1,3 +1,4 @@
 # Makefile for Linux samples code
 
-obj-$(CONFIG_SAMPLES)	+= kobject/ kprobes/ tracepoints/ trace_events/
+obj-$(CONFIG_SAMPLES)	+= kobject/ kprobes/ tracepoints/ trace_events/ \
+			   hw_breakpoint/
diff --git a/samples/hw_breakpoint/Makefile b/samples/hw_breakpoint/Makefile
new file mode 100644
index 0000000..0f5c31c
--- /dev/null
+++ b/samples/hw_breakpoint/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_SAMPLE_HW_BREAKPOINT) += data_breakpoint.o
diff --git a/samples/hw_breakpoint/data_breakpoint.c b/samples/hw_breakpoint/data_breakpoint.c
new file mode 100644
index 0000000..9506381
--- /dev/null
+++ b/samples/hw_breakpoint/data_breakpoint.c
@@ -0,0 +1,90 @@
+/*
+ * data_breakpoint.c - Sample HW Breakpoint file to watch kernel data address
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * usage: insmod data_breakpoint.ko ksym=<ksym_name>
+ *
+ * This file is a kernel module that places a breakpoint over ksym_name kernel
+ * variable using Hardware Breakpoint register. The corresponding handler which
+ * prints a backtrace is invoked everytime a write operation is performed on
+ * that variable.
+ *
+ * Copyright (C) IBM Corporation, 2009
+ *
+ * Author: K.Prasad <prasad@linux.vnet.ibm.com>
+ */
+#include <linux/module.h>	/* Needed by all modules */
+#include <linux/kernel.h>	/* Needed for KERN_INFO */
+#include <linux/init.h>		/* Needed for the macros */
+#include <linux/kallsyms.h>
+
+#include <linux/perf_event.h>
+#include <linux/hw_breakpoint.h>
+
+struct perf_event **sample_hbp;
+
+static char ksym_name[KSYM_NAME_LEN] = "pid_max";
+module_param_string(ksym, ksym_name, KSYM_NAME_LEN, S_IRUGO);
+MODULE_PARM_DESC(ksym, "Kernel symbol to monitor; this module will report any"
+			" write operations on the kernel symbol");
+
+static void sample_hbp_handler(struct perf_event *temp, void *data)
+{
+	printk(KERN_INFO "%s value is changed\n", ksym_name);
+	dump_stack();
+	printk(KERN_INFO "Dump stack from sample_hbp_handler\n");
+}
+
+static int __init hw_break_module_init(void)
+{
+	int ret;
+	unsigned long addr;
+
+	addr = kallsyms_lookup_name(ksym_name);
+
+	sample_hbp = register_wide_hw_breakpoint(addr, HW_BREAKPOINT_LEN_4,
+						 HW_BREAKPOINT_W | HW_BREAKPOINT_R,
+						 sample_hbp_handler, true);
+	if (IS_ERR(sample_hbp)) {
+		ret = PTR_ERR(sample_hbp);
+		goto fail;
+	} else if (!sample_hbp) {
+		ret = -EINVAL;
+		goto fail;
+	}
+
+	printk(KERN_INFO "HW Breakpoint for %s write installed\n", ksym_name);
+
+	return 0;
+
+fail:
+	printk(KERN_INFO "Breakpoint registration failed\n");
+
+	return ret;
+}
+
+static void __exit hw_break_module_exit(void)
+{
+	unregister_wide_hw_breakpoint(sample_hbp);
+	printk(KERN_INFO "HW Breakpoint for %s write uninstalled\n", ksym_name);
+}
+
+module_init(hw_break_module_init);
+module_exit(hw_break_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("K.Prasad");
+MODULE_DESCRIPTION("ksym breakpoint");
diff --git a/tools/perf/Documentation/perf-kmem.txt b/tools/perf/Documentation/perf-kmem.txt
new file mode 100644
index 0000000..44b0ce3
--- /dev/null
+++ b/tools/perf/Documentation/perf-kmem.txt
@@ -0,0 +1,44 @@
+perf-kmem(1)
+==============
+
+NAME
+----
+perf-kmem - Tool to trace/measure kernel memory(slab) properties
+
+SYNOPSIS
+--------
+[verse]
+'perf kmem' {record} [<options>]
+
+DESCRIPTION
+-----------
+There's two variants of perf kmem:
+
+  'perf kmem record <command>' to record the kmem events
+  of an arbitrary workload.
+
+  'perf kmem' to report kernel memory statistics.
+
+OPTIONS
+-------
+-i <file>::
+--input=<file>::
+	Select the input file (default: perf.data)
+
+--stat=<caller|alloc>::
+	Select per callsite or per allocation statistics
+
+-s <key[,key2...]>::
+--sort=<key[,key2...]>::
+	Sort the output (default: frag,hit,bytes)
+
+-l <num>::
+--line=<num>::
+	Print n lines only
+
+--raw-ip::
+	Print raw ip instead of symbol
+
+SEE ALSO
+--------
+linkperf:perf-record[1]
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index 0ff23de..fc46c0b 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -26,11 +26,19 @@
 
 -e::
 --event=::
-	Select the PMU event. Selection can be a symbolic event name
-	(use 'perf list' to list all events) or a raw PMU
-	event (eventsel+umask) in the form of rNNN where NNN is a
-	hexadecimal event descriptor.
+	Select the PMU event. Selection can be:
 
+        - a symbolic event name	(use 'perf list' to list all events)
+
+        - a raw PMU event (eventsel+umask) in the form of rNNN where NNN is a
+	  hexadecimal event descriptor.
+
+        - a hardware breakpoint event in the form of '\mem:addr[:access]'
+          where addr is the address in memory you want to break in.
+          Access is the memory access type (read, write, execute) it can
+          be passed as follows: '\mem:addr[:[r][w][x]]'.
+          If you want to profile read-write accesses in 0x1000, just set
+          'mem:0x1000:rw'.
 -a::
         System-wide collection.
 
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 53e663a..f1537a9 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -2,6 +2,7 @@
 all::
 
 # Define V=1 to have a more verbose compile.
+# Define V=2 to have an even more verbose compile.
 #
 # Define SNPRINTF_RETURNS_BOGUS if your are on a system which snprintf()
 # or vsnprintf() return -1 instead of number of characters which would
@@ -147,6 +148,8 @@
 # broken, or spawning external process is slower than built-in grep perf has).
 #
 # Define LDFLAGS=-static to build a static binary.
+#
+# Define EXTRA_CFLAGS=-m64 or EXTRA_CFLAGS=-m32 as appropriate for cross-builds.
 
 PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
 	@$(SHELL_PATH) util/PERF-VERSION-GEN
@@ -159,22 +162,6 @@
 uname_P := $(shell sh -c 'uname -p 2>/dev/null || echo not')
 uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not')
 
-#
-# Add -m32 for cross-builds:
-#
-ifdef NO_64BIT
-  MBITS := -m32
-else
-  #
-  # If we're on a 64-bit kernel (except ia64), use -m64:
-  #
-  ifneq ($(uname_M),ia64)
-    ifneq ($(patsubst %64,%,$(uname_M)),$(uname_M))
-      MBITS := -m64
-    endif
-  endif
-endif
-
 # CFLAGS and LDFLAGS are for the users to override from the command line.
 
 #
@@ -211,7 +198,7 @@
   CFLAGS_OPTIMIZE = -O6
 endif
 
-CFLAGS = $(MBITS) -ggdb3 -Wall -Wextra -std=gnu99 -Werror $(CFLAGS_OPTIMIZE) -D_FORTIFY_SOURCE=2 $(EXTRA_WARNINGS)
+CFLAGS = -ggdb3 -Wall -Wextra -std=gnu99 -Werror $(CFLAGS_OPTIMIZE) -D_FORTIFY_SOURCE=2 $(EXTRA_WARNINGS) $(EXTRA_CFLAGS)
 EXTLIBS = -lpthread -lrt -lelf -lm
 ALL_CFLAGS = $(CFLAGS)
 ALL_LDFLAGS = $(LDFLAGS)
@@ -263,7 +250,7 @@
 # explicitly what architecture to check for. Fix this up for yours..
 SPARSE_FLAGS = -D__BIG_ENDIAN__ -D__powerpc__
 
-ifeq ($(shell sh -c "echo 'int foo(void) {char X[2]; return 3;}' | $(CC) -x c -c -Werror -fstack-protector-all - -o /dev/null >/dev/null 2>&1 && echo y"), y)
+ifeq ($(shell sh -c "echo 'int foo(void) {char X[2]; return 3;}' | $(CC) -x c -c -Werror -fstack-protector-all - -o /dev/null "$(QUIET_STDERR)" && echo y"), y)
   CFLAGS := $(CFLAGS) -fstack-protector-all
 endif
 
@@ -445,9 +432,15 @@
 BUILTIN_OBJS += builtin-top.o
 BUILTIN_OBJS += builtin-trace.o
 BUILTIN_OBJS += builtin-probe.o
+BUILTIN_OBJS += builtin-kmem.o
 
 PERFLIBS = $(LIB_FILE)
 
+ifeq ($(V), 2)
+	QUIET_STDERR = ">/dev/null"
+else
+	QUIET_STDERR = ">/dev/null 2>&1"
+endif
 #
 # Platform specific tweaks
 #
@@ -475,19 +468,19 @@
 	PTHREAD_LIBS =
 endif
 
-ifeq ($(shell sh -c "(echo '\#include <libelf.h>'; echo 'int main(void) { Elf * elf = elf_begin(0, ELF_C_READ, 0); return (long)elf; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) > /dev/null 2>&1 && echo y"), y)
-ifneq ($(shell sh -c "(echo '\#include <gnu/libc-version.h>'; echo 'int main(void) { const char * version = gnu_get_libc_version(); return (long)version; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) > /dev/null 2>&1 && echo y"), y)
-	msg := $(error No gnu/libc-version.h found, please install glibc-dev[el]);
+ifeq ($(shell sh -c "(echo '\#include <libelf.h>'; echo 'int main(void) { Elf * elf = elf_begin(0, ELF_C_READ, 0); return (long)elf; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
+ifneq ($(shell sh -c "(echo '\#include <gnu/libc-version.h>'; echo 'int main(void) { const char * version = gnu_get_libc_version(); return (long)version; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
+	msg := $(error No gnu/libc-version.h found, please install glibc-dev[el]/glibc-static);
 endif
 
-	ifneq ($(shell sh -c "(echo '\#include <libelf.h>'; echo 'int main(void) { Elf * elf = elf_begin(0, ELF_C_READ_MMAP, 0); return (long)elf; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) > /dev/null 2>&1 && echo y"), y)
+	ifneq ($(shell sh -c "(echo '\#include <libelf.h>'; echo 'int main(void) { Elf * elf = elf_begin(0, ELF_C_READ_MMAP, 0); return (long)elf; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
 		BASIC_CFLAGS += -DLIBELF_NO_MMAP
 	endif
 else
 	msg := $(error No libelf.h/libelf found, please install libelf-dev/elfutils-libelf-devel and glibc-dev[el]);
 endif
 
-ifneq ($(shell sh -c "(echo '\#include <libdwarf/dwarf.h>'; echo '\#include <libdwarf/libdwarf.h>'; echo 'int main(void) { Dwarf_Debug dbg; Dwarf_Error err; Dwarf_Ranges *rng; dwarf_init(0, DW_DLC_READ, 0, 0, &dbg, &err); dwarf_get_ranges(dbg, 0, &rng, 0, 0, &err); return (long)dbg; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -ldwarf -lelf -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) > /dev/null 2>&1 && echo y"), y)
+ifneq ($(shell sh -c "(echo '\#include <libdwarf/dwarf.h>'; echo '\#include <libdwarf/libdwarf.h>'; echo 'int main(void) { Dwarf_Debug dbg; Dwarf_Error err; Dwarf_Ranges *rng; dwarf_init(0, DW_DLC_READ, 0, 0, &dbg, &err); dwarf_get_ranges(dbg, 0, &rng, 0, 0, &err); return (long)dbg; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -ldwarf -lelf -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
 	msg := $(warning No libdwarf.h found or old libdwarf.h found, disables dwarf support. Please install libdwarf-dev/libdwarf-devel >= 20081231);
 	BASIC_CFLAGS += -DNO_LIBDWARF
 else
@@ -499,25 +492,25 @@
 ifdef NO_DEMANGLE
 	BASIC_CFLAGS += -DNO_DEMANGLE
 else
-	has_bfd := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) -lbfd > /dev/null 2>&1 && echo y")
+	has_bfd := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) -lbfd "$(QUIET_STDERR)" && echo y")
 
 	ifeq ($(has_bfd),y)
 		EXTLIBS += -lbfd
 	else
-		has_bfd_iberty := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) -lbfd -liberty > /dev/null 2>&1 && echo y")
+		has_bfd_iberty := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) -lbfd -liberty "$(QUIET_STDERR)" && echo y")
 		ifeq ($(has_bfd_iberty),y)
 			EXTLIBS += -lbfd -liberty
 		else
-			has_bfd_iberty_z := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) -lbfd -liberty -lz > /dev/null 2>&1 && echo y")
+			has_bfd_iberty_z := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) -lbfd -liberty -lz "$(QUIET_STDERR)" && echo y")
 			ifeq ($(has_bfd_iberty_z),y)
 				EXTLIBS += -lbfd -liberty -lz
 			else
-				has_cplus_demangle := $(shell sh -c "(echo 'extern char *cplus_demangle(const char *, int);'; echo 'int main(void) { cplus_demangle(0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) -liberty > /dev/null 2>&1 && echo y")
+				has_cplus_demangle := $(shell sh -c "(echo 'extern char *cplus_demangle(const char *, int);'; echo 'int main(void) { cplus_demangle(0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) -liberty "$(QUIET_STDERR)" && echo y")
 				ifeq ($(has_cplus_demangle),y)
 					EXTLIBS += -liberty
 					BASIC_CFLAGS += -DHAVE_CPLUS_DEMANGLE
 				else
-					msg := $(warning No bfd.h/libbfd found, install binutils-dev[el] to gain symbol demangling)
+					msg := $(warning No bfd.h/libbfd found, install binutils-dev[el]/zlib-static to gain symbol demangling)
 					BASIC_CFLAGS += -DNO_DEMANGLE
 				endif
 			endif
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 77d50a6..6b13a1e 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -33,9 +33,11 @@
 static int		full_paths;
 
 static int		print_line;
+static bool		use_modules;
 
 static unsigned long	page_size;
 static unsigned long	mmap_window = 32;
+const char		*vmlinux_name;
 
 struct sym_hist {
 	u64		sum;
@@ -156,7 +158,7 @@
 
 	if (event->header.misc & PERF_RECORD_MISC_KERNEL) {
 		level = 'k';
-		sym = kernel_maps__find_symbol(ip, &map);
+		sym = kernel_maps__find_symbol(ip, &map, symbol_filter);
 		dump_printf(" ...... dso: %s\n",
 			    map ? map->dso->long_name : "<not found>");
 	} else if (event->header.misc & PERF_RECORD_MISC_USER) {
@@ -636,9 +638,9 @@
 		exit(0);
 	}
 
-	if (load_kernel(symbol_filter) < 0) {
-		perror("failed to load kernel symbols");
-		return EXIT_FAILURE;
+	if (kernel_maps__init(vmlinux_name, true, use_modules) < 0) {
+		pr_err("failed to create kernel maps for symbol resolution\b");
+		return -1;
 	}
 
 remap:
@@ -742,7 +744,7 @@
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
 		    "dump raw trace in ASCII"),
 	OPT_STRING('k', "vmlinux", &vmlinux_name, "file", "vmlinux pathname"),
-	OPT_BOOLEAN('m', "modules", &modules,
+	OPT_BOOLEAN('m', "modules", &use_modules,
 		    "load module symbols - WARNING: use only with -k and LIVE kernel"),
 	OPT_BOOLEAN('l', "print-line", &print_line,
 		    "print matching source lines (may be slow)"),
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
new file mode 100644
index 0000000..173d6db
--- /dev/null
+++ b/tools/perf/builtin-kmem.c
@@ -0,0 +1,833 @@
+#include "builtin.h"
+#include "perf.h"
+
+#include "util/util.h"
+#include "util/cache.h"
+#include "util/symbol.h"
+#include "util/thread.h"
+#include "util/header.h"
+
+#include "util/parse-options.h"
+#include "util/trace-event.h"
+
+#include "util/debug.h"
+#include "util/data_map.h"
+
+#include <linux/rbtree.h>
+
+struct alloc_stat;
+typedef int (*sort_fn_t)(struct alloc_stat *, struct alloc_stat *);
+
+static char const		*input_name = "perf.data";
+
+static struct perf_header	*header;
+static u64			sample_type;
+
+static int			alloc_flag;
+static int			caller_flag;
+
+static int			alloc_lines = -1;
+static int			caller_lines = -1;
+
+static bool			raw_ip;
+
+static char			default_sort_order[] = "frag,hit,bytes";
+
+static char			*cwd;
+static int			cwdlen;
+
+static int			*cpunode_map;
+static int			max_cpu_num;
+
+struct alloc_stat {
+	u64	call_site;
+	u64	ptr;
+	u64	bytes_req;
+	u64	bytes_alloc;
+	u32	hit;
+	u32	pingpong;
+
+	short	alloc_cpu;
+
+	struct rb_node node;
+};
+
+static struct rb_root root_alloc_stat;
+static struct rb_root root_alloc_sorted;
+static struct rb_root root_caller_stat;
+static struct rb_root root_caller_sorted;
+
+static unsigned long total_requested, total_allocated;
+static unsigned long nr_allocs, nr_cross_allocs;
+
+struct raw_event_sample {
+	u32 size;
+	char data[0];
+};
+
+#define PATH_SYS_NODE	"/sys/devices/system/node"
+
+static void init_cpunode_map(void)
+{
+	FILE *fp;
+	int i;
+
+	fp = fopen("/sys/devices/system/cpu/kernel_max", "r");
+	if (!fp) {
+		max_cpu_num = 4096;
+		return;
+	}
+
+	if (fscanf(fp, "%d", &max_cpu_num) < 1)
+		die("Failed to read 'kernel_max' from sysfs");
+	max_cpu_num++;
+
+	cpunode_map = calloc(max_cpu_num, sizeof(int));
+	if (!cpunode_map)
+		die("calloc");
+	for (i = 0; i < max_cpu_num; i++)
+		cpunode_map[i] = -1;
+	fclose(fp);
+}
+
+static void setup_cpunode_map(void)
+{
+	struct dirent *dent1, *dent2;
+	DIR *dir1, *dir2;
+	unsigned int cpu, mem;
+	char buf[PATH_MAX];
+
+	init_cpunode_map();
+
+	dir1 = opendir(PATH_SYS_NODE);
+	if (!dir1)
+		return;
+
+	while (true) {
+		dent1 = readdir(dir1);
+		if (!dent1)
+			break;
+
+		if (sscanf(dent1->d_name, "node%u", &mem) < 1)
+			continue;
+
+		snprintf(buf, PATH_MAX, "%s/%s", PATH_SYS_NODE, dent1->d_name);
+		dir2 = opendir(buf);
+		if (!dir2)
+			continue;
+		while (true) {
+			dent2 = readdir(dir2);
+			if (!dent2)
+				break;
+			if (sscanf(dent2->d_name, "cpu%u", &cpu) < 1)
+				continue;
+			cpunode_map[cpu] = mem;
+		}
+	}
+}
+
+static int
+process_comm_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	struct thread *thread = threads__findnew(event->comm.pid);
+
+	dump_printf("%p [%p]: PERF_RECORD_COMM: %s:%d\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->comm.comm, event->comm.pid);
+
+	if (thread == NULL ||
+	    thread__set_comm(thread, event->comm.comm)) {
+		dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static void insert_alloc_stat(unsigned long call_site, unsigned long ptr,
+			      int bytes_req, int bytes_alloc, int cpu)
+{
+	struct rb_node **node = &root_alloc_stat.rb_node;
+	struct rb_node *parent = NULL;
+	struct alloc_stat *data = NULL;
+
+	while (*node) {
+		parent = *node;
+		data = rb_entry(*node, struct alloc_stat, node);
+
+		if (ptr > data->ptr)
+			node = &(*node)->rb_right;
+		else if (ptr < data->ptr)
+			node = &(*node)->rb_left;
+		else
+			break;
+	}
+
+	if (data && data->ptr == ptr) {
+		data->hit++;
+		data->bytes_req += bytes_req;
+		data->bytes_alloc += bytes_req;
+	} else {
+		data = malloc(sizeof(*data));
+		if (!data)
+			die("malloc");
+		data->ptr = ptr;
+		data->pingpong = 0;
+		data->hit = 1;
+		data->bytes_req = bytes_req;
+		data->bytes_alloc = bytes_alloc;
+
+		rb_link_node(&data->node, parent, node);
+		rb_insert_color(&data->node, &root_alloc_stat);
+	}
+	data->call_site = call_site;
+	data->alloc_cpu = cpu;
+}
+
+static void insert_caller_stat(unsigned long call_site,
+			      int bytes_req, int bytes_alloc)
+{
+	struct rb_node **node = &root_caller_stat.rb_node;
+	struct rb_node *parent = NULL;
+	struct alloc_stat *data = NULL;
+
+	while (*node) {
+		parent = *node;
+		data = rb_entry(*node, struct alloc_stat, node);
+
+		if (call_site > data->call_site)
+			node = &(*node)->rb_right;
+		else if (call_site < data->call_site)
+			node = &(*node)->rb_left;
+		else
+			break;
+	}
+
+	if (data && data->call_site == call_site) {
+		data->hit++;
+		data->bytes_req += bytes_req;
+		data->bytes_alloc += bytes_req;
+	} else {
+		data = malloc(sizeof(*data));
+		if (!data)
+			die("malloc");
+		data->call_site = call_site;
+		data->pingpong = 0;
+		data->hit = 1;
+		data->bytes_req = bytes_req;
+		data->bytes_alloc = bytes_alloc;
+
+		rb_link_node(&data->node, parent, node);
+		rb_insert_color(&data->node, &root_caller_stat);
+	}
+}
+
+static void process_alloc_event(struct raw_event_sample *raw,
+				struct event *event,
+				int cpu,
+				u64 timestamp __used,
+				struct thread *thread __used,
+				int node)
+{
+	unsigned long call_site;
+	unsigned long ptr;
+	int bytes_req;
+	int bytes_alloc;
+	int node1, node2;
+
+	ptr = raw_field_value(event, "ptr", raw->data);
+	call_site = raw_field_value(event, "call_site", raw->data);
+	bytes_req = raw_field_value(event, "bytes_req", raw->data);
+	bytes_alloc = raw_field_value(event, "bytes_alloc", raw->data);
+
+	insert_alloc_stat(call_site, ptr, bytes_req, bytes_alloc, cpu);
+	insert_caller_stat(call_site, bytes_req, bytes_alloc);
+
+	total_requested += bytes_req;
+	total_allocated += bytes_alloc;
+
+	if (node) {
+		node1 = cpunode_map[cpu];
+		node2 = raw_field_value(event, "node", raw->data);
+		if (node1 != node2)
+			nr_cross_allocs++;
+	}
+	nr_allocs++;
+}
+
+static int ptr_cmp(struct alloc_stat *, struct alloc_stat *);
+static int callsite_cmp(struct alloc_stat *, struct alloc_stat *);
+
+static struct alloc_stat *search_alloc_stat(unsigned long ptr,
+					    unsigned long call_site,
+					    struct rb_root *root,
+					    sort_fn_t sort_fn)
+{
+	struct rb_node *node = root->rb_node;
+	struct alloc_stat key = { .ptr = ptr, .call_site = call_site };
+
+	while (node) {
+		struct alloc_stat *data;
+		int cmp;
+
+		data = rb_entry(node, struct alloc_stat, node);
+
+		cmp = sort_fn(&key, data);
+		if (cmp < 0)
+			node = node->rb_left;
+		else if (cmp > 0)
+			node = node->rb_right;
+		else
+			return data;
+	}
+	return NULL;
+}
+
+static void process_free_event(struct raw_event_sample *raw,
+			       struct event *event,
+			       int cpu,
+			       u64 timestamp __used,
+			       struct thread *thread __used)
+{
+	unsigned long ptr;
+	struct alloc_stat *s_alloc, *s_caller;
+
+	ptr = raw_field_value(event, "ptr", raw->data);
+
+	s_alloc = search_alloc_stat(ptr, 0, &root_alloc_stat, ptr_cmp);
+	if (!s_alloc)
+		return;
+
+	if (cpu != s_alloc->alloc_cpu) {
+		s_alloc->pingpong++;
+
+		s_caller = search_alloc_stat(0, s_alloc->call_site,
+					     &root_caller_stat, callsite_cmp);
+		assert(s_caller);
+		s_caller->pingpong++;
+	}
+	s_alloc->alloc_cpu = -1;
+}
+
+static void
+process_raw_event(event_t *raw_event __used, void *more_data,
+		  int cpu, u64 timestamp, struct thread *thread)
+{
+	struct raw_event_sample *raw = more_data;
+	struct event *event;
+	int type;
+
+	type = trace_parse_common_type(raw->data);
+	event = trace_find_event(type);
+
+	if (!strcmp(event->name, "kmalloc") ||
+	    !strcmp(event->name, "kmem_cache_alloc")) {
+		process_alloc_event(raw, event, cpu, timestamp, thread, 0);
+		return;
+	}
+
+	if (!strcmp(event->name, "kmalloc_node") ||
+	    !strcmp(event->name, "kmem_cache_alloc_node")) {
+		process_alloc_event(raw, event, cpu, timestamp, thread, 1);
+		return;
+	}
+
+	if (!strcmp(event->name, "kfree") ||
+	    !strcmp(event->name, "kmem_cache_free")) {
+		process_free_event(raw, event, cpu, timestamp, thread);
+		return;
+	}
+}
+
+static int
+process_sample_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	u64 ip = event->ip.ip;
+	u64 timestamp = -1;
+	u32 cpu = -1;
+	u64 period = 1;
+	void *more_data = event->ip.__more_data;
+	struct thread *thread = threads__findnew(event->ip.pid);
+
+	if (sample_type & PERF_SAMPLE_TIME) {
+		timestamp = *(u64 *)more_data;
+		more_data += sizeof(u64);
+	}
+
+	if (sample_type & PERF_SAMPLE_CPU) {
+		cpu = *(u32 *)more_data;
+		more_data += sizeof(u32);
+		more_data += sizeof(u32); /* reserved */
+	}
+
+	if (sample_type & PERF_SAMPLE_PERIOD) {
+		period = *(u64 *)more_data;
+		more_data += sizeof(u64);
+	}
+
+	dump_printf("%p [%p]: PERF_RECORD_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->header.misc,
+		event->ip.pid, event->ip.tid,
+		(void *)(long)ip,
+		(long long)period);
+
+	if (thread == NULL) {
+		pr_debug("problem processing %d event, skipping it.\n",
+			 event->header.type);
+		return -1;
+	}
+
+	dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
+
+	process_raw_event(event, more_data, cpu, timestamp, thread);
+
+	return 0;
+}
+
+static int sample_type_check(u64 type)
+{
+	sample_type = type;
+
+	if (!(sample_type & PERF_SAMPLE_RAW)) {
+		fprintf(stderr,
+			"No trace sample to read. Did you call perf record "
+			"without -R?");
+		return -1;
+	}
+
+	return 0;
+}
+
+static struct perf_file_handler file_handler = {
+	.process_sample_event	= process_sample_event,
+	.process_comm_event	= process_comm_event,
+	.sample_type_check	= sample_type_check,
+};
+
+static int read_events(void)
+{
+	register_idle_thread();
+	register_perf_file_handler(&file_handler);
+
+	return mmap_dispatch_perf_file(&header, input_name, NULL, false, 0, 0,
+				       &cwdlen, &cwd);
+}
+
+static double fragmentation(unsigned long n_req, unsigned long n_alloc)
+{
+	if (n_alloc == 0)
+		return 0.0;
+	else
+		return 100.0 - (100.0 * n_req / n_alloc);
+}
+
+static void __print_result(struct rb_root *root, int n_lines, int is_caller)
+{
+	struct rb_node *next;
+
+	printf("%.102s\n", graph_dotted_line);
+	printf(" %-34s |",  is_caller ? "Callsite": "Alloc Ptr");
+	printf(" Total_alloc/Per | Total_req/Per   | Hit   | Ping-pong | Frag\n");
+	printf("%.102s\n", graph_dotted_line);
+
+	next = rb_first(root);
+
+	while (next && n_lines--) {
+		struct alloc_stat *data = rb_entry(next, struct alloc_stat,
+						   node);
+		struct symbol *sym = NULL;
+		char buf[BUFSIZ];
+		u64 addr;
+
+		if (is_caller) {
+			addr = data->call_site;
+			if (!raw_ip)
+				sym = kernel_maps__find_symbol(addr,
+							       NULL, NULL);
+		} else
+			addr = data->ptr;
+
+		if (sym != NULL)
+			snprintf(buf, sizeof(buf), "%s+%Lx", sym->name,
+				 addr - sym->start);
+		else
+			snprintf(buf, sizeof(buf), "%#Lx", addr);
+		printf(" %-34s |", buf);
+
+		printf(" %9llu/%-5lu | %9llu/%-5lu | %6lu | %8lu | %6.3f%%\n",
+		       (unsigned long long)data->bytes_alloc,
+		       (unsigned long)data->bytes_alloc / data->hit,
+		       (unsigned long long)data->bytes_req,
+		       (unsigned long)data->bytes_req / data->hit,
+		       (unsigned long)data->hit,
+		       (unsigned long)data->pingpong,
+		       fragmentation(data->bytes_req, data->bytes_alloc));
+
+		next = rb_next(next);
+	}
+
+	if (n_lines == -1)
+		printf(" ...                                | ...             | ...             | ...    | ...      | ...   \n");
+
+	printf("%.102s\n", graph_dotted_line);
+}
+
+static void print_summary(void)
+{
+	printf("\nSUMMARY\n=======\n");
+	printf("Total bytes requested: %lu\n", total_requested);
+	printf("Total bytes allocated: %lu\n", total_allocated);
+	printf("Total bytes wasted on internal fragmentation: %lu\n",
+	       total_allocated - total_requested);
+	printf("Internal fragmentation: %f%%\n",
+	       fragmentation(total_requested, total_allocated));
+	printf("Cross CPU allocations: %lu/%lu\n", nr_cross_allocs, nr_allocs);
+}
+
+static void print_result(void)
+{
+	if (caller_flag)
+		__print_result(&root_caller_sorted, caller_lines, 1);
+	if (alloc_flag)
+		__print_result(&root_alloc_sorted, alloc_lines, 0);
+	print_summary();
+}
+
+struct sort_dimension {
+	const char		name[20];
+	sort_fn_t		cmp;
+	struct list_head	list;
+};
+
+static LIST_HEAD(caller_sort);
+static LIST_HEAD(alloc_sort);
+
+static void sort_insert(struct rb_root *root, struct alloc_stat *data,
+			struct list_head *sort_list)
+{
+	struct rb_node **new = &(root->rb_node);
+	struct rb_node *parent = NULL;
+	struct sort_dimension *sort;
+
+	while (*new) {
+		struct alloc_stat *this;
+		int cmp = 0;
+
+		this = rb_entry(*new, struct alloc_stat, node);
+		parent = *new;
+
+		list_for_each_entry(sort, sort_list, list) {
+			cmp = sort->cmp(data, this);
+			if (cmp)
+				break;
+		}
+
+		if (cmp > 0)
+			new = &((*new)->rb_left);
+		else
+			new = &((*new)->rb_right);
+	}
+
+	rb_link_node(&data->node, parent, new);
+	rb_insert_color(&data->node, root);
+}
+
+static void __sort_result(struct rb_root *root, struct rb_root *root_sorted,
+			  struct list_head *sort_list)
+{
+	struct rb_node *node;
+	struct alloc_stat *data;
+
+	for (;;) {
+		node = rb_first(root);
+		if (!node)
+			break;
+
+		rb_erase(node, root);
+		data = rb_entry(node, struct alloc_stat, node);
+		sort_insert(root_sorted, data, sort_list);
+	}
+}
+
+static void sort_result(void)
+{
+	__sort_result(&root_alloc_stat, &root_alloc_sorted, &alloc_sort);
+	__sort_result(&root_caller_stat, &root_caller_sorted, &caller_sort);
+}
+
+static int __cmd_kmem(void)
+{
+	setup_pager();
+	read_events();
+	sort_result();
+	print_result();
+
+	return 0;
+}
+
+static const char * const kmem_usage[] = {
+	"perf kmem [<options>] {record}",
+	NULL
+};
+
+static int ptr_cmp(struct alloc_stat *l, struct alloc_stat *r)
+{
+	if (l->ptr < r->ptr)
+		return -1;
+	else if (l->ptr > r->ptr)
+		return 1;
+	return 0;
+}
+
+static struct sort_dimension ptr_sort_dimension = {
+	.name	= "ptr",
+	.cmp	= ptr_cmp,
+};
+
+static int callsite_cmp(struct alloc_stat *l, struct alloc_stat *r)
+{
+	if (l->call_site < r->call_site)
+		return -1;
+	else if (l->call_site > r->call_site)
+		return 1;
+	return 0;
+}
+
+static struct sort_dimension callsite_sort_dimension = {
+	.name	= "callsite",
+	.cmp	= callsite_cmp,
+};
+
+static int hit_cmp(struct alloc_stat *l, struct alloc_stat *r)
+{
+	if (l->hit < r->hit)
+		return -1;
+	else if (l->hit > r->hit)
+		return 1;
+	return 0;
+}
+
+static struct sort_dimension hit_sort_dimension = {
+	.name	= "hit",
+	.cmp	= hit_cmp,
+};
+
+static int bytes_cmp(struct alloc_stat *l, struct alloc_stat *r)
+{
+	if (l->bytes_alloc < r->bytes_alloc)
+		return -1;
+	else if (l->bytes_alloc > r->bytes_alloc)
+		return 1;
+	return 0;
+}
+
+static struct sort_dimension bytes_sort_dimension = {
+	.name	= "bytes",
+	.cmp	= bytes_cmp,
+};
+
+static int frag_cmp(struct alloc_stat *l, struct alloc_stat *r)
+{
+	double x, y;
+
+	x = fragmentation(l->bytes_req, l->bytes_alloc);
+	y = fragmentation(r->bytes_req, r->bytes_alloc);
+
+	if (x < y)
+		return -1;
+	else if (x > y)
+		return 1;
+	return 0;
+}
+
+static struct sort_dimension frag_sort_dimension = {
+	.name	= "frag",
+	.cmp	= frag_cmp,
+};
+
+static int pingpong_cmp(struct alloc_stat *l, struct alloc_stat *r)
+{
+	if (l->pingpong < r->pingpong)
+		return -1;
+	else if (l->pingpong > r->pingpong)
+		return 1;
+	return 0;
+}
+
+static struct sort_dimension pingpong_sort_dimension = {
+	.name	= "pingpong",
+	.cmp	= pingpong_cmp,
+};
+
+static struct sort_dimension *avail_sorts[] = {
+	&ptr_sort_dimension,
+	&callsite_sort_dimension,
+	&hit_sort_dimension,
+	&bytes_sort_dimension,
+	&frag_sort_dimension,
+	&pingpong_sort_dimension,
+};
+
+#define NUM_AVAIL_SORTS	\
+	(int)(sizeof(avail_sorts) / sizeof(struct sort_dimension *))
+
+static int sort_dimension__add(const char *tok, struct list_head *list)
+{
+	struct sort_dimension *sort;
+	int i;
+
+	for (i = 0; i < NUM_AVAIL_SORTS; i++) {
+		if (!strcmp(avail_sorts[i]->name, tok)) {
+			sort = malloc(sizeof(*sort));
+			if (!sort)
+				die("malloc");
+			memcpy(sort, avail_sorts[i], sizeof(*sort));
+			list_add_tail(&sort->list, list);
+			return 0;
+		}
+	}
+
+	return -1;
+}
+
+static int setup_sorting(struct list_head *sort_list, const char *arg)
+{
+	char *tok;
+	char *str = strdup(arg);
+
+	if (!str)
+		die("strdup");
+
+	while (true) {
+		tok = strsep(&str, ",");
+		if (!tok)
+			break;
+		if (sort_dimension__add(tok, sort_list) < 0) {
+			error("Unknown --sort key: '%s'", tok);
+			return -1;
+		}
+	}
+
+	free(str);
+	return 0;
+}
+
+static int parse_sort_opt(const struct option *opt __used,
+			  const char *arg, int unset __used)
+{
+	if (!arg)
+		return -1;
+
+	if (caller_flag > alloc_flag)
+		return setup_sorting(&caller_sort, arg);
+	else
+		return setup_sorting(&alloc_sort, arg);
+
+	return 0;
+}
+
+static int parse_stat_opt(const struct option *opt __used,
+			  const char *arg, int unset __used)
+{
+	if (!arg)
+		return -1;
+
+	if (strcmp(arg, "alloc") == 0)
+		alloc_flag = (caller_flag + 1);
+	else if (strcmp(arg, "caller") == 0)
+		caller_flag = (alloc_flag + 1);
+	else
+		return -1;
+	return 0;
+}
+
+static int parse_line_opt(const struct option *opt __used,
+			  const char *arg, int unset __used)
+{
+	int lines;
+
+	if (!arg)
+		return -1;
+
+	lines = strtoul(arg, NULL, 10);
+
+	if (caller_flag > alloc_flag)
+		caller_lines = lines;
+	else
+		alloc_lines = lines;
+
+	return 0;
+}
+
+static const struct option kmem_options[] = {
+	OPT_STRING('i', "input", &input_name, "file",
+		   "input file name"),
+	OPT_CALLBACK(0, "stat", NULL, "<alloc>|<caller>",
+		     "stat selector, Pass 'alloc' or 'caller'.",
+		     parse_stat_opt),
+	OPT_CALLBACK('s', "sort", NULL, "key[,key2...]",
+		     "sort by keys: ptr, call_site, bytes, hit, pingpong, frag",
+		     parse_sort_opt),
+	OPT_CALLBACK('l', "line", NULL, "num",
+		     "show n lins",
+		     parse_line_opt),
+	OPT_BOOLEAN(0, "raw-ip", &raw_ip, "show raw ip instead of symbol"),
+	OPT_END()
+};
+
+static const char *record_args[] = {
+	"record",
+	"-a",
+	"-R",
+	"-M",
+	"-f",
+	"-c", "1",
+	"-e", "kmem:kmalloc",
+	"-e", "kmem:kmalloc_node",
+	"-e", "kmem:kfree",
+	"-e", "kmem:kmem_cache_alloc",
+	"-e", "kmem:kmem_cache_alloc_node",
+	"-e", "kmem:kmem_cache_free",
+};
+
+static int __cmd_record(int argc, const char **argv)
+{
+	unsigned int rec_argc, i, j;
+	const char **rec_argv;
+
+	rec_argc = ARRAY_SIZE(record_args) + argc - 1;
+	rec_argv = calloc(rec_argc + 1, sizeof(char *));
+
+	for (i = 0; i < ARRAY_SIZE(record_args); i++)
+		rec_argv[i] = strdup(record_args[i]);
+
+	for (j = 1; j < (unsigned int)argc; j++, i++)
+		rec_argv[i] = argv[j];
+
+	return cmd_record(i, rec_argv, NULL);
+}
+
+int cmd_kmem(int argc, const char **argv, const char *prefix __used)
+{
+	symbol__init(0);
+
+	argc = parse_options(argc, argv, kmem_options, kmem_usage, 0);
+
+	if (argc && !strncmp(argv[0], "rec", 3))
+		return __cmd_record(argc, argv);
+	else if (argc)
+		usage_with_options(kmem_usage, kmem_options);
+
+	if (list_empty(&caller_sort))
+		setup_sorting(&caller_sort, default_sort_order);
+	if (list_empty(&alloc_sort))
+		setup_sorting(&alloc_sort, default_sort_order);
+
+	setup_cpunode_map();
+
+	return __cmd_kmem();
+}
+
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 82260c5..0e519c6 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -307,6 +307,12 @@
 		printf("\n");
 		error("perfcounter syscall returned with %d (%s)\n",
 			fd[nr_cpu][counter], strerror(err));
+
+#if defined(__i386__) || defined(__x86_64__)
+		if (attr->type == PERF_TYPE_HARDWARE && err == EOPNOTSUPP)
+			die("No hardware sampling interrupt available. No APIC? If so then you can boot the kernel with the \"lapic\" boot parameter to force-enable it.\n");
+#endif
+
 		die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
 		exit(-1);
 	}
@@ -400,7 +406,7 @@
 	struct stat st;
 	pid_t pid = 0;
 	int flags;
-	int ret;
+	int err;
 	unsigned long waking = 0;
 
 	page_size = sysconf(_SC_PAGE_SIZE);
@@ -434,16 +440,18 @@
 		exit(-1);
 	}
 
-	if (!file_new)
-		header = perf_header__read(output);
-	else
-		header = perf_header__new();
-
+	header = perf_header__new();
 	if (header == NULL) {
 		pr_err("Not enough memory for reading perf file header\n");
 		return -1;
 	}
 
+	if (!file_new) {
+		err = perf_header__read(header, output);
+		if (err < 0)
+			return err;
+	}
+
 	if (raw_samples) {
 		perf_header__set_feat(header, HEADER_TRACE_INFO);
 	} else {
@@ -472,8 +480,11 @@
 		}
 	}
 
-	if (file_new)
-		perf_header__write(header, output, false);
+	if (file_new) {
+		err = perf_header__write(header, output, false);
+		if (err < 0)
+			return err;
+	}
 
 	if (!system_wide)
 		event__synthesize_thread(pid, process_synthesized_event);
@@ -527,7 +538,7 @@
 		if (hits == samples) {
 			if (done)
 				break;
-			ret = poll(event_array, nr_poll, -1);
+			err = poll(event_array, nr_poll, -1);
 			waking++;
 		}
 
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 1a806d5..fe474b7 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -38,6 +38,7 @@
 static struct strlist	*dso_list, *comm_list, *sym_list;
 
 static int		force;
+static bool		use_modules;
 
 static int		full_paths;
 static int		show_nr_samples;
@@ -51,6 +52,7 @@
 static int		exclude_other = 1;
 
 static char		callchain_default_opt[] = "fractal,0.5";
+const char		*vmlinux_name;
 
 static char		*cwd;
 static int		cwdlen;
@@ -448,7 +450,7 @@
 		 * trick of looking in the whole kernel symbol list.
 		 */
 		if ((long long)ip < 0)
-			return kernel_maps__find_symbol(ip, mapp);
+			return kernel_maps__find_symbol(ip, mapp, NULL);
 	}
 	dump_printf(" ...... dso: %s\n",
 		    map ? map->dso->long_name : "<not found>");
@@ -466,7 +468,7 @@
 	return 0;
 }
 
-static struct symbol **resolve_callchain(struct thread *thread, struct map *map,
+static struct symbol **resolve_callchain(struct thread *thread,
 					 struct ip_callchain *chain,
 					 struct symbol **parent)
 {
@@ -495,10 +497,10 @@
 		case PERF_CONTEXT_HV:
 			break;
 		case PERF_CONTEXT_KERNEL:
-			sym = kernel_maps__find_symbol(ip, &map);
+			sym = kernel_maps__find_symbol(ip, NULL, NULL);
 			break;
 		default:
-			sym = resolve_symbol(thread, &map, &ip);
+			sym = resolve_symbol(thread, NULL, &ip);
 			break;
 		}
 
@@ -528,7 +530,7 @@
 	struct hist_entry *he;
 
 	if ((sort__has_parent || callchain) && chain)
-		syms = resolve_callchain(thread, map, chain, &parent);
+		syms = resolve_callchain(thread, chain, &parent);
 
 	he = __hist_entry__add(thread, map, sym, parent,
 			       ip, count, level, &hit);
@@ -715,7 +717,7 @@
 
 	if (cpumode == PERF_RECORD_MISC_KERNEL) {
 		level = 'k';
-		sym = kernel_maps__find_symbol(ip, &map);
+		sym = kernel_maps__find_symbol(ip, &map, NULL);
 		dump_printf(" ...... dso: %s\n",
 			    map ? map->dso->long_name : "<not found>");
 	} else if (cpumode == PERF_RECORD_MISC_USER) {
@@ -924,8 +926,9 @@
 
 	register_perf_file_handler(&file_handler);
 
-	ret = mmap_dispatch_perf_file(&header, input_name, force, full_paths,
-				      &cwdlen, &cwd);
+	ret = mmap_dispatch_perf_file(&header, input_name, vmlinux_name,
+				      !vmlinux_name, force,
+				      full_paths, &cwdlen, &cwd);
 	if (ret)
 		return ret;
 
@@ -1023,7 +1026,7 @@
 		    "dump raw trace in ASCII"),
 	OPT_STRING('k', "vmlinux", &vmlinux_name, "file", "vmlinux pathname"),
 	OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
-	OPT_BOOLEAN('m', "modules", &modules,
+	OPT_BOOLEAN('m', "modules", &use_modules,
 		    "load module symbols - WARNING: use only with -k and LIVE kernel"),
 	OPT_BOOLEAN('n', "show-nr-samples", &show_nr_samples,
 		    "Show a column with the number of samples"),
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index df44b75..260f57a 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1718,7 +1718,8 @@
 	register_idle_thread();
 	register_perf_file_handler(&file_handler);
 
-	return mmap_dispatch_perf_file(&header, input_name, 0, 0, &cwdlen, &cwd);
+	return mmap_dispatch_perf_file(&header, input_name, NULL, false, 0, 0,
+				       &cwdlen, &cwd);
 }
 
 static void print_bad_events(void)
diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c
index 665877e..dd4d82a 100644
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c
@@ -1093,7 +1093,7 @@
 
 static int __cmd_timechart(void)
 {
-	int ret, rc = EXIT_FAILURE;
+	int err, rc = EXIT_FAILURE;
 	unsigned long offset = 0;
 	unsigned long head, shift;
 	struct stat statbuf;
@@ -1111,8 +1111,8 @@
 		exit(-1);
 	}
 
-	ret = fstat(input, &statbuf);
-	if (ret < 0) {
+	err = fstat(input, &statbuf);
+	if (err < 0) {
 		perror("failed to stat file");
 		exit(-1);
 	}
@@ -1122,7 +1122,16 @@
 		exit(0);
 	}
 
-	header = perf_header__read(input);
+	header = perf_header__new();
+	if (header == NULL)
+		return -ENOMEM;
+
+	err = perf_header__read(header, input);
+	if (err < 0) {
+		perf_header__delete(header);
+		return err;
+	}
+
 	head = header->data_offset;
 
 	sample_type = perf_header__sample_type(header);
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 07b92c3..6a5de90 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -79,13 +79,7 @@
 static bool			hide_kernel_symbols		=  false;
 static bool			hide_user_symbols		=  false;
 static struct winsize		winsize;
-static const char		*graph_line			=
-	"_____________________________________________________________________"
-	"_____________________________________________________________________";
-static const char		*graph_dotted_line			=
-	"---------------------------------------------------------------------"
-	"---------------------------------------------------------------------"
-	"---------------------------------------------------------------------";
+const char 			*vmlinux_name;
 
 /*
  * Source
@@ -830,6 +824,8 @@
 		case 'q':
 		case 'Q':
 			printf("exiting.\n");
+			if (dump_symtab)
+				dsos__fprintf(stderr);
 			exit(0);
 		case 's':
 			prompt_symbol(&sym_filter_entry, "Enter details symbol");
@@ -946,17 +942,6 @@
 	return 0;
 }
 
-static int parse_symbols(void)
-{
-	if (dsos__load_kernel(vmlinux_name, symbol_filter, 1) <= 0)
-		return -1;
-
-	if (dump_symtab)
-		dsos__fprintf(stderr);
-
-	return 0;
-}
-
 static void event__process_sample(const event_t *self, int counter)
 {
 	u64 ip = self->ip.ip;
@@ -999,7 +984,7 @@
 		if (hide_kernel_symbols)
 			return;
 
-		sym = kernel_maps__find_symbol(ip, &map);
+		sym = kernel_maps__find_symbol(ip, &map, symbol_filter);
 		if (sym == NULL)
 			return;
 		break;
@@ -1326,7 +1311,7 @@
 
 int cmd_top(int argc, const char **argv, const char *prefix __used)
 {
-	int counter;
+	int counter, err;
 
 	page_size = sysconf(_SC_PAGE_SIZE);
 
@@ -1350,10 +1335,11 @@
 	if (delay_secs < 1)
 		delay_secs = 1;
 
-	parse_symbols();
+	err = kernel_maps__init(vmlinux_name, !vmlinux_name, true);
+	if (err < 0)
+		return err;
 	parse_source(sym_filter_entry);
 
-
 	/*
 	 * User specified count overrides default frequency.
 	 */
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index d042d65..b71198e 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -131,7 +131,8 @@
 	register_idle_thread();
 	register_perf_file_handler(&file_handler);
 
-	return mmap_dispatch_perf_file(&header, input_name, 0, 0, &cwdlen, &cwd);
+	return mmap_dispatch_perf_file(&header, input_name, NULL, false,
+				       0, 0, &cwdlen, &cwd);
 }
 
 static const char * const annotate_usage[] = {
diff --git a/tools/perf/builtin.h b/tools/perf/builtin.h
index 9b02d85..a3d8bf6 100644
--- a/tools/perf/builtin.h
+++ b/tools/perf/builtin.h
@@ -28,5 +28,6 @@
 extern int cmd_trace(int argc, const char **argv, const char *prefix);
 extern int cmd_version(int argc, const char **argv, const char *prefix);
 extern int cmd_probe(int argc, const char **argv, const char *prefix);
+extern int cmd_kmem(int argc, const char **argv, const char *prefix);
 
 #endif
diff --git a/tools/perf/command-list.txt b/tools/perf/command-list.txt
index d3a6e18..02b09ea 100644
--- a/tools/perf/command-list.txt
+++ b/tools/perf/command-list.txt
@@ -14,3 +14,4 @@
 perf-top			mainporcelain common
 perf-trace			mainporcelain common
 perf-probe			mainporcelain common
+perf-kmem			mainporcelain common
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index 89b82ac..cf64049 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -285,20 +285,21 @@
 {
 	const char *cmd = argv[0];
 	static struct cmd_struct commands[] = {
-		{ "help", cmd_help, 0 },
-		{ "list", cmd_list, 0 },
 		{ "buildid-list", cmd_buildid_list, 0 },
-		{ "record", cmd_record, 0 },
-		{ "report", cmd_report, 0 },
-		{ "bench", cmd_bench, 0 },
-		{ "stat", cmd_stat, 0 },
-		{ "timechart", cmd_timechart, 0 },
-		{ "top", cmd_top, 0 },
-		{ "annotate", cmd_annotate, 0 },
-		{ "version", cmd_version, 0 },
-		{ "trace", cmd_trace, 0 },
-		{ "sched", cmd_sched, 0 },
-		{ "probe", cmd_probe, 0 },
+		{ "help",	cmd_help,	0 },
+		{ "list",	cmd_list,	0 },
+		{ "record",	cmd_record,	0 },
+		{ "report",	cmd_report,	0 },
+		{ "bench",	cmd_bench,	0 },
+		{ "stat",	cmd_stat,	0 },
+		{ "timechart",	cmd_timechart,	0 },
+		{ "top",	cmd_top,	0 },
+		{ "annotate",	cmd_annotate,	0 },
+		{ "version",	cmd_version,	0 },
+		{ "trace",	cmd_trace,	0 },
+		{ "sched",	cmd_sched,	0 },
+		{ "probe",	cmd_probe,	0 },
+		{ "kmem",	cmd_kmem,	0 },
 	};
 	unsigned int i;
 	static const char ext[] = STRIP_EXTENSION;
diff --git a/tools/perf/util/ctype.c b/tools/perf/util/ctype.c
index 0b791bd..3507362 100644
--- a/tools/perf/util/ctype.c
+++ b/tools/perf/util/ctype.c
@@ -29,3 +29,11 @@
 	A, A, A, A, A, A, A, A, A, A, A, R, R, P, P, 0,		/* 112..127 */
 	/* Nothing in the 128.. range */
 };
+
+const char *graph_line =
+	"_____________________________________________________________________"
+	"_____________________________________________________________________";
+const char *graph_dotted_line =
+	"---------------------------------------------------------------------"
+	"---------------------------------------------------------------------"
+	"---------------------------------------------------------------------";
diff --git a/tools/perf/util/data_map.c b/tools/perf/util/data_map.c
index 14cb846..f318d19 100644
--- a/tools/perf/util/data_map.c
+++ b/tools/perf/util/data_map.c
@@ -101,12 +101,14 @@
 
 int mmap_dispatch_perf_file(struct perf_header **pheader,
 			    const char *input_name,
+			    const char *vmlinux_name,
+			    bool try_vmlinux_path,
 			    int force,
 			    int full_paths,
 			    int *cwdlen,
 			    char **cwd)
 {
-	int ret, rc = EXIT_FAILURE;
+	int err;
 	struct perf_header *header;
 	unsigned long head, shift;
 	unsigned long offset = 0;
@@ -118,56 +120,69 @@
 	int input;
 	char *buf;
 
-	if (!curr_handler)
-		die("Forgot to register perf file handler");
+	if (curr_handler == NULL) {
+		pr_debug("Forgot to register perf file handler\n");
+		return -EINVAL;
+	}
 
 	page_size = getpagesize();
 
 	input = open(input_name, O_RDONLY);
 	if (input < 0) {
-		fprintf(stderr, " failed to open file: %s", input_name);
+		pr_err("Failed to open file: %s", input_name);
 		if (!strcmp(input_name, "perf.data"))
-			fprintf(stderr, "  (try 'perf record' first)");
-		fprintf(stderr, "\n");
-		exit(-1);
+			pr_err("  (try 'perf record' first)");
+		pr_err("\n");
+		return -errno;
 	}
 
-	ret = fstat(input, &input_stat);
-	if (ret < 0) {
-		perror("failed to stat file");
-		exit(-1);
+	if (fstat(input, &input_stat) < 0) {
+		pr_err("failed to stat file");
+		err = -errno;
+		goto out_close;
 	}
 
+	err = -EACCES;
 	if (!force && input_stat.st_uid && (input_stat.st_uid != geteuid())) {
-		fprintf(stderr, "file: %s not owned by current user or root\n",
+		pr_err("file: %s not owned by current user or root\n",
 			input_name);
-		exit(-1);
+		goto out_close;
 	}
 
-	if (!input_stat.st_size) {
-		fprintf(stderr, "zero-sized file, nothing to do!\n");
-		exit(0);
+	if (input_stat.st_size == 0) {
+		pr_info("zero-sized file, nothing to do!\n");
+		goto done;
 	}
 
-	*pheader = perf_header__read(input);
-	header = *pheader;
+	err = -ENOMEM;
+	header = perf_header__new();
+	if (header == NULL)
+		goto out_close;
+
+	err = perf_header__read(header, input);
+	if (err < 0)
+		goto out_delete;
+	*pheader = header;
 	head = header->data_offset;
 
 	sample_type = perf_header__sample_type(header);
 
-	if (curr_handler->sample_type_check)
-		if (curr_handler->sample_type_check(sample_type) < 0)
-			exit(-1);
+	err = -EINVAL;
+	if (curr_handler->sample_type_check &&
+	    curr_handler->sample_type_check(sample_type) < 0)
+		goto out_delete;
 
-	if (load_kernel(NULL) < 0) {
-		perror("failed to load kernel symbols");
-		return EXIT_FAILURE;
+	err = -ENOMEM;
+	if (kernel_maps__init(vmlinux_name, try_vmlinux_path, true) < 0) {
+		pr_err("failed to setup the kernel maps to resolve symbols\n");
+		goto out_delete;
 	}
 
 	if (!full_paths) {
 		if (getcwd(__cwd, sizeof(__cwd)) == NULL) {
-			perror("failed to get the current directory");
-			return EXIT_FAILURE;
+			pr_err("failed to get the current directory\n");
+			err = -errno;
+			goto out_delete;
 		}
 		*cwd = __cwd;
 		*cwdlen = strlen(*cwd);
@@ -181,11 +196,12 @@
 	head -= shift;
 
 remap:
-	buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
-			   MAP_SHARED, input, offset);
+	buf = mmap(NULL, page_size * mmap_window, PROT_READ,
+		   MAP_SHARED, input, offset);
 	if (buf == MAP_FAILED) {
-		perror("failed to mmap file");
-		exit(-1);
+		pr_err("failed to mmap file\n");
+		err = -errno;
+		goto out_delete;
 	}
 
 more:
@@ -242,10 +258,12 @@
 		goto more;
 
 done:
-	rc = EXIT_SUCCESS;
+	err = 0;
+out_close:
 	close(input);
 
-	return rc;
+	return err;
+out_delete:
+	perf_header__delete(header);
+	goto out_close;
 }
-
-
diff --git a/tools/perf/util/data_map.h b/tools/perf/util/data_map.h
index ae036ec..3f0d21b 100644
--- a/tools/perf/util/data_map.h
+++ b/tools/perf/util/data_map.h
@@ -23,6 +23,8 @@
 void register_perf_file_handler(struct perf_file_handler *handler);
 int mmap_dispatch_perf_file(struct perf_header **pheader,
 			    const char *input_name,
+			    const char *vmlinux_name,
+			    bool try_vmlinux_path,
 			    int force,
 			    int full_paths,
 			    int *cwdlen,
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 1f771ce..f1e3926 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -69,13 +69,6 @@
 	char			 filename[];
 };
 
-struct build_id_list {
-	struct build_id_event	event;
-	struct list_head	list;
-	const char		*dso_name;
-	int			len;
-};
-
 typedef union event_union {
 	struct perf_event_header	header;
 	struct ip_event			ip;
@@ -122,10 +115,13 @@
 void map__init(struct map *self, u64 start, u64 end, u64 pgoff,
 	       struct dso *dso);
 struct map *map__new(struct mmap_event *event, char *cwd, int cwdlen);
+void map__delete(struct map *self);
 struct map *map__clone(struct map *self);
 int map__overlap(struct map *l, struct map *r);
 size_t map__fprintf(struct map *self, FILE *fp);
 struct symbol *map__find_symbol(struct map *self, u64 ip, symbol_filter_t filter);
+void map__fixup_start(struct map *self);
+void map__fixup_end(struct map *self);
 
 int event__synthesize_thread(pid_t pid, int (*process)(event_t *event));
 void event__synthesize_threads(int (*process)(event_t *event));
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index b01a953..1332f8e 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -78,16 +78,24 @@
 	return self;
 }
 
+void perf_header__delete(struct perf_header *self)
+{
+	int i;
+
+	for (i = 0; i < self->attrs; ++i)
+		perf_header_attr__delete(self->attr[i]);
+
+	free(self->attr);
+	free(self);
+}
+
 int perf_header__add_attr(struct perf_header *self,
 			  struct perf_header_attr *attr)
 {
-	int pos = self->attrs;
-
 	if (self->frozen)
 		return -1;
 
-	self->attrs++;
-	if (self->attrs > self->size) {
+	if (self->attrs == self->size) {
 		int nsize = self->size * 2;
 		struct perf_header_attr **nattr;
 
@@ -98,7 +106,8 @@
 		self->size = nsize;
 		self->attr = nattr;
 	}
-	self->attr[pos] = attr;
+
+	self->attr[self->attrs++] = attr;
 	return 0;
 }
 
@@ -167,7 +176,7 @@
 		int ret = write(fd, buf, size);
 
 		if (ret < 0)
-			return -1;
+			return -errno;
 
 		size -= ret;
 		buf += ret;
@@ -176,43 +185,51 @@
 	return 0;
 }
 
-static int write_buildid_table(int fd, struct list_head *id_head)
+static int dsos__write_buildid_table(int fd)
 {
-	struct build_id_list *iter, *next;
+	struct dso *pos;
 
-	list_for_each_entry_safe(iter, next, id_head, list) {
-		struct build_id_event *b = &iter->event;
+	list_for_each_entry(pos, &dsos, node) {
+		int err;
+		struct build_id_event b;
+		size_t len;
 
-		if (do_write(fd, b, sizeof(*b)) < 0 ||
-		    do_write(fd, iter->dso_name, iter->len) < 0)
-			return -1;
-		list_del(&iter->list);
-		free(iter);
+		if (!pos->has_build_id)
+			continue;
+		len = pos->long_name_len + 1;
+		len = ALIGN(len, 64);
+		memset(&b, 0, sizeof(b));
+		memcpy(&b.build_id, pos->build_id, sizeof(pos->build_id));
+		b.header.size = sizeof(b) + len;
+		err = do_write(fd, &b, sizeof(b));
+		if (err < 0)
+			return err;
+		err = do_write(fd, pos->long_name, len);
+		if (err < 0)
+			return err;
 	}
 
 	return 0;
 }
 
-static void
-perf_header__adds_write(struct perf_header *self, int fd)
+static int perf_header__adds_write(struct perf_header *self, int fd)
 {
-	LIST_HEAD(id_list);
 	int nr_sections;
 	struct perf_file_section *feat_sec;
 	int sec_size;
 	u64 sec_start;
-	int idx = 0;
+	int idx = 0, err;
 
-	if (fetch_build_id_table(&id_list))
+	if (dsos__read_build_ids())
 		perf_header__set_feat(self, HEADER_BUILD_ID);
 
 	nr_sections = bitmap_weight(self->adds_features, HEADER_FEAT_BITS);
 	if (!nr_sections)
-		return;
+		return 0;
 
 	feat_sec = calloc(sizeof(*feat_sec), nr_sections);
-	if (!feat_sec)
-		die("No memory");
+	if (feat_sec == NULL)
+		return -ENOMEM;
 
 	sec_size = sizeof(*feat_sec) * nr_sections;
 
@@ -236,25 +253,37 @@
 
 		buildid_sec = &feat_sec[idx++];
 
+		/*
+		 * Read the kernel buildid nad the list of loaded modules with
+		 * its build_ids:
+		 */
+		kernel_maps__init(NULL, false, true);
+
 		/* Write build-ids */
 		buildid_sec->offset = lseek(fd, 0, SEEK_CUR);
-		if (write_buildid_table(fd, &id_list) < 0)
-			die("failed to write buildid table");
+		err = dsos__write_buildid_table(fd);
+		if (err < 0) {
+			pr_debug("failed to write buildid table\n");
+			goto out_free;
+		}
 		buildid_sec->size = lseek(fd, 0, SEEK_CUR) - buildid_sec->offset;
 	}
 
 	lseek(fd, sec_start, SEEK_SET);
-	if (do_write(fd, feat_sec, sec_size) < 0)
-		die("failed to write feature section");
+	err = do_write(fd, feat_sec, sec_size);
+	if (err < 0)
+		pr_debug("failed to write feature section\n");
+out_free:
 	free(feat_sec);
+	return err;
 }
 
-void perf_header__write(struct perf_header *self, int fd, bool at_exit)
+int perf_header__write(struct perf_header *self, int fd, bool at_exit)
 {
 	struct perf_file_header f_header;
 	struct perf_file_attr   f_attr;
 	struct perf_header_attr	*attr;
-	int i;
+	int i, err;
 
 	lseek(fd, sizeof(f_header), SEEK_SET);
 
@@ -263,8 +292,11 @@
 		attr = self->attr[i];
 
 		attr->id_offset = lseek(fd, 0, SEEK_CUR);
-		if (do_write(fd, attr->id, attr->ids * sizeof(u64)) < 0)
-			die("failed to write perf header");
+		err = do_write(fd, attr->id, attr->ids * sizeof(u64));
+		if (err < 0) {
+			pr_debug("failed to write perf header\n");
+			return err;
+		}
 	}
 
 
@@ -280,20 +312,30 @@
 				.size   = attr->ids * sizeof(u64),
 			}
 		};
-		if (do_write(fd, &f_attr, sizeof(f_attr)) < 0)
-			die("failed to write perf header attribute");
+		err = do_write(fd, &f_attr, sizeof(f_attr));
+		if (err < 0) {
+			pr_debug("failed to write perf header attribute\n");
+			return err;
+		}
 	}
 
 	self->event_offset = lseek(fd, 0, SEEK_CUR);
 	self->event_size = event_count * sizeof(struct perf_trace_event_type);
-	if (events)
-		if (do_write(fd, events, self->event_size) < 0)
-			die("failed to write perf header events");
+	if (events) {
+		err = do_write(fd, events, self->event_size);
+		if (err < 0) {
+			pr_debug("failed to write perf header events\n");
+			return err;
+		}
+	}
 
 	self->data_offset = lseek(fd, 0, SEEK_CUR);
 
-	if (at_exit)
-		perf_header__adds_write(self, fd);
+	if (at_exit) {
+		err = perf_header__adds_write(self, fd);
+		if (err < 0)
+			return err;
+	}
 
 	f_header = (struct perf_file_header){
 		.magic	   = PERF_MAGIC,
@@ -316,11 +358,15 @@
 	memcpy(&f_header.adds_features, &self->adds_features, sizeof(self->adds_features));
 
 	lseek(fd, 0, SEEK_SET);
-	if (do_write(fd, &f_header, sizeof(f_header)) < 0)
-		die("failed to write perf header");
+	err = do_write(fd, &f_header, sizeof(f_header));
+	if (err < 0) {
+		pr_debug("failed to write perf header\n");
+		return err;
+	}
 	lseek(fd, self->data_offset + self->data_size, SEEK_SET);
 
 	self->frozen = 1;
+	return 0;
 }
 
 static void do_read(int fd, void *buf, size_t size)
@@ -430,19 +476,17 @@
 	return 0;
 }
 
-struct perf_header *perf_header__read(int fd)
+int perf_header__read(struct perf_header *self, int fd)
 {
-	struct perf_header	*self = perf_header__new();
 	struct perf_file_header f_header;
 	struct perf_file_attr	f_attr;
 	u64			f_id;
 	int nr_attrs, nr_ids, i, j;
 
-	if (self == NULL)
-		die("nomem");
-
-	if (perf_file_header__read(&f_header, self, fd) < 0)
-		die("incompatible file format");
+	if (perf_file_header__read(&f_header, self, fd) < 0) {
+		pr_debug("incompatible file format\n");
+		return -EINVAL;
+	}
 
 	nr_attrs = f_header.attrs.size / sizeof(f_attr);
 	lseek(fd, f_header.attrs.offset, SEEK_SET);
@@ -456,7 +500,7 @@
 
 		attr = perf_header_attr__new(&f_attr.attr);
 		if (attr == NULL)
-			 die("nomem");
+			 return -ENOMEM;
 
 		nr_ids = f_attr.ids.size / sizeof(u64);
 		lseek(fd, f_attr.ids.offset, SEEK_SET);
@@ -464,11 +508,15 @@
 		for (j = 0; j < nr_ids; j++) {
 			do_read(fd, &f_id, sizeof(f_id));
 
-			if (perf_header_attr__add_id(attr, f_id) < 0)
-				die("nomem");
+			if (perf_header_attr__add_id(attr, f_id) < 0) {
+				perf_header_attr__delete(attr);
+				return -ENOMEM;
+			}
 		}
-		if (perf_header__add_attr(self, attr) < 0)
-			 die("nomem");
+		if (perf_header__add_attr(self, attr) < 0) {
+			perf_header_attr__delete(attr);
+			return -ENOMEM;
+		}
 
 		lseek(fd, tmp, SEEK_SET);
 	}
@@ -476,8 +524,8 @@
 	if (f_header.event_types.size) {
 		lseek(fd, f_header.event_types.offset, SEEK_SET);
 		events = malloc(f_header.event_types.size);
-		if (!events)
-			die("nomem");
+		if (events == NULL)
+			return -ENOMEM;
 		do_read(fd, events, f_header.event_types.size);
 		event_count =  f_header.event_types.size / sizeof(struct perf_trace_event_type);
 	}
@@ -487,8 +535,7 @@
 	lseek(fd, self->data_offset, SEEK_SET);
 
 	self->frozen = 1;
-
-	return self;
+	return 0;
 }
 
 u64 perf_header__sample_type(struct perf_header *header)
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index f46a94e..d1dbe2b 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -55,8 +55,11 @@
 	DECLARE_BITMAP(adds_features, HEADER_FEAT_BITS);
 };
 
-struct perf_header *perf_header__read(int fd);
-void perf_header__write(struct perf_header *self, int fd, bool at_exit);
+struct perf_header *perf_header__new(void);
+void perf_header__delete(struct perf_header *self);
+
+int perf_header__read(struct perf_header *self, int fd);
+int perf_header__write(struct perf_header *self, int fd, bool at_exit);
 
 int perf_header__add_attr(struct perf_header *self,
 			  struct perf_header_attr *attr);
@@ -75,8 +78,6 @@
 void perf_header__set_feat(struct perf_header *self, int feat);
 bool perf_header__has_feat(const struct perf_header *self, int feat);
 
-struct perf_header *perf_header__new(void);
-
 int perf_header__process_sections(struct perf_header *self, int fd,
 				  int (*process)(struct perf_file_section *self,
 						 int feat, int fd));
diff --git a/tools/perf/util/include/linux/bitops.h b/tools/perf/util/include/linux/bitops.h
index ace57c3..8d63116 100644
--- a/tools/perf/util/include/linux/bitops.h
+++ b/tools/perf/util/include/linux/bitops.h
@@ -7,6 +7,8 @@
 #define CONFIG_GENERIC_FIND_FIRST_BIT
 #include "../../../../include/linux/bitops.h"
 
+#undef __KERNEL__
+
 static inline void set_bit(int nr, unsigned long *addr)
 {
 	addr[nr / BITS_PER_LONG] |= 1UL << (nr % BITS_PER_LONG);
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index 94ca950..0941232 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -75,6 +75,29 @@
 	return NULL;
 }
 
+void map__delete(struct map *self)
+{
+	free(self);
+}
+
+void map__fixup_start(struct map *self)
+{
+	struct rb_node *nd = rb_first(&self->dso->syms);
+	if (nd != NULL) {
+		struct symbol *sym = rb_entry(nd, struct symbol, rb_node);
+		self->start = sym->start;
+	}
+}
+
+void map__fixup_end(struct map *self)
+{
+	struct rb_node *nd = rb_last(&self->dso->syms);
+	if (nd != NULL) {
+		struct symbol *sym = rb_entry(nd, struct symbol, rb_node);
+		self->end = sym->end;
+	}
+}
+
 #define DSO__DELETED "(deleted)"
 
 struct symbol *
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 0faf4f2..0700274 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -1,4 +1,4 @@
-
+#include "../../../include/linux/hw_breakpoint.h"
 #include "util.h"
 #include "../perf.h"
 #include "parse-options.h"
@@ -540,6 +540,81 @@
 						     attr, strp);
 }
 
+static enum event_result
+parse_breakpoint_type(const char *type, const char **strp,
+		      struct perf_event_attr *attr)
+{
+	int i;
+
+	for (i = 0; i < 3; i++) {
+		if (!type[i])
+			break;
+
+		switch (type[i]) {
+		case 'r':
+			attr->bp_type |= HW_BREAKPOINT_R;
+			break;
+		case 'w':
+			attr->bp_type |= HW_BREAKPOINT_W;
+			break;
+		case 'x':
+			attr->bp_type |= HW_BREAKPOINT_X;
+			break;
+		default:
+			return EVT_FAILED;
+		}
+	}
+	if (!attr->bp_type) /* Default */
+		attr->bp_type = HW_BREAKPOINT_R | HW_BREAKPOINT_W;
+
+	*strp = type + i;
+
+	return EVT_HANDLED;
+}
+
+static enum event_result
+parse_breakpoint_event(const char **strp, struct perf_event_attr *attr)
+{
+	const char *target;
+	const char *type;
+	char *endaddr;
+	u64 addr;
+	enum event_result err;
+
+	target = strchr(*strp, ':');
+	if (!target)
+		return EVT_FAILED;
+
+	if (strncmp(*strp, "mem", target - *strp) != 0)
+		return EVT_FAILED;
+
+	target++;
+
+	addr = strtoull(target, &endaddr, 0);
+	if (target == endaddr)
+		return EVT_FAILED;
+
+	attr->bp_addr = addr;
+	*strp = endaddr;
+
+	type = strchr(target, ':');
+
+	/* If no type is defined, just rw as default */
+	if (!type) {
+		attr->bp_type = HW_BREAKPOINT_R | HW_BREAKPOINT_W;
+	} else {
+		err = parse_breakpoint_type(++type, strp, attr);
+		if (err == EVT_FAILED)
+			return EVT_FAILED;
+	}
+
+	/* We should find a nice way to override the access type */
+	attr->bp_len = HW_BREAKPOINT_LEN_4;
+	attr->type = PERF_TYPE_BREAKPOINT;
+
+	return EVT_HANDLED;
+}
+
 static int check_events(const char *str, unsigned int i)
 {
 	int n;
@@ -673,6 +748,10 @@
 	if (ret != EVT_FAILED)
 		goto modifier;
 
+	ret = parse_breakpoint_event(str, attr);
+	if (ret != EVT_FAILED)
+		goto modifier;
+
 	fprintf(stderr, "invalid or unsupported event: '%s'\n", *str);
 	fprintf(stderr, "Run 'perf list' for a list of valid events\n");
 	return EVT_FAILED;
@@ -859,6 +938,9 @@
 		"rNNN");
 	printf("\n");
 
+	printf("  %-42s [hardware breakpoint]\n", "mem:<addr>[:access]");
+	printf("\n");
+
 	print_tracepoint_events();
 
 	exit(129);
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 5cc96c8..44d81d5 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -9,8 +9,13 @@
 #include <libelf.h>
 #include <gelf.h>
 #include <elf.h>
+#include <limits.h>
 #include <sys/utsname.h>
 
+#ifndef NT_GNU_BUILD_ID
+#define NT_GNU_BUILD_ID 3
+#endif
+
 enum dso_origin {
 	DSO__ORIG_KERNEL = 0,
 	DSO__ORIG_JAVA_JIT,
@@ -26,7 +31,11 @@
 static struct dso *dsos__find(const char *name);
 static struct map *map__new2(u64 start, struct dso *dso);
 static void kernel_maps__insert(struct map *map);
+static int dso__load_kernel_sym(struct dso *self, struct map *map,
+				symbol_filter_t filter);
 unsigned int symbol__priv_size;
+static int vmlinux_path__nr_entries;
+static char **vmlinux_path;
 
 static struct rb_root kernel_maps;
 
@@ -69,11 +78,11 @@
 		prev->end = curr->start - 1;
 	}
 
-	nd = rb_last(&curr->dso->syms);
-	if (nd) {
-		struct symbol *sym = rb_entry(nd, struct symbol, rb_node);
-		curr->end = sym->end;
-	}
+	/*
+	 * We still haven't the actual symbols, so guess the
+	 * last map final address.
+	 */
+	curr->end = ~0UL;
 }
 
 static struct symbol *symbol__new(u64 start, u64 len, const char *name)
@@ -111,6 +120,8 @@
 
 static void dso__set_long_name(struct dso *self, char *name)
 {
+	if (name == NULL)
+		return;
 	self->long_name = name;
 	self->long_name_len = strlen(name);
 }
@@ -323,7 +334,7 @@
  * kernel range is broken in several maps, named [kernel].N, as we don't have
  * the original ELF section names vmlinux have.
  */
-static int kernel_maps__split_kallsyms(symbol_filter_t filter, int use_modules)
+static int kernel_maps__split_kallsyms(symbol_filter_t filter)
 {
 	struct map *map = kernel_map;
 	struct symbol *pos;
@@ -339,9 +350,6 @@
 
 		module = strchr(pos->name, '\t');
 		if (module) {
-			if (!use_modules)
-				goto delete_symbol;
-
 			*module++ = '\0';
 
 			if (strcmp(map->dso->name, module)) {
@@ -381,7 +389,6 @@
 		}
 
 		if (filter && filter(map, pos)) {
-delete_symbol:
 			rb_erase(&pos->rb_node, &kernel_map->dso->syms);
 			symbol__delete(pos);
 		} else {
@@ -397,17 +404,18 @@
 }
 
 
-static int kernel_maps__load_kallsyms(symbol_filter_t filter, int use_modules)
+static int kernel_maps__load_kallsyms(symbol_filter_t filter)
 {
 	if (kernel_maps__load_all_kallsyms())
 		return -1;
 
 	dso__fixup_sym_end(kernel_map->dso);
+	kernel_map->dso->origin = DSO__ORIG_KERNEL;
 
-	return kernel_maps__split_kallsyms(filter, use_modules);
+	return kernel_maps__split_kallsyms(filter);
 }
 
-static size_t kernel_maps__fprintf(FILE *fp)
+size_t kernel_maps__fprintf(FILE *fp)
 {
 	size_t printed = fprintf(fp, "Kernel maps:\n");
 	struct rb_node *nd;
@@ -883,47 +891,40 @@
 	return err;
 }
 
-bool fetch_build_id_table(struct list_head *head)
+static bool dso__build_id_equal(const struct dso *self, u8 *build_id)
 {
-	bool have_buildid = false;
+	return memcmp(self->build_id, build_id, sizeof(self->build_id)) == 0;
+}
+
+bool dsos__read_build_ids(void)
+{
+	bool have_build_id = false;
 	struct dso *pos;
 
-	list_for_each_entry(pos, &dsos, node) {
-		struct build_id_list *new;
-		struct build_id_event b;
-		size_t len;
+	list_for_each_entry(pos, &dsos, node)
+		if (filename__read_build_id(pos->long_name, pos->build_id,
+					    sizeof(pos->build_id)) > 0) {
+			have_build_id	  = true;
+			pos->has_build_id = true;
+		}
 
-		if (filename__read_build_id(pos->long_name,
-					    &b.build_id,
-					    sizeof(b.build_id)) < 0)
-			continue;
-		have_buildid = true;
-		memset(&b.header, 0, sizeof(b.header));
-		len = pos->long_name_len + 1;
-		len = ALIGN(len, 64);
-		b.header.size = sizeof(b) + len;
-
-		new = malloc(sizeof(*new));
-		if (!new)
-			die("No memory\n");
-
-		memcpy(&new->event, &b, sizeof(b));
-		new->dso_name = pos->long_name;
-		new->len = len;
-
-		list_add_tail(&new->list, head);
-	}
-
-	return have_buildid;
+	return have_build_id;
 }
 
+/*
+ * Align offset to 4 bytes as needed for note name and descriptor data.
+ */
+#define NOTE_ALIGN(n) (((n) + 3) & -4U)
+
 int filename__read_build_id(const char *filename, void *bf, size_t size)
 {
 	int fd, err = -1;
 	GElf_Ehdr ehdr;
 	GElf_Shdr shdr;
-	Elf_Data *build_id_data;
+	Elf_Data *data;
 	Elf_Scn *sec;
+	Elf_Kind ek;
+	void *ptr;
 	Elf *elf;
 
 	if (size < BUILD_ID_SIZE)
@@ -939,6 +940,10 @@
 		goto out_close;
 	}
 
+	ek = elf_kind(elf);
+	if (ek != ELF_K_ELF)
+		goto out_elf_end;
+
 	if (gelf_getehdr(elf, &ehdr) == NULL) {
 		pr_err("%s: cannot get elf header.\n", __func__);
 		goto out_elf_end;
@@ -946,14 +951,37 @@
 
 	sec = elf_section_by_name(elf, &ehdr, &shdr,
 				  ".note.gnu.build-id", NULL);
-	if (sec == NULL)
+	if (sec == NULL) {
+		sec = elf_section_by_name(elf, &ehdr, &shdr,
+					  ".notes", NULL);
+		if (sec == NULL)
+			goto out_elf_end;
+	}
+
+	data = elf_getdata(sec, NULL);
+	if (data == NULL)
 		goto out_elf_end;
 
-	build_id_data = elf_getdata(sec, NULL);
-	if (build_id_data == NULL)
-		goto out_elf_end;
-	memcpy(bf, build_id_data->d_buf + 16, BUILD_ID_SIZE);
-	err = BUILD_ID_SIZE;
+	ptr = data->d_buf;
+	while (ptr < (data->d_buf + data->d_size)) {
+		GElf_Nhdr *nhdr = ptr;
+		int namesz = NOTE_ALIGN(nhdr->n_namesz),
+		    descsz = NOTE_ALIGN(nhdr->n_descsz);
+		const char *name;
+
+		ptr += sizeof(*nhdr);
+		name = ptr;
+		ptr += namesz;
+		if (nhdr->n_type == NT_GNU_BUILD_ID &&
+		    nhdr->n_namesz == sizeof("GNU")) {
+			if (memcmp(name, "GNU", sizeof("GNU")) == 0) {
+				memcpy(bf, ptr, BUILD_ID_SIZE);
+				err = BUILD_ID_SIZE;
+				break;
+			}
+		}
+		ptr += descsz;
+	}
 out_elf_end:
 	elf_end(elf);
 out_close:
@@ -962,23 +990,48 @@
 	return err;
 }
 
-static char *dso__read_build_id(struct dso *self)
+int sysfs__read_build_id(const char *filename, void *build_id, size_t size)
 {
-	int len;
-	char *build_id = NULL;
-	unsigned char rawbf[BUILD_ID_SIZE];
+	int fd, err = -1;
 
-	len = filename__read_build_id(self->long_name, rawbf, sizeof(rawbf));
-	if (len < 0)
+	if (size < BUILD_ID_SIZE)
 		goto out;
 
-	build_id = malloc(len * 2 + 1);
-	if (build_id == NULL)
+	fd = open(filename, O_RDONLY);
+	if (fd < 0)
 		goto out;
 
-	build_id__sprintf(rawbf, len, build_id);
+	while (1) {
+		char bf[BUFSIZ];
+		GElf_Nhdr nhdr;
+		int namesz, descsz;
+
+		if (read(fd, &nhdr, sizeof(nhdr)) != sizeof(nhdr))
+			break;
+
+		namesz = NOTE_ALIGN(nhdr.n_namesz);
+		descsz = NOTE_ALIGN(nhdr.n_descsz);
+		if (nhdr.n_type == NT_GNU_BUILD_ID &&
+		    nhdr.n_namesz == sizeof("GNU")) {
+			if (read(fd, bf, namesz) != namesz)
+				break;
+			if (memcmp(bf, "GNU", sizeof("GNU")) == 0) {
+				if (read(fd, build_id,
+				    BUILD_ID_SIZE) == BUILD_ID_SIZE) {
+					err = 0;
+					break;
+				}
+			} else if (read(fd, bf, descsz) != descsz)
+				break;
+		} else {
+			int n = namesz + descsz;
+			if (read(fd, bf, n) != n)
+				break;
+		}
+	}
+	close(fd);
 out:
-	return build_id;
+	return err;
 }
 
 char dso__symtab_origin(const struct dso *self)
@@ -1001,12 +1054,17 @@
 int dso__load(struct dso *self, struct map *map, symbol_filter_t filter)
 {
 	int size = PATH_MAX;
-	char *name = malloc(size), *build_id = NULL;
+	char *name;
+	u8 build_id[BUILD_ID_SIZE];
 	int ret = -1;
 	int fd;
 
 	self->loaded = 1;
 
+	if (self->kernel)
+		return dso__load_kernel_sym(self, map, filter);
+
+	name = malloc(size);
 	if (!name)
 		return -1;
 
@@ -1023,8 +1081,6 @@
 
 more:
 	do {
-		int berr = 0;
-
 		self->origin++;
 		switch (self->origin) {
 		case DSO__ORIG_FEDORA:
@@ -1036,12 +1092,18 @@
 				 self->long_name);
 			break;
 		case DSO__ORIG_BUILDID:
-			build_id = dso__read_build_id(self);
-			if (build_id != NULL) {
+			if (filename__read_build_id(self->long_name, build_id,
+						    sizeof(build_id))) {
+				char build_id_hex[BUILD_ID_SIZE * 2 + 1];
+
+				build_id__sprintf(build_id, sizeof(build_id),
+						  build_id_hex);
 				snprintf(name, size,
 					 "/usr/lib/debug/.build-id/%.2s/%s.debug",
-					build_id, build_id + 2);
-				goto compare_build_id;
+					build_id_hex, build_id_hex + 2);
+				if (self->has_build_id)
+					goto compare_build_id;
+				break;
 			}
 			self->origin++;
 			/* Fall thru */
@@ -1054,18 +1116,11 @@
 		}
 
 		if (self->has_build_id) {
-			bool match;
-			build_id = malloc(BUILD_ID_SIZE);
-			if (build_id == NULL)
+			if (filename__read_build_id(name, build_id,
+						    sizeof(build_id)) < 0)
 				goto more;
-			berr = filename__read_build_id(name, build_id,
-						       BUILD_ID_SIZE);
 compare_build_id:
-			match = berr > 0 && memcmp(build_id, self->build_id,
-						   sizeof(self->build_id)) == 0;
-			free(build_id);
-			build_id = NULL;
-			if (!match)
+			if (!dso__build_id_equal(self, build_id))
 				goto more;
 		}
 
@@ -1100,7 +1155,8 @@
 	maps__insert(&kernel_maps, map);
 }
 
-struct symbol *kernel_maps__find_symbol(u64 ip, struct map **mapp)
+struct symbol *kernel_maps__find_symbol(u64 ip, struct map **mapp,
+					symbol_filter_t filter)
 {
 	struct map *map = maps__find(&kernel_maps, ip);
 
@@ -1109,7 +1165,7 @@
 
 	if (map) {
 		ip = map->map_ip(map, ip);
-		return map->dso->find_symbol(map->dso, ip);
+		return map__find_symbol(map, ip, filter);
 	}
 
 	return NULL;
@@ -1129,32 +1185,13 @@
 	return NULL;
 }
 
-static int dso__load_module_sym(struct dso *self, struct map *map,
-				symbol_filter_t filter)
-{
-	int err = 0, fd = open(self->long_name, O_RDONLY);
-
-	self->loaded = 1;
-
-	if (fd < 0) {
-		pr_err("%s: cannot open %s\n", __func__, self->long_name);
-		return err;
-	}
-
-	err = dso__load_sym(self, map, self->long_name, fd, filter, 0, 1);
-	close(fd);
-
-	return err;
-}
-
-static int dsos__load_modules_sym_dir(char *dirname, symbol_filter_t filter)
+static int dsos__set_modules_path_dir(char *dirname)
 {
 	struct dirent *dent;
-	int nr_symbols = 0, err;
 	DIR *dir = opendir(dirname);
 
 	if (!dir) {
-		pr_err("%s: cannot open %s dir\n", __func__, dirname);
+		pr_debug("%s: cannot open %s dir\n", __func__, dirname);
 		return -1;
 	}
 
@@ -1168,14 +1205,12 @@
 
 			snprintf(path, sizeof(path), "%s/%s",
 				 dirname, dent->d_name);
-			err = dsos__load_modules_sym_dir(path, filter);
-			if (err < 0)
+			if (dsos__set_modules_path_dir(path) < 0)
 				goto failure;
 		} else {
 			char *dot = strrchr(dent->d_name, '.'),
 			     dso_name[PATH_MAX];
 			struct map *map;
-			struct rb_node *last;
 			char *long_name;
 
 			if (dot == NULL || strcmp(dot, ".ko"))
@@ -1195,36 +1230,16 @@
 			if (long_name == NULL)
 				goto failure;
 			dso__set_long_name(map->dso, long_name);
-			dso__set_basename(map->dso);
-
-			err = dso__load_module_sym(map->dso, map, filter);
-			if (err < 0)
-				goto failure;
-			last = rb_last(&map->dso->syms);
-			if (last) {
-				struct symbol *sym;
-				/*
-				 * We do this here as well, even having the
-				 * symbol size found in the symtab because
-				 * misannotated ASM symbols may have the size
-				 * set to zero.
-				 */
-				dso__fixup_sym_end(map->dso);
-
-				sym = rb_entry(last, struct symbol, rb_node);
-				map->end = map->start + sym->end;
-			}
 		}
-		nr_symbols += err;
 	}
 
-	return nr_symbols;
+	return 0;
 failure:
 	closedir(dir);
 	return -1;
 }
 
-static int dsos__load_modules_sym(symbol_filter_t filter)
+static int dsos__set_modules_path(void)
 {
 	struct utsname uts;
 	char modules_path[PATH_MAX];
@@ -1235,7 +1250,7 @@
 	snprintf(modules_path, sizeof(modules_path), "/lib/modules/%s/kernel",
 		 uts.release);
 
-	return dsos__load_modules_sym_dir(modules_path, filter);
+	return dsos__set_modules_path_dir(modules_path);
 }
 
 /*
@@ -1257,7 +1272,7 @@
 	return self;
 }
 
-static int dsos__load_modules(void)
+static int kernel_maps__create_module_maps(void)
 {
 	char *line = NULL;
 	size_t n;
@@ -1307,6 +1322,12 @@
 			goto out_delete_line;
 		}
 
+		snprintf(name, sizeof(name),
+			 "/sys/module/%s/notes/.note.gnu.build-id", line);
+		if (sysfs__read_build_id(name, dso->build_id,
+					 sizeof(dso->build_id)) == 0)
+			dso->has_build_id = true;
+
 		dso->origin = DSO__ORIG_KMODULE;
 		kernel_maps__insert(map);
 		dsos__add(dso);
@@ -1315,7 +1336,7 @@
 	free(line);
 	fclose(file);
 
-	return 0;
+	return dsos__set_modules_path();
 
 out_delete_line:
 	free(line);
@@ -1326,13 +1347,37 @@
 static int dso__load_vmlinux(struct dso *self, struct map *map,
 			     const char *vmlinux, symbol_filter_t filter)
 {
-	int err, fd = open(vmlinux, O_RDONLY);
+	int err = -1, fd;
 
-	self->loaded = 1;
+	if (self->has_build_id) {
+		u8 build_id[BUILD_ID_SIZE];
 
+		if (filename__read_build_id(vmlinux, build_id,
+					    sizeof(build_id)) < 0) {
+			pr_debug("No build_id in %s, ignoring it\n", vmlinux);
+			return -1;
+		}
+		if (!dso__build_id_equal(self, build_id)) {
+			char expected_build_id[BUILD_ID_SIZE * 2 + 1],
+			     vmlinux_build_id[BUILD_ID_SIZE * 2 + 1];
+
+			build_id__sprintf(self->build_id,
+					  sizeof(self->build_id),
+					  expected_build_id);
+			build_id__sprintf(build_id, sizeof(build_id),
+					  vmlinux_build_id);
+			pr_debug("build_id in %s is %s while expected is %s, "
+				 "ignoring it\n", vmlinux, vmlinux_build_id,
+				 expected_build_id);
+			return -1;
+		}
+	}
+
+	fd = open(vmlinux, O_RDONLY);
 	if (fd < 0)
 		return -1;
 
+	self->loaded = 1;
 	err = dso__load_sym(self, map, self->long_name, fd, filter, 1, 0);
 
 	close(fd);
@@ -1340,78 +1385,55 @@
 	return err;
 }
 
-int dsos__load_kernel(const char *vmlinux, symbol_filter_t filter,
-		      int use_modules)
+static int dso__load_kernel_sym(struct dso *self, struct map *map,
+				symbol_filter_t filter)
 {
-	int err = -1;
-	struct dso *dso = dso__new(vmlinux);
+	int err;
+	bool is_kallsyms;
 
-	if (dso == NULL)
-		return -1;
-
-	dso->short_name = "[kernel]";
-	kernel_map = map__new2(0, dso);
-	if (kernel_map == NULL)
-		goto out_delete_dso;
-
-	kernel_map->map_ip = kernel_map->unmap_ip = identity__map_ip;
-
-	if (use_modules && dsos__load_modules() < 0) {
-		pr_warning("Failed to load list of modules in use! "
-			   "Continuing...\n");
-		use_modules = 0;
-	}
-
-	if (vmlinux) {
-		err = dso__load_vmlinux(dso, kernel_map, vmlinux, filter);
-		if (err > 0 && use_modules) {
-			int syms = dsos__load_modules_sym(filter);
-
-			if (syms < 0)
-				pr_warning("Failed to read module symbols!"
-					   " Continuing...\n");
-			else
-				err += syms;
+	if (vmlinux_path != NULL) {
+		int i;
+		pr_debug("Looking at the vmlinux_path (%d entries long)\n",
+			 vmlinux_path__nr_entries);
+		for (i = 0; i < vmlinux_path__nr_entries; ++i) {
+			err = dso__load_vmlinux(self, map, vmlinux_path[i],
+						filter);
+			if (err > 0) {
+				pr_debug("Using %s for symbols\n",
+					 vmlinux_path[i]);
+				dso__set_long_name(self,
+						   strdup(vmlinux_path[i]));
+				goto out_fixup;
+			}
 		}
 	}
 
-	if (err <= 0)
-		err = kernel_maps__load_kallsyms(filter, use_modules);
+	is_kallsyms = self->long_name[0] == '[';
+	if (is_kallsyms)
+		goto do_kallsyms;
+
+	err = dso__load_vmlinux(self, map, self->long_name, filter);
+	if (err <= 0) {
+		pr_info("The file %s cannot be used, "
+			"trying to use /proc/kallsyms...", self->long_name);
+		sleep(2);
+do_kallsyms:
+		err = kernel_maps__load_kallsyms(filter);
+		if (err > 0 && !is_kallsyms)
+                        dso__set_long_name(self, strdup("[kernel.kallsyms]"));
+	}
 
 	if (err > 0) {
-		struct rb_node *node = rb_first(&dso->syms);
-		struct symbol *sym = rb_entry(node, struct symbol, rb_node);
-
-		kernel_map->start = sym->start;
-		node = rb_last(&dso->syms);
-		sym = rb_entry(node, struct symbol, rb_node);
-		kernel_map->end = sym->end;
-
-		dso->origin = DSO__ORIG_KERNEL;
-		kernel_maps__insert(kernel_map);
-		/*
-		 * Now that we have all sorted out, just set the ->end of all
-		 * maps:
-		 */
-		kernel_maps__fixup_end();
-		dsos__add(dso);
-
-		if (verbose)
-			kernel_maps__fprintf(stderr);
+out_fixup:
+		map__fixup_start(map);
+		map__fixup_end(map);
 	}
 
 	return err;
-
-out_delete_dso:
-	dso__delete(dso);
-	return -1;
 }
 
 LIST_HEAD(dsos);
-struct dso	*vdso;
-
-const char	*vmlinux_name = "vmlinux";
-int		modules;
+struct dso *vdso;
 
 static void dsos__add(struct dso *dso)
 {
@@ -1463,18 +1485,117 @@
 	return ret;
 }
 
-int load_kernel(symbol_filter_t filter)
+static int kernel_maps__create_kernel_map(const char *vmlinux_name)
 {
-	if (dsos__load_kernel(vmlinux_name, filter, modules) <= 0)
+	struct dso *kernel = dso__new(vmlinux_name ?: "[kernel.kallsyms]");
+
+	if (kernel == NULL)
 		return -1;
 
+	kernel_map = map__new2(0, kernel);
+	if (kernel_map == NULL)
+		goto out_delete_kernel_dso;
+
+	kernel_map->map_ip	 = kernel_map->unmap_ip = identity__map_ip;
+	kernel->short_name	 = "[kernel]";
+	kernel->kernel		 = 1;
+
 	vdso = dso__new("[vdso]");
-	if (!vdso)
-		return -1;
+	if (vdso == NULL)
+		goto out_delete_kernel_map;
 
+	if (sysfs__read_build_id("/sys/kernel/notes", kernel->build_id,
+				 sizeof(kernel->build_id)) == 0)
+		kernel->has_build_id = true;
+
+	kernel_maps__insert(kernel_map);
+	dsos__add(kernel);
 	dsos__add(vdso);
 
 	return 0;
+
+out_delete_kernel_map:
+	map__delete(kernel_map);
+	kernel_map = NULL;
+out_delete_kernel_dso:
+	dso__delete(kernel);
+	return -1;
+}
+
+static void vmlinux_path__exit(void)
+{
+	while (--vmlinux_path__nr_entries >= 0) {
+		free(vmlinux_path[vmlinux_path__nr_entries]);
+		vmlinux_path[vmlinux_path__nr_entries] = NULL;
+	}
+
+	free(vmlinux_path);
+	vmlinux_path = NULL;
+}
+
+static int vmlinux_path__init(void)
+{
+	struct utsname uts;
+	char bf[PATH_MAX];
+
+	if (uname(&uts) < 0)
+		return -1;
+
+	vmlinux_path = malloc(sizeof(char *) * 5);
+	if (vmlinux_path == NULL)
+		return -1;
+
+	vmlinux_path[vmlinux_path__nr_entries] = strdup("vmlinux");
+	if (vmlinux_path[vmlinux_path__nr_entries] == NULL)
+		goto out_fail;
+	++vmlinux_path__nr_entries;
+	vmlinux_path[vmlinux_path__nr_entries] = strdup("/boot/vmlinux");
+	if (vmlinux_path[vmlinux_path__nr_entries] == NULL)
+		goto out_fail;
+	++vmlinux_path__nr_entries;
+	snprintf(bf, sizeof(bf), "/boot/vmlinux-%s", uts.release);
+	vmlinux_path[vmlinux_path__nr_entries] = strdup(bf);
+	if (vmlinux_path[vmlinux_path__nr_entries] == NULL)
+		goto out_fail;
+	++vmlinux_path__nr_entries;
+	snprintf(bf, sizeof(bf), "/lib/modules/%s/build/vmlinux", uts.release);
+	vmlinux_path[vmlinux_path__nr_entries] = strdup(bf);
+	if (vmlinux_path[vmlinux_path__nr_entries] == NULL)
+		goto out_fail;
+	++vmlinux_path__nr_entries;
+	snprintf(bf, sizeof(bf), "/usr/lib/debug/lib/modules/%s/vmlinux",
+		 uts.release);
+	vmlinux_path[vmlinux_path__nr_entries] = strdup(bf);
+	if (vmlinux_path[vmlinux_path__nr_entries] == NULL)
+		goto out_fail;
+	++vmlinux_path__nr_entries;
+
+	return 0;
+
+out_fail:
+	vmlinux_path__exit();
+	return -1;
+}
+
+int kernel_maps__init(const char *vmlinux_name, bool try_vmlinux_path,
+		      bool use_modules)
+{
+	if (try_vmlinux_path && vmlinux_path__init() < 0)
+		return -1;
+
+	if (kernel_maps__create_kernel_map(vmlinux_name) < 0) {
+		vmlinux_path__exit();
+		return -1;
+	}
+
+	if (use_modules && kernel_maps__create_module_maps() < 0)
+		pr_debug("Failed to load list of modules in use, "
+			 "continuing...\n");
+	/*
+	 * Now that we have all the maps created, just set the ->end of them:
+	 */
+	kernel_maps__fixup_end();
+	return 0;
 }
 
 void symbol__init(unsigned int priv_size)
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 5ad1019..8c4d026 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -64,6 +64,7 @@
 	u8		 slen_calculated:1;
 	u8		 loaded:1;
 	u8		 has_build_id:1;
+	u8		 kernel:1;
 	unsigned char	 origin;
 	u8		 build_id[BUILD_ID_SIZE];
 	u16		 long_name_len;
@@ -77,7 +78,6 @@
 
 struct symbol *dso__find_symbol(struct dso *self, u64 ip);
 
-int dsos__load_kernel(const char *vmlinux, symbol_filter_t filter, int modules);
 struct dso *dsos__findnew(const char *name);
 int dso__load(struct dso *self, struct map *map, symbol_filter_t filter);
 void dsos__fprintf(FILE *fp);
@@ -89,16 +89,17 @@
 void dso__set_build_id(struct dso *self, void *build_id);
 
 int filename__read_build_id(const char *filename, void *bf, size_t size);
-bool fetch_build_id_table(struct list_head *head);
+int sysfs__read_build_id(const char *filename, void *bf, size_t size);
+bool dsos__read_build_ids(void);
 int build_id__sprintf(u8 *self, int len, char *bf);
 
-int load_kernel(symbol_filter_t filter);
+int kernel_maps__init(const char *vmlinux_name, bool try_vmlinux_path,
+		      bool use_modules);
+size_t kernel_maps__fprintf(FILE *fp);
 
 void symbol__init(unsigned int priv_size);
 
 extern struct list_head dsos;
 extern struct map *kernel_map;
 extern struct dso *vdso;
-extern const char *vmlinux_name;
-extern int   modules;
 #endif /* __PERF_SYMBOL */
diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
index 53addd7..e4b8d43 100644
--- a/tools/perf/util/thread.h
+++ b/tools/perf/util/thread.h
@@ -26,7 +26,8 @@
 void maps__insert(struct rb_root *maps, struct map *map);
 struct map *maps__find(struct rb_root *maps, u64 ip);
 
-struct symbol *kernel_maps__find_symbol(const u64 ip, struct map **mapp);
+struct symbol *kernel_maps__find_symbol(const u64 ip, struct map **mapp,
+					symbol_filter_t filter);
 struct map *kernel_maps__find_by_dso_name(const char *name);
 
 static inline struct map *thread__find_map(struct thread *self, u64 ip)
diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c
index 831052d..cace355 100644
--- a/tools/perf/util/trace-event-info.c
+++ b/tools/perf/util/trace-event-info.c
@@ -33,11 +33,11 @@
 #include <ctype.h>
 #include <errno.h>
 #include <stdbool.h>
+#include <linux/kernel.h>
 
 #include "../perf.h"
 #include "trace-event.h"
 
-
 #define VERSION "0.5"
 
 #define _STR(x) #x
@@ -483,23 +483,31 @@
 get_tracepoints_path(struct perf_event_attr *pattrs, int nb_events)
 {
 	struct tracepoint_path path, *ppath = &path;
-	int i;
+	int i, nr_tracepoints = 0;
 
 	for (i = 0; i < nb_events; i++) {
 		if (pattrs[i].type != PERF_TYPE_TRACEPOINT)
 			continue;
+		++nr_tracepoints;
 		ppath->next = tracepoint_id_to_path(pattrs[i].config);
 		if (!ppath->next)
 			die("%s\n", "No memory to alloc tracepoints list");
 		ppath = ppath->next;
 	}
 
-	return path.next;
+	return nr_tracepoints > 0 ? path.next : NULL;
 }
-void read_tracing_data(int fd, struct perf_event_attr *pattrs, int nb_events)
+
+int read_tracing_data(int fd, struct perf_event_attr *pattrs, int nb_events)
 {
 	char buf[BUFSIZ];
-	struct tracepoint_path *tps;
+	struct tracepoint_path *tps = get_tracepoints_path(pattrs, nb_events);
+
+	/*
+	 * What? No tracepoints? No sense writing anything here, bail out.
+	 */
+	if (tps == NULL)
+		return -1;
 
 	output_fd = fd;
 
@@ -528,11 +536,11 @@
 	page_size = getpagesize();
 	write_or_die(&page_size, 4);
 
-	tps = get_tracepoints_path(pattrs, nb_events);
-
 	read_header_files();
 	read_ftrace_files(tps);
 	read_event_files(tps);
 	read_proc_kallsyms();
 	read_ftrace_printk();
+
+	return 0;
 }
diff --git a/tools/perf/util/trace-event-read.c b/tools/perf/util/trace-event-read.c
index 44292e0..342dfdd 100644
--- a/tools/perf/util/trace-event-read.c
+++ b/tools/perf/util/trace-event-read.c
@@ -471,11 +471,11 @@
 
 	read_or_die(buf, 3);
 	if (memcmp(buf, test, 3) != 0)
-		die("not an trace data file");
+		die("no trace data in the file");
 
 	read_or_die(buf, 7);
 	if (memcmp(buf, "tracing", 7) != 0)
-		die("not a trace file (missing tracing)");
+		die("not a trace file (missing 'tracing' tag)");
 
 	version = read_string();
 	if (show_version)
diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h
index f6637c2..dd51c687 100644
--- a/tools/perf/util/trace-event.h
+++ b/tools/perf/util/trace-event.h
@@ -248,7 +248,7 @@
 raw_field_value(struct event *event, const char *name, void *data);
 void *raw_field_ptr(struct event *event, const char *name, void *data);
 
-void read_tracing_data(int fd, struct perf_event_attr *pattrs, int nb_events);
+int read_tracing_data(int fd, struct perf_event_attr *pattrs, int nb_events);
 
 /* taken from kernel/trace/trace.h */
 enum trace_flag_type {
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index f2203a0..e1c623e 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -84,6 +84,9 @@
 #include <iconv.h>
 #endif
 
+extern const char *graph_line;
+extern const char *graph_dotted_line;
+
 /* On most systems <limits.h> would have given us this, but
  * not on some systems (e.g. GNU/Hurd).
  */