[PATCH] Kprobes: Use RCU for (un)register synchronization - base changes

Changes to the base kprobes infrastructure to use RCU for synchronization
during kprobe registration and unregistration.  These changes coupled with the
arch kprobe changes (next in series):

a. serialize registration and unregistration of kprobes.
b. enable lockless execution of handlers. Handlers can now run in parallel.

Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 6720305..cff281c 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -34,6 +34,8 @@
 #include <linux/notifier.h>
 #include <linux/smp.h>
 #include <linux/percpu.h>
+#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
 
 #include <asm/kprobes.h>
 
@@ -146,10 +148,7 @@
 };
 
 #ifdef CONFIG_KPROBES
-/* Locks kprobe: irq must be disabled */
-void lock_kprobes(void);
-void unlock_kprobes(void);
-
+extern spinlock_t kretprobe_lock;
 extern int arch_prepare_kprobe(struct kprobe *p);
 extern void arch_copy_kprobe(struct kprobe *p);
 extern void arch_arm_kprobe(struct kprobe *p);
@@ -160,7 +159,7 @@
 extern kprobe_opcode_t *get_insn_slot(void);
 extern void free_insn_slot(kprobe_opcode_t *slot);
 
-/* Get the kprobe at this addr (if any).  Must have called lock_kprobes */
+/* Get the kprobe at this addr (if any) - called under a rcu_read_lock() */
 struct kprobe *get_kprobe(void *addr);
 struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk);
 
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 6da8f9b..cfef426 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -32,7 +32,6 @@
  *		<prasanna@in.ibm.com> added function-return probes.
  */
 #include <linux/kprobes.h>
-#include <linux/spinlock.h>
 #include <linux/hash.h>
 #include <linux/init.h>
 #include <linux/slab.h>
@@ -49,8 +48,8 @@
 static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
 static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
 
-unsigned int kprobe_cpu = NR_CPUS;
-static DEFINE_SPINLOCK(kprobe_lock);
+static DEFINE_SPINLOCK(kprobe_lock);	/* Protects kprobe_table */
+DEFINE_SPINLOCK(kretprobe_lock);	/* Protects kretprobe_inst_table */
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
 
 /*
@@ -153,41 +152,6 @@
 	}
 }
 
-/* Locks kprobe: irqs must be disabled */
-void __kprobes lock_kprobes(void)
-{
-	unsigned long flags = 0;
-
-	/* Avoiding local interrupts to happen right after we take the kprobe_lock
-	 * and before we get a chance to update kprobe_cpu, this to prevent
-	 * deadlock when we have a kprobe on ISR routine and a kprobe on task
-	 * routine
-	 */
-	local_irq_save(flags);
-
-	spin_lock(&kprobe_lock);
-	kprobe_cpu = smp_processor_id();
-
- 	local_irq_restore(flags);
-}
-
-void __kprobes unlock_kprobes(void)
-{
-	unsigned long flags = 0;
-
-	/* Avoiding local interrupts to happen right after we update
-	 * kprobe_cpu and before we get a a chance to release kprobe_lock,
-	 * this to prevent deadlock when we have a kprobe on ISR routine and
-	 * a kprobe on task routine
-	 */
-	local_irq_save(flags);
-
-	kprobe_cpu = NR_CPUS;
-	spin_unlock(&kprobe_lock);
-
- 	local_irq_restore(flags);
-}
-
 /* We have preemption disabled.. so it is safe to use __ versions */
 static inline void set_kprobe_instance(struct kprobe *kp)
 {
@@ -199,15 +163,20 @@
 	__get_cpu_var(kprobe_instance) = NULL;
 }
 
-/* You have to be holding the kprobe_lock */
+/*
+ * This routine is called either:
+ * 	- under the kprobe_lock spinlock - during kprobe_[un]register()
+ * 				OR
+ * 	- under an rcu_read_lock() - from arch/xxx/kernel/kprobes.c
+ */
 struct kprobe __kprobes *get_kprobe(void *addr)
 {
 	struct hlist_head *head;
 	struct hlist_node *node;
+	struct kprobe *p;
 
 	head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
-	hlist_for_each(node, head) {
-		struct kprobe *p = hlist_entry(node, struct kprobe, hlist);
+	hlist_for_each_entry_rcu(p, node, head, hlist) {
 		if (p->addr == addr)
 			return p;
 	}
@@ -222,7 +191,7 @@
 {
 	struct kprobe *kp;
 
-	list_for_each_entry(kp, &p->list, list) {
+	list_for_each_entry_rcu(kp, &p->list, list) {
 		if (kp->pre_handler) {
 			set_kprobe_instance(kp);
 			if (kp->pre_handler(kp, regs))
@@ -238,7 +207,7 @@
 {
 	struct kprobe *kp;
 
-	list_for_each_entry(kp, &p->list, list) {
+	list_for_each_entry_rcu(kp, &p->list, list) {
 		if (kp->post_handler) {
 			set_kprobe_instance(kp);
 			kp->post_handler(kp, regs, flags);
@@ -277,6 +246,7 @@
 	return ret;
 }
 
+/* Called with kretprobe_lock held */
 struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp)
 {
 	struct hlist_node *node;
@@ -286,6 +256,7 @@
 	return NULL;
 }
 
+/* Called with kretprobe_lock held */
 static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe
 							      *rp)
 {
@@ -296,6 +267,7 @@
 	return NULL;
 }
 
+/* Called with kretprobe_lock held */
 void __kprobes add_rp_inst(struct kretprobe_instance *ri)
 {
 	/*
@@ -314,6 +286,7 @@
 	hlist_add_head(&ri->uflist, &ri->rp->used_instances);
 }
 
+/* Called with kretprobe_lock held */
 void __kprobes recycle_rp_inst(struct kretprobe_instance *ri)
 {
 	/* remove rp inst off the rprobe_inst_table */
@@ -347,13 +320,13 @@
 	struct hlist_node *node, *tmp;
 	unsigned long flags = 0;
 
-	spin_lock_irqsave(&kprobe_lock, flags);
+	spin_lock_irqsave(&kretprobe_lock, flags);
         head = kretprobe_inst_table_head(current);
         hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
                 if (ri->task == tk)
                         recycle_rp_inst(ri);
         }
-	spin_unlock_irqrestore(&kprobe_lock, flags);
+	spin_unlock_irqrestore(&kretprobe_lock, flags);
 }
 
 /*
@@ -364,9 +337,12 @@
 					   struct pt_regs *regs)
 {
 	struct kretprobe *rp = container_of(p, struct kretprobe, kp);
+	unsigned long flags = 0;
 
 	/*TODO: consider to only swap the RA after the last pre_handler fired */
+	spin_lock_irqsave(&kretprobe_lock, flags);
 	arch_prepare_kretprobe(rp, regs);
+	spin_unlock_irqrestore(&kretprobe_lock, flags);
 	return 0;
 }
 
@@ -397,13 +373,13 @@
         struct kprobe *kp;
 
 	if (p->break_handler) {
-		list_for_each_entry(kp, &old_p->list, list) {
+		list_for_each_entry_rcu(kp, &old_p->list, list) {
 			if (kp->break_handler)
 				return -EEXIST;
 		}
-		list_add_tail(&p->list, &old_p->list);
+		list_add_tail_rcu(&p->list, &old_p->list);
 	} else
-		list_add(&p->list, &old_p->list);
+		list_add_rcu(&p->list, &old_p->list);
 	return 0;
 }
 
@@ -421,18 +397,18 @@
 	ap->break_handler = aggr_break_handler;
 
 	INIT_LIST_HEAD(&ap->list);
-	list_add(&p->list, &ap->list);
+	list_add_rcu(&p->list, &ap->list);
 
 	INIT_HLIST_NODE(&ap->hlist);
-	hlist_del(&p->hlist);
-	hlist_add_head(&ap->hlist,
+	hlist_del_rcu(&p->hlist);
+	hlist_add_head_rcu(&ap->hlist,
 		&kprobe_table[hash_ptr(ap->addr, KPROBE_HASH_BITS)]);
 }
 
 /*
  * This is the second or subsequent kprobe at the address - handle
  * the intricacies
- * TODO: Move kcalloc outside the spinlock
+ * TODO: Move kcalloc outside the spin_lock
  */
 static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
 					  struct kprobe *p)
@@ -458,7 +434,7 @@
 static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags)
 {
 	arch_disarm_kprobe(p);
-	hlist_del(&p->hlist);
+	hlist_del_rcu(&p->hlist);
 	spin_unlock_irqrestore(&kprobe_lock, flags);
 	arch_remove_kprobe(p);
 }
@@ -466,11 +442,10 @@
 static inline void cleanup_aggr_kprobe(struct kprobe *old_p,
 		struct kprobe *p, unsigned long flags)
 {
-	list_del(&p->list);
-	if (list_empty(&old_p->list)) {
+	list_del_rcu(&p->list);
+	if (list_empty(&old_p->list))
 		cleanup_kprobe(old_p, flags);
-		kfree(old_p);
-	} else
+	else
 		spin_unlock_irqrestore(&kprobe_lock, flags);
 }
 
@@ -493,9 +468,9 @@
 	if ((ret = arch_prepare_kprobe(p)) != 0)
 		goto rm_kprobe;
 
+	p->nmissed = 0;
 	spin_lock_irqsave(&kprobe_lock, flags);
 	old_p = get_kprobe(p->addr);
-	p->nmissed = 0;
 	if (old_p) {
 		ret = register_aggr_kprobe(old_p, p);
 		goto out;
@@ -503,7 +478,7 @@
 
 	arch_copy_kprobe(p);
 	INIT_HLIST_NODE(&p->hlist);
-	hlist_add_head(&p->hlist,
+	hlist_add_head_rcu(&p->hlist,
 		       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
 
   	arch_arm_kprobe(p);
@@ -524,10 +499,16 @@
 	spin_lock_irqsave(&kprobe_lock, flags);
 	old_p = get_kprobe(p->addr);
 	if (old_p) {
+		/* cleanup_*_kprobe() does the spin_unlock_irqrestore */
 		if (old_p->pre_handler == aggr_pre_handler)
 			cleanup_aggr_kprobe(old_p, p, flags);
 		else
 			cleanup_kprobe(p, flags);
+
+		synchronize_sched();
+		if (old_p->pre_handler == aggr_pre_handler &&
+				list_empty(&old_p->list))
+			kfree(old_p);
 	} else
 		spin_unlock_irqrestore(&kprobe_lock, flags);
 }
@@ -604,13 +585,13 @@
 
 	unregister_kprobe(&rp->kp);
 	/* No race here */
-	spin_lock_irqsave(&kprobe_lock, flags);
+	spin_lock_irqsave(&kretprobe_lock, flags);
 	free_rp_inst(rp);
 	while ((ri = get_used_rp_inst(rp)) != NULL) {
 		ri->rp = NULL;
 		hlist_del(&ri->uflist);
 	}
-	spin_unlock_irqrestore(&kprobe_lock, flags);
+	spin_unlock_irqrestore(&kretprobe_lock, flags);
 }
 
 static int __init init_kprobes(void)