pidns: Make the pidns proc mount/umount logic obvious.
Track the number of pids in the proc hash table. When the number of
pids goes to 0 schedule work to unmount the kernel mount of proc.
Move the mount of proc into alloc_pid when we allocate the pid for
init.
Remove the surprising calls of pid_ns_release proc in fork and
proc_flush_task. Those code paths really shouldn't know about proc
namespace implementation details and people have demonstrated several
times that finding and understanding those code paths is difficult and
non-obvious.
Because of the call path detach pid is alwasy called with the
rtnl_lock held free_pid is not allowed to sleep, so the work to
unmounting proc is moved to a work queue. This has the side benefit
of not blocking the entire world waiting for the unnecessary
rcu_barrier in deactivate_locked_super.
In the process of making the code clear and obvious this fixes a bug
reported by Gao feng <gaofeng@cn.fujitsu.com> where we would leak a
mount of proc during clone(CLONE_NEWPID|CLONE_NEWNET) if copy_pid_ns
succeeded and copy_net_ns failed.
Acked-by: "Serge E. Hallyn" <serge@hallyn.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
diff --git a/kernel/pid.c b/kernel/pid.c
index 3a5f238..e957f8b 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -36,6 +36,7 @@
#include <linux/pid_namespace.h>
#include <linux/init_task.h>
#include <linux/syscalls.h>
+#include <linux/proc_fs.h>
#define pid_hashfn(nr, ns) \
hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
@@ -270,8 +271,12 @@
unsigned long flags;
spin_lock_irqsave(&pidmap_lock, flags);
- for (i = 0; i <= pid->level; i++)
- hlist_del_rcu(&pid->numbers[i].pid_chain);
+ for (i = 0; i <= pid->level; i++) {
+ struct upid *upid = pid->numbers + i;
+ hlist_del_rcu(&upid->pid_chain);
+ if (--upid->ns->nr_hashed == 0)
+ schedule_work(&upid->ns->proc_work);
+ }
spin_unlock_irqrestore(&pidmap_lock, flags);
for (i = 0; i <= pid->level; i++)
@@ -293,6 +298,7 @@
goto out;
tmp = ns;
+ pid->level = ns->level;
for (i = ns->level; i >= 0; i--) {
nr = alloc_pidmap(tmp);
if (nr < 0)
@@ -303,17 +309,23 @@
tmp = tmp->parent;
}
+ if (unlikely(is_child_reaper(pid))) {
+ if (pid_ns_prepare_proc(ns))
+ goto out_free;
+ }
+
get_pid_ns(ns);
- pid->level = ns->level;
atomic_set(&pid->count, 1);
for (type = 0; type < PIDTYPE_MAX; ++type)
INIT_HLIST_HEAD(&pid->tasks[type]);
upid = pid->numbers + ns->level;
spin_lock_irq(&pidmap_lock);
- for ( ; upid >= pid->numbers; --upid)
+ for ( ; upid >= pid->numbers; --upid) {
hlist_add_head_rcu(&upid->pid_chain,
&pid_hash[pid_hashfn(upid->nr, upid->ns)]);
+ upid->ns->nr_hashed++;
+ }
spin_unlock_irq(&pidmap_lock);
out:
@@ -570,6 +582,7 @@
/* Reserve PID 0. We never call free_pidmap(0) */
set_bit(0, init_pid_ns.pidmap[0].page);
atomic_dec(&init_pid_ns.pidmap[0].nr_free);
+ init_pid_ns.nr_hashed = 1;
init_pid_ns.pid_cachep = KMEM_CACHE(pid,
SLAB_HWCACHE_ALIGN | SLAB_PANIC);