[PATCH] shared mount handling: bind and rbind

Implement handling of MS_BIND in presense of shared mounts (see
Documentation/sharedsubtree.txt in the end of patch series for detailed
description).

Signed-off-by: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/fs/namespace.c b/fs/namespace.c
index f6861a5..9f5a084 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -28,8 +28,6 @@
 
 extern int __init init_rootfs(void);
 
-#define CL_EXPIRE 	0x01
-
 #ifdef CONFIG_SYSFS
 extern int __init sysfs_init(void);
 #else
@@ -145,13 +143,43 @@
 	old_nd->dentry->d_mounted--;
 }
 
+void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
+			struct vfsmount *child_mnt)
+{
+	child_mnt->mnt_parent = mntget(mnt);
+	child_mnt->mnt_mountpoint = dget(dentry);
+	dentry->d_mounted++;
+}
+
 static void attach_mnt(struct vfsmount *mnt, struct nameidata *nd)
 {
-	mnt->mnt_parent = mntget(nd->mnt);
-	mnt->mnt_mountpoint = dget(nd->dentry);
-	list_add(&mnt->mnt_hash, mount_hashtable + hash(nd->mnt, nd->dentry));
+	mnt_set_mountpoint(nd->mnt, nd->dentry, mnt);
+	list_add_tail(&mnt->mnt_hash, mount_hashtable +
+			hash(nd->mnt, nd->dentry));
 	list_add_tail(&mnt->mnt_child, &nd->mnt->mnt_mounts);
-	nd->dentry->d_mounted++;
+}
+
+/*
+ * the caller must hold vfsmount_lock
+ */
+static void commit_tree(struct vfsmount *mnt)
+{
+	struct vfsmount *parent = mnt->mnt_parent;
+	struct vfsmount *m;
+	LIST_HEAD(head);
+	struct namespace *n = parent->mnt_namespace;
+
+	BUG_ON(parent == mnt);
+
+	list_add_tail(&head, &mnt->mnt_list);
+	list_for_each_entry(m, &head, mnt_list)
+		m->mnt_namespace = n;
+	list_splice(&head, n->list.prev);
+
+	list_add_tail(&mnt->mnt_hash, mount_hashtable +
+				hash(parent, mnt->mnt_mountpoint));
+	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
+	touch_namespace(n);
 }
 
 static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root)
@@ -183,7 +211,11 @@
 		mnt->mnt_root = dget(root);
 		mnt->mnt_mountpoint = mnt->mnt_root;
 		mnt->mnt_parent = mnt;
-		mnt->mnt_namespace = current->namespace;
+
+		if ((flag & CL_PROPAGATION) || IS_MNT_SHARED(old))
+			list_add(&mnt->mnt_share, &old->mnt_share);
+		if (flag & CL_MAKE_SHARED)
+			set_mnt_shared(mnt);
 
 		/* stick the duplicate mount on the same expiry list
 		 * as the original if that was on one */
@@ -379,7 +411,7 @@
 
 EXPORT_SYMBOL(may_umount);
 
-static void release_mounts(struct list_head *head)
+void release_mounts(struct list_head *head)
 {
 	struct vfsmount *mnt;
 	while(!list_empty(head)) {
@@ -401,7 +433,7 @@
 	}
 }
 
-static void umount_tree(struct vfsmount *mnt, struct list_head *kill)
+void umount_tree(struct vfsmount *mnt, struct list_head *kill)
 {
 	struct vfsmount *p;
 
@@ -581,7 +613,7 @@
 	}
 }
 
-static struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
+struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
 					int flag)
 {
 	struct vfsmount *res, *p, *q, *r, *s;
@@ -626,6 +658,67 @@
 	return NULL;
 }
 
+/*
+ *  @source_mnt : mount tree to be attached
+ *  @nd        : place the mount tree @source_mnt is attached
+ *
+ *  NOTE: in the table below explains the semantics when a source mount
+ *  of a given type is attached to a destination mount of a given type.
+ * 	---------------------------------------------
+ * 	|         BIND MOUNT OPERATION              |
+ * 	|********************************************
+ * 	| source-->| shared        |       private  |
+ * 	| dest     |               |                |
+ * 	|   |      |               |                |
+ * 	|   v      |               |                |
+ * 	|********************************************
+ * 	|  shared  | shared (++)   |     shared (+) |
+ * 	|          |               |                |
+ * 	|non-shared| shared (+)    |      private   |
+ * 	*********************************************
+ * A bind operation clones the source mount and mounts the clone on the
+ * destination mount.
+ *
+ * (++)  the cloned mount is propagated to all the mounts in the propagation
+ * 	 tree of the destination mount and the cloned mount is added to
+ * 	 the peer group of the source mount.
+ * (+)   the cloned mount is created under the destination mount and is marked
+ *       as shared. The cloned mount is added to the peer group of the source
+ *       mount.
+ *
+ * if the source mount is a tree, the operations explained above is
+ * applied to each mount in the tree.
+ * Must be called without spinlocks held, since this function can sleep
+ * in allocations.
+ */
+static int attach_recursive_mnt(struct vfsmount *source_mnt,
+				struct nameidata *nd)
+{
+	LIST_HEAD(tree_list);
+	struct vfsmount *dest_mnt = nd->mnt;
+	struct dentry *dest_dentry = nd->dentry;
+	struct vfsmount *child, *p;
+
+	if (propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list))
+		return -EINVAL;
+
+	if (IS_MNT_SHARED(dest_mnt)) {
+		for (p = source_mnt; p; p = next_mnt(p, source_mnt))
+			set_mnt_shared(p);
+	}
+
+	spin_lock(&vfsmount_lock);
+	mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt);
+	commit_tree(source_mnt);
+
+	list_for_each_entry_safe(child, p, &tree_list, mnt_hash) {
+		list_del_init(&child->mnt_hash);
+		commit_tree(child);
+	}
+	spin_unlock(&vfsmount_lock);
+	return 0;
+}
+
 static int graft_tree(struct vfsmount *mnt, struct nameidata *nd)
 {
 	int err;
@@ -646,17 +739,8 @@
 		goto out_unlock;
 
 	err = -ENOENT;
-	spin_lock(&vfsmount_lock);
-	if (IS_ROOT(nd->dentry) || !d_unhashed(nd->dentry)) {
-		struct list_head head;
-
-		attach_mnt(mnt, nd);
-		list_add_tail(&head, &mnt->mnt_list);
-		list_splice(&head, current->namespace->list.prev);
-		err = 0;
-		touch_namespace(current->namespace);
-	}
-	spin_unlock(&vfsmount_lock);
+	if (IS_ROOT(nd->dentry) || !d_unhashed(nd->dentry))
+		err = attach_recursive_mnt(mnt, nd);
 out_unlock:
 	up(&nd->dentry->d_inode->i_sem);
 	if (!err)
diff --git a/fs/pnode.c b/fs/pnode.c
index 1e22165..2d572b8 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -20,9 +20,88 @@
 void change_mnt_propagation(struct vfsmount *mnt, int type)
 {
 	if (type == MS_SHARED) {
-		mnt->mnt_flags |= MNT_SHARED;
+		set_mnt_shared(mnt);
 	} else {
 		list_del_init(&mnt->mnt_share);
 		mnt->mnt_flags &= ~MNT_PNODE_MASK;
 	}
 }
+
+/*
+ * get the next mount in the propagation tree.
+ * @m: the mount seen last
+ * @origin: the original mount from where the tree walk initiated
+ */
+static struct vfsmount *propagation_next(struct vfsmount *m,
+					 struct vfsmount *origin)
+{
+	m = next_peer(m);
+	if (m == origin)
+		return NULL;
+	return m;
+}
+
+/*
+ * mount 'source_mnt' under the destination 'dest_mnt' at
+ * dentry 'dest_dentry'. And propagate that mount to
+ * all the peer and slave mounts of 'dest_mnt'.
+ * Link all the new mounts into a propagation tree headed at
+ * source_mnt. Also link all the new mounts using ->mnt_list
+ * headed at source_mnt's ->mnt_list
+ *
+ * @dest_mnt: destination mount.
+ * @dest_dentry: destination dentry.
+ * @source_mnt: source mount.
+ * @tree_list : list of heads of trees to be attached.
+ */
+int propagate_mnt(struct vfsmount *dest_mnt, struct dentry *dest_dentry,
+		    struct vfsmount *source_mnt, struct list_head *tree_list)
+{
+	struct vfsmount *m, *child;
+	int ret = 0;
+	struct vfsmount *prev_dest_mnt = dest_mnt;
+	struct vfsmount *prev_src_mnt  = source_mnt;
+	LIST_HEAD(tmp_list);
+	LIST_HEAD(umount_list);
+
+	for (m = propagation_next(dest_mnt, dest_mnt); m;
+			m = propagation_next(m, dest_mnt)) {
+		int type = CL_PROPAGATION;
+
+		if (IS_MNT_NEW(m))
+			continue;
+
+		if (IS_MNT_SHARED(m))
+			type |= CL_MAKE_SHARED;
+
+		if (!(child = copy_tree(source_mnt, source_mnt->mnt_root,
+						type))) {
+			ret = -ENOMEM;
+			list_splice(tree_list, tmp_list.prev);
+			goto out;
+		}
+
+		if (is_subdir(dest_dentry, m->mnt_root)) {
+			mnt_set_mountpoint(m, dest_dentry, child);
+			list_add_tail(&child->mnt_hash, tree_list);
+		} else {
+			/*
+			 * This can happen if the parent mount was bind mounted
+			 * on some subdirectory of a shared/slave mount.
+			 */
+			list_add_tail(&child->mnt_hash, &tmp_list);
+		}
+		prev_dest_mnt = m;
+		prev_src_mnt  = child;
+	}
+out:
+	spin_lock(&vfsmount_lock);
+	while (!list_empty(&tmp_list)) {
+		child = list_entry(tmp_list.next, struct vfsmount, mnt_hash);
+		list_del_init(&child->mnt_hash);
+		umount_tree(child, &umount_list);
+	}
+	spin_unlock(&vfsmount_lock);
+	release_mounts(&umount_list);
+	return ret;
+}
diff --git a/fs/pnode.h b/fs/pnode.h
index ab1bdae..c62c72f 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -12,7 +12,21 @@
 #include <linux/mount.h>
 
 #define IS_MNT_SHARED(mnt) (mnt->mnt_flags & MNT_SHARED)
+#define IS_MNT_NEW(mnt)  (!mnt->mnt_namespace)
 #define CLEAR_MNT_SHARED(mnt) (mnt->mnt_flags &= ~MNT_SHARED)
 
+#define CL_EXPIRE    		0x01
+#define CL_COPY_ALL 		0x04
+#define CL_MAKE_SHARED 		0x08
+#define CL_PROPAGATION 		0x10
+
+static inline void set_mnt_shared(struct vfsmount *mnt)
+{
+	mnt->mnt_flags &= ~MNT_PNODE_MASK;
+	mnt->mnt_flags |= MNT_SHARED;
+}
+
 void change_mnt_propagation(struct vfsmount *, int);
+int propagate_mnt(struct vfsmount *, struct dentry *, struct vfsmount *,
+		struct list_head *);
 #endif /* _LINUX_PNODE_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 551fba3..5e188b7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1251,7 +1251,12 @@
 extern struct vfsmount *kern_mount(struct file_system_type *);
 extern int may_umount_tree(struct vfsmount *);
 extern int may_umount(struct vfsmount *);
+extern void umount_tree(struct vfsmount *, struct list_head *);
+extern void release_mounts(struct list_head *);
 extern long do_mount(char *, char *, char *, unsigned long, void *);
+extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
+extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
+				  struct vfsmount *);
 
 extern int vfs_statfs(struct super_block *, struct kstatfs *);