[PATCH] shared mounts handling: umount

An unmount of a mount creates a umount event on the parent.  If the
parent is a shared mount, it gets propagated to all mounts in the peer
group.

Signed-off-by: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/fs/namespace.c b/fs/namespace.c
index 1487982d..4b1af01 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -86,31 +86,44 @@
 }
 
 /*
- * Now, lookup_mnt increments the ref count before returning
- * the vfsmount struct.
+ * find the first or last mount at @dentry on vfsmount @mnt depending on
+ * @dir. If @dir is set return the first mount else return the last mount.
  */
-struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
+struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
+			      int dir)
 {
 	struct list_head *head = mount_hashtable + hash(mnt, dentry);
 	struct list_head *tmp = head;
 	struct vfsmount *p, *found = NULL;
 
-	spin_lock(&vfsmount_lock);
 	for (;;) {
-		tmp = tmp->next;
+		tmp = dir ? tmp->next : tmp->prev;
 		p = NULL;
 		if (tmp == head)
 			break;
 		p = list_entry(tmp, struct vfsmount, mnt_hash);
 		if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry) {
-			found = mntget(p);
+			found = p;
 			break;
 		}
 	}
-	spin_unlock(&vfsmount_lock);
 	return found;
 }
 
+/*
+ * lookup_mnt increments the ref count before returning
+ * the vfsmount struct.
+ */
+struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
+{
+	struct vfsmount *child_mnt;
+	spin_lock(&vfsmount_lock);
+	if ((child_mnt = __lookup_mnt(mnt, dentry, 1)))
+		mntget(child_mnt);
+	spin_unlock(&vfsmount_lock);
+	return child_mnt;
+}
+
 static inline int check_mnt(struct vfsmount *mnt)
 {
 	return mnt->mnt_namespace == current->namespace;
@@ -404,9 +417,12 @@
  */
 int may_umount(struct vfsmount *mnt)
 {
-	if (atomic_read(&mnt->mnt_count) > 2)
-		return -EBUSY;
-	return 0;
+	int ret = 0;
+	spin_lock(&vfsmount_lock);
+	if (propagate_mount_busy(mnt, 2))
+		ret = -EBUSY;
+	spin_unlock(&vfsmount_lock);
+	return ret;
 }
 
 EXPORT_SYMBOL(may_umount);
@@ -433,7 +449,7 @@
 	}
 }
 
-void umount_tree(struct vfsmount *mnt, struct list_head *kill)
+void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
 {
 	struct vfsmount *p;
 
@@ -442,6 +458,9 @@
 		list_add(&p->mnt_hash, kill);
 	}
 
+	if (propagate)
+		propagate_umount(kill);
+
 	list_for_each_entry(p, kill, mnt_hash) {
 		list_del_init(&p->mnt_expire);
 		list_del_init(&p->mnt_list);
@@ -450,6 +469,7 @@
 		list_del_init(&p->mnt_child);
 		if (p->mnt_parent != p)
 			mnt->mnt_mountpoint->d_mounted--;
+		change_mnt_propagation(p, MS_PRIVATE);
 	}
 }
 
@@ -526,9 +546,9 @@
 	event++;
 
 	retval = -EBUSY;
-	if (atomic_read(&mnt->mnt_count) == 2 || flags & MNT_DETACH) {
+	if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) {
 		if (!list_empty(&mnt->mnt_list))
-			umount_tree(mnt, &umount_list);
+			umount_tree(mnt, 1, &umount_list);
 		retval = 0;
 	}
 	spin_unlock(&vfsmount_lock);
@@ -651,7 +671,7 @@
 	if (res) {
 		LIST_HEAD(umount_list);
 		spin_lock(&vfsmount_lock);
-		umount_tree(res, &umount_list);
+		umount_tree(res, 0, &umount_list);
 		spin_unlock(&vfsmount_lock);
 		release_mounts(&umount_list);
 	}
@@ -827,7 +847,7 @@
 	if (err) {
 		LIST_HEAD(umount_list);
 		spin_lock(&vfsmount_lock);
-		umount_tree(mnt, &umount_list);
+		umount_tree(mnt, 0, &umount_list);
 		spin_unlock(&vfsmount_lock);
 		release_mounts(&umount_list);
 	}
@@ -1023,12 +1043,12 @@
 	 * Check that it is still dead: the count should now be 2 - as
 	 * contributed by the vfsmount parent and the mntget above
 	 */
-	if (atomic_read(&mnt->mnt_count) == 2) {
+	if (!propagate_mount_busy(mnt, 2)) {
 		/* delete from the namespace */
 		touch_namespace(mnt->mnt_namespace);
 		list_del_init(&mnt->mnt_list);
 		mnt->mnt_namespace = NULL;
-		umount_tree(mnt, umounts);
+		umount_tree(mnt, 1, umounts);
 		spin_unlock(&vfsmount_lock);
 	} else {
 		/*
@@ -1647,7 +1667,7 @@
 	spin_unlock(&vfsmount_lock);
 	down_write(&namespace_sem);
 	spin_lock(&vfsmount_lock);
-	umount_tree(root, &umount_list);
+	umount_tree(root, 0, &umount_list);
 	spin_unlock(&vfsmount_lock);
 	up_write(&namespace_sem);
 	release_mounts(&umount_list);
diff --git a/fs/pnode.c b/fs/pnode.c
index 2d572b8..7bc942d 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -99,9 +99,94 @@
 	while (!list_empty(&tmp_list)) {
 		child = list_entry(tmp_list.next, struct vfsmount, mnt_hash);
 		list_del_init(&child->mnt_hash);
-		umount_tree(child, &umount_list);
+		umount_tree(child, 0, &umount_list);
 	}
 	spin_unlock(&vfsmount_lock);
 	release_mounts(&umount_list);
 	return ret;
 }
+
+/*
+ * return true if the refcount is greater than count
+ */
+static inline int do_refcount_check(struct vfsmount *mnt, int count)
+{
+	int mycount = atomic_read(&mnt->mnt_count);
+	return (mycount > count);
+}
+
+/*
+ * check if the mount 'mnt' can be unmounted successfully.
+ * @mnt: the mount to be checked for unmount
+ * NOTE: unmounting 'mnt' would naturally propagate to all
+ * other mounts its parent propagates to.
+ * Check if any of these mounts that **do not have submounts**
+ * have more references than 'refcnt'. If so return busy.
+ */
+int propagate_mount_busy(struct vfsmount *mnt, int refcnt)
+{
+	struct vfsmount *m, *child;
+	struct vfsmount *parent = mnt->mnt_parent;
+	int ret = 0;
+
+	if (mnt == parent)
+		return do_refcount_check(mnt, refcnt);
+
+	/*
+	 * quickly check if the current mount can be unmounted.
+	 * If not, we don't have to go checking for all other
+	 * mounts
+	 */
+	if (!list_empty(&mnt->mnt_mounts) || do_refcount_check(mnt, refcnt))
+		return 1;
+
+	for (m = propagation_next(parent, parent); m;
+	     		m = propagation_next(m, parent)) {
+		child = __lookup_mnt(m, mnt->mnt_mountpoint, 0);
+		if (child && list_empty(&child->mnt_mounts) &&
+		    (ret = do_refcount_check(child, 1)))
+			break;
+	}
+	return ret;
+}
+
+/*
+ * NOTE: unmounting 'mnt' naturally propagates to all other mounts its
+ * parent propagates to.
+ */
+static void __propagate_umount(struct vfsmount *mnt)
+{
+	struct vfsmount *parent = mnt->mnt_parent;
+	struct vfsmount *m;
+
+	BUG_ON(parent == mnt);
+
+	for (m = propagation_next(parent, parent); m;
+			m = propagation_next(m, parent)) {
+
+		struct vfsmount *child = __lookup_mnt(m,
+					mnt->mnt_mountpoint, 0);
+		/*
+		 * umount the child only if the child has no
+		 * other children
+		 */
+		if (child && list_empty(&child->mnt_mounts)) {
+			list_del(&child->mnt_hash);
+			list_add_tail(&child->mnt_hash, &mnt->mnt_hash);
+		}
+	}
+}
+
+/*
+ * collect all mounts that receive propagation from the mount in @list,
+ * and return these additional mounts in the same list.
+ * @list: the list of mounts to be unmounted.
+ */
+int propagate_umount(struct list_head *list)
+{
+	struct vfsmount *mnt;
+
+	list_for_each_entry(mnt, list, mnt_hash)
+		__propagate_umount(mnt);
+	return 0;
+}
diff --git a/fs/pnode.h b/fs/pnode.h
index c62c72f..9b88ba0 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -29,4 +29,6 @@
 void change_mnt_propagation(struct vfsmount *, int);
 int propagate_mnt(struct vfsmount *, struct dentry *, struct vfsmount *,
 		struct list_head *);
+int propagate_umount(struct list_head *);
+int propagate_mount_busy(struct vfsmount *, int);
 #endif /* _LINUX_PNODE_H */