cgroup: split cgroup destruction into two steps Split cgroup_destroy_locked() into two steps and put the latter half into cgroup_offline_fn() which is executed from a work item. The latter half is responsible for offlining the css's, removing the cgroup from internal lists, and propagating release notification to the parent. The separation is to allow using percpu refcnt for css. Note that this allows for other cgroup operations to happen between the first and second halves of destruction, including creating a new cgroup with the same name. As the target cgroup is marked DEAD in the first half and cgroup internals don't care about the names of cgroups, this should be fine. A comment explaining this will be added by the next patch which implements the actual percpu refcnting. As RCU freeing is guaranteed to happen after the second step of destruction, we can use the same work item for both. This patch renames cgroup->free_work to ->destroy_work and uses it for both purposes. INIT_WORK() is now performed right before queueing the work item. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Li Zefan <lizefan@huawei.com>

commit: ea15f8ccdb430af1e8bc9b4e19a230eb4c356777 [log] [tgz]
author: Tejun Heo <tj@kernel.org> Thu Jun 13 19:27:42 2013 -0700
committer: Tejun Heo <tj@kernel.org> Thu Jun 13 19:27:42 2013 -0700
tree: e75aa3f5ddac2a7092c0f16361d72f9b25232832
parent: 455050d23e1bfc47ca98e943ad5b2f3a9bbe45fb [diff] [blame]
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 5a1ddec..df68147 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c

@@ -208,6 +208,7 @@
  */
 static int need_forkexit_callback __read_mostly;
 
+static void cgroup_offline_fn(struct work_struct *work);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
 			      struct cftype cfts[], bool is_add);
@@ -830,7 +831,7 @@
 
 static void cgroup_free_fn(struct work_struct *work)
 {
-	struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
+	struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
 	struct cgroup_subsys *ss;
 
 	mutex_lock(&cgroup_mutex);
@@ -875,7 +876,8 @@
 {
 	struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
 
-	schedule_work(&cgrp->free_work);
+	INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
+	schedule_work(&cgrp->destroy_work);
 }
 
 static void cgroup_diput(struct dentry *dentry, struct inode *inode)
@@ -1407,7 +1409,6 @@
 	INIT_LIST_HEAD(&cgrp->allcg_node);
 	INIT_LIST_HEAD(&cgrp->release_list);
 	INIT_LIST_HEAD(&cgrp->pidlists);
-	INIT_WORK(&cgrp->free_work, cgroup_free_fn);
 	mutex_init(&cgrp->pidlist_mutex);
 	INIT_LIST_HEAD(&cgrp->event_list);
 	spin_lock_init(&cgrp->event_list_lock);
@@ -2991,12 +2992,13 @@
 	/*
 	 * @pos could already have been removed.  Once a cgroup is removed,
 	 * its ->sibling.next is no longer updated when its next sibling
-	 * changes.  As CGRP_DEAD is set on removal which is fully
-	 * serialized, if we see it unasserted, it's guaranteed that the
-	 * next sibling hasn't finished its grace period even if it's
-	 * already removed, and thus safe to dereference from this RCU
-	 * critical section.  If ->sibling.next is inaccessible,
-	 * cgroup_is_dead() is guaranteed to be visible as %true here.
+	 * changes.  As CGRP_DEAD assertion is serialized and happens
+	 * before the cgroup is taken off the ->sibling list, if we see it
+	 * unasserted, it's guaranteed that the next sibling hasn't
+	 * finished its grace period even if it's already removed, and thus
+	 * safe to dereference from this RCU critical section.  If
+	 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
+	 * to be visible as %true here.
 	 */
 	if (likely(!cgroup_is_dead(pos))) {
 		next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
@@ -4359,7 +4361,6 @@
 	__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
 {
 	struct dentry *d = cgrp->dentry;
-	struct cgroup *parent = cgrp->parent;
 	struct cgroup_event *event, *tmp;
 	struct cgroup_subsys *ss;
 	bool empty;
@@ -4423,6 +4424,21 @@
 	}
 	spin_unlock(&cgrp->event_list_lock);
 
+	INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn);
+	schedule_work(&cgrp->destroy_work);
+
+	return 0;
+};
+
+static void cgroup_offline_fn(struct work_struct *work)
+{
+	struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
+	struct cgroup *parent = cgrp->parent;
+	struct dentry *d = cgrp->dentry;
+	struct cgroup_subsys *ss;
+
+	mutex_lock(&cgroup_mutex);
+
 	/* tell subsystems to initate destruction */
 	for_each_subsys(cgrp->root, ss)
 		offline_css(ss, cgrp);
@@ -4446,7 +4462,7 @@
 	set_bit(CGRP_RELEASABLE, &parent->flags);
 	check_for_release(parent);
 
-	return 0;
+	mutex_unlock(&cgroup_mutex);
 }
 
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
commit	ea15f8ccdb430af1e8bc9b4e19a230eb4c356777	[log] [tgz]
author	Tejun Heo <tj@kernel.org>	Thu Jun 13 19:27:42 2013 -0700
committer	Tejun Heo <tj@kernel.org>	Thu Jun 13 19:27:42 2013 -0700
tree	e75aa3f5ddac2a7092c0f16361d72f9b25232832
parent	455050d23e1bfc47ca98e943ad5b2f3a9bbe45fb [diff] [blame]