30 files changed, 3264 insertions, 1898 deletions
diff --git a/Documentation/cgroup-v1/rdma.txt b/Documentation/cgroup-v1/rdma.txt
new file mode 100644
index 000000000000..af618171e0eb
--- /dev/null
+++ b/Documentation/cgroup-v1/rdma.txt
@@ -0,0 +1,109 @@
+				RDMA Controller
+				----------------
+
+Contents
+--------
+
+1. Overview
+  1-1. What is RDMA controller?
+  1-2. Why RDMA controller needed?
+  1-3. How is RDMA controller implemented?
+2. Usage Examples
+
+1. Overview
+
+1-1. What is RDMA controller?
+-----------------------------
+
+RDMA controller allows user to limit RDMA/IB specific resources that a given
+set of processes can use. These processes are grouped using RDMA controller.
+
+RDMA controller defines two resources which can be limited for processes of a
+cgroup.
+
+1-2. Why RDMA controller needed?
+--------------------------------
+
+Currently user space applications can easily take away all the rdma verb
+specific resources such as AH, CQ, QP, MR etc. Due to which other applications
+in other cgroup or kernel space ULPs may not even get chance to allocate any
+rdma resources. This can leads to service unavailability.
+
+Therefore RDMA controller is needed through which resource consumption
+of processes can be limited. Through this controller different rdma
+resources can be accounted.
+
+1-3. How is RDMA controller implemented?
+----------------------------------------
+
+RDMA cgroup allows limit configuration of resources. Rdma cgroup maintains
+resource accounting per cgroup, per device using resource pool structure.
+Each such resource pool is limited up to 64 resources in given resource pool
+by rdma cgroup, which can be extended later if required.
+
+This resource pool object is linked to the cgroup css. Typically there
+are 0 to 4 resource pool instances per cgroup, per device in most use cases.
+But nothing limits to have it more. At present hundreds of RDMA devices per
+single cgroup may not be handled optimally, however there is no
+known use case or requirement for such configuration either.
+
+Since RDMA resources can be allocated from any process and can be freed by any
+of the child processes which shares the address space, rdma resources are
+always owned by the creator cgroup css. This allows process migration from one
+to other cgroup without major complexity of transferring resource ownership;
+because such ownership is not really present due to shared nature of
+rdma resources. Linking resources around css also ensures that cgroups can be
+deleted after processes migrated. This allow progress migration as well with
+active resources, even though that is not a primary use case.
+
+Whenever RDMA resource charging occurs, owner rdma cgroup is returned to
+the caller. Same rdma cgroup should be passed while uncharging the resource.
+This also allows process migrated with active RDMA resource to charge
+to new owner cgroup for new resource. It also allows to uncharge resource of
+a process from previously charged cgroup which is migrated to new cgroup,
+even though that is not a primary use case.
+
+Resource pool object is created in following situations.
+(a) User sets the limit and no previous resource pool exist for the device
+of interest for the cgroup.
+(b) No resource limits were configured, but IB/RDMA stack tries to
+charge the resource. So that it correctly uncharge them when applications are
+running without limits and later on when limits are enforced during uncharging,
+otherwise usage count will drop to negative.
+
+Resource pool is destroyed if all the resource limits are set to max and
+it is the last resource getting deallocated.
+
+User should set all the limit to max value if it intents to remove/unconfigure
+the resource pool for a particular device.
+
+IB stack honors limits enforced by the rdma controller. When application
+query about maximum resource limits of IB device, it returns minimum of
+what is configured by user for a given cgroup and what is supported by
+IB device.
+
+Following resources can be accounted by rdma controller.
+  hca_handle	Maximum number of HCA Handles
+  hca_object 	Maximum number of HCA Objects
+
+2. Usage Examples
+-----------------
+
+(a) Configure resource limit:
+echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max
+echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max
+
+(b) Query resource limit:
+cat /sys/fs/cgroup/rdma/2/rdma.max
+#Output:
+mlx4_0 hca_handle=2 hca_object=2000
+ocrdma1 hca_handle=3 hca_object=max
+
+(c) Query current usage:
+cat /sys/fs/cgroup/rdma/2/rdma.current
+#Output:
+mlx4_0 hca_handle=1 hca_object=20
+ocrdma1 hca_handle=1 hca_object=23
+
+(d) Delete resource limit:
+echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index 4cc07ce3b8dd..3b8449f8ac7e 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -47,6 +47,12 @@ CONTENTS
   5-3. IO
     5-3-1. IO Interface Files
     5-3-2. Writeback
+  5-4. PID
+    5-4-1. PID Interface Files
+  5-5. RDMA
+    5-5-1. RDMA Interface Files
+  5-6. Misc
+    5-6-1. perf_event
 6. Namespace
   6-1. Basics
   6-2. The Root and Views
@@ -328,14 +334,12 @@ a process with a non-root euid to migrate a target process into a
 cgroup by writing its PID to the "cgroup.procs" file, the following
 conditions must be met.
 
-- The writer's euid must match either uid or suid of the target process.
-
 - The writer must have write access to the "cgroup.procs" file.
 
 - The writer must have write access to the "cgroup.procs" file of the
   common ancestor of the source and destination cgroups.
 
-The above three constraints ensure that while a delegatee may migrate
+The above two constraints ensure that while a delegatee may migrate
 processes around freely in the delegated sub-hierarchy it can't pull
 in from or push out to outside the sub-hierarchy.
 
@@ -350,10 +354,10 @@ all processes under C0 and C1 belong to U0.
 
 Let's also say U0 wants to write the PID of a process which is
 currently in C10 into "C00/cgroup.procs".  U0 has write access to the
-file and uid match on the process; however, the common ancestor of the
-source cgroup C10 and the destination cgroup C00 is above the points
-of delegation and U0 would not have write access to its "cgroup.procs"
-files and thus the write will be denied with -EACCES.
+file; however, the common ancestor of the source cgroup C10 and the
+destination cgroup C00 is above the points of delegation and U0 would
+not have write access to its "cgroup.procs" files and thus the write
+will be denied with -EACCES.
 
 
 2-6. Guidelines
@@ -1119,6 +1123,91 @@ writeback as follows.
 	vm.dirty[_background]_ratio.
 
 
+5-4. PID
+
+The process number controller is used to allow a cgroup to stop any
+new tasks from being fork()'d or clone()'d after a specified limit is
+reached.
+
+The number of tasks in a cgroup can be exhausted in ways which other
+controllers cannot prevent, thus warranting its own controller.  For
+example, a fork bomb is likely to exhaust the number of tasks before
+hitting memory restrictions.
+
+Note that PIDs used in this controller refer to TIDs, process IDs as
+used by the kernel.
+
+
+5-4-1. PID Interface Files
+
+  pids.max
+
+ A read-write single value file which exists on non-root cgroups.  The
+ default is "max".
+
+ Hard limit of number of processes.
+
+  pids.current
+
+ A read-only single value file which exists on all cgroups.
+
+ The number of processes currently in the cgroup and its descendants.
+
+Organisational operations are not blocked by cgroup policies, so it is
+possible to have pids.current > pids.max.  This can be done by either
+setting the limit to be smaller than pids.current, or attaching enough
+processes to the cgroup such that pids.current is larger than
+pids.max.  However, it is not possible to violate a cgroup PID policy
+through fork() or clone(). These will return -EAGAIN if the creation
+of a new process would cause a cgroup policy to be violated.
+
+
+5-5. RDMA
+
+The "rdma" controller regulates the distribution and accounting of
+of RDMA resources.
+
+5-5-1. RDMA Interface Files
+
+  rdma.max
+	A readwrite nested-keyed file that exists for all the cgroups
+	except root that describes current configured resource limit
+	for a RDMA/IB device.
+
+	Lines are keyed by device name and are not ordered.
+	Each line contains space separated resource name and its configured
+	limit that can be distributed.
+
+	The following nested keys are defined.
+
+	  hca_handle	Maximum number of HCA Handles
+	  hca_object 	Maximum number of HCA Objects
+
+	An example for mlx4 and ocrdma device follows.
+
+	  mlx4_0 hca_handle=2 hca_object=2000
+	  ocrdma1 hca_handle=3 hca_object=max
+
+  rdma.current
+	A read-only file that describes current resource usage.
+	It exists for all the cgroup except root.
+
+	An example for mlx4 and ocrdma device follows.
+
+	  mlx4_0 hca_handle=1 hca_object=20
+	  ocrdma1 hca_handle=1 hca_object=23
+
+
+5-6. Misc
+
+5-6-1. perf_event
+
+perf_event controller, if not mounted on a legacy hierarchy, is
+automatically enabled on the v2 hierarchy so that perf events can
+always be filtered by cgroup v2 path.  The controller can still be
+moved to a legacy hierarchy after v2 hierarchy is populated.
+
+
 6. Namespace
 
 6-1. Basics
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index edaae9f9853c..e426ac877d19 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -13,6 +13,7 @@ ib_core-y :=			packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
 				multicast.o mad.o smi.o agent.o mad_rmpp.o
 ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
 ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
+ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o
 
 ib_cm-y :=			cm.o
 
diff --git a/drivers/infiniband/core/cgroup.c b/drivers/infiniband/core/cgroup.c
new file mode 100644
index 000000000000..126ac5f99db7
--- /dev/null
+++ b/drivers/infiniband/core/cgroup.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include "core_priv.h"
+
+/**
+ * ib_device_register_rdmacg - register with rdma cgroup.
+ * @device: device to register to participate in resource
+ *          accounting by rdma cgroup.
+ *
+ * Register with the rdma cgroup. Should be called before
+ * exposing rdma device to user space applications to avoid
+ * resource accounting leak.
+ * Returns 0 on success or otherwise failure code.
+ */
+int ib_device_register_rdmacg(struct ib_device *device)
+{
+	device->cg_device.name = device->name;
+	return rdmacg_register_device(&device->cg_device);
+}
+
+/**
+ * ib_device_unregister_rdmacg - unregister with rdma cgroup.
+ * @device: device to unregister.
+ *
+ * Unregister with the rdma cgroup. Should be called after
+ * all the resources are deallocated, and after a stage when any
+ * other resource allocation by user application cannot be done
+ * for this device to avoid any leak in accounting.
+ */
+void ib_device_unregister_rdmacg(struct ib_device *device)
+{
+	rdmacg_unregister_device(&device->cg_device);
+}
+
+int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
+			 struct ib_device *device,
+			 enum rdmacg_resource_type resource_index)
+{
+	return rdmacg_try_charge(&cg_obj->cg, &device->cg_device,
+				 resource_index);
+}
+EXPORT_SYMBOL(ib_rdmacg_try_charge);
+
+void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
+			struct ib_device *device,
+			enum rdmacg_resource_type resource_index)
+{
+	rdmacg_uncharge(cg_obj->cg, &device->cg_device,
+			resource_index);
+}
+EXPORT_SYMBOL(ib_rdmacg_uncharge);
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index 912ab4cd6eae..cb7d372e4bdf 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -35,6 +35,7 @@
 
 #include <linux/list.h>
 #include <linux/spinlock.h>
+#include <linux/cgroup_rdma.h>
 
 #include <rdma/ib_verbs.h>
 
@@ -124,6 +125,35 @@ int ib_cache_setup_one(struct ib_device *device);
 void ib_cache_cleanup_one(struct ib_device *device);
 void ib_cache_release_one(struct ib_device *device);
 
+#ifdef CONFIG_CGROUP_RDMA
+int ib_device_register_rdmacg(struct ib_device *device);
+void ib_device_unregister_rdmacg(struct ib_device *device);
+
+int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
+			 struct ib_device *device,
+			 enum rdmacg_resource_type resource_index);
+
+void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
+			struct ib_device *device,
+			enum rdmacg_resource_type resource_index);
+#else
+static inline int ib_device_register_rdmacg(struct ib_device *device)
+{ return 0; }
+
+static inline void ib_device_unregister_rdmacg(struct ib_device *device)
+{ }
+
+static inline int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
+				       struct ib_device *device,
+				       enum rdmacg_resource_type resource_index)
+{ return 0; }
+
+static inline void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
+				      struct ib_device *device,
+				      enum rdmacg_resource_type resource_index)
+{ }
+#endif
+
 static inline bool rdma_is_upper_dev_rcu(struct net_device *dev,
 					 struct net_device *upper)
 {
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index a63e8400ea3b..593d2ce6ec7c 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -369,10 +369,18 @@ int ib_register_device(struct ib_device *device,
 		goto out;
 	}
 
+	ret = ib_device_register_rdmacg(device);
+	if (ret) {
+		pr_warn("Couldn't register device with rdma cgroup\n");
+		ib_cache_cleanup_one(device);
+		goto out;
+	}
+
 	memset(&device->attrs, 0, sizeof(device->attrs));
 	ret = device->query_device(device, &device->attrs, &uhw);
 	if (ret) {
 		pr_warn("Couldn't query the device attributes\n");
+		ib_device_unregister_rdmacg(device);
 		ib_cache_cleanup_one(device);
 		goto out;
 	}
@@ -381,6 +389,7 @@ int ib_register_device(struct ib_device *device,
 	if (ret) {
 		pr_warn("Couldn't register device %s with driver model\n",
 			device->name);
+		ib_device_unregister_rdmacg(device);
 		ib_cache_cleanup_one(device);
 		goto out;
 	}
@@ -430,6 +439,7 @@ void ib_unregister_device(struct ib_device *device)
 
 	mutex_unlock(&device_mutex);
 
+	ib_device_unregister_rdmacg(device);
 	ib_device_unregister_sysfs(device);
 	ib_cache_cleanup_one(device);
 
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index b4b395a054ac..7b7a76e1279a 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -316,6 +316,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
 	struct ib_udata                   udata;
 	struct ib_ucontext		 *ucontext;
 	struct file			 *filp;
+	struct ib_rdmacg_object		 cg_obj;
 	int ret;
 
 	if (out_len < sizeof resp)
@@ -335,13 +336,18 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
 		   (unsigned long) cmd.response + sizeof resp,
 		   in_len - sizeof cmd, out_len - sizeof resp);
 
+	ret = ib_rdmacg_try_charge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE);
+	if (ret)
+		goto err;
+
 	ucontext = ib_dev->alloc_ucontext(ib_dev, &udata);
 	if (IS_ERR(ucontext)) {
 		ret = PTR_ERR(ucontext);
-		goto err;
+		goto err_alloc;
 	}
 
 	ucontext->device = ib_dev;
+	ucontext->cg_obj = cg_obj;
 	INIT_LIST_HEAD(&ucontext->pd_list);
 	INIT_LIST_HEAD(&ucontext->mr_list);
 	INIT_LIST_HEAD(&ucontext->mw_list);
@@ -407,6 +413,9 @@ err_free:
 	put_pid(ucontext->tgid);
 	ib_dev->dealloc_ucontext(ucontext);
 
+err_alloc:
+	ib_rdmacg_uncharge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE);
+
 err:
 	mutex_unlock(&file->mutex);
 	return ret;
@@ -561,6 +570,13 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
 		return -ENOMEM;
 
 	init_uobj(uobj, 0, file->ucontext, &pd_lock_class);
+	ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
+				   RDMACG_RESOURCE_HCA_OBJECT);
+	if (ret) {
+		kfree(uobj);
+		return ret;
+	}
+
 	down_write(&uobj->mutex);
 
 	pd = ib_dev->alloc_pd(ib_dev, file->ucontext, &udata);
@@ -605,6 +621,7 @@ err_idr:
 	ib_dealloc_pd(pd);
 
 err:
+	ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
 	put_uobj_write(uobj);
 	return ret;
 }
@@ -637,6 +654,8 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
 	if (ret)
 		goto err_put;
 
+	ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
 	uobj->live = 0;
 	put_uobj_write(uobj);
 
@@ -1006,6 +1025,10 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
 			goto err_put;
 		}
 	}
+	ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
+				   RDMACG_RESOURCE_HCA_OBJECT);
+	if (ret)
+		goto err_charge;
 
 	mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
 				     cmd.access_flags, &udata);
@@ -1054,6 +1077,9 @@ err_unreg:
 	ib_dereg_mr(mr);
 
 err_put:
+	ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
+err_charge:
 	put_pd_read(pd);
 
 err_free:
@@ -1178,6 +1204,8 @@ ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
 	if (ret)
 		return ret;
 
+	ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
 	idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
 
 	mutex_lock(&file->mutex);
@@ -1226,6 +1254,11 @@ ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file,
 		   in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
 		   out_len - sizeof(resp));
 
+	ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
+				   RDMACG_RESOURCE_HCA_OBJECT);
+	if (ret)
+		goto err_charge;
+
 	mw = pd->device->alloc_mw(pd, cmd.mw_type, &udata);
 	if (IS_ERR(mw)) {
 		ret = PTR_ERR(mw);
@@ -1271,6 +1304,9 @@ err_unalloc:
 	uverbs_dealloc_mw(mw);
 
 err_put:
+	ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
+err_charge:
 	put_pd_read(pd);
 
 err_free:
@@ -1306,6 +1342,8 @@ ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
 	if (ret)
 		return ret;
 
+	ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
 	idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
 
 	mutex_lock(&file->mutex);
@@ -1405,6 +1443,11 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file,
 	if (cmd_sz > offsetof(typeof(*cmd), flags) + sizeof(cmd->flags))
 		attr.flags = cmd->flags;
 
+	ret = ib_rdmacg_try_charge(&obj->uobject.cg_obj, ib_dev,
+				   RDMACG_RESOURCE_HCA_OBJECT);
+	if (ret)
+		goto err_charge;
+
 	cq = ib_dev->create_cq(ib_dev, &attr,
 					     file->ucontext, uhw);
 	if (IS_ERR(cq)) {
@@ -1452,6 +1495,10 @@ err_free:
 	ib_destroy_cq(cq);
 
 err_file:
+	ib_rdmacg_uncharge(&obj->uobject.cg_obj, ib_dev,
+			   RDMACG_RESOURCE_HCA_OBJECT);
+
+err_charge:
 	if (ev_file)
 		ib_uverbs_release_ucq(file, ev_file, obj);
 
@@ -1732,6 +1779,8 @@ ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
 	if (ret)
 		return ret;
 
+	ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
 	idr_remove_uobj(&ib_uverbs_cq_idr, uobj);
 
 	mutex_lock(&file->mutex);
@@ -1905,6 +1954,11 @@ static int create_qp(struct ib_uverbs_file *file,
 			goto err_put;
 		}
 
+	ret = ib_rdmacg_try_charge(&obj->uevent.uobject.cg_obj, device,
+				   RDMACG_RESOURCE_HCA_OBJECT);
+	if (ret)
+		goto err_put;
+
 	if (cmd->qp_type == IB_QPT_XRC_TGT)
 		qp = ib_create_qp(pd, &attr);
 	else
@@ -1912,7 +1966,7 @@ static int create_qp(struct ib_uverbs_file *file,
 
 	if (IS_ERR(qp)) {
 		ret = PTR_ERR(qp);
-		goto err_put;
+		goto err_create;
 	}
 
 	if (cmd->qp_type != IB_QPT_XRC_TGT) {
@@ -1993,6 +2047,10 @@ err_cb:
 err_destroy:
 	ib_destroy_qp(qp);
 
+err_create:
+	ib_rdmacg_uncharge(&obj->uevent.uobject.cg_obj, device,
+			   RDMACG_RESOURCE_HCA_OBJECT);
+
 err_put:
 	if (xrcd)
 		put_xrcd_read(xrcd_uobj);
@@ -2519,6 +2577,8 @@ ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
 	if (ret)
 		return ret;
 
+	ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
 	if (obj->uxrcd)
 		atomic_dec(&obj->uxrcd->refcnt);
 
@@ -2970,11 +3030,16 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
 	memset(&attr.dmac, 0, sizeof(attr.dmac));
 	memcpy(attr.grh.dgid.raw, cmd.attr.grh.dgid, 16);
 
+	ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
+				   RDMACG_RESOURCE_HCA_OBJECT);
+	if (ret)
+		goto err_charge;
+
 	ah = pd->device->create_ah(pd, &attr, &udata);
 
 	if (IS_ERR(ah)) {
 		ret = PTR_ERR(ah);
-		goto err_put;
+		goto err_create;
 	}
 
 	ah->device  = pd->device;
@@ -3013,7 +3078,10 @@ err_copy:
 err_destroy:
 	ib_destroy_ah(ah);
 
-err_put:
+err_create:
+	ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
+err_charge:
 	put_pd_read(pd);
 
 err:
@@ -3047,6 +3115,8 @@ ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
 	if (ret)
 		return ret;
 
+	ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
 	idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
 
 	mutex_lock(&file->mutex);
@@ -3861,10 +3931,16 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
 		err = -EINVAL;
 		goto err_free;
 	}
+
+	err = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
+				   RDMACG_RESOURCE_HCA_OBJECT);
+	if (err)
+		goto err_free;
+
 	flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER);
 	if (IS_ERR(flow_id)) {
 		err = PTR_ERR(flow_id);
-		goto err_free;
+		goto err_create;
 	}
 	flow_id->uobject = uobj;
 	uobj->object = flow_id;
@@ -3897,6 +3973,8 @@ err_copy:
 	idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
 destroy_flow:
 	ib_destroy_flow(flow_id);
+err_create:
+	ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
 err_free:
 	kfree(flow_attr);
 err_put:
@@ -3936,8 +4014,11 @@ int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,
 	flow_id = uobj->object;
 
 	ret = ib_destroy_flow(flow_id);
-	if (!ret)
+	if (!ret) {
+		ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev,
+				   RDMACG_RESOURCE_HCA_OBJECT);
 		uobj->live = 0;
+	}
 
 	put_uobj_write(uobj);
 
@@ -4005,6 +4086,11 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
 	obj->uevent.events_reported = 0;
 	INIT_LIST_HEAD(&obj->uevent.event_list);
 
+	ret = ib_rdmacg_try_charge(&obj->uevent.uobject.cg_obj, ib_dev,
+				   RDMACG_RESOURCE_HCA_OBJECT);
+	if (ret)
+		goto err_put_cq;
+
 	srq = pd->device->create_srq(pd, &attr, udata);
 	if (IS_ERR(srq)) {
 		ret = PTR_ERR(srq);
@@ -4069,6 +4155,8 @@ err_destroy:
 	ib_destroy_srq(srq);
 
 err_put:
+	ib_rdmacg_uncharge(&obj->uevent.uobject.cg_obj, ib_dev,
+			   RDMACG_RESOURCE_HCA_OBJECT);
 	put_pd_read(pd);
 
 err_put_cq:
@@ -4255,6 +4343,8 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
 	if (ret)
 		return ret;
 
+	ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
 	if (srq_type == IB_SRQT_XRC) {
 		us = container_of(obj, struct ib_usrq_object, uevent);
 		atomic_dec(&us->uxrcd->refcnt);
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index e3fb4b1af1ad..35c788a32e26 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -51,6 +51,7 @@
 #include <rdma/ib.h>
 
 #include "uverbs.h"
+#include "core_priv.h"
 
 MODULE_AUTHOR("Roland Dreier");
 MODULE_DESCRIPTION("InfiniBand userspace verbs access");
@@ -237,6 +238,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 
 		idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
 		ib_destroy_ah(ah);
+		ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+				   RDMACG_RESOURCE_HCA_OBJECT);
 		kfree(uobj);
 	}
 
@@ -246,6 +249,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 
 		idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
 		uverbs_dealloc_mw(mw);
+		ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+				   RDMACG_RESOURCE_HCA_OBJECT);
 		kfree(uobj);
 	}
 
@@ -254,6 +259,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 
 		idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
 		ib_destroy_flow(flow_id);
+		ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+				   RDMACG_RESOURCE_HCA_OBJECT);
 		kfree(uobj);
 	}
 
@@ -266,6 +273,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 		if (qp == qp->real_qp)
 			ib_uverbs_detach_umcast(qp, uqp);
 		ib_destroy_qp(qp);
+		ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+				   RDMACG_RESOURCE_HCA_OBJECT);
 		ib_uverbs_release_uevent(file, &uqp->uevent);
 		kfree(uqp);
 	}
@@ -298,6 +307,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 
 		idr_remove_uobj(&ib_uverbs_srq_idr, uobj);
 		ib_destroy_srq(srq);
+		ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+				   RDMACG_RESOURCE_HCA_OBJECT);
 		ib_uverbs_release_uevent(file, uevent);
 		kfree(uevent);
 	}
@@ -310,6 +321,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 
 		idr_remove_uobj(&ib_uverbs_cq_idr, uobj);
 		ib_destroy_cq(cq);
+		ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+				   RDMACG_RESOURCE_HCA_OBJECT);
 		ib_uverbs_release_ucq(file, ev_file, ucq);
 		kfree(ucq);
 	}
@@ -319,6 +332,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 
 		idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
 		ib_dereg_mr(mr);
+		ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+				   RDMACG_RESOURCE_HCA_OBJECT);
 		kfree(uobj);
 	}
 
@@ -339,11 +354,16 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 
 		idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
 		ib_dealloc_pd(pd);
+		ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+				   RDMACG_RESOURCE_HCA_OBJECT);
 		kfree(uobj);
 	}
 
 	put_pid(context->tgid);
 
+	ib_rdmacg_uncharge(&context->cg_obj, context->device,
+			   RDMACG_RESOURCE_HCA_HANDLE);
+
 	return context->device->dealloc_ucontext(context);
 }
 
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 439b946c4808..db5900aaa55a 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -478,7 +478,7 @@ static void kernfs_drain(struct kernfs_node *kn)
 		rwsem_release(&kn->dep_map, 1, _RET_IP_);
 	}
 
-	kernfs_unmap_bin_file(kn);
+	kernfs_drain_open_files(kn);
 
 	mutex_lock(&kernfs_mutex);
 }
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 4f0535890b30..35043a8c4529 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -515,7 +515,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
 		goto out_put;
 
 	rc = 0;
-	of->mmapped = 1;
+	of->mmapped = true;
 	of->vm_ops = vma->vm_ops;
 	vma->vm_ops = &kernfs_vm_ops;
 out_put:
@@ -707,7 +707,8 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
 	if (error)
 		goto err_free;
 
-	((struct seq_file *)file->private_data)->private = of;
+	of->seq_file = file->private_data;
+	of->seq_file->private = of;
 
 	/* seq_file clears PWRITE unconditionally, restore it if WRITE */
 	if (file->f_mode & FMODE_WRITE)
@@ -716,13 +717,22 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
 	/* make sure we have open node struct */
 	error = kernfs_get_open_node(kn, of);
 	if (error)
-		goto err_close;
+		goto err_seq_release;
+
+	if (ops->open) {
+		/* nobody has access to @of yet, skip @of->mutex */
+		error = ops->open(of);
+		if (error)
+			goto err_put_node;
+	}
 
 	/* open succeeded, put active references */
 	kernfs_put_active(kn);
 	return 0;
 
-err_close:
+err_put_node:
+	kernfs_put_open_node(kn, of);
+err_seq_release:
 	seq_release(inode, file);
 err_free:
 	kfree(of->prealloc_buf);
@@ -732,11 +742,41 @@ err_out:
 	return error;
 }
 
+/* used from release/drain to ensure that ->release() is called exactly once */
+static void kernfs_release_file(struct kernfs_node *kn,
+				struct kernfs_open_file *of)
+{
+	/*
+	 * @of is guaranteed to have no other file operations in flight and
+	 * we just want to synchronize release and drain paths.
+	 * @kernfs_open_file_mutex is enough.  @of->mutex can't be used
+	 * here because drain path may be called from places which can
+	 * cause circular dependency.
+	 */
+	lockdep_assert_held(&kernfs_open_file_mutex);
+
+	if (!of->released) {
+		/*
+		 * A file is never detached without being released and we
+		 * need to be able to release files which are deactivated
+		 * and being drained.  Don't use kernfs_ops().
+		 */
+		kn->attr.ops->release(of);
+		of->released = true;
+	}
+}
+
 static int kernfs_fop_release(struct inode *inode, struct file *filp)
 {
 	struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
 	struct kernfs_open_file *of = kernfs_of(filp);
 
+	if (kn->flags & KERNFS_HAS_RELEASE) {
+		mutex_lock(&kernfs_open_file_mutex);
+		kernfs_release_file(kn, of);
+		mutex_unlock(&kernfs_open_file_mutex);
+	}
+
 	kernfs_put_open_node(kn, of);
 	seq_release(inode, filp);
 	kfree(of->prealloc_buf);
@@ -745,12 +785,12 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-void kernfs_unmap_bin_file(struct kernfs_node *kn)
+void kernfs_drain_open_files(struct kernfs_node *kn)
 {
 	struct kernfs_open_node *on;
 	struct kernfs_open_file *of;
 
-	if (!(kn->flags & KERNFS_HAS_MMAP))
+	if (!(kn->flags & (KERNFS_HAS_MMAP | KERNFS_HAS_RELEASE)))
 		return;
 
 	spin_lock_irq(&kernfs_open_node_lock);
@@ -762,10 +802,16 @@ void kernfs_unmap_bin_file(struct kernfs_node *kn)
 		return;
 
 	mutex_lock(&kernfs_open_file_mutex);
+
 	list_for_each_entry(of, &on->files, list) {
 		struct inode *inode = file_inode(of->file);
-		unmap_mapping_range(inode->i_mapping, 0, 0, 1);
+
+		if (kn->flags & KERNFS_HAS_MMAP)
+			unmap_mapping_range(inode->i_mapping, 0, 0, 1);
+
+		kernfs_release_file(kn, of);
 	}
+
 	mutex_unlock(&kernfs_open_file_mutex);
 
 	kernfs_put_open_node(kn, NULL);
@@ -964,6 +1010,8 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
 		kn->flags |= KERNFS_HAS_SEQ_SHOW;
 	if (ops->mmap)
 		kn->flags |= KERNFS_HAS_MMAP;
+	if (ops->release)
+		kn->flags |= KERNFS_HAS_RELEASE;
 
 	rc = kernfs_add_one(kn);
 	if (rc) {
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index bfd551bbf231..3100987cf8ba 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -104,7 +104,7 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
  */
 extern const struct file_operations kernfs_file_fops;
 
-void kernfs_unmap_bin_file(struct kernfs_node *kn);
+void kernfs_drain_open_files(struct kernfs_node *kn);
 
 /*
  * symlink.c
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 861b4677fc5b..3c02404cfce9 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -148,14 +148,18 @@ struct cgroup_subsys_state {
  * set for a task.
  */
 struct css_set {
-	/* Reference count */
-	atomic_t refcount;
-
 	/*
-	 * List running through all cgroup groups in the same hash
-	 * slot. Protected by css_set_lock
+	 * Set of subsystem states, one for each subsystem. This array is
+	 * immutable after creation apart from the init_css_set during
+	 * subsystem registration (at boot time).
 	 */
-	struct hlist_node hlist;
+	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
+
+	/* reference count */
+	atomic_t refcount;
+
+	/* the default cgroup associated with this css_set */
+	struct cgroup *dfl_cgrp;
 
 	/*
 	 * Lists running through all tasks using this cgroup group.
@@ -167,21 +171,29 @@ struct css_set {
 	struct list_head tasks;
 	struct list_head mg_tasks;
 
+	/* all css_task_iters currently walking this cset */
+	struct list_head task_iters;
+
 	/*
-	 * List of cgrp_cset_links pointing at cgroups referenced from this
-	 * css_set.  Protected by css_set_lock.
+	 * On the default hierarhcy, ->subsys[ssid] may point to a css
+	 * attached to an ancestor instead of the cgroup this css_set is
+	 * associated with.  The following node is anchored at
+	 * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to
+	 * iterate through all css's attached to a given cgroup.
 	 */
-	struct list_head cgrp_links;
+	struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
 
-	/* the default cgroup associated with this css_set */
-	struct cgroup *dfl_cgrp;
+	/*
+	 * List running through all cgroup groups in the same hash
+	 * slot. Protected by css_set_lock
+	 */
+	struct hlist_node hlist;
 
 	/*
-	 * Set of subsystem states, one for each subsystem. This array is
-	 * immutable after creation apart from the init_css_set during
-	 * subsystem registration (at boot time).
+	 * List of cgrp_cset_links pointing at cgroups referenced from this
+	 * css_set.  Protected by css_set_lock.
 	 */
-	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
+	struct list_head cgrp_links;
 
 	/*
 	 * List of csets participating in the on-going migration either as
@@ -201,18 +213,6 @@ struct css_set {
 	struct cgroup *mg_dst_cgrp;
 	struct css_set *mg_dst_cset;
 
-	/*
-	 * On the default hierarhcy, ->subsys[ssid] may point to a css
-	 * attached to an ancestor instead of the cgroup this css_set is
-	 * associated with.  The following node is anchored at
-	 * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to
-	 * iterate through all css's attached to a given cgroup.
-	 */
-	struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
-
-	/* all css_task_iters currently walking this cset */
-	struct list_head task_iters;
-
 	/* dead and being drained, ignore for migration */
 	bool dead;
 
@@ -388,6 +388,9 @@ struct cftype {
 	struct list_head node;		/* anchored at ss->cfts */
 	struct kernfs_ops *kf_ops;
 
+	int (*open)(struct kernfs_open_file *of);
+	void (*release)(struct kernfs_open_file *of);
+
 	/*
 	 * read_u64() is a shortcut for the common case of returning a
 	 * single integer. Use it in place of read()
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c83c23f0577b..f6b43fbb141c 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -266,7 +266,7 @@ void css_task_iter_end(struct css_task_iter *it);
  * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset
  * @leader: the loop cursor
  * @dst_css: the destination css
- * @tset: takset to iterate
+ * @tset: taskset to iterate
  *
  * Iterate threadgroup leaders of @tset.  For single-task migrations, @tset
  * may not contain any.
diff --git a/include/linux/cgroup_rdma.h b/include/linux/cgroup_rdma.h
new file mode 100644
index 000000000000..e94290b29e99
--- /dev/null
+++ b/include/linux/cgroup_rdma.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License. See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#ifndef _CGROUP_RDMA_H
+#define _CGROUP_RDMA_H
+
+#include <linux/cgroup.h>
+
+enum rdmacg_resource_type {
+	RDMACG_RESOURCE_HCA_HANDLE,
+	RDMACG_RESOURCE_HCA_OBJECT,
+	RDMACG_RESOURCE_MAX,
+};
+
+#ifdef CONFIG_CGROUP_RDMA
+
+struct rdma_cgroup {
+	struct cgroup_subsys_state	css;
+
+	/*
+	 * head to keep track of all resource pools
+	 * that belongs to this cgroup.
+	 */
+	struct list_head		rpools;
+};
+
+struct rdmacg_device {
+	struct list_head	dev_node;
+	struct list_head	rpools;
+	char			*name;
+};
+
+/*
+ * APIs for RDMA/IB stack to publish when a device wants to
+ * participate in resource accounting
+ */
+int rdmacg_register_device(struct rdmacg_device *device);
+void rdmacg_unregister_device(struct rdmacg_device *device);
+
+/* APIs for RDMA/IB stack to charge/uncharge pool specific resources */
+int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
+		      struct rdmacg_device *device,
+		      enum rdmacg_resource_type index);
+void rdmacg_uncharge(struct rdma_cgroup *cg,
+		     struct rdmacg_device *device,
+		     enum rdmacg_resource_type index);
+#endif	/* CONFIG_CGROUP_RDMA */
+#endif	/* _CGROUP_RDMA_H */
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 0df0336acee9..d0e597c44585 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -56,6 +56,10 @@ SUBSYS(hugetlb)
 SUBSYS(pids)
 #endif
 
+#if IS_ENABLED(CONFIG_CGROUP_RDMA)
+SUBSYS(rdma)
+#endif
+
 /*
  * The following subsystems are not supported on the default hierarchy.
  */
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 7056238fd9f5..a9b11b8d06f2 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -46,6 +46,7 @@ enum kernfs_node_flag {
 	KERNFS_SUICIDAL		= 0x0400,
 	KERNFS_SUICIDED		= 0x0800,
 	KERNFS_EMPTY_DIR	= 0x1000,
+	KERNFS_HAS_RELEASE	= 0x2000,
 };
 
 /* @flags for kernfs_create_root() */
@@ -175,6 +176,7 @@ struct kernfs_open_file {
 	/* published fields */
 	struct kernfs_node	*kn;
 	struct file		*file;
+	struct seq_file		*seq_file;
 	void			*priv;
 
 	/* private fields, do not use outside kernfs proper */
@@ -185,12 +187,20 @@ struct kernfs_open_file {
 	char			*prealloc_buf;
 
 	size_t			atomic_write_len;
-	bool			mmapped;
+	bool			mmapped:1;
+	bool			released:1;
 	const struct vm_operations_struct *vm_ops;
 };
 
 struct kernfs_ops {
 	/*
+	 * Optional open/release methods.  Both are called with
+	 * @of->seq_file populated.
+	 */
+	int (*open)(struct kernfs_open_file *of);
+	void (*release)(struct kernfs_open_file *of);
+
+	/*
 	 * Read is handled by either seq_file or raw_read().
 	 *
 	 * If seq_show() is present, seq_file path is active.  Other seq
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index d84849c5dc05..0f1813c13687 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -60,6 +60,7 @@
 #include <linux/atomic.h>
 #include <linux/mmu_notifier.h>
 #include <linux/uaccess.h>
+#include <linux/cgroup_rdma.h>
 
 extern struct workqueue_struct *ib_wq;
 extern struct workqueue_struct *ib_comp_wq;
@@ -1356,6 +1357,12 @@ struct ib_fmr_attr {
 
 struct ib_umem;
 
+struct ib_rdmacg_object {
+#ifdef CONFIG_CGROUP_RDMA
+	struct rdma_cgroup	*cg;		/* owner rdma cgroup */
+#endif
+};
+
 struct ib_ucontext {
 	struct ib_device       *device;
 	struct list_head	pd_list;
@@ -1388,6 +1395,8 @@ struct ib_ucontext {
 	struct list_head	no_private_counters;
 	int                     odp_mrs_count;
 #endif
+
+	struct ib_rdmacg_object	cg_obj;
 };
 
 struct ib_uobject {
@@ -1395,6 +1404,7 @@ struct ib_uobject {
 	struct ib_ucontext     *context;	/* associated user context */
 	void		       *object;		/* containing object */
 	struct list_head	list;		/* link to context's list */
+	struct ib_rdmacg_object	cg_obj;		/* rdmacg object */
 	int			id;		/* index into kernel idr */
 	struct kref		ref;
 	struct rw_semaphore	mutex;		/* protects .live */
@@ -2128,6 +2138,10 @@ struct ib_device {
 	struct attribute_group	     *hw_stats_ag;
 	struct rdma_hw_stats         *hw_stats;
 
+#ifdef CONFIG_CGROUP_RDMA
+	struct rdmacg_device         cg_device;
+#endif
+
 	/**
 	 * The following mandatory functions are used only at device
 	 * registration.  Keep functions such as these at the end of this
diff --git a/init/Kconfig b/init/Kconfig
index 8c39615165b7..a92f27da4a27 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1078,6 +1078,16 @@ config CGROUP_PIDS
 	  since the PIDs limit only affects a process's ability to fork, not to
 	  attach to a cgroup.
 
+config CGROUP_RDMA
+	bool "RDMA controller"
+	help
+	  Provides enforcement of RDMA resources defined by IB stack.
+	  It is fairly easy for consumers to exhaust RDMA resources, which
+	  can result into resource unavailability to other consumers.
+	  RDMA controller is designed to stop this from happening.
+	  Attaching processes with active RDMA resources to the cgroup
+	  hierarchy is allowed even if can cross the hierarchy's limit.
+
 config CGROUP_FREEZER
 	bool "Freezer controller"
 	help
diff --git a/kernel/Makefile b/kernel/Makefile
index 12c679f769c6..b302b4731d16 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -64,10 +64,7 @@ obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
 obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
 obj-$(CONFIG_COMPAT) += compat.o
-obj-$(CONFIG_CGROUPS) += cgroup.o
-obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
-obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o
-obj-$(CONFIG_CPUSETS) += cpuset.o
+obj-$(CONFIG_CGROUPS) += cgroup/
 obj-$(CONFIG_UTS_NS) += utsname.o
 obj-$(CONFIG_USER_NS) += user_namespace.o
 obj-$(CONFIG_PID_NS) += pid_namespace.o
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
new file mode 100644
index 000000000000..387348a40c64
--- /dev/null
+++ b/kernel/cgroup/Makefile
@@ -0,0 +1,6 @@
+obj-y := cgroup.o namespace.o cgroup-v1.o
+
+obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
+obj-$(CONFIG_CGROUP_PIDS) += pids.o
+obj-$(CONFIG_CGROUP_RDMA) += rdma.o
+obj-$(CONFIG_CPUSETS) += cpuset.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
new file mode 100644
index 000000000000..9203bfb05603
--- /dev/null
+++ b/kernel/cgroup/cgroup-internal.h
@@ -0,0 +1,214 @@
+#ifndef __CGROUP_INTERNAL_H
+#define __CGROUP_INTERNAL_H
+
+#include <linux/cgroup.h>
+#include <linux/kernfs.h>
+#include <linux/workqueue.h>
+#include <linux/list.h>
+
+/*
+ * A cgroup can be associated with multiple css_sets as different tasks may
+ * belong to different cgroups on different hierarchies.  In the other
+ * direction, a css_set is naturally associated with multiple cgroups.
+ * This M:N relationship is represented by the following link structure
+ * which exists for each association and allows traversing the associations
+ * from both sides.
+ */
+struct cgrp_cset_link {
+	/* the cgroup and css_set this link associates */
+	struct cgroup		*cgrp;
+	struct css_set		*cset;
+
+	/* list of cgrp_cset_links anchored at cgrp->cset_links */
+	struct list_head	cset_link;
+
+	/* list of cgrp_cset_links anchored at css_set->cgrp_links */
+	struct list_head	cgrp_link;
+};
+
+/* used to track tasks and csets during migration */
+struct cgroup_taskset {
+	/* the src and dst cset list running through cset->mg_node */
+	struct list_head	src_csets;
+	struct list_head	dst_csets;
+
+	/* the subsys currently being processed */
+	int			ssid;
+
+	/*
+	 * Fields for cgroup_taskset_*() iteration.
+	 *
+	 * Before migration is committed, the target migration tasks are on
+	 * ->mg_tasks of the csets on ->src_csets.  After, on ->mg_tasks of
+	 * the csets on ->dst_csets.  ->csets point to either ->src_csets
+	 * or ->dst_csets depending on whether migration is committed.
+	 *
+	 * ->cur_csets and ->cur_task point to the current task position
+	 * during iteration.
+	 */
+	struct list_head	*csets;
+	struct css_set		*cur_cset;
+	struct task_struct	*cur_task;
+};
+
+/* migration context also tracks preloading */
+struct cgroup_mgctx {
+	/*
+	 * Preloaded source and destination csets.  Used to guarantee
+	 * atomic success or failure on actual migration.
+	 */
+	struct list_head	preloaded_src_csets;
+	struct list_head	preloaded_dst_csets;
+
+	/* tasks and csets to migrate */
+	struct cgroup_taskset	tset;
+
+	/* subsystems affected by migration */
+	u16			ss_mask;
+};
+
+#define CGROUP_TASKSET_INIT(tset)						\
+{										\
+	.src_csets		= LIST_HEAD_INIT(tset.src_csets),		\
+	.dst_csets		= LIST_HEAD_INIT(tset.dst_csets),		\
+	.csets			= &tset.src_csets,				\
+}
+
+#define CGROUP_MGCTX_INIT(name)							\
+{										\
+	LIST_HEAD_INIT(name.preloaded_src_csets),				\
+	LIST_HEAD_INIT(name.preloaded_dst_csets),				\
+	CGROUP_TASKSET_INIT(name.tset),						\
+}
+
+#define DEFINE_CGROUP_MGCTX(name)						\
+	struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
+
+struct cgroup_sb_opts {
+	u16 subsys_mask;
+	unsigned int flags;
+	char *release_agent;
+	bool cpuset_clone_children;
+	char *name;
+	/* User explicitly requested empty subsystem */
+	bool none;
+};
+
+extern struct mutex cgroup_mutex;
+extern spinlock_t css_set_lock;
+extern struct cgroup_subsys *cgroup_subsys[];
+extern struct list_head cgroup_roots;
+extern struct file_system_type cgroup_fs_type;
+
+/* iterate across the hierarchies */
+#define for_each_root(root)						\
+	list_for_each_entry((root), &cgroup_roots, root_list)
+
+/**
+ * for_each_subsys - iterate all enabled cgroup subsystems
+ * @ss: the iteration cursor
+ * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
+ */
+#define for_each_subsys(ss, ssid)					\
+	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&		\
+	     (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
+
+static inline bool cgroup_is_dead(const struct cgroup *cgrp)
+{
+	return !(cgrp->self.flags & CSS_ONLINE);
+}
+
+static inline bool notify_on_release(const struct cgroup *cgrp)
+{
+	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+}
+
+void put_css_set_locked(struct css_set *cset);
+
+static inline void put_css_set(struct css_set *cset)
+{
+	unsigned long flags;
+
+	/*
+	 * Ensure that the refcount doesn't hit zero while any readers
+	 * can see it. Similar to atomic_dec_and_lock(), but for an
+	 * rwlock
+	 */
+	if (atomic_add_unless(&cset->refcount, -1, 1))
+		return;
+
+	spin_lock_irqsave(&css_set_lock, flags);
+	put_css_set_locked(cset);
+	spin_unlock_irqrestore(&css_set_lock, flags);
+}
+
+/*
+ * refcounted get/put for css_set objects
+ */
+static inline void get_css_set(struct css_set *cset)
+{
+	atomic_inc(&cset->refcount);
+}
+
+bool cgroup_ssid_enabled(int ssid);
+bool cgroup_on_dfl(const struct cgroup *cgrp);
+
+struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root);
+struct cgroup *task_cgroup_from_root(struct task_struct *task,
+				     struct cgroup_root *root);
+struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline);
+void cgroup_kn_unlock(struct kernfs_node *kn);
+int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
+			  struct cgroup_namespace *ns);
+
+void cgroup_free_root(struct cgroup_root *root);
+void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts);
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
+int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
+struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
+			       struct cgroup_root *root, unsigned long magic,
+			       struct cgroup_namespace *ns);
+
+bool cgroup_may_migrate_to(struct cgroup *dst_cgrp);
+void cgroup_migrate_finish(struct cgroup_mgctx *mgctx);
+void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp,
+			    struct cgroup_mgctx *mgctx);
+int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx);
+int cgroup_migrate(struct task_struct *leader, bool threadgroup,
+		   struct cgroup_mgctx *mgctx);
+
+int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
+		       bool threadgroup);
+ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
+			     size_t nbytes, loff_t off, bool threadgroup);
+ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
+			   loff_t off);
+
+void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
+
+int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode);
+int cgroup_rmdir(struct kernfs_node *kn);
+int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
+		     struct kernfs_root *kf_root);
+
+/*
+ * namespace.c
+ */
+extern const struct proc_ns_operations cgroupns_operations;
+
+/*
+ * cgroup-v1.c
+ */
+extern struct cftype cgroup1_base_files[];
+extern const struct file_operations proc_cgroupstats_operations;
+extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops;
+
+bool cgroup1_ssid_disabled(int ssid);
+void cgroup1_pidlist_destroy_all(struct cgroup *cgrp);
+void cgroup1_release_agent(struct work_struct *work);
+void cgroup1_check_for_release(struct cgroup *cgrp);
+struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
+			     void *data, unsigned long magic,
+			     struct cgroup_namespace *ns);
+
+#endif /* __CGROUP_INTERNAL_H */
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
new file mode 100644
index 000000000000..fc34bcf2329f
--- /dev/null
+++ b/kernel/cgroup/cgroup-v1.c
@@ -0,0 +1,1395 @@
+#include "cgroup-internal.h"
+
+#include <linux/ctype.h>
+#include <linux/kmod.h>
+#include <linux/sort.h>
+#include <linux/delay.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/delayacct.h>
+#include <linux/pid_namespace.h>
+#include <linux/cgroupstats.h>
+
+#include <trace/events/cgroup.h>
+
+/*
+ * pidlists linger the following amount before being destroyed.  The goal
+ * is avoiding frequent destruction in the middle of consecutive read calls
+ * Expiring in the middle is a performance problem not a correctness one.
+ * 1 sec should be enough.
+ */
+#define CGROUP_PIDLIST_DESTROY_DELAY	HZ
+
+/* Controllers blocked by the commandline in v1 */
+static u16 cgroup_no_v1_mask;
+
+/*
+ * pidlist destructions need to be flushed on cgroup destruction.  Use a
+ * separate workqueue as flush domain.
+ */
+static struct workqueue_struct *cgroup_pidlist_destroy_wq;
+
+/*
+ * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
+ * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
+ */
+static DEFINE_SPINLOCK(release_agent_path_lock);
+
+bool cgroup1_ssid_disabled(int ssid)
+{
+	return cgroup_no_v1_mask & (1 << ssid);
+}
+
+/**
+ * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
+ * @from: attach to all cgroups of a given task
+ * @tsk: the task to be attached
+ */
+int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
+{
+	struct cgroup_root *root;
+	int retval = 0;
+
+	mutex_lock(&cgroup_mutex);
+	percpu_down_write(&cgroup_threadgroup_rwsem);
+	for_each_root(root) {
+		struct cgroup *from_cgrp;
+
+		if (root == &cgrp_dfl_root)
+			continue;
+
+		spin_lock_irq(&css_set_lock);
+		from_cgrp = task_cgroup_from_root(from, root);
+		spin_unlock_irq(&css_set_lock);
+
+		retval = cgroup_attach_task(from_cgrp, tsk, false);
+		if (retval)
+			break;
+	}
+	percpu_up_write(&cgroup_threadgroup_rwsem);
+	mutex_unlock(&cgroup_mutex);
+
+	return retval;
+}
+EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
+
+/**
+ * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
+ * @to: cgroup to which the tasks will be moved
+ * @from: cgroup in which the tasks currently reside
+ *
+ * Locking rules between cgroup_post_fork() and the migration path
+ * guarantee that, if a task is forking while being migrated, the new child
+ * is guaranteed to be either visible in the source cgroup after the
+ * parent's migration is complete or put into the target cgroup.  No task
+ * can slip out of migration through forking.
+ */
+int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
+{
+	DEFINE_CGROUP_MGCTX(mgctx);
+	struct cgrp_cset_link *link;
+	struct css_task_iter it;
+	struct task_struct *task;
+	int ret;
+
+	if (cgroup_on_dfl(to))
+		return -EINVAL;
+
+	if (!cgroup_may_migrate_to(to))
+		return -EBUSY;
+
+	mutex_lock(&cgroup_mutex);
+
+	percpu_down_write(&cgroup_threadgroup_rwsem);
+
+	/* all tasks in @from are being moved, all csets are source */
+	spin_lock_irq(&css_set_lock);
+	list_for_each_entry(link, &from->cset_links, cset_link)
+		cgroup_migrate_add_src(link->cset, to, &mgctx);
+	spin_unlock_irq(&css_set_lock);
+
+	ret = cgroup_migrate_prepare_dst(&mgctx);
+	if (ret)
+		goto out_err;
+
+	/*
+	 * Migrate tasks one-by-one until @from is empty.  This fails iff
+	 * ->can_attach() fails.
+	 */
+	do {
+		css_task_iter_start(&from->self, &it);
+		task = css_task_iter_next(&it);
+		if (task)
+			get_task_struct(task);
+		css_task_iter_end(&it);
+
+		if (task) {
+			ret = cgroup_migrate(task, false, &mgctx);
+			if (!ret)
+				trace_cgroup_transfer_tasks(to, task, false);
+			put_task_struct(task);
+		}
+	} while (task && !ret);
+out_err:
+	cgroup_migrate_finish(&mgctx);
+	percpu_up_write(&cgroup_threadgroup_rwsem);
+	mutex_unlock(&cgroup_mutex);
+	return ret;
+}
+
+/*
+ * Stuff for reading the 'tasks'/'procs' files.
+ *
+ * Reading this file can return large amounts of data if a cgroup has
+ * *lots* of attached tasks. So it may need several calls to read(),
+ * but we cannot guarantee that the information we produce is correct
+ * unless we produce it entirely atomically.
+ *
+ */
+
+/* which pidlist file are we talking about? */
+enum cgroup_filetype {
+	CGROUP_FILE_PROCS,
+	CGROUP_FILE_TASKS,
+};
+
+/*
+ * A pidlist is a list of pids that virtually represents the contents of one
+ * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
+ * a pair (one each for procs, tasks) for each pid namespace that's relevant
+ * to the cgroup.
+ */
+struct cgroup_pidlist {
+	/*
+	 * used to find which pidlist is wanted. doesn't change as long as
+	 * this particular list stays in the list.
+	*/
+	struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
+	/* array of xids */
+	pid_t *list;
+	/* how many elements the above list has */
+	int length;
+	/* each of these stored in a list by its cgroup */
+	struct list_head links;
+	/* pointer to the cgroup we belong to, for list removal purposes */
+	struct cgroup *owner;
+	/* for delayed destruction */
+	struct delayed_work destroy_dwork;
+};
+
+/*
+ * The following two functions "fix" the issue where there are more pids
+ * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
+ * TODO: replace with a kernel-wide solution to this problem
+ */
+#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
+static void *pidlist_allocate(int count)
+{
+	if (PIDLIST_TOO_LARGE(count))
+		return vmalloc(count * sizeof(pid_t));
+	else
+		return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
+}
+
+static void pidlist_free(void *p)
+{
+	kvfree(p);
+}
+
+/*
+ * Used to destroy all pidlists lingering waiting for destroy timer.  None
+ * should be left afterwards.
+ */
+void cgroup1_pidlist_destroy_all(struct cgroup *cgrp)
+{
+	struct cgroup_pidlist *l, *tmp_l;
+
+	mutex_lock(&cgrp->pidlist_mutex);
+	list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
+		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
+	mutex_unlock(&cgrp->pidlist_mutex);
+
+	flush_workqueue(cgroup_pidlist_destroy_wq);
+	BUG_ON(!list_empty(&cgrp->pidlists));
+}
+
+static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
+{
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
+						destroy_dwork);
+	struct cgroup_pidlist *tofree = NULL;
+
+	mutex_lock(&l->owner->pidlist_mutex);
+
+	/*
+	 * Destroy iff we didn't get queued again.  The state won't change
+	 * as destroy_dwork can only be queued while locked.
+	 */
+	if (!delayed_work_pending(dwork)) {
+		list_del(&l->links);
+		pidlist_free(l->list);
+		put_pid_ns(l->key.ns);
+		tofree = l;
+	}
+
+	mutex_unlock(&l->owner->pidlist_mutex);
+	kfree(tofree);
+}
+
+/*
+ * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
+ * Returns the number of unique elements.
+ */
+static int pidlist_uniq(pid_t *list, int length)
+{
+	int src, dest = 1;
+
+	/*
+	 * we presume the 0th element is unique, so i starts at 1. trivial
+	 * edge cases first; no work needs to be done for either
+	 */
+	if (length == 0 || length == 1)
+		return length;
+	/* src and dest walk down the list; dest counts unique elements */
+	for (src = 1; src < length; src++) {
+		/* find next unique element */
+		while (list[src] == list[src-1]) {
+			src++;
+			if (src == length)
+				goto after;
+		}
+		/* dest always points to where the next unique element goes */
+		list[dest] = list[src];
+		dest++;
+	}
+after:
+	return dest;
+}
+
+/*
+ * The two pid files - task and cgroup.procs - guaranteed that the result
+ * is sorted, which forced this whole pidlist fiasco.  As pid order is
+ * different per namespace, each namespace needs differently sorted list,
+ * making it impossible to use, for example, single rbtree of member tasks
+ * sorted by task pointer.  As pidlists can be fairly large, allocating one
+ * per open file is dangerous, so cgroup had to implement shared pool of
+ * pidlists keyed by cgroup and namespace.
+ */
+static int cmppid(const void *a, const void *b)
+{
+	return *(pid_t *)a - *(pid_t *)b;
+}
+
+static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
+						  enum cgroup_filetype type)
+{
+	struct cgroup_pidlist *l;
+	/* don't need task_nsproxy() if we're looking at ourself */
+	struct pid_namespace *ns = task_active_pid_ns(current);
+
+	lockdep_assert_held(&cgrp->pidlist_mutex);
+
+	list_for_each_entry(l, &cgrp->pidlists, links)
+		if (l->key.type == type && l->key.ns == ns)
+			return l;
+	return NULL;
+}
+
+/*
+ * find the appropriate pidlist for our purpose (given procs vs tasks)
+ * returns with the lock on that pidlist already held, and takes care
+ * of the use count, or returns NULL with no locks held if we're out of
+ * memory.
+ */
+static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
+						enum cgroup_filetype type)
+{
+	struct cgroup_pidlist *l;
+
+	lockdep_assert_held(&cgrp->pidlist_mutex);
+
+	l = cgroup_pidlist_find(cgrp, type);
+	if (l)
+		return l;
+
+	/* entry not found; create a new one */
+	l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
+	if (!l)
+		return l;
+
+	INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
+	l->key.type = type;
+	/* don't need task_nsproxy() if we're looking at ourself */
+	l->key.ns = get_pid_ns(task_active_pid_ns(current));
+	l->owner = cgrp;
+	list_add(&l->links, &cgrp->pidlists);
+	return l;
+}
+
+/**
+ * cgroup_task_count - count the number of tasks in a cgroup.
+ * @cgrp: the cgroup in question
+ *
+ * Return the number of tasks in the cgroup.  The returned number can be
+ * higher than the actual number of tasks due to css_set references from
+ * namespace roots and temporary usages.
+ */
+static int cgroup_task_count(const struct cgroup *cgrp)
+{
+	int count = 0;
+	struct cgrp_cset_link *link;
+
+	spin_lock_irq(&css_set_lock);
+	list_for_each_entry(link, &cgrp->cset_links, cset_link)
+		count += atomic_read(&link->cset->refcount);
+	spin_unlock_irq(&css_set_lock);
+	return count;
+}
+
+/*
+ * Load a cgroup's pidarray with either procs' tgids or tasks' pids
+ */
+static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
+			      struct cgroup_pidlist **lp)
+{
+	pid_t *array;
+	int length;
+	int pid, n = 0; /* used for populating the array */
+	struct css_task_iter it;
+	struct task_struct *tsk;
+	struct cgroup_pidlist *l;
+
+	lockdep_assert_held(&cgrp->pidlist_mutex);
+
+	/*
+	 * If cgroup gets more users after we read count, we won't have
+	 * enough space - tough.  This race is indistinguishable to the
+	 * caller from the case that the additional cgroup users didn't
+	 * show up until sometime later on.
+	 */
+	length = cgroup_task_count(cgrp);
+	array = pidlist_allocate(length);
+	if (!array)
+		return -ENOMEM;
+	/* now, populate the array */
+	css_task_iter_start(&cgrp->self, &it);
+	while ((tsk = css_task_iter_next(&it))) {
+		if (unlikely(n == length))
+			break;
+		/* get tgid or pid for procs or tasks file respectively */
+		if (type == CGROUP_FILE_PROCS)
+			pid = task_tgid_vnr(tsk);
+		else
+			pid = task_pid_vnr(tsk);
+		if (pid > 0) /* make sure to only use valid results */
+			array[n++] = pid;
+	}
+	css_task_iter_end(&it);
+	length = n;
+	/* now sort & (if procs) strip out duplicates */
+	sort(array, length, sizeof(pid_t), cmppid, NULL);
+	if (type == CGROUP_FILE_PROCS)
+		length = pidlist_uniq(array, length);
+
+	l = cgroup_pidlist_find_create(cgrp, type);
+	if (!l) {
+		pidlist_free(array);
+		return -ENOMEM;
+	}
+
+	/* store array, freeing old if necessary */
+	pidlist_free(l->list);
+	l->list = array;
+	l->length = length;
+	*lp = l;
+	return 0;
+}
+
+/*
+ * seq_file methods for the tasks/procs files. The seq_file position is the
+ * next pid to display; the seq_file iterator is a pointer to the pid
+ * in the cgroup->l->list array.
+ */
+
+static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
+{
+	/*
+	 * Initially we receive a position value that corresponds to
+	 * one more than the last pid shown (or 0 on the first call or
+	 * after a seek to the start). Use a binary-search to find the
+	 * next pid to display, if any
+	 */
+	struct kernfs_open_file *of = s->private;
+	struct cgroup *cgrp = seq_css(s)->cgroup;
+	struct cgroup_pidlist *l;
+	enum cgroup_filetype type = seq_cft(s)->private;
+	int index = 0, pid = *pos;
+	int *iter, ret;
+
+	mutex_lock(&cgrp->pidlist_mutex);
+
+	/*
+	 * !NULL @of->priv indicates that this isn't the first start()
+	 * after open.  If the matching pidlist is around, we can use that.
+	 * Look for it.  Note that @of->priv can't be used directly.  It
+	 * could already have been destroyed.
+	 */
+	if (of->priv)
+		of->priv = cgroup_pidlist_find(cgrp, type);
+
+	/*
+	 * Either this is the first start() after open or the matching
+	 * pidlist has been destroyed inbetween.  Create a new one.
+	 */
+	if (!of->priv) {
+		ret = pidlist_array_load(cgrp, type,
+					 (struct cgroup_pidlist **)&of->priv);
+		if (ret)
+			return ERR_PTR(ret);
+	}
+	l = of->priv;
+
+	if (pid) {
+		int end = l->length;
+
+		while (index < end) {
+			int mid = (index + end) / 2;
+			if (l->list[mid] == pid) {
+				index = mid;
+				break;
+			} else if (l->list[mid] <= pid)
+				index = mid + 1;
+			else
+				end = mid;
+		}
+	}
+	/* If we're off the end of the array, we're done */
+	if (index >= l->length)
+		return NULL;
+	/* Update the abstract position to be the actual pid that we found */
+	iter = l->list + index;
+	*pos = *iter;
+	return iter;
+}
+
+static void cgroup_pidlist_stop(struct seq_file *s, void *v)
+{
+	struct kernfs_open_file *of = s->private;
+	struct cgroup_pidlist *l = of->priv;
+
+	if (l)
+		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
+				 CGROUP_PIDLIST_DESTROY_DELAY);
+	mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
+}
+
+static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct kernfs_open_file *of = s->private;
+	struct cgroup_pidlist *l = of->priv;
+	pid_t *p = v;
+	pid_t *end = l->list + l->length;
+	/*
+	 * Advance to the next pid in the array. If this goes off the
+	 * end, we're done
+	 */
+	p++;
+	if (p >= end) {
+		return NULL;
+	} else {
+		*pos = *p;
+		return p;
+	}
+}
+
+static int cgroup_pidlist_show(struct seq_file *s, void *v)
+{
+	seq_printf(s, "%d\n", *(int *)v);
+
+	return 0;
+}
+
+static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
+				  char *buf, size_t nbytes, loff_t off)
+{
+	return __cgroup_procs_write(of, buf, nbytes, off, false);
+}
+
+static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
+					  char *buf, size_t nbytes, loff_t off)
+{
+	struct cgroup *cgrp;
+
+	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+
+	cgrp = cgroup_kn_lock_live(of->kn, false);
+	if (!cgrp)
+		return -ENODEV;
+	spin_lock(&release_agent_path_lock);
+	strlcpy(cgrp->root->release_agent_path, strstrip(buf),
+		sizeof(cgrp->root->release_agent_path));
+	spin_unlock(&release_agent_path_lock);
+	cgroup_kn_unlock(of->kn);
+	return nbytes;
+}
+
+static int cgroup_release_agent_show(struct seq_file *seq, void *v)
+{
+	struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+	spin_lock(&release_agent_path_lock);
+	seq_puts(seq, cgrp->root->release_agent_path);
+	spin_unlock(&release_agent_path_lock);
+	seq_putc(seq, '\n');
+	return 0;
+}
+
+static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
+{
+	seq_puts(seq, "0\n");
+	return 0;
+}
+
+static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
+					 struct cftype *cft)
+{
+	return notify_on_release(css->cgroup);
+}
+
+static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
+					  struct cftype *cft, u64 val)
+{
+	if (val)
+		set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
+	else
+		clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
+	return 0;
+}
+
+static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
+				      struct cftype *cft)
+{
+	return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+}
+
+static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
+				       struct cftype *cft, u64 val)
+{
+	if (val)
+		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+	else
+		clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+	return 0;
+}
+
+/* cgroup core interface files for the legacy hierarchies */
+struct cftype cgroup1_base_files[] = {
+	{
+		.name = "cgroup.procs",
+		.seq_start = cgroup_pidlist_start,
+		.seq_next = cgroup_pidlist_next,
+		.seq_stop = cgroup_pidlist_stop,
+		.seq_show = cgroup_pidlist_show,
+		.private = CGROUP_FILE_PROCS,
+		.write = cgroup_procs_write,
+	},
+	{
+		.name = "cgroup.clone_children",
+		.read_u64 = cgroup_clone_children_read,
+		.write_u64 = cgroup_clone_children_write,
+	},
+	{
+		.name = "cgroup.sane_behavior",
+		.flags = CFTYPE_ONLY_ON_ROOT,
+		.seq_show = cgroup_sane_behavior_show,
+	},
+	{
+		.name = "tasks",
+		.seq_start = cgroup_pidlist_start,
+		.seq_next = cgroup_pidlist_next,
+		.seq_stop = cgroup_pidlist_stop,
+		.seq_show = cgroup_pidlist_show,
+		.private = CGROUP_FILE_TASKS,
+		.write = cgroup_tasks_write,
+	},
+	{
+		.name = "notify_on_release",
+		.read_u64 = cgroup_read_notify_on_release,
+		.write_u64 = cgroup_write_notify_on_release,
+	},
+	{
+		.name = "release_agent",
+		.flags = CFTYPE_ONLY_ON_ROOT,
+		.seq_show = cgroup_release_agent_show,
+		.write = cgroup_release_agent_write,
+		.max_write_len = PATH_MAX - 1,
+	},
+	{ }	/* terminate */
+};
+
+/* Display information about each subsystem and each hierarchy */
+static int proc_cgroupstats_show(struct seq_file *m, void *v)
+{
+	struct cgroup_subsys *ss;
+	int i;
+
+	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
+	/*
+	 * ideally we don't want subsystems moving around while we do this.
+	 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
+	 * subsys/hierarchy state.
+	 */
+	mutex_lock(&cgroup_mutex);
+
+	for_each_subsys(ss, i)
+		seq_printf(m, "%s\t%d\t%d\t%d\n",
+			   ss->legacy_name, ss->root->hierarchy_id,
+			   atomic_read(&ss->root->nr_cgrps),
+			   cgroup_ssid_enabled(i));
+
+	mutex_unlock(&cgroup_mutex);
+	return 0;
+}
+
+static int cgroupstats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, proc_cgroupstats_show, NULL);
+}
+
+const struct file_operations proc_cgroupstats_operations = {
+	.open = cgroupstats_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+/**
+ * cgroupstats_build - build and fill cgroupstats
+ * @stats: cgroupstats to fill information into
+ * @dentry: A dentry entry belonging to the cgroup for which stats have
+ * been requested.
+ *
+ * Build and fill cgroupstats so that taskstats can export it to user
+ * space.
+ */
+int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
+{
+	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
+	struct cgroup *cgrp;
+	struct css_task_iter it;
+	struct task_struct *tsk;
+
+	/* it should be kernfs_node belonging to cgroupfs and is a directory */
+	if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
+	    kernfs_type(kn) != KERNFS_DIR)
+		return -EINVAL;
+
+	mutex_lock(&cgroup_mutex);
+
+	/*
+	 * We aren't being called from kernfs and there's no guarantee on
+	 * @kn->priv's validity.  For this and css_tryget_online_from_dir(),
+	 * @kn->priv is RCU safe.  Let's do the RCU dancing.
+	 */
+	rcu_read_lock();
+	cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
+	if (!cgrp || cgroup_is_dead(cgrp)) {
+		rcu_read_unlock();
+		mutex_unlock(&cgroup_mutex);
+		return -ENOENT;
+	}
+	rcu_read_unlock();
+
+	css_task_iter_start(&cgrp->self, &it);
+	while ((tsk = css_task_iter_next(&it))) {
+		switch (tsk->state) {
+		case TASK_RUNNING:
+			stats->nr_running++;
+			break;
+		case TASK_INTERRUPTIBLE:
+			stats->nr_sleeping++;
+			break;
+		case TASK_UNINTERRUPTIBLE:
+			stats->nr_uninterruptible++;
+			break;
+		case TASK_STOPPED:
+			stats->nr_stopped++;
+			break;
+		default:
+			if (delayacct_is_task_waiting_on_io(tsk))
+				stats->nr_io_wait++;
+			break;
+		}
+	}
+	css_task_iter_end(&it);
+
+	mutex_unlock(&cgroup_mutex);
+	return 0;
+}
+
+void cgroup1_check_for_release(struct cgroup *cgrp)
+{
+	if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
+	    !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
+		schedule_work(&cgrp->release_agent_work);
+}
+
+/*
+ * Notify userspace when a cgroup is released, by running the
+ * configured release agent with the name of the cgroup (path
+ * relative to the root of cgroup file system) as the argument.
+ *
+ * Most likely, this user command will try to rmdir this cgroup.
+ *
+ * This races with the possibility that some other task will be
+ * attached to this cgroup before it is removed, or that some other
+ * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
+ * The presumed 'rmdir' will fail quietly if this cgroup is no longer
+ * unused, and this cgroup will be reprieved from its death sentence,
+ * to continue to serve a useful existence.  Next time it's released,
+ * we will get notified again, if it still has 'notify_on_release' set.
+ *
+ * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
+ * means only wait until the task is successfully execve()'d.  The
+ * separate release agent task is forked by call_usermodehelper(),
+ * then control in this thread returns here, without waiting for the
+ * release agent task.  We don't bother to wait because the caller of
+ * this routine has no use for the exit status of the release agent
+ * task, so no sense holding our caller up for that.
+ */
+void cgroup1_release_agent(struct work_struct *work)
+{
+	struct cgroup *cgrp =
+		container_of(work, struct cgroup, release_agent_work);
+	char *pathbuf = NULL, *agentbuf = NULL;
+	char *argv[3], *envp[3];
+	int ret;
+
+	mutex_lock(&cgroup_mutex);
+
+	pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
+	agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
+	if (!pathbuf || !agentbuf)
+		goto out;
+
+	spin_lock_irq(&css_set_lock);
+	ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
+	spin_unlock_irq(&css_set_lock);
+	if (ret < 0 || ret >= PATH_MAX)
+		goto out;
+
+	argv[0] = agentbuf;
+	argv[1] = pathbuf;
+	argv[2] = NULL;
+
+	/* minimal command environment */
+	envp[0] = "HOME=/";
+	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+	envp[2] = NULL;
+
+	mutex_unlock(&cgroup_mutex);
+	call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+	goto out_free;
+out:
+	mutex_unlock(&cgroup_mutex);
+out_free:
+	kfree(agentbuf);
+	kfree(pathbuf);
+}
+
+/*
+ * cgroup_rename - Only allow simple rename of directories in place.
+ */
+static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
+			  const char *new_name_str)
+{
+	struct cgroup *cgrp = kn->priv;
+	int ret;
+
+	if (kernfs_type(kn) != KERNFS_DIR)
+		return -ENOTDIR;
+	if (kn->parent != new_parent)
+		return -EIO;
+
+	/*
+	 * We're gonna grab cgroup_mutex which nests outside kernfs
+	 * active_ref.  kernfs_rename() doesn't require active_ref
+	 * protection.  Break them before grabbing cgroup_mutex.
+	 */
+	kernfs_break_active_protection(new_parent);
+	kernfs_break_active_protection(kn);
+
+	mutex_lock(&cgroup_mutex);
+
+	ret = kernfs_rename(kn, new_parent, new_name_str);
+	if (!ret)
+		trace_cgroup_rename(cgrp);
+
+	mutex_unlock(&cgroup_mutex);
+
+	kernfs_unbreak_active_protection(kn);
+	kernfs_unbreak_active_protection(new_parent);
+	return ret;
+}
+
+static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
+{
+	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
+	struct cgroup_subsys *ss;
+	int ssid;
+
+	for_each_subsys(ss, ssid)
+		if (root->subsys_mask & (1 << ssid))
+			seq_show_option(seq, ss->legacy_name, NULL);
+	if (root->flags & CGRP_ROOT_NOPREFIX)
+		seq_puts(seq, ",noprefix");
+	if (root->flags & CGRP_ROOT_XATTR)
+		seq_puts(seq, ",xattr");
+
+	spin_lock(&release_agent_path_lock);
+	if (strlen(root->release_agent_path))
+		seq_show_option(seq, "release_agent",
+				root->release_agent_path);
+	spin_unlock(&release_agent_path_lock);
+
+	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
+		seq_puts(seq, ",clone_children");
+	if (strlen(root->name))
+		seq_show_option(seq, "name", root->name);
+	return 0;
+}
+
+static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
+{
+	char *token, *o = data;
+	bool all_ss = false, one_ss = false;
+	u16 mask = U16_MAX;
+	struct cgroup_subsys *ss;
+	int nr_opts = 0;
+	int i;
+
+#ifdef CONFIG_CPUSETS
+	mask = ~((u16)1 << cpuset_cgrp_id);
+#endif
+
+	memset(opts, 0, sizeof(*opts));
+
+	while ((token = strsep(&o, ",")) != NULL) {
+		nr_opts++;
+
+		if (!*token)
+			return -EINVAL;
+		if (!strcmp(token, "none")) {
+			/* Explicitly have no subsystems */
+			opts->none = true;
+			continue;
+		}
+		if (!strcmp(token, "all")) {
+			/* Mutually exclusive option 'all' + subsystem name */
+			if (one_ss)
+				return -EINVAL;
+			all_ss = true;
+			continue;
+		}
+		if (!strcmp(token, "noprefix")) {
+			opts->flags |= CGRP_ROOT_NOPREFIX;
+			continue;
+		}
+		if (!strcmp(token, "clone_children")) {
+			opts->cpuset_clone_children = true;
+			continue;
+		}
+		if (!strcmp(token, "xattr")) {
+			opts->flags |= CGRP_ROOT_XATTR;
+			continue;
+		}
+		if (!strncmp(token, "release_agent=", 14)) {
+			/* Specifying two release agents is forbidden */
+			if (opts->release_agent)
+				return -EINVAL;
+			opts->release_agent =
+				kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
+			if (!opts->release_agent)
+				return -ENOMEM;
+			continue;
+		}
+		if (!strncmp(token, "name=", 5)) {
+			const char *name = token + 5;
+			/* Can't specify an empty name */
+			if (!strlen(name))
+				return -EINVAL;
+			/* Must match [\w.-]+ */
+			for (i = 0; i < strlen(name); i++) {
+				char c = name[i];
+				if (isalnum(c))
+					continue;
+				if ((c == '.') || (c == '-') || (c == '_'))
+					continue;
+				return -EINVAL;
+			}
+			/* Specifying two names is forbidden */
+			if (opts->name)
+				return -EINVAL;
+			opts->name = kstrndup(name,
+					      MAX_CGROUP_ROOT_NAMELEN - 1,
+					      GFP_KERNEL);
+			if (!opts->name)
+				return -ENOMEM;
+
+			continue;
+		}
+
+		for_each_subsys(ss, i) {
+			if (strcmp(token, ss->legacy_name))
+				continue;
+			if (!cgroup_ssid_enabled(i))
+				continue;
+			if (cgroup1_ssid_disabled(i))
+				continue;
+
+			/* Mutually exclusive option 'all' + subsystem name */
+			if (all_ss)
+				return -EINVAL;
+			opts->subsys_mask |= (1 << i);
+			one_ss = true;
+
+			break;
+		}
+		if (i == CGROUP_SUBSYS_COUNT)
+			return -ENOENT;
+	}
+
+	/*
+	 * If the 'all' option was specified select all the subsystems,
+	 * otherwise if 'none', 'name=' and a subsystem name options were
+	 * not specified, let's default to 'all'
+	 */
+	if (all_ss || (!one_ss && !opts->none && !opts->name))
+		for_each_subsys(ss, i)
+			if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i))
+				opts->subsys_mask |= (1 << i);
+
+	/*
+	 * We either have to specify by name or by subsystems. (So all
+	 * empty hierarchies must have a name).
+	 */
+	if (!opts->subsys_mask && !opts->name)
+		return -EINVAL;
+
+	/*
+	 * Option noprefix was introduced just for backward compatibility
+	 * with the old cpuset, so we allow noprefix only if mounting just
+	 * the cpuset subsystem.
+	 */
+	if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
+		return -EINVAL;
+
+	/* Can't specify "none" and some subsystems */
+	if (opts->subsys_mask && opts->none)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data)
+{
+	int ret = 0;
+	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
+	struct cgroup_sb_opts opts;
+	u16 added_mask, removed_mask;
+
+	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
+
+	/* See what subsystems are wanted */
+	ret = parse_cgroupfs_options(data, &opts);
+	if (ret)
+		goto out_unlock;
+
+	if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
+		pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
+			task_tgid_nr(current), current->comm);
+
+	added_mask = opts.subsys_mask & ~root->subsys_mask;
+	removed_mask = root->subsys_mask & ~opts.subsys_mask;
+
+	/* Don't allow flags or name to change at remount */
+	if ((opts.flags ^ root->flags) ||
+	    (opts.name && strcmp(opts.name, root->name))) {
+		pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
+		       opts.flags, opts.name ?: "", root->flags, root->name);
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	/* remounting is not allowed for populated hierarchies */
+	if (!list_empty(&root->cgrp.self.children)) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+
+	ret = rebind_subsystems(root, added_mask);
+	if (ret)
+		goto out_unlock;
+
+	WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
+
+	if (opts.release_agent) {
+		spin_lock(&release_agent_path_lock);
+		strcpy(root->release_agent_path, opts.release_agent);
+		spin_unlock(&release_agent_path_lock);
+	}
+
+	trace_cgroup_remount(root);
+
+ out_unlock:
+	kfree(opts.release_agent);
+	kfree(opts.name);
+	mutex_unlock(&cgroup_mutex);
+	return ret;
+}
+
+struct kernfs_syscall_ops cgroup1_kf_syscall_ops = {
+	.rename			= cgroup1_rename,
+	.show_options		= cgroup1_show_options,
+	.remount_fs		= cgroup1_remount,
+	.mkdir			= cgroup_mkdir,
+	.rmdir			= cgroup_rmdir,
+	.show_path		= cgroup_show_path,
+};
+
+struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
+			     void *data, unsigned long magic,
+			     struct cgroup_namespace *ns)
+{
+	struct super_block *pinned_sb = NULL;
+	struct cgroup_sb_opts opts;
+	struct cgroup_root *root;
+	struct cgroup_subsys *ss;
+	struct dentry *dentry;
+	int i, ret;
+
+	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
+
+	/* First find the desired set of subsystems */
+	ret = parse_cgroupfs_options(data, &opts);
+	if (ret)
+		goto out_unlock;
+
+	/*
+	 * Destruction of cgroup root is asynchronous, so subsystems may
+	 * still be dying after the previous unmount.  Let's drain the
+	 * dying subsystems.  We just need to ensure that the ones
+	 * unmounted previously finish dying and don't care about new ones
+	 * starting.  Testing ref liveliness is good enough.
+	 */
+	for_each_subsys(ss, i) {
+		if (!(opts.subsys_mask & (1 << i)) ||
+		    ss->root == &cgrp_dfl_root)
+			continue;
+
+		if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
+			mutex_unlock(&cgroup_mutex);
+			msleep(10);
+			ret = restart_syscall();
+			goto out_free;
+		}
+		cgroup_put(&ss->root->cgrp);
+	}
+
+	for_each_root(root) {
+		bool name_match = false;
+
+		if (root == &cgrp_dfl_root)
+			continue;
+
+		/*
+		 * If we asked for a name then it must match.  Also, if
+		 * name matches but sybsys_mask doesn't, we should fail.
+		 * Remember whether name matched.
+		 */
+		if (opts.name) {
+			if (strcmp(opts.name, root->name))
+				continue;
+			name_match = true;
+		}
+
+		/*
+		 * If we asked for subsystems (or explicitly for no
+		 * subsystems) then they must match.
+		 */
+		if ((opts.subsys_mask || opts.none) &&
+		    (opts.subsys_mask != root->subsys_mask)) {
+			if (!name_match)
+				continue;
+			ret = -EBUSY;
+			goto out_unlock;
+		}
+
+		if (root->flags ^ opts.flags)
+			pr_warn("new mount options do not match the existing superblock, will be ignored\n");
+
+		/*
+		 * We want to reuse @root whose lifetime is governed by its
+		 * ->cgrp.  Let's check whether @root is alive and keep it
+		 * that way.  As cgroup_kill_sb() can happen anytime, we
+		 * want to block it by pinning the sb so that @root doesn't
+		 * get killed before mount is complete.
+		 *
+		 * With the sb pinned, tryget_live can reliably indicate
+		 * whether @root can be reused.  If it's being killed,
+		 * drain it.  We can use wait_queue for the wait but this
+		 * path is super cold.  Let's just sleep a bit and retry.
+		 */
+		pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
+		if (IS_ERR(pinned_sb) ||
+		    !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
+			mutex_unlock(&cgroup_mutex);
+			if (!IS_ERR_OR_NULL(pinned_sb))
+				deactivate_super(pinned_sb);
+			msleep(10);
+			ret = restart_syscall();
+			goto out_free;
+		}
+
+		ret = 0;
+		goto out_unlock;
+	}
+
+	/*
+	 * No such thing, create a new one.  name= matching without subsys
+	 * specification is allowed for already existing hierarchies but we
+	 * can't create new one without subsys specification.
+	 */
+	if (!opts.subsys_mask && !opts.none) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	/* Hierarchies may only be created in the initial cgroup namespace. */
+	if (ns != &init_cgroup_ns) {
+		ret = -EPERM;
+		goto out_unlock;
+	}
+
+	root = kzalloc(sizeof(*root), GFP_KERNEL);
+	if (!root) {
+		ret = -ENOMEM;
+		goto out_unlock;
+	}
+
+	init_cgroup_root(root, &opts);
+
+	ret = cgroup_setup_root(root, opts.subsys_mask);
+	if (ret)
+		cgroup_free_root(root);
+
+out_unlock:
+	mutex_unlock(&cgroup_mutex);
+out_free:
+	kfree(opts.release_agent);
+	kfree(opts.name);
+
+	if (ret)
+		return ERR_PTR(ret);
+
+	dentry = cgroup_do_mount(&cgroup_fs_type, flags, root,
+				 CGROUP_SUPER_MAGIC, ns);
+
+	/*
+	 * If @pinned_sb, we're reusing an existing root and holding an
+	 * extra ref on its sb.  Mount is complete.  Put the extra ref.
+	 */
+	if (pinned_sb)
+		deactivate_super(pinned_sb);
+
+	return dentry;
+}
+
+static int __init cgroup1_wq_init(void)
+{
+	/*
+	 * Used to destroy pidlists and separate to serve as flush domain.
+	 * Cap @max_active to 1 too.
+	 */
+	cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
+						    0, 1);
+	BUG_ON(!cgroup_pidlist_destroy_wq);
+	return 0;
+}
+core_initcall(cgroup1_wq_init);
+
+static int __init cgroup_no_v1(char *str)
+{
+	struct cgroup_subsys *ss;
+	char *token;
+	int i;
+
+	while ((token = strsep(&str, ",")) != NULL) {
+		if (!*token)
+			continue;
+
+		if (!strcmp(token, "all")) {
+			cgroup_no_v1_mask = U16_MAX;
+			break;
+		}
+
+		for_each_subsys(ss, i) {
+			if (strcmp(token, ss->name) &&
+			    strcmp(token, ss->legacy_name))
+				continue;
+
+			cgroup_no_v1_mask |= 1 << i;
+		}
+	}
+	return 1;
+}
+__setup("cgroup_no_v1=", cgroup_no_v1);
+
+
+#ifdef CONFIG_CGROUP_DEBUG
+static struct cgroup_subsys_state *
+debug_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
+
+	if (!css)
+		return ERR_PTR(-ENOMEM);
+
+	return css;
+}
+
+static void debug_css_free(struct cgroup_subsys_state *css)
+{
+	kfree(css);
+}
+
+static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
+				struct cftype *cft)
+{
+	return cgroup_task_count(css->cgroup);
+}
+
+static u64 current_css_set_read(struct cgroup_subsys_state *css,
+				struct cftype *cft)
+{
+	return (u64)(unsigned long)current->cgroups;
+}
+
+static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
+					 struct cftype *cft)
+{
+	u64 count;
+
+	rcu_read_lock();
+	count = atomic_read(&task_css_set(current)->refcount);
+	rcu_read_unlock();
+	return count;
+}
+
+static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
+{
+	struct cgrp_cset_link *link;
+	struct css_set *cset;
+	char *name_buf;
+
+	name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+	if (!name_buf)
+		return -ENOMEM;
+
+	spin_lock_irq(&css_set_lock);
+	rcu_read_lock();
+	cset = rcu_dereference(current->cgroups);
+	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+		struct cgroup *c = link->cgrp;
+
+		cgroup_name(c, name_buf, NAME_MAX + 1);
+		seq_printf(seq, "Root %d group %s\n",
+			   c->root->hierarchy_id, name_buf);
+	}
+	rcu_read_unlock();
+	spin_unlock_irq(&css_set_lock);
+	kfree(name_buf);
+	return 0;
+}
+
+#define MAX_TASKS_SHOWN_PER_CSS 25
+static int cgroup_css_links_read(struct seq_file *seq, void *v)
+{
+	struct cgroup_subsys_state *css = seq_css(seq);
+	struct cgrp_cset_link *link;
+
+	spin_lock_irq(&css_set_lock);
+	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
+		struct css_set *cset = link->cset;
+		struct task_struct *task;
+		int count = 0;
+
+		seq_printf(seq, "css_set %p\n", cset);
+
+		list_for_each_entry(task, &cset->tasks, cg_list) {
+			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
+				goto overflow;
+			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
+		}
+
+		list_for_each_entry(task, &cset->mg_tasks, cg_list) {
+			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
+				goto overflow;
+			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
+		}
+		continue;
+	overflow:
+		seq_puts(seq, "  ...\n");
+	}
+	spin_unlock_irq(&css_set_lock);
+	return 0;
+}
+
+static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+	return (!cgroup_is_populated(css->cgroup) &&
+		!css_has_online_children(&css->cgroup->self));
+}
+
+static struct cftype debug_files[] =  {
+	{
+		.name = "taskcount",
+		.read_u64 = debug_taskcount_read,
+	},
+
+	{
+		.name = "current_css_set",
+		.read_u64 = current_css_set_read,
+	},
+
+	{
+		.name = "current_css_set_refcount",
+		.read_u64 = current_css_set_refcount_read,
+	},
+
+	{
+		.name = "current_css_set_cg_links",
+		.seq_show = current_css_set_cg_links_read,
+	},
+
+	{
+		.name = "cgroup_css_links",
+		.seq_show = cgroup_css_links_read,
+	},
+
+	{
+		.name = "releasable",
+		.read_u64 = releasable_read,
+	},
+
+	{ }	/* terminate */
+};
+
+struct cgroup_subsys debug_cgrp_subsys = {
+	.css_alloc = debug_css_alloc,
+	.css_free = debug_css_free,
+	.legacy_cftypes = debug_files,
+};
+#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup.c b/kernel/cgroup/cgroup.c
index 53bbca7c4859..e8f87bf9840c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -28,15 +28,13 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
-#include <linux/cgroup.h>
+#include "cgroup-internal.h"
+
 #include <linux/cred.h>
-#include <linux/ctype.h>
 #include <linux/errno.h>
 #include <linux/init_task.h>
 #include <linux/kernel.h>
-#include <linux/list.h>
 #include <linux/magic.h>
-#include <linux/mm.h>
 #include <linux/mutex.h>
 #include <linux/mount.h>
 #include <linux/pagemap.h>
@@ -47,16 +45,9 @@
 #include <linux/spinlock.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/string.h>
-#include <linux/sort.h>
-#include <linux/kmod.h>
-#include <linux/delayacct.h>
-#include <linux/cgroupstats.h>
 #include <linux/hashtable.h>
-#include <linux/pid_namespace.h>
 #include <linux/idr.h>
-#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
 #include <linux/kthread.h>
-#include <linux/delay.h>
 #include <linux/atomic.h>
 #include <linux/cpuset.h>
 #include <linux/proc_ns.h>
@@ -67,14 +58,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/cgroup.h>
 
-/*
- * pidlists linger the following amount before being destroyed.  The goal
- * is avoiding frequent destruction in the middle of consecutive read calls
- * Expiring in the middle is a performance problem not a correctness one.
- * 1 sec should be enough.
- */
-#define CGROUP_PIDLIST_DESTROY_DELAY	HZ
-
 #define CGROUP_FILE_NAME_MAX		(MAX_CGROUP_TYPE_NAMELEN +	\
 					 MAX_CFTYPE_NAME + 2)
 
@@ -88,14 +71,12 @@
  * These locks are exported if CONFIG_PROVE_RCU so that accessors in
  * cgroup.h can use them for lockdep annotations.
  */
-#ifdef CONFIG_PROVE_RCU
 DEFINE_MUTEX(cgroup_mutex);
 DEFINE_SPINLOCK(css_set_lock);
+
+#ifdef CONFIG_PROVE_RCU
 EXPORT_SYMBOL_GPL(cgroup_mutex);
 EXPORT_SYMBOL_GPL(css_set_lock);
-#else
-static DEFINE_MUTEX(cgroup_mutex);
-static DEFINE_SPINLOCK(css_set_lock);
 #endif
 
 /*
@@ -110,12 +91,6 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
  */
 static DEFINE_SPINLOCK(cgroup_file_kn_lock);
 
-/*
- * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
- * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
- */
-static DEFINE_SPINLOCK(release_agent_path_lock);
-
 struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
 
 #define cgroup_assert_mutex_or_rcu_locked()				\
@@ -131,15 +106,9 @@ struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
  */
 static struct workqueue_struct *cgroup_destroy_wq;
 
-/*
- * pidlist destructions need to be flushed on cgroup destruction.  Use a
- * separate workqueue as flush domain.
- */
-static struct workqueue_struct *cgroup_pidlist_destroy_wq;
-
 /* generate an array of cgroup subsystem pointers */
 #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
-static struct cgroup_subsys *cgroup_subsys[] = {
+struct cgroup_subsys *cgroup_subsys[] = {
 #include <linux/cgroup_subsys.h>
 };
 #undef SUBSYS
@@ -186,18 +155,14 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root);
  */
 static bool cgrp_dfl_visible;
 
-/* Controllers blocked by the commandline in v1 */
-static u16 cgroup_no_v1_mask;
-
 /* some controllers are not supported in the default hierarchy */
 static u16 cgrp_dfl_inhibit_ss_mask;
 
 /* some controllers are implicitly enabled on the default hierarchy */
-static unsigned long cgrp_dfl_implicit_ss_mask;
+static u16 cgrp_dfl_implicit_ss_mask;
 
 /* The list of hierarchy roots */
-
-static LIST_HEAD(cgroup_roots);
+LIST_HEAD(cgroup_roots);
 static int cgroup_root_count;
 
 /* hierarchy ID allocation and mapping, protected by cgroup_mutex */
@@ -213,13 +178,13 @@ static DEFINE_IDR(cgroup_hierarchy_idr);
 static u64 css_serial_nr_next = 1;
 
 /*
- * These bitmask flags indicate whether tasks in the fork and exit paths have
- * fork/exit handlers to call. This avoids us having to do extra work in the
- * fork/exit path to check which subsystems have fork/exit callbacks.
+ * These bitmasks identify subsystems with specific features to avoid
+ * having to do iterative checks repeatedly.
  */
 static u16 have_fork_callback __read_mostly;
 static u16 have_exit_callback __read_mostly;
 static u16 have_free_callback __read_mostly;
+static u16 have_canfork_callback __read_mostly;
 
 /* cgroup namespace for init task */
 struct cgroup_namespace init_cgroup_ns = {
@@ -230,15 +195,9 @@ struct cgroup_namespace init_cgroup_ns = {
 	.root_cset	= &init_css_set,
 };
 
-/* Ditto for the can_fork callback. */
-static u16 have_canfork_callback __read_mostly;
-
 static struct file_system_type cgroup2_fs_type;
-static struct cftype cgroup_dfl_base_files[];
-static struct cftype cgroup_legacy_base_files[];
+static struct cftype cgroup_base_files[];
 
-static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
-static void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
 static int cgroup_apply_control(struct cgroup *cgrp);
 static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
 static void css_task_iter_advance(struct css_task_iter *it);
@@ -259,7 +218,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
  * is fine for individual subsystems but unsuitable for cgroup core.  This
  * is slower static_key_enabled() based test indexed by @ssid.
  */
-static bool cgroup_ssid_enabled(int ssid)
+bool cgroup_ssid_enabled(int ssid)
 {
 	if (CGROUP_SUBSYS_COUNT == 0)
 		return false;
@@ -267,11 +226,6 @@ static bool cgroup_ssid_enabled(int ssid)
 	return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
 }
 
-static bool cgroup_ssid_no_v1(int ssid)
-{
-	return cgroup_no_v1_mask & (1 << ssid);
-}
-
 /**
  * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
  * @cgrp: the cgroup of interest
@@ -325,7 +279,7 @@ static bool cgroup_ssid_no_v1(int ssid)
  *
  * - debug: disallowed on the default hierarchy.
  */
-static bool cgroup_on_dfl(const struct cgroup *cgrp)
+bool cgroup_on_dfl(const struct cgroup *cgrp)
 {
 	return cgrp->root == &cgrp_dfl_root;
 }
@@ -481,12 +435,6 @@ out_unlock:
 	return css;
 }
 
-/* convenient tests for these bits */
-static inline bool cgroup_is_dead(const struct cgroup *cgrp)
-{
-	return !(cgrp->self.flags & CSS_ONLINE);
-}
-
 static void cgroup_get(struct cgroup *cgrp)
 {
 	WARN_ON_ONCE(cgroup_is_dead(cgrp));
@@ -518,11 +466,6 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
 }
 EXPORT_SYMBOL_GPL(of_css);
 
-static int notify_on_release(const struct cgroup *cgrp)
-{
-	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
-}
-
 /**
  * for_each_css - iterate all css's of a cgroup
  * @css: the iteration cursor
@@ -553,15 +496,6 @@ static int notify_on_release(const struct cgroup *cgrp)
 		else
 
 /**
- * for_each_subsys - iterate all enabled cgroup subsystems
- * @ss: the iteration cursor
- * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
- */
-#define for_each_subsys(ss, ssid)					\
-	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&		\
-	     (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
-
-/**
  * do_each_subsys_mask - filter for_each_subsys with a bitmask
  * @ss: the iteration cursor
  * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
@@ -585,10 +519,6 @@ static int notify_on_release(const struct cgroup *cgrp)
 	}								\
 } while (false)
 
-/* iterate across the hierarchies */
-#define for_each_root(root)						\
-	list_for_each_entry((root), &cgroup_roots, root_list)
-
 /* iterate over child cgrps, lock should be held throughout iteration */
 #define cgroup_for_each_live_child(child, cgrp)				\
 	list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
@@ -615,29 +545,6 @@ static int notify_on_release(const struct cgroup *cgrp)
 			;						\
 		else
 
-static void cgroup_release_agent(struct work_struct *work);
-static void check_for_release(struct cgroup *cgrp);
-
-/*
- * A cgroup can be associated with multiple css_sets as different tasks may
- * belong to different cgroups on different hierarchies.  In the other
- * direction, a css_set is naturally associated with multiple cgroups.
- * This M:N relationship is represented by the following link structure
- * which exists for each association and allows traversing the associations
- * from both sides.
- */
-struct cgrp_cset_link {
-	/* the cgroup and css_set this link associates */
-	struct cgroup		*cgrp;
-	struct css_set		*cset;
-
-	/* list of cgrp_cset_links anchored at cgrp->cset_links */
-	struct list_head	cset_link;
-
-	/* list of cgrp_cset_links anchored at css_set->cgrp_links */
-	struct list_head	cgrp_link;
-};
-
 /*
  * The default css_set - used by init and its children prior to any
  * hierarchies being mounted. It contains a pointer to the root state
@@ -647,12 +554,12 @@ struct cgrp_cset_link {
  */
 struct css_set init_css_set = {
 	.refcount		= ATOMIC_INIT(1),
-	.cgrp_links		= LIST_HEAD_INIT(init_css_set.cgrp_links),
 	.tasks			= LIST_HEAD_INIT(init_css_set.tasks),
 	.mg_tasks		= LIST_HEAD_INIT(init_css_set.mg_tasks),
+	.task_iters		= LIST_HEAD_INIT(init_css_set.task_iters),
+	.cgrp_links		= LIST_HEAD_INIT(init_css_set.cgrp_links),
 	.mg_preload_node	= LIST_HEAD_INIT(init_css_set.mg_preload_node),
 	.mg_node		= LIST_HEAD_INIT(init_css_set.mg_node),
-	.task_iters		= LIST_HEAD_INIT(init_css_set.task_iters),
 };
 
 static int css_set_count	= 1;	/* 1 for init_css_set */
@@ -699,7 +606,7 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
 		if (!trigger)
 			break;
 
-		check_for_release(cgrp);
+		cgroup1_check_for_release(cgrp);
 		cgroup_file_notify(&cgrp->events_file);
 
 		cgrp = cgroup_parent(cgrp);
@@ -808,7 +715,7 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
 	return key;
 }
 
-static void put_css_set_locked(struct css_set *cset)
+void put_css_set_locked(struct css_set *cset)
 {
 	struct cgrp_cset_link *link, *tmp_link;
 	struct cgroup_subsys *ss;
@@ -838,31 +745,6 @@ static void put_css_set_locked(struct css_set *cset)
 	kfree_rcu(cset, rcu_head);
 }
 
-static void put_css_set(struct css_set *cset)
-{
-	unsigned long flags;
-
-	/*
-	 * Ensure that the refcount doesn't hit zero while any readers
-	 * can see it. Similar to atomic_dec_and_lock(), but for an
-	 * rwlock
-	 */
-	if (atomic_add_unless(&cset->refcount, -1, 1))
-		return;
-
-	spin_lock_irqsave(&css_set_lock, flags);
-	put_css_set_locked(cset);
-	spin_unlock_irqrestore(&css_set_lock, flags);
-}
-
-/*
- * refcounted get/put for css_set objects
- */
-static inline void get_css_set(struct css_set *cset)
-{
-	atomic_inc(&cset->refcount);
-}
-
 /**
  * compare_css_sets - helper function for find_existing_css_set().
  * @cset: candidate css_set being tested
@@ -1095,13 +977,13 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 	}
 
 	atomic_set(&cset->refcount, 1);
-	INIT_LIST_HEAD(&cset->cgrp_links);
 	INIT_LIST_HEAD(&cset->tasks);
 	INIT_LIST_HEAD(&cset->mg_tasks);
-	INIT_LIST_HEAD(&cset->mg_preload_node);
-	INIT_LIST_HEAD(&cset->mg_node);
 	INIT_LIST_HEAD(&cset->task_iters);
 	INIT_HLIST_NODE(&cset->hlist);
+	INIT_LIST_HEAD(&cset->cgrp_links);
+	INIT_LIST_HEAD(&cset->mg_preload_node);
+	INIT_LIST_HEAD(&cset->mg_node);
 
 	/* Copy the set of subsystem state objects generated in
 	 * find_existing_css_set() */
@@ -1138,7 +1020,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 	return cset;
 }
 
-static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
+struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
 {
 	struct cgroup *root_cgrp = kf_root->kn->priv;
 
@@ -1166,7 +1048,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
 	idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
 }
 
-static void cgroup_free_root(struct cgroup_root *root)
+void cgroup_free_root(struct cgroup_root *root)
 {
 	if (root) {
 		idr_destroy(&root->cgroup_idr);
@@ -1283,8 +1165,8 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
  * Return the cgroup for "task" from the given hierarchy. Must be
  * called with cgroup_mutex and css_set_lock held.
  */
-static struct cgroup *task_cgroup_from_root(struct task_struct *task,
-					    struct cgroup_root *root)
+struct cgroup *task_cgroup_from_root(struct task_struct *task,
+				     struct cgroup_root *root)
 {
 	/*
 	 * No need to lock the task - since we hold cgroup_mutex the
@@ -1321,7 +1203,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
  */
 
 static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
-static const struct file_operations proc_cgroupstats_operations;
 
 static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
 			      char *buf)
@@ -1415,7 +1296,7 @@ static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
  * inaccessible any time.  If the caller intends to continue to access the
  * cgroup, it should pin it before invoking this function.
  */
-static void cgroup_kn_unlock(struct kernfs_node *kn)
+void cgroup_kn_unlock(struct kernfs_node *kn)
 {
 	struct cgroup *cgrp;
 
@@ -1447,8 +1328,7 @@ static void cgroup_kn_unlock(struct kernfs_node *kn)
  * locking under kernfs active protection and allows all kernfs operations
  * including self-removal.
  */
-static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn,
-					  bool drain_offline)
+struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
 {
 	struct cgroup *cgrp;
 
@@ -1532,9 +1412,9 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
 
 	if (!css->ss) {
 		if (cgroup_on_dfl(cgrp))
-			cfts = cgroup_dfl_base_files;
+			cfts = cgroup_base_files;
 		else
-			cfts = cgroup_legacy_base_files;
+			cfts = cgroup1_base_files;
 
 		return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
 	}
@@ -1559,7 +1439,7 @@ err:
 	return ret;
 }
 
-static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
+int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
 {
 	struct cgroup *dcgrp = &dst_root->cgrp;
 	struct cgroup_subsys *ss;
@@ -1629,8 +1509,8 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
 	return 0;
 }
 
-static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
-			    struct kernfs_root *kf_root)
+int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
+		     struct kernfs_root *kf_root)
 {
 	int len = 0;
 	char *buf = NULL;
@@ -1656,237 +1536,10 @@ static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
 	return len;
 }
 
-static int cgroup_show_options(struct seq_file *seq,
-			       struct kernfs_root *kf_root)
-{
-	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
-	struct cgroup_subsys *ss;
-	int ssid;
-
-	if (root != &cgrp_dfl_root)
-		for_each_subsys(ss, ssid)
-			if (root->subsys_mask & (1 << ssid))
-				seq_show_option(seq, ss->legacy_name, NULL);
-	if (root->flags & CGRP_ROOT_NOPREFIX)
-		seq_puts(seq, ",noprefix");
-	if (root->flags & CGRP_ROOT_XATTR)
-		seq_puts(seq, ",xattr");
-
-	spin_lock(&release_agent_path_lock);
-	if (strlen(root->release_agent_path))
-		seq_show_option(seq, "release_agent",
-				root->release_agent_path);
-	spin_unlock(&release_agent_path_lock);
-
-	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
-		seq_puts(seq, ",clone_children");
-	if (strlen(root->name))
-		seq_show_option(seq, "name", root->name);
-	return 0;
-}
-
-struct cgroup_sb_opts {
-	u16 subsys_mask;
-	unsigned int flags;
-	char *release_agent;
-	bool cpuset_clone_children;
-	char *name;
-	/* User explicitly requested empty subsystem */
-	bool none;
-};
-
-static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
-{
-	char *token, *o = data;
-	bool all_ss = false, one_ss = false;
-	u16 mask = U16_MAX;
-	struct cgroup_subsys *ss;
-	int nr_opts = 0;
-	int i;
-
-#ifdef CONFIG_CPUSETS
-	mask = ~((u16)1 << cpuset_cgrp_id);
-#endif
-
-	memset(opts, 0, sizeof(*opts));
-
-	while ((token = strsep(&o, ",")) != NULL) {
-		nr_opts++;
-
-		if (!*token)
-			return -EINVAL;
-		if (!strcmp(token, "none")) {
-			/* Explicitly have no subsystems */
-			opts->none = true;
-			continue;
-		}
-		if (!strcmp(token, "all")) {
-			/* Mutually exclusive option 'all' + subsystem name */
-			if (one_ss)
-				return -EINVAL;
-			all_ss = true;
-			continue;
-		}
-		if (!strcmp(token, "noprefix")) {
-			opts->flags |= CGRP_ROOT_NOPREFIX;
-			continue;
-		}
-		if (!strcmp(token, "clone_children")) {
-			opts->cpuset_clone_children = true;
-			continue;
-		}
-		if (!strcmp(token, "xattr")) {
-			opts->flags |= CGRP_ROOT_XATTR;
-			continue;
-		}
-		if (!strncmp(token, "release_agent=", 14)) {
-			/* Specifying two release agents is forbidden */
-			if (opts->release_agent)
-				return -EINVAL;
-			opts->release_agent =
-				kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
-			if (!opts->release_agent)
-				return -ENOMEM;
-			continue;
-		}
-		if (!strncmp(token, "name=", 5)) {
-			const char *name = token + 5;
-			/* Can't specify an empty name */
-			if (!strlen(name))
-				return -EINVAL;
-			/* Must match [\w.-]+ */
-			for (i = 0; i < strlen(name); i++) {
-				char c = name[i];
-				if (isalnum(c))
-					continue;
-				if ((c == '.') || (c == '-') || (c == '_'))
-					continue;
-				return -EINVAL;
-			}
-			/* Specifying two names is forbidden */
-			if (opts->name)
-				return -EINVAL;
-			opts->name = kstrndup(name,
-					      MAX_CGROUP_ROOT_NAMELEN - 1,
-					      GFP_KERNEL);
-			if (!opts->name)
-				return -ENOMEM;
-
-			continue;
-		}
-
-		for_each_subsys(ss, i) {
-			if (strcmp(token, ss->legacy_name))
-				continue;
-			if (!cgroup_ssid_enabled(i))
-				continue;
-			if (cgroup_ssid_no_v1(i))
-				continue;
-
-			/* Mutually exclusive option 'all' + subsystem name */
-			if (all_ss)
-				return -EINVAL;
-			opts->subsys_mask |= (1 << i);
-			one_ss = true;
-
-			break;
-		}
-		if (i == CGROUP_SUBSYS_COUNT)
-			return -ENOENT;
-	}
-
-	/*
-	 * If the 'all' option was specified select all the subsystems,
-	 * otherwise if 'none', 'name=' and a subsystem name options were
-	 * not specified, let's default to 'all'
-	 */
-	if (all_ss || (!one_ss && !opts->none && !opts->name))
-		for_each_subsys(ss, i)
-			if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i))
-				opts->subsys_mask |= (1 << i);
-
-	/*
-	 * We either have to specify by name or by subsystems. (So all
-	 * empty hierarchies must have a name).
-	 */
-	if (!opts->subsys_mask && !opts->name)
-		return -EINVAL;
-
-	/*
-	 * Option noprefix was introduced just for backward compatibility
-	 * with the old cpuset, so we allow noprefix only if mounting just
-	 * the cpuset subsystem.
-	 */
-	if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
-		return -EINVAL;
-
-	/* Can't specify "none" and some subsystems */
-	if (opts->subsys_mask && opts->none)
-		return -EINVAL;
-
-	return 0;
-}
-
 static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
 {
-	int ret = 0;
-	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
-	struct cgroup_sb_opts opts;
-	u16 added_mask, removed_mask;
-
-	if (root == &cgrp_dfl_root) {
-		pr_err("remount is not allowed\n");
-		return -EINVAL;
-	}
-
-	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
-
-	/* See what subsystems are wanted */
-	ret = parse_cgroupfs_options(data, &opts);
-	if (ret)
-		goto out_unlock;
-
-	if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
-		pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
-			task_tgid_nr(current), current->comm);
-
-	added_mask = opts.subsys_mask & ~root->subsys_mask;
-	removed_mask = root->subsys_mask & ~opts.subsys_mask;
-
-	/* Don't allow flags or name to change at remount */
-	if ((opts.flags ^ root->flags) ||
-	    (opts.name && strcmp(opts.name, root->name))) {
-		pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
-		       opts.flags, opts.name ?: "", root->flags, root->name);
-		ret = -EINVAL;
-		goto out_unlock;
-	}
-
-	/* remounting is not allowed for populated hierarchies */
-	if (!list_empty(&root->cgrp.self.children)) {
-		ret = -EBUSY;
-		goto out_unlock;
-	}
-
-	ret = rebind_subsystems(root, added_mask);
-	if (ret)
-		goto out_unlock;
-
-	WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
-
-	if (opts.release_agent) {
-		spin_lock(&release_agent_path_lock);
-		strcpy(root->release_agent_path, opts.release_agent);
-		spin_unlock(&release_agent_path_lock);
-	}
-
-	trace_cgroup_remount(root);
-
- out_unlock:
-	kfree(opts.release_agent);
-	kfree(opts.name);
-	mutex_unlock(&cgroup_mutex);
-	return ret;
+	pr_err("remount is not allowed\n");
+	return -EINVAL;
 }
 
 /*
@@ -1964,11 +1617,10 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 		INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
 
 	init_waitqueue_head(&cgrp->offline_waitq);
-	INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent);
+	INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
 }
 
-static void init_cgroup_root(struct cgroup_root *root,
-			     struct cgroup_sb_opts *opts)
+void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
 {
 	struct cgroup *cgrp = &root->cgrp;
 
@@ -1987,10 +1639,11 @@ static void init_cgroup_root(struct cgroup_root *root,
 		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
 }
 
-static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
 {
 	LIST_HEAD(tmp_links);
 	struct cgroup *root_cgrp = &root->cgrp;
+	struct kernfs_syscall_ops *kf_sops;
 	struct css_set *cset;
 	int i, ret;
 
@@ -2022,7 +1675,10 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
 	if (ret)
 		goto cancel_ref;
 
-	root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
+	kf_sops = root == &cgrp_dfl_root ?
+		&cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
+
+	root->kf_root = kernfs_create_root(kf_sops,
 					   KERNFS_ROOT_CREATE_DEACTIVATED,
 					   root_cgrp);
 	if (IS_ERR(root->kf_root)) {
@@ -2080,182 +1736,18 @@ out:
 	return ret;
 }
 
-static struct dentry *cgroup_mount(struct file_system_type *fs_type,
-			 int flags, const char *unused_dev_name,
-			 void *data)
+struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
+			       struct cgroup_root *root, unsigned long magic,
+			       struct cgroup_namespace *ns)
 {
-	bool is_v2 = fs_type == &cgroup2_fs_type;
-	struct super_block *pinned_sb = NULL;
-	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
-	struct cgroup_subsys *ss;
-	struct cgroup_root *root;
-	struct cgroup_sb_opts opts;
 	struct dentry *dentry;
-	int ret;
-	int i;
 	bool new_sb;
 
-	get_cgroup_ns(ns);
-
-	/* Check if the caller has permission to mount. */
-	if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
-		put_cgroup_ns(ns);
-		return ERR_PTR(-EPERM);
-	}
-
-	/*
-	 * The first time anyone tries to mount a cgroup, enable the list
-	 * linking each css_set to its tasks and fix up all existing tasks.
-	 */
-	if (!use_task_css_set_links)
-		cgroup_enable_task_cg_lists();
-
-	if (is_v2) {
-		if (data) {
-			pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
-			put_cgroup_ns(ns);
-			return ERR_PTR(-EINVAL);
-		}
-		cgrp_dfl_visible = true;
-		root = &cgrp_dfl_root;
-		cgroup_get(&root->cgrp);
-		goto out_mount;
-	}
-
-	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
-
-	/* First find the desired set of subsystems */
-	ret = parse_cgroupfs_options(data, &opts);
-	if (ret)
-		goto out_unlock;
-
-	/*
-	 * Destruction of cgroup root is asynchronous, so subsystems may
-	 * still be dying after the previous unmount.  Let's drain the
-	 * dying subsystems.  We just need to ensure that the ones
-	 * unmounted previously finish dying and don't care about new ones
-	 * starting.  Testing ref liveliness is good enough.
-	 */
-	for_each_subsys(ss, i) {
-		if (!(opts.subsys_mask & (1 << i)) ||
-		    ss->root == &cgrp_dfl_root)
-			continue;
-
-		if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
-			mutex_unlock(&cgroup_mutex);
-			msleep(10);
-			ret = restart_syscall();
-			goto out_free;
-		}
-		cgroup_put(&ss->root->cgrp);
-	}
-
-	for_each_root(root) {
-		bool name_match = false;
-
-		if (root == &cgrp_dfl_root)
-			continue;
-
-		/*
-		 * If we asked for a name then it must match.  Also, if
-		 * name matches but sybsys_mask doesn't, we should fail.
-		 * Remember whether name matched.
-		 */
-		if (opts.name) {
-			if (strcmp(opts.name, root->name))
-				continue;
-			name_match = true;
-		}
-
-		/*
-		 * If we asked for subsystems (or explicitly for no
-		 * subsystems) then they must match.
-		 */
-		if ((opts.subsys_mask || opts.none) &&
-		    (opts.subsys_mask != root->subsys_mask)) {
-			if (!name_match)
-				continue;
-			ret = -EBUSY;
-			goto out_unlock;
-		}
-
-		if (root->flags ^ opts.flags)
-			pr_warn("new mount options do not match the existing superblock, will be ignored\n");
-
-		/*
-		 * We want to reuse @root whose lifetime is governed by its
-		 * ->cgrp.  Let's check whether @root is alive and keep it
-		 * that way.  As cgroup_kill_sb() can happen anytime, we
-		 * want to block it by pinning the sb so that @root doesn't
-		 * get killed before mount is complete.
-		 *
-		 * With the sb pinned, tryget_live can reliably indicate
-		 * whether @root can be reused.  If it's being killed,
-		 * drain it.  We can use wait_queue for the wait but this
-		 * path is super cold.  Let's just sleep a bit and retry.
-		 */
-		pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
-		if (IS_ERR(pinned_sb) ||
-		    !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
-			mutex_unlock(&cgroup_mutex);
-			if (!IS_ERR_OR_NULL(pinned_sb))
-				deactivate_super(pinned_sb);
-			msleep(10);
-			ret = restart_syscall();
-			goto out_free;
-		}
-
-		ret = 0;
-		goto out_unlock;
-	}
+	dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb);
 
 	/*
-	 * No such thing, create a new one.  name= matching without subsys
-	 * specification is allowed for already existing hierarchies but we
-	 * can't create new one without subsys specification.
-	 */
-	if (!opts.subsys_mask && !opts.none) {
-		ret = -EINVAL;
-		goto out_unlock;
-	}
-
-	/* Hierarchies may only be created in the initial cgroup namespace. */
-	if (ns != &init_cgroup_ns) {
-		ret = -EPERM;
-		goto out_unlock;
-	}
-
-	root = kzalloc(sizeof(*root), GFP_KERNEL);
-	if (!root) {
-		ret = -ENOMEM;
-		goto out_unlock;
-	}
-
-	init_cgroup_root(root, &opts);
-
-	ret = cgroup_setup_root(root, opts.subsys_mask);
-	if (ret)
-		cgroup_free_root(root);
-
-out_unlock:
-	mutex_unlock(&cgroup_mutex);
-out_free:
-	kfree(opts.release_agent);
-	kfree(opts.name);
-
-	if (ret) {
-		put_cgroup_ns(ns);
-		return ERR_PTR(ret);
-	}
-out_mount:
-	dentry = kernfs_mount(fs_type, flags, root->kf_root,
-			      is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
-			      &new_sb);
-
-	/*
-	 * In non-init cgroup namespace, instead of root cgroup's
-	 * dentry, we return the dentry corresponding to the
-	 * cgroupns->root_cgrp.
+	 * In non-init cgroup namespace, instead of root cgroup's dentry,
+	 * we return the dentry corresponding to the cgroupns->root_cgrp.
 	 */
 	if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
 		struct dentry *nsdentry;
@@ -2277,13 +1769,45 @@ out_mount:
 	if (IS_ERR(dentry) || !new_sb)
 		cgroup_put(&root->cgrp);
 
+	return dentry;
+}
+
+static struct dentry *cgroup_mount(struct file_system_type *fs_type,
+			 int flags, const char *unused_dev_name,
+			 void *data)
+{
+	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
+	struct dentry *dentry;
+
+	get_cgroup_ns(ns);
+
+	/* Check if the caller has permission to mount. */
+	if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
+		put_cgroup_ns(ns);
+		return ERR_PTR(-EPERM);
+	}
+
 	/*
-	 * If @pinned_sb, we're reusing an existing root and holding an
-	 * extra ref on its sb.  Mount is complete.  Put the extra ref.
+	 * The first time anyone tries to mount a cgroup, enable the list
+	 * linking each css_set to its tasks and fix up all existing tasks.
 	 */
-	if (pinned_sb) {
-		WARN_ON(new_sb);
-		deactivate_super(pinned_sb);
+	if (!use_task_css_set_links)
+		cgroup_enable_task_cg_lists();
+
+	if (fs_type == &cgroup2_fs_type) {
+		if (data) {
+			pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
+			put_cgroup_ns(ns);
+			return ERR_PTR(-EINVAL);
+		}
+		cgrp_dfl_visible = true;
+		cgroup_get(&cgrp_dfl_root.cgrp);
+
+		dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
+					 CGROUP2_SUPER_MAGIC, ns);
+	} else {
+		dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
+				       CGROUP_SUPER_MAGIC, ns);
 	}
 
 	put_cgroup_ns(ns);
@@ -2311,7 +1835,7 @@ static void cgroup_kill_sb(struct super_block *sb)
 	kernfs_kill_sb(sb);
 }
 
-static struct file_system_type cgroup_fs_type = {
+struct file_system_type cgroup_fs_type = {
 	.name = "cgroup",
 	.mount = cgroup_mount,
 	.kill_sb = cgroup_kill_sb,
@@ -2325,8 +1849,8 @@ static struct file_system_type cgroup2_fs_type = {
 	.fs_flags = FS_USERNS_MOUNT,
 };
 
-static int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
-				 struct cgroup_namespace *ns)
+int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
+			  struct cgroup_namespace *ns)
 {
 	struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
 
@@ -2389,49 +1913,18 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
 }
 EXPORT_SYMBOL_GPL(task_cgroup_path);
 
-/* used to track tasks and other necessary states during migration */
-struct cgroup_taskset {
-	/* the src and dst cset list running through cset->mg_node */
-	struct list_head	src_csets;
-	struct list_head	dst_csets;
-
-	/* the subsys currently being processed */
-	int			ssid;
-
-	/*
-	 * Fields for cgroup_taskset_*() iteration.
-	 *
-	 * Before migration is committed, the target migration tasks are on
-	 * ->mg_tasks of the csets on ->src_csets.  After, on ->mg_tasks of
-	 * the csets on ->dst_csets.  ->csets point to either ->src_csets
-	 * or ->dst_csets depending on whether migration is committed.
-	 *
-	 * ->cur_csets and ->cur_task point to the current task position
-	 * during iteration.
-	 */
-	struct list_head	*csets;
-	struct css_set		*cur_cset;
-	struct task_struct	*cur_task;
-};
-
-#define CGROUP_TASKSET_INIT(tset)	(struct cgroup_taskset){	\
-	.src_csets		= LIST_HEAD_INIT(tset.src_csets),	\
-	.dst_csets		= LIST_HEAD_INIT(tset.dst_csets),	\
-	.csets			= &tset.src_csets,			\
-}
-
 /**
- * cgroup_taskset_add - try to add a migration target task to a taskset
+ * cgroup_migrate_add_task - add a migration target task to a migration context
  * @task: target task
- * @tset: target taskset
+ * @mgctx: target migration context
  *
- * Add @task, which is a migration target, to @tset.  This function becomes
- * noop if @task doesn't need to be migrated.  @task's css_set should have
- * been added as a migration source and @task->cg_list will be moved from
- * the css_set's tasks list to mg_tasks one.
+ * Add @task, which is a migration target, to @mgctx->tset.  This function
+ * becomes noop if @task doesn't need to be migrated.  @task's css_set
+ * should have been added as a migration source and @task->cg_list will be
+ * moved from the css_set's tasks list to mg_tasks one.
  */
-static void cgroup_taskset_add(struct task_struct *task,
-			       struct cgroup_taskset *tset)
+static void cgroup_migrate_add_task(struct task_struct *task,
+				    struct cgroup_mgctx *mgctx)
 {
 	struct css_set *cset;
 
@@ -2451,10 +1944,11 @@ static void cgroup_taskset_add(struct task_struct *task,
 
 	list_move_tail(&task->cg_list, &cset->mg_tasks);
 	if (list_empty(&cset->mg_node))
-		list_add_tail(&cset->mg_node, &tset->src_csets);
+		list_add_tail(&cset->mg_node,
+			      &mgctx->tset.src_csets);
 	if (list_empty(&cset->mg_dst_cset->mg_node))
-		list_move_tail(&cset->mg_dst_cset->mg_node,
-			       &tset->dst_csets);
+		list_add_tail(&cset->mg_dst_cset->mg_node,
+			      &mgctx->tset.dst_csets);
 }
 
 /**
@@ -2521,17 +2015,16 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
 
 /**
  * cgroup_taskset_migrate - migrate a taskset
- * @tset: taget taskset
- * @root: cgroup root the migration is taking place on
+ * @mgctx: migration context
  *
- * Migrate tasks in @tset as setup by migration preparation functions.
+ * Migrate tasks in @mgctx as setup by migration preparation functions.
  * This function fails iff one of the ->can_attach callbacks fails and
- * guarantees that either all or none of the tasks in @tset are migrated.
- * @tset is consumed regardless of success.
+ * guarantees that either all or none of the tasks in @mgctx are migrated.
+ * @mgctx is consumed regardless of success.
  */
-static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
-				  struct cgroup_root *root)
+static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
 {
+	struct cgroup_taskset *tset = &mgctx->tset;
 	struct cgroup_subsys *ss;
 	struct task_struct *task, *tmp_task;
 	struct css_set *cset, *tmp_cset;
@@ -2542,7 +2035,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
 		return 0;
 
 	/* check that we can legitimately attach to the cgroup */
-	do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+	do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
 		if (ss->can_attach) {
 			tset->ssid = ssid;
 			ret = ss->can_attach(tset);
@@ -2578,7 +2071,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
 	 */
 	tset->csets = &tset->dst_csets;
 
-	do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+	do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
 		if (ss->attach) {
 			tset->ssid = ssid;
 			ss->attach(tset);
@@ -2589,7 +2082,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
 	goto out_release_tset;
 
 out_cancel_attach:
-	do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+	do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
 		if (ssid == failed_ssid)
 			break;
 		if (ss->cancel_attach) {
@@ -2616,7 +2109,7 @@ out_release_tset:
  * zero for migration destination cgroups with tasks so that child cgroups
  * don't compete against tasks.
  */
-static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
+bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
 {
 	return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) ||
 		!dst_cgrp->subtree_control;
@@ -2624,25 +2117,31 @@ static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
 
 /**
  * cgroup_migrate_finish - cleanup after attach
- * @preloaded_csets: list of preloaded css_sets
+ * @mgctx: migration context
  *
  * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst().  See
  * those functions for details.
  */
-static void cgroup_migrate_finish(struct list_head *preloaded_csets)
+void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
 {
+	LIST_HEAD(preloaded);
 	struct css_set *cset, *tmp_cset;
 
 	lockdep_assert_held(&cgroup_mutex);
 
 	spin_lock_irq(&css_set_lock);
-	list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
+
+	list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
+	list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);
+
+	list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
 		cset->mg_src_cgrp = NULL;
 		cset->mg_dst_cgrp = NULL;
 		cset->mg_dst_cset = NULL;
 		list_del_init(&cset->mg_preload_node);
 		put_css_set_locked(cset);
 	}
+
 	spin_unlock_irq(&css_set_lock);
 }
 
@@ -2650,10 +2149,10 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
  * cgroup_migrate_add_src - add a migration source css_set
  * @src_cset: the source css_set to add
  * @dst_cgrp: the destination cgroup
- * @preloaded_csets: list of preloaded css_sets
+ * @mgctx: migration context
  *
  * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp.  Pin
- * @src_cset and add it to @preloaded_csets, which should later be cleaned
+ * @src_cset and add it to @mgctx->src_csets, which should later be cleaned
  * up by cgroup_migrate_finish().
  *
  * This function may be called without holding cgroup_threadgroup_rwsem
@@ -2662,9 +2161,9 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
  * into play and the preloaded css_sets are guaranteed to cover all
  * migrations.
  */
-static void cgroup_migrate_add_src(struct css_set *src_cset,
-				   struct cgroup *dst_cgrp,
-				   struct list_head *preloaded_csets)
+void cgroup_migrate_add_src(struct css_set *src_cset,
+			    struct cgroup *dst_cgrp,
+			    struct cgroup_mgctx *mgctx)
 {
 	struct cgroup *src_cgrp;
 
@@ -2692,33 +2191,35 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
 	src_cset->mg_src_cgrp = src_cgrp;
 	src_cset->mg_dst_cgrp = dst_cgrp;
 	get_css_set(src_cset);
-	list_add(&src_cset->mg_preload_node, preloaded_csets);
+	list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
 }
 
 /**
  * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
- * @preloaded_csets: list of preloaded source css_sets
+ * @mgctx: migration context
  *
  * Tasks are about to be moved and all the source css_sets have been
- * preloaded to @preloaded_csets.  This function looks up and pins all
- * destination css_sets, links each to its source, and append them to
- * @preloaded_csets.
+ * preloaded to @mgctx->preloaded_src_csets.  This function looks up and
+ * pins all destination css_sets, links each to its source, and append them
+ * to @mgctx->preloaded_dst_csets.
  *
  * This function must be called after cgroup_migrate_add_src() has been
  * called on each migration source css_set.  After migration is performed
  * using cgroup_migrate(), cgroup_migrate_finish() must be called on
- * @preloaded_csets.
+ * @mgctx.
  */
-static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets)
+int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
 {
-	LIST_HEAD(csets);
 	struct css_set *src_cset, *tmp_cset;
 
 	lockdep_assert_held(&cgroup_mutex);
 
 	/* look up the dst cset for each src cset and link it to src */
-	list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
+	list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
+				 mg_preload_node) {
 		struct css_set *dst_cset;
+		struct cgroup_subsys *ss;
+		int ssid;
 
 		dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
 		if (!dst_cset)
@@ -2743,15 +2244,19 @@ static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets)
 		src_cset->mg_dst_cset = dst_cset;
 
 		if (list_empty(&dst_cset->mg_preload_node))
-			list_add(&dst_cset->mg_preload_node, &csets);
+			list_add_tail(&dst_cset->mg_preload_node,
+				      &mgctx->preloaded_dst_csets);
 		else
 			put_css_set(dst_cset);
+
+		for_each_subsys(ss, ssid)
+			if (src_cset->subsys[ssid] != dst_cset->subsys[ssid])
+				mgctx->ss_mask |= 1 << ssid;
 	}
 
-	list_splice_tail(&csets, preloaded_csets);
 	return 0;
 err:
-	cgroup_migrate_finish(&csets);
+	cgroup_migrate_finish(mgctx);
 	return -ENOMEM;
 }
 
@@ -2759,7 +2264,7 @@ err:
  * cgroup_migrate - migrate a process or task to a cgroup
  * @leader: the leader of the process or the task to migrate
  * @threadgroup: whether @leader points to the whole process or a single task
- * @root: cgroup root migration is taking place on
+ * @mgctx: migration context
  *
  * Migrate a process or task denoted by @leader.  If migrating a process,
  * the caller must be holding cgroup_threadgroup_rwsem.  The caller is also
@@ -2773,10 +2278,9 @@ err:
  * decided for all targets by invoking group_migrate_prepare_dst() before
  * actually starting migrating.
  */
-static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
-			  struct cgroup_root *root)
+int cgroup_migrate(struct task_struct *leader, bool threadgroup,
+		   struct cgroup_mgctx *mgctx)
 {
-	struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
 	struct task_struct *task;
 
 	/*
@@ -2788,14 +2292,14 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
 	rcu_read_lock();
 	task = leader;
 	do {
-		cgroup_taskset_add(task, &tset);
+		cgroup_migrate_add_task(task, mgctx);
 		if (!threadgroup)
 			break;
 	} while_each_thread(leader, task);
 	rcu_read_unlock();
 	spin_unlock_irq(&css_set_lock);
 
-	return cgroup_taskset_migrate(&tset, root);
+	return cgroup_migrate_execute(mgctx);
 }
 
 /**
@@ -2806,10 +2310,10 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
  *
  * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
  */
-static int cgroup_attach_task(struct cgroup *dst_cgrp,
-			      struct task_struct *leader, bool threadgroup)
+int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
+		       bool threadgroup)
 {
-	LIST_HEAD(preloaded_csets);
+	DEFINE_CGROUP_MGCTX(mgctx);
 	struct task_struct *task;
 	int ret;
 
@@ -2821,8 +2325,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
 	rcu_read_lock();
 	task = leader;
 	do {
-		cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
-				       &preloaded_csets);
+		cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
 		if (!threadgroup)
 			break;
 	} while_each_thread(leader, task);
@@ -2830,11 +2333,11 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
 	spin_unlock_irq(&css_set_lock);
 
 	/* prepare dst csets and commit */
-	ret = cgroup_migrate_prepare_dst(&preloaded_csets);
+	ret = cgroup_migrate_prepare_dst(&mgctx);
 	if (!ret)
-		ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root);
+		ret = cgroup_migrate(leader, threadgroup, &mgctx);
 
-	cgroup_migrate_finish(&preloaded_csets);
+	cgroup_migrate_finish(&mgctx);
 
 	if (!ret)
 		trace_cgroup_attach_task(dst_cgrp, leader, threadgroup);
@@ -2846,20 +2349,9 @@ static int cgroup_procs_write_permission(struct task_struct *task,
 					 struct cgroup *dst_cgrp,
 					 struct kernfs_open_file *of)
 {
-	const struct cred *cred = current_cred();
-	const struct cred *tcred = get_task_cred(task);
 	int ret = 0;
 
-	/*
-	 * even if we're attaching all tasks in the thread group, we only
-	 * need to check permissions on one of them.
-	 */
-	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
-	    !uid_eq(cred->euid, tcred->uid) &&
-	    !uid_eq(cred->euid, tcred->suid))
-		ret = -EACCES;
-
-	if (!ret && cgroup_on_dfl(dst_cgrp)) {
+	if (cgroup_on_dfl(dst_cgrp)) {
 		struct super_block *sb = of->file->f_path.dentry->d_sb;
 		struct cgroup *cgrp;
 		struct inode *inode;
@@ -2877,9 +2369,21 @@ static int cgroup_procs_write_permission(struct task_struct *task,
 			ret = inode_permission(inode, MAY_WRITE);
 			iput(inode);
 		}
+	} else {
+		const struct cred *cred = current_cred();
+		const struct cred *tcred = get_task_cred(task);
+
+		/*
+		 * even if we're attaching all tasks in the thread group,
+		 * we only need to check permissions on one of them.
+		 */
+		if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
+		    !uid_eq(cred->euid, tcred->uid) &&
+		    !uid_eq(cred->euid, tcred->suid))
+			ret = -EACCES;
+		put_cred(tcred);
 	}
 
-	put_cred(tcred);
 	return ret;
 }
 
@@ -2888,8 +2392,8 @@ static int cgroup_procs_write_permission(struct task_struct *task,
  * function to attach either it or all tasks in its threadgroup. Will lock
  * cgroup_mutex and threadgroup.
  */
-static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
-				    size_t nbytes, loff_t off, bool threadgroup)
+ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
+			     size_t nbytes, loff_t off, bool threadgroup)
 {
 	struct task_struct *tsk;
 	struct cgroup_subsys *ss;
@@ -2950,86 +2454,12 @@ out_unlock_threadgroup:
 	return ret ?: nbytes;
 }
 
-/**
- * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
- * @from: attach to all cgroups of a given task
- * @tsk: the task to be attached
- */
-int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
-{
-	struct cgroup_root *root;
-	int retval = 0;
-
-	mutex_lock(&cgroup_mutex);
-	percpu_down_write(&cgroup_threadgroup_rwsem);
-	for_each_root(root) {
-		struct cgroup *from_cgrp;
-
-		if (root == &cgrp_dfl_root)
-			continue;
-
-		spin_lock_irq(&css_set_lock);
-		from_cgrp = task_cgroup_from_root(from, root);
-		spin_unlock_irq(&css_set_lock);
-
-		retval = cgroup_attach_task(from_cgrp, tsk, false);
-		if (retval)
-			break;
-	}
-	percpu_up_write(&cgroup_threadgroup_rwsem);
-	mutex_unlock(&cgroup_mutex);
-
-	return retval;
-}
-EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
-
-static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
-				  char *buf, size_t nbytes, loff_t off)
-{
-	return __cgroup_procs_write(of, buf, nbytes, off, false);
-}
-
-static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
-				  char *buf, size_t nbytes, loff_t off)
+ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
+			   loff_t off)
 {
 	return __cgroup_procs_write(of, buf, nbytes, off, true);
 }
 
-static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
-					  char *buf, size_t nbytes, loff_t off)
-{
-	struct cgroup *cgrp;
-
-	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
-
-	cgrp = cgroup_kn_lock_live(of->kn, false);
-	if (!cgrp)
-		return -ENODEV;
-	spin_lock(&release_agent_path_lock);
-	strlcpy(cgrp->root->release_agent_path, strstrip(buf),
-		sizeof(cgrp->root->release_agent_path));
-	spin_unlock(&release_agent_path_lock);
-	cgroup_kn_unlock(of->kn);
-	return nbytes;
-}
-
-static int cgroup_release_agent_show(struct seq_file *seq, void *v)
-{
-	struct cgroup *cgrp = seq_css(seq)->cgroup;
-
-	spin_lock(&release_agent_path_lock);
-	seq_puts(seq, cgrp->root->release_agent_path);
-	spin_unlock(&release_agent_path_lock);
-	seq_putc(seq, '\n');
-	return 0;
-}
-
-static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
-{
-	seq_puts(seq, "0\n");
-	return 0;
-}
-
 static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
 {
 	struct cgroup_subsys *ss;
@@ -3075,8 +2505,7 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
  */
 static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 {
-	LIST_HEAD(preloaded_csets);
-	struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
+	DEFINE_CGROUP_MGCTX(mgctx);
 	struct cgroup_subsys_state *d_css;
 	struct cgroup *dsct;
 	struct css_set *src_cset;
@@ -3092,33 +2521,28 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 		struct cgrp_cset_link *link;
 
 		list_for_each_entry(link, &dsct->cset_links, cset_link)
-			cgroup_migrate_add_src(link->cset, dsct,
-					       &preloaded_csets);
+			cgroup_migrate_add_src(link->cset, dsct, &mgctx);
 	}
 	spin_unlock_irq(&css_set_lock);
 
 	/* NULL dst indicates self on default hierarchy */
-	ret = cgroup_migrate_prepare_dst(&preloaded_csets);
+	ret = cgroup_migrate_prepare_dst(&mgctx);
 	if (ret)
 		goto out_finish;
 
 	spin_lock_irq(&css_set_lock);
-	list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
+	list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
 		struct task_struct *task, *ntask;
 
-		/* src_csets precede dst_csets, break on the first dst_cset */
-		if (!src_cset->mg_src_cgrp)
-			break;
-
 		/* all tasks in src_csets need to be migrated */
 		list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
-			cgroup_taskset_add(task, &tset);
+			cgroup_migrate_add_task(task, &mgctx);
 	}
 	spin_unlock_irq(&css_set_lock);
 
-	ret = cgroup_taskset_migrate(&tset, cgrp->root);
+	ret = cgroup_migrate_execute(&mgctx);
 out_finish:
-	cgroup_migrate_finish(&preloaded_csets);
+	cgroup_migrate_finish(&mgctx);
 	percpu_up_write(&cgroup_threadgroup_rwsem);
 	return ret;
 }
@@ -3131,7 +2555,7 @@ out_finish:
  * controller while the previous css is still around.  This function grabs
  * cgroup_mutex and drains the previous css instances of @cgrp's subtree.
  */
-static void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
+void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
 	__acquires(&cgroup_mutex)
 {
 	struct cgroup *dsct;
@@ -3503,6 +2927,23 @@ static int cgroup_events_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
+static int cgroup_file_open(struct kernfs_open_file *of)
+{
+	struct cftype *cft = of->kn->priv;
+
+	if (cft->open)
+		return cft->open(of);
+	return 0;
+}
+
+static void cgroup_file_release(struct kernfs_open_file *of)
+{
+	struct cftype *cft = of->kn->priv;
+
+	if (cft->release)
+		cft->release(of);
+}
+
 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
 				 size_t nbytes, loff_t off)
 {
@@ -3553,7 +2994,8 @@ static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
 
 static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
 {
-	seq_cft(seq)->seq_stop(seq, v);
+	if (seq_cft(seq)->seq_stop)
+		seq_cft(seq)->seq_stop(seq, v);
 }
 
 static int cgroup_seqfile_show(struct seq_file *m, void *arg)
@@ -3575,12 +3017,16 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg)
 
 static struct kernfs_ops cgroup_kf_single_ops = {
 	.atomic_write_len	= PAGE_SIZE,
+	.open			= cgroup_file_open,
+	.release		= cgroup_file_release,
 	.write			= cgroup_file_write,
 	.seq_show		= cgroup_seqfile_show,
 };
 
 static struct kernfs_ops cgroup_kf_ops = {
 	.atomic_write_len	= PAGE_SIZE,
+	.open			= cgroup_file_open,
+	.release		= cgroup_file_release,
 	.write			= cgroup_file_write,
 	.seq_start		= cgroup_seqfile_start,
 	.seq_next		= cgroup_seqfile_next,
@@ -3588,48 +3034,6 @@ static struct kernfs_ops cgroup_kf_ops = {
 	.seq_show		= cgroup_seqfile_show,
 };
 
-/*
- * cgroup_rename - Only allow simple rename of directories in place.
- */
-static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
-			 const char *new_name_str)
-{
-	struct cgroup *cgrp = kn->priv;
-	int ret;
-
-	if (kernfs_type(kn) != KERNFS_DIR)
-		return -ENOTDIR;
-	if (kn->parent != new_parent)
-		return -EIO;
-
-	/*
-	 * This isn't a proper migration and its usefulness is very
-	 * limited.  Disallow on the default hierarchy.
-	 */
-	if (cgroup_on_dfl(cgrp))
-		return -EPERM;
-
-	/*
-	 * We're gonna grab cgroup_mutex which nests outside kernfs
-	 * active_ref.  kernfs_rename() doesn't require active_ref
-	 * protection.  Break them before grabbing cgroup_mutex.
-	 */
-	kernfs_break_active_protection(new_parent);
-	kernfs_break_active_protection(kn);
-
-	mutex_lock(&cgroup_mutex);
-
-	ret = kernfs_rename(kn, new_parent, new_name_str);
-	if (!ret)
-		trace_cgroup_rename(cgrp);
-
-	mutex_unlock(&cgroup_mutex);
-
-	kernfs_unbreak_active_protection(kn);
-	kernfs_unbreak_active_protection(new_parent);
-	return ret;
-}
-
 /* set uid and gid of cgroup dirs and files to that of the creator */
 static int cgroup_kn_set_ugid(struct kernfs_node *kn)
 {
@@ -3926,26 +3330,6 @@ void cgroup_file_notify(struct cgroup_file *cfile)
 }
 
 /**
- * cgroup_task_count - count the number of tasks in a cgroup.
- * @cgrp: the cgroup in question
- *
- * Return the number of tasks in the cgroup.  The returned number can be
- * higher than the actual number of tasks due to css_set references from
- * namespace roots and temporary usages.
- */
-static int cgroup_task_count(const struct cgroup *cgrp)
-{
-	int count = 0;
-	struct cgrp_cset_link *link;
-
-	spin_lock_irq(&css_set_lock);
-	list_for_each_entry(link, &cgrp->cset_links, cset_link)
-		count += atomic_read(&link->cset->refcount);
-	spin_unlock_irq(&css_set_lock);
-	return count;
-}
-
-/**
  * css_next_child - find the next child of a given css
  * @pos: the current position (%NULL to initiate traversal)
  * @parent: css whose children to walk
@@ -4343,560 +3727,69 @@ void css_task_iter_end(struct css_task_iter *it)
 		put_task_struct(it->cur_task);
 }
 
-/**
- * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
- * @to: cgroup to which the tasks will be moved
- * @from: cgroup in which the tasks currently reside
- *
- * Locking rules between cgroup_post_fork() and the migration path
- * guarantee that, if a task is forking while being migrated, the new child
- * is guaranteed to be either visible in the source cgroup after the
- * parent's migration is complete or put into the target cgroup.  No task
- * can slip out of migration through forking.
- */
-int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
-{
-	LIST_HEAD(preloaded_csets);
-	struct cgrp_cset_link *link;
-	struct css_task_iter it;
-	struct task_struct *task;
-	int ret;
-
-	if (!cgroup_may_migrate_to(to))
-		return -EBUSY;
-
-	mutex_lock(&cgroup_mutex);
-
-	percpu_down_write(&cgroup_threadgroup_rwsem);
-
-	/* all tasks in @from are being moved, all csets are source */
-	spin_lock_irq(&css_set_lock);
-	list_for_each_entry(link, &from->cset_links, cset_link)
-		cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
-	spin_unlock_irq(&css_set_lock);
-
-	ret = cgroup_migrate_prepare_dst(&preloaded_csets);
-	if (ret)
-		goto out_err;
-
-	/*
-	 * Migrate tasks one-by-one until @from is empty.  This fails iff
-	 * ->can_attach() fails.
-	 */
-	do {
-		css_task_iter_start(&from->self, &it);
-		task = css_task_iter_next(&it);
-		if (task)
-			get_task_struct(task);
-		css_task_iter_end(&it);
-
-		if (task) {
-			ret = cgroup_migrate(task, false, to->root);
-			if (!ret)
-				trace_cgroup_transfer_tasks(to, task, false);
-			put_task_struct(task);
-		}
-	} while (task && !ret);
-out_err:
-	cgroup_migrate_finish(&preloaded_csets);
-	percpu_up_write(&cgroup_threadgroup_rwsem);
-	mutex_unlock(&cgroup_mutex);
-	return ret;
-}
-
-/*
- * Stuff for reading the 'tasks'/'procs' files.
- *
- * Reading this file can return large amounts of data if a cgroup has
- * *lots* of attached tasks. So it may need several calls to read(),
- * but we cannot guarantee that the information we produce is correct
- * unless we produce it entirely atomically.
- *
- */
-
-/* which pidlist file are we talking about? */
-enum cgroup_filetype {
-	CGROUP_FILE_PROCS,
-	CGROUP_FILE_TASKS,
-};
-
-/*
- * A pidlist is a list of pids that virtually represents the contents of one
- * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
- * a pair (one each for procs, tasks) for each pid namespace that's relevant
- * to the cgroup.
- */
-struct cgroup_pidlist {
-	/*
-	 * used to find which pidlist is wanted. doesn't change as long as
-	 * this particular list stays in the list.
-	*/
-	struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
-	/* array of xids */
-	pid_t *list;
-	/* how many elements the above list has */
-	int length;
-	/* each of these stored in a list by its cgroup */
-	struct list_head links;
-	/* pointer to the cgroup we belong to, for list removal purposes */
-	struct cgroup *owner;
-	/* for delayed destruction */
-	struct delayed_work destroy_dwork;
-};
-
-/*
- * The following two functions "fix" the issue where there are more pids
- * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
- * TODO: replace with a kernel-wide solution to this problem
- */
-#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
-static void *pidlist_allocate(int count)
-{
-	if (PIDLIST_TOO_LARGE(count))
-		return vmalloc(count * sizeof(pid_t));
-	else
-		return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
-}
-
-static void pidlist_free(void *p)
-{
-	kvfree(p);
-}
-
-/*
- * Used to destroy all pidlists lingering waiting for destroy timer.  None
- * should be left afterwards.
- */
-static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
-{
-	struct cgroup_pidlist *l, *tmp_l;
-
-	mutex_lock(&cgrp->pidlist_mutex);
-	list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
-		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
-	mutex_unlock(&cgrp->pidlist_mutex);
-
-	flush_workqueue(cgroup_pidlist_destroy_wq);
-	BUG_ON(!list_empty(&cgrp->pidlists));
-}
-
-static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
-{
-	struct delayed_work *dwork = to_delayed_work(work);
-	struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
-						destroy_dwork);
-	struct cgroup_pidlist *tofree = NULL;
-
-	mutex_lock(&l->owner->pidlist_mutex);
-
-	/*
-	 * Destroy iff we didn't get queued again.  The state won't change
-	 * as destroy_dwork can only be queued while locked.
-	 */
-	if (!delayed_work_pending(dwork)) {
-		list_del(&l->links);
-		pidlist_free(l->list);
-		put_pid_ns(l->key.ns);
-		tofree = l;
-	}
-
-	mutex_unlock(&l->owner->pidlist_mutex);
-	kfree(tofree);
-}
-
-/*
- * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
- * Returns the number of unique elements.
- */
-static int pidlist_uniq(pid_t *list, int length)
-{
-	int src, dest = 1;
-
-	/*
-	 * we presume the 0th element is unique, so i starts at 1. trivial
-	 * edge cases first; no work needs to be done for either
-	 */
-	if (length == 0 || length == 1)
-		return length;
-	/* src and dest walk down the list; dest counts unique elements */
-	for (src = 1; src < length; src++) {
-		/* find next unique element */
-		while (list[src] == list[src-1]) {
-			src++;
-			if (src == length)
-				goto after;
-		}
-		/* dest always points to where the next unique element goes */
-		list[dest] = list[src];
-		dest++;
-	}
-after:
-	return dest;
-}
-
-/*
- * The two pid files - task and cgroup.procs - guaranteed that the result
- * is sorted, which forced this whole pidlist fiasco.  As pid order is
- * different per namespace, each namespace needs differently sorted list,
- * making it impossible to use, for example, single rbtree of member tasks
- * sorted by task pointer.  As pidlists can be fairly large, allocating one
- * per open file is dangerous, so cgroup had to implement shared pool of
- * pidlists keyed by cgroup and namespace.
- *
- * All this extra complexity was caused by the original implementation
- * committing to an entirely unnecessary property.  In the long term, we
- * want to do away with it.  Explicitly scramble sort order if on the
- * default hierarchy so that no such expectation exists in the new
- * interface.
- *
- * Scrambling is done by swapping every two consecutive bits, which is
- * non-identity one-to-one mapping which disturbs sort order sufficiently.
- */
-static pid_t pid_fry(pid_t pid)
+static void cgroup_procs_release(struct kernfs_open_file *of)
 {
-	unsigned a = pid & 0x55555555;
-	unsigned b = pid & 0xAAAAAAAA;
-
-	return (a << 1) | (b >> 1);
-}
-
-static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
-{
-	if (cgroup_on_dfl(cgrp))
-		return pid_fry(pid);
-	else
-		return pid;
-}
-
-static int cmppid(const void *a, const void *b)
-{
-	return *(pid_t *)a - *(pid_t *)b;
-}
-
-static int fried_cmppid(const void *a, const void *b)
-{
-	return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
-}
-
-static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
-						  enum cgroup_filetype type)
-{
-	struct cgroup_pidlist *l;
-	/* don't need task_nsproxy() if we're looking at ourself */
-	struct pid_namespace *ns = task_active_pid_ns(current);
-
-	lockdep_assert_held(&cgrp->pidlist_mutex);
-
-	list_for_each_entry(l, &cgrp->pidlists, links)
-		if (l->key.type == type && l->key.ns == ns)
-			return l;
-	return NULL;
-}
-
-/*
- * find the appropriate pidlist for our purpose (given procs vs tasks)
- * returns with the lock on that pidlist already held, and takes care
- * of the use count, or returns NULL with no locks held if we're out of
- * memory.
- */
-static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
-						enum cgroup_filetype type)
-{
-	struct cgroup_pidlist *l;
-
-	lockdep_assert_held(&cgrp->pidlist_mutex);
-
-	l = cgroup_pidlist_find(cgrp, type);
-	if (l)
-		return l;
-
-	/* entry not found; create a new one */
-	l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
-	if (!l)
-		return l;
-
-	INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
-	l->key.type = type;
-	/* don't need task_nsproxy() if we're looking at ourself */
-	l->key.ns = get_pid_ns(task_active_pid_ns(current));
-	l->owner = cgrp;
-	list_add(&l->links, &cgrp->pidlists);
-	return l;
-}
-
-/*
- * Load a cgroup's pidarray with either procs' tgids or tasks' pids
- */
-static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
-			      struct cgroup_pidlist **lp)
-{
-	pid_t *array;
-	int length;
-	int pid, n = 0; /* used for populating the array */
-	struct css_task_iter it;
-	struct task_struct *tsk;
-	struct cgroup_pidlist *l;
-
-	lockdep_assert_held(&cgrp->pidlist_mutex);
-
-	/*
-	 * If cgroup gets more users after we read count, we won't have
-	 * enough space - tough.  This race is indistinguishable to the
-	 * caller from the case that the additional cgroup users didn't
-	 * show up until sometime later on.
-	 */
-	length = cgroup_task_count(cgrp);
-	array = pidlist_allocate(length);
-	if (!array)
-		return -ENOMEM;
-	/* now, populate the array */
-	css_task_iter_start(&cgrp->self, &it);
-	while ((tsk = css_task_iter_next(&it))) {
-		if (unlikely(n == length))
-			break;
-		/* get tgid or pid for procs or tasks file respectively */
-		if (type == CGROUP_FILE_PROCS)
-			pid = task_tgid_vnr(tsk);
-		else
-			pid = task_pid_vnr(tsk);
-		if (pid > 0) /* make sure to only use valid results */
-			array[n++] = pid;
-	}
-	css_task_iter_end(&it);
-	length = n;
-	/* now sort & (if procs) strip out duplicates */
-	if (cgroup_on_dfl(cgrp))
-		sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
-	else
-		sort(array, length, sizeof(pid_t), cmppid, NULL);
-	if (type == CGROUP_FILE_PROCS)
-		length = pidlist_uniq(array, length);
-
-	l = cgroup_pidlist_find_create(cgrp, type);
-	if (!l) {
-		pidlist_free(array);
-		return -ENOMEM;
+	if (of->priv) {
+		css_task_iter_end(of->priv);
+		kfree(of->priv);
 	}
-
-	/* store array, freeing old if necessary */
-	pidlist_free(l->list);
-	l->list = array;
-	l->length = length;
-	*lp = l;
-	return 0;
 }
 
-/**
- * cgroupstats_build - build and fill cgroupstats
- * @stats: cgroupstats to fill information into
- * @dentry: A dentry entry belonging to the cgroup for which stats have
- * been requested.
- *
- * Build and fill cgroupstats so that taskstats can export it to user
- * space.
- */
-int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
+static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
 {
-	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
-	struct cgroup *cgrp;
-	struct css_task_iter it;
-	struct task_struct *tsk;
-
-	/* it should be kernfs_node belonging to cgroupfs and is a directory */
-	if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
-	    kernfs_type(kn) != KERNFS_DIR)
-		return -EINVAL;
-
-	mutex_lock(&cgroup_mutex);
-
-	/*
-	 * We aren't being called from kernfs and there's no guarantee on
-	 * @kn->priv's validity.  For this and css_tryget_online_from_dir(),
-	 * @kn->priv is RCU safe.  Let's do the RCU dancing.
-	 */
-	rcu_read_lock();
-	cgrp = rcu_dereference(kn->priv);
-	if (!cgrp || cgroup_is_dead(cgrp)) {
-		rcu_read_unlock();
-		mutex_unlock(&cgroup_mutex);
-		return -ENOENT;
-	}
-	rcu_read_unlock();
+	struct kernfs_open_file *of = s->private;
+	struct css_task_iter *it = of->priv;
+	struct task_struct *task;
 
-	css_task_iter_start(&cgrp->self, &it);
-	while ((tsk = css_task_iter_next(&it))) {
-		switch (tsk->state) {
-		case TASK_RUNNING:
-			stats->nr_running++;
-			break;
-		case TASK_INTERRUPTIBLE:
-			stats->nr_sleeping++;
-			break;
-		case TASK_UNINTERRUPTIBLE:
-			stats->nr_uninterruptible++;
-			break;
-		case TASK_STOPPED:
-			stats->nr_stopped++;
-			break;
-		default:
-			if (delayacct_is_task_waiting_on_io(tsk))
-				stats->nr_io_wait++;
-			break;
-		}
-	}
-	css_task_iter_end(&it);
+	do {
+		task = css_task_iter_next(it);
+	} while (task && !thread_group_leader(task));
 
-	mutex_unlock(&cgroup_mutex);
-	return 0;
+	return task;
 }
 
-
-/*
- * seq_file methods for the tasks/procs files. The seq_file position is the
- * next pid to display; the seq_file iterator is a pointer to the pid
- * in the cgroup->l->list array.
- */
-
-static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
+static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
 {
-	/*
-	 * Initially we receive a position value that corresponds to
-	 * one more than the last pid shown (or 0 on the first call or
-	 * after a seek to the start). Use a binary-search to find the
-	 * next pid to display, if any
-	 */
 	struct kernfs_open_file *of = s->private;
 	struct cgroup *cgrp = seq_css(s)->cgroup;
-	struct cgroup_pidlist *l;
-	enum cgroup_filetype type = seq_cft(s)->private;
-	int index = 0, pid = *pos;
-	int *iter, ret;
-
-	mutex_lock(&cgrp->pidlist_mutex);
+	struct css_task_iter *it = of->priv;
 
 	/*
-	 * !NULL @of->priv indicates that this isn't the first start()
-	 * after open.  If the matching pidlist is around, we can use that.
-	 * Look for it.  Note that @of->priv can't be used directly.  It
-	 * could already have been destroyed.
+	 * When a seq_file is seeked, it's always traversed sequentially
+	 * from position 0, so we can simply keep iterating on !0 *pos.
 	 */
-	if (of->priv)
-		of->priv = cgroup_pidlist_find(cgrp, type);
-
-	/*
-	 * Either this is the first start() after open or the matching
-	 * pidlist has been destroyed inbetween.  Create a new one.
-	 */
-	if (!of->priv) {
-		ret = pidlist_array_load(cgrp, type,
-					 (struct cgroup_pidlist **)&of->priv);
-		if (ret)
-			return ERR_PTR(ret);
-	}
-	l = of->priv;
-
-	if (pid) {
-		int end = l->length;
-
-		while (index < end) {
-			int mid = (index + end) / 2;
-			if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
-				index = mid;
-				break;
-			} else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
-				index = mid + 1;
-			else
-				end = mid;
-		}
-	}
-	/* If we're off the end of the array, we're done */
-	if (index >= l->length)
-		return NULL;
-	/* Update the abstract position to be the actual pid that we found */
-	iter = l->list + index;
-	*pos = cgroup_pid_fry(cgrp, *iter);
-	return iter;
-}
-
-static void cgroup_pidlist_stop(struct seq_file *s, void *v)
-{
-	struct kernfs_open_file *of = s->private;
-	struct cgroup_pidlist *l = of->priv;
-
-	if (l)
-		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
-				 CGROUP_PIDLIST_DESTROY_DELAY);
-	mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
-}
+	if (!it) {
+		if (WARN_ON_ONCE((*pos)++))
+			return ERR_PTR(-EINVAL);
 
-static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
-{
-	struct kernfs_open_file *of = s->private;
-	struct cgroup_pidlist *l = of->priv;
-	pid_t *p = v;
-	pid_t *end = l->list + l->length;
-	/*
-	 * Advance to the next pid in the array. If this goes off the
-	 * end, we're done
-	 */
-	p++;
-	if (p >= end) {
-		return NULL;
-	} else {
-		*pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
-		return p;
+		it = kzalloc(sizeof(*it), GFP_KERNEL);
+		if (!it)
+			return ERR_PTR(-ENOMEM);
+		of->priv = it;
+		css_task_iter_start(&cgrp->self, it);
+	} else if (!(*pos)++) {
+		css_task_iter_end(it);
+		css_task_iter_start(&cgrp->self, it);
 	}
-}
-
-static int cgroup_pidlist_show(struct seq_file *s, void *v)
-{
-	seq_printf(s, "%d\n", *(int *)v);
 
-	return 0;
+	return cgroup_procs_next(s, NULL, NULL);
 }
 
-static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
-					 struct cftype *cft)
+static int cgroup_procs_show(struct seq_file *s, void *v)
 {
-	return notify_on_release(css->cgroup);
-}
-
-static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
-					  struct cftype *cft, u64 val)
-{
-	if (val)
-		set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
-	else
-		clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
-	return 0;
-}
-
-static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
-				      struct cftype *cft)
-{
-	return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
-}
-
-static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
-				       struct cftype *cft, u64 val)
-{
-	if (val)
-		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
-	else
-		clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+	seq_printf(s, "%d\n", task_tgid_vnr(v));
 	return 0;
 }
 
 /* cgroup core interface files for the default hierarchy */
-static struct cftype cgroup_dfl_base_files[] = {
+static struct cftype cgroup_base_files[] = {
 	{
 		.name = "cgroup.procs",
 		.file_offset = offsetof(struct cgroup, procs_file),
-		.seq_start = cgroup_pidlist_start,
-		.seq_next = cgroup_pidlist_next,
-		.seq_stop = cgroup_pidlist_stop,
-		.seq_show = cgroup_pidlist_show,
-		.private = CGROUP_FILE_PROCS,
+		.release = cgroup_procs_release,
+		.seq_start = cgroup_procs_start,
+		.seq_next = cgroup_procs_next,
+		.seq_show = cgroup_procs_show,
 		.write = cgroup_procs_write,
 	},
 	{
@@ -4917,51 +3810,6 @@ static struct cftype cgroup_dfl_base_files[] = {
 	{ }	/* terminate */
 };
 
-/* cgroup core interface files for the legacy hierarchies */
-static struct cftype cgroup_legacy_base_files[] = {
-	{
-		.name = "cgroup.procs",
-		.seq_start = cgroup_pidlist_start,
-		.seq_next = cgroup_pidlist_next,
-		.seq_stop = cgroup_pidlist_stop,
-		.seq_show = cgroup_pidlist_show,
-		.private = CGROUP_FILE_PROCS,
-		.write = cgroup_procs_write,
-	},
-	{
-		.name = "cgroup.clone_children",
-		.read_u64 = cgroup_clone_children_read,
-		.write_u64 = cgroup_clone_children_write,
-	},
-	{
-		.name = "cgroup.sane_behavior",
-		.flags = CFTYPE_ONLY_ON_ROOT,
-		.seq_show = cgroup_sane_behavior_show,
-	},
-	{
-		.name = "tasks",
-		.seq_start = cgroup_pidlist_start,
-		.seq_next = cgroup_pidlist_next,
-		.seq_stop = cgroup_pidlist_stop,
-		.seq_show = cgroup_pidlist_show,
-		.private = CGROUP_FILE_TASKS,
-		.write = cgroup_tasks_write,
-	},
-	{
-		.name = "notify_on_release",
-		.read_u64 = cgroup_read_notify_on_release,
-		.write_u64 = cgroup_write_notify_on_release,
-	},
-	{
-		.name = "release_agent",
-		.flags = CFTYPE_ONLY_ON_ROOT,
-		.seq_show = cgroup_release_agent_show,
-		.write = cgroup_release_agent_write,
-		.max_write_len = PATH_MAX - 1,
-	},
-	{ }	/* terminate */
-};
-
 /*
  * css destruction is four-stage process.
  *
@@ -5007,7 +3855,7 @@ static void css_free_work_fn(struct work_struct *work)
 	} else {
 		/* cgroup free path */
 		atomic_dec(&cgrp->root->nr_cgrps);
-		cgroup_pidlist_destroy_all(cgrp);
+		cgroup1_pidlist_destroy_all(cgrp);
 		cancel_work_sync(&cgrp->release_agent_work);
 
 		if (cgroup_parent(cgrp)) {
@@ -5302,8 +4150,7 @@ out_free_cgrp:
 	return ERR_PTR(ret);
 }
 
-static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
-			umode_t mode)
+int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
 {
 	struct cgroup *parent, *cgrp;
 	struct kernfs_node *kn;
@@ -5507,7 +4354,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	 */
 	kernfs_remove(cgrp->kn);
 
-	check_for_release(cgroup_parent(cgrp));
+	cgroup1_check_for_release(cgroup_parent(cgrp));
 
 	/* put the base reference */
 	percpu_ref_kill(&cgrp->self.refcnt);
@@ -5515,7 +4362,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	return 0;
 };
 
-static int cgroup_rmdir(struct kernfs_node *kn)
+int cgroup_rmdir(struct kernfs_node *kn)
 {
 	struct cgroup *cgrp;
 	int ret = 0;
@@ -5535,10 +4382,8 @@ static int cgroup_rmdir(struct kernfs_node *kn)
 
 static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
 	.remount_fs		= cgroup_remount,
-	.show_options		= cgroup_show_options,
 	.mkdir			= cgroup_mkdir,
 	.rmdir			= cgroup_rmdir,
-	.rename			= cgroup_rename,
 	.show_path		= cgroup_show_path,
 };
 
@@ -5646,8 +4491,8 @@ int __init cgroup_init(void)
 
 	BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
 	BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
-	BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
-	BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
+	BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
+	BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
 
 	/*
 	 * The latency of the synchronize_sched() is too high for cgroups,
@@ -5697,7 +4542,7 @@ int __init cgroup_init(void)
 			continue;
 		}
 
-		if (cgroup_ssid_no_v1(ssid))
+		if (cgroup1_ssid_disabled(ssid))
 			printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
 			       ss->name);
 
@@ -5744,15 +4589,6 @@ static int __init cgroup_wq_init(void)
 	 */
 	cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
 	BUG_ON(!cgroup_destroy_wq);
-
-	/*
-	 * Used to destroy pidlists and separate to serve as flush domain.
-	 * Cap @max_active to 1 too.
-	 */
-	cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
-						    0, 1);
-	BUG_ON(!cgroup_pidlist_destroy_wq);
-
 	return 0;
 }
 core_initcall(cgroup_wq_init);
@@ -5835,42 +4671,6 @@ out:
 	return retval;
 }
 
-/* Display information about each subsystem and each hierarchy */
-static int proc_cgroupstats_show(struct seq_file *m, void *v)
-{
-	struct cgroup_subsys *ss;
-	int i;
-
-	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
-	/*
-	 * ideally we don't want subsystems moving around while we do this.
-	 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
-	 * subsys/hierarchy state.
-	 */
-	mutex_lock(&cgroup_mutex);
-
-	for_each_subsys(ss, i)
-		seq_printf(m, "%s\t%d\t%d\t%d\n",
-			   ss->legacy_name, ss->root->hierarchy_id,
-			   atomic_read(&ss->root->nr_cgrps),
-			   cgroup_ssid_enabled(i));
-
-	mutex_unlock(&cgroup_mutex);
-	return 0;
-}
-
-static int cgroupstats_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_cgroupstats_show, NULL);
-}
-
-static const struct file_operations proc_cgroupstats_operations = {
-	.open = cgroupstats_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
 /**
  * cgroup_fork - initialize cgroup related fields during copy_process()
  * @child: pointer to task_struct of forking parent process.
@@ -6050,76 +4850,6 @@ void cgroup_free(struct task_struct *task)
 	put_css_set(cset);
 }
 
-static void check_for_release(struct cgroup *cgrp)
-{
-	if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
-	    !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
-		schedule_work(&cgrp->release_agent_work);
-}
-
-/*
- * Notify userspace when a cgroup is released, by running the
- * configured release agent with the name of the cgroup (path
- * relative to the root of cgroup file system) as the argument.
- *
- * Most likely, this user command will try to rmdir this cgroup.
- *
- * This races with the possibility that some other task will be
- * attached to this cgroup before it is removed, or that some other
- * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
- * The presumed 'rmdir' will fail quietly if this cgroup is no longer
- * unused, and this cgroup will be reprieved from its death sentence,
- * to continue to serve a useful existence.  Next time it's released,
- * we will get notified again, if it still has 'notify_on_release' set.
- *
- * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
- * means only wait until the task is successfully execve()'d.  The
- * separate release agent task is forked by call_usermodehelper(),
- * then control in this thread returns here, without waiting for the
- * release agent task.  We don't bother to wait because the caller of
- * this routine has no use for the exit status of the release agent
- * task, so no sense holding our caller up for that.
- */
-static void cgroup_release_agent(struct work_struct *work)
-{
-	struct cgroup *cgrp =
-		container_of(work, struct cgroup, release_agent_work);
-	char *pathbuf = NULL, *agentbuf = NULL;
-	char *argv[3], *envp[3];
-	int ret;
-
-	mutex_lock(&cgroup_mutex);
-
-	pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
-	agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
-	if (!pathbuf || !agentbuf)
-		goto out;
-
-	spin_lock_irq(&css_set_lock);
-	ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
-	spin_unlock_irq(&css_set_lock);
-	if (ret < 0 || ret >= PATH_MAX)
-		goto out;
-
-	argv[0] = agentbuf;
-	argv[1] = pathbuf;
-	argv[2] = NULL;
-
-	/* minimal command environment */
-	envp[0] = "HOME=/";
-	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
-	envp[2] = NULL;
-
-	mutex_unlock(&cgroup_mutex);
-	call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
-	goto out_free;
-out:
-	mutex_unlock(&cgroup_mutex);
-out_free:
-	kfree(agentbuf);
-	kfree(pathbuf);
-}
-
 static int __init cgroup_disable(char *str)
 {
 	struct cgroup_subsys *ss;
@@ -6141,33 +4871,6 @@ static int __init cgroup_disable(char *str)
 }
 __setup("cgroup_disable=", cgroup_disable);
 
-static int __init cgroup_no_v1(char *str)
-{
-	struct cgroup_subsys *ss;
-	char *token;
-	int i;
-
-	while ((token = strsep(&str, ",")) != NULL) {
-		if (!*token)
-			continue;
-
-		if (!strcmp(token, "all")) {
-			cgroup_no_v1_mask = U16_MAX;
-			break;
-		}
-
-		for_each_subsys(ss, i) {
-			if (strcmp(token, ss->name) &&
-			    strcmp(token, ss->legacy_name))
-				continue;
-
-			cgroup_no_v1_mask |= 1 << i;
-		}
-	}
-	return 1;
-}
-__setup("cgroup_no_v1=", cgroup_no_v1);
-
 /**
  * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
  * @dentry: directory dentry of interest
@@ -6197,7 +4900,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
 	 * have been or be removed at any point.  @kn->priv is RCU
 	 * protected for this access.  See css_release_work_fn() for details.
 	 */
-	cgrp = rcu_dereference(kn->priv);
+	cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
 	if (cgrp)
 		css = cgroup_css(cgrp, ss);
 
@@ -6349,154 +5052,6 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
 
 #endif	/* CONFIG_SOCK_CGROUP_DATA */
 
-/* cgroup namespaces */
-
-static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns)
-{
-	return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES);
-}
-
-static void dec_cgroup_namespaces(struct ucounts *ucounts)
-{
-	dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES);
-}
-
-static struct cgroup_namespace *alloc_cgroup_ns(void)
-{
-	struct cgroup_namespace *new_ns;
-	int ret;
-
-	new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
-	if (!new_ns)
-		return ERR_PTR(-ENOMEM);
-	ret = ns_alloc_inum(&new_ns->ns);
-	if (ret) {
-		kfree(new_ns);
-		return ERR_PTR(ret);
-	}
-	atomic_set(&new_ns->count, 1);
-	new_ns->ns.ops = &cgroupns_operations;
-	return new_ns;
-}
-
-void free_cgroup_ns(struct cgroup_namespace *ns)
-{
-	put_css_set(ns->root_cset);
-	dec_cgroup_namespaces(ns->ucounts);
-	put_user_ns(ns->user_ns);
-	ns_free_inum(&ns->ns);
-	kfree(ns);
-}
-EXPORT_SYMBOL(free_cgroup_ns);
-
-struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
-					struct user_namespace *user_ns,
-					struct cgroup_namespace *old_ns)
-{
-	struct cgroup_namespace *new_ns;
-	struct ucounts *ucounts;
-	struct css_set *cset;
-
-	BUG_ON(!old_ns);
-
-	if (!(flags & CLONE_NEWCGROUP)) {
-		get_cgroup_ns(old_ns);
-		return old_ns;
-	}
-
-	/* Allow only sysadmin to create cgroup namespace. */
-	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
-		return ERR_PTR(-EPERM);
-
-	ucounts = inc_cgroup_namespaces(user_ns);
-	if (!ucounts)
-		return ERR_PTR(-ENOSPC);
-
-	/* It is not safe to take cgroup_mutex here */
-	spin_lock_irq(&css_set_lock);
-	cset = task_css_set(current);
-	get_css_set(cset);
-	spin_unlock_irq(&css_set_lock);
-
-	new_ns = alloc_cgroup_ns();
-	if (IS_ERR(new_ns)) {
-		put_css_set(cset);
-		dec_cgroup_namespaces(ucounts);
-		return new_ns;
-	}
-
-	new_ns->user_ns = get_user_ns(user_ns);
-	new_ns->ucounts = ucounts;
-	new_ns->root_cset = cset;
-
-	return new_ns;
-}
-
-static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
-{
-	return container_of(ns, struct cgroup_namespace, ns);
-}
-
-static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
-{
-	struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
-
-	if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
-	    !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
-		return -EPERM;
-
-	/* Don't need to do anything if we are attaching to our own cgroupns. */
-	if (cgroup_ns == nsproxy->cgroup_ns)
-		return 0;
-
-	get_cgroup_ns(cgroup_ns);
-	put_cgroup_ns(nsproxy->cgroup_ns);
-	nsproxy->cgroup_ns = cgroup_ns;
-
-	return 0;
-}
-
-static struct ns_common *cgroupns_get(struct task_struct *task)
-{
-	struct cgroup_namespace *ns = NULL;
-	struct nsproxy *nsproxy;
-
-	task_lock(task);
-	nsproxy = task->nsproxy;
-	if (nsproxy) {
-		ns = nsproxy->cgroup_ns;
-		get_cgroup_ns(ns);
-	}
-	task_unlock(task);
-
-	return ns ? &ns->ns : NULL;
-}
-
-static void cgroupns_put(struct ns_common *ns)
-{
-	put_cgroup_ns(to_cg_ns(ns));
-}
-
-static struct user_namespace *cgroupns_owner(struct ns_common *ns)
-{
-	return to_cg_ns(ns)->user_ns;
-}
-
-const struct proc_ns_operations cgroupns_operations = {
-	.name		= "cgroup",
-	.type		= CLONE_NEWCGROUP,
-	.get		= cgroupns_get,
-	.put		= cgroupns_put,
-	.install	= cgroupns_install,
-	.owner		= cgroupns_owner,
-};
-
-static __init int cgroup_namespaces_init(void)
-{
-	return 0;
-}
-subsys_initcall(cgroup_namespaces_init);
-
 #ifdef CONFIG_CGROUP_BPF
 int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
 		      enum bpf_attach_type type, bool overridable)
@@ -6510,149 +5065,3 @@ int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
 	return ret;
 }
 #endif /* CONFIG_CGROUP_BPF */
-
-#ifdef CONFIG_CGROUP_DEBUG
-static struct cgroup_subsys_state *
-debug_css_alloc(struct cgroup_subsys_state *parent_css)
-{
-	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
-
-	if (!css)
-		return ERR_PTR(-ENOMEM);
-
-	return css;
-}
-
-static void debug_css_free(struct cgroup_subsys_state *css)
-{
-	kfree(css);
-}
-
-static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
-				struct cftype *cft)
-{
-	return cgroup_task_count(css->cgroup);
-}
-
-static u64 current_css_set_read(struct cgroup_subsys_state *css,
-				struct cftype *cft)
-{
-	return (u64)(unsigned long)current->cgroups;
-}
-
-static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
-					 struct cftype *cft)
-{
-	u64 count;
-
-	rcu_read_lock();
-	count = atomic_read(&task_css_set(current)->refcount);
-	rcu_read_unlock();
-	return count;
-}
-
-static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
-{
-	struct cgrp_cset_link *link;
-	struct css_set *cset;
-	char *name_buf;
-
-	name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
-	if (!name_buf)
-		return -ENOMEM;
-
-	spin_lock_irq(&css_set_lock);
-	rcu_read_lock();
-	cset = rcu_dereference(current->cgroups);
-	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
-		struct cgroup *c = link->cgrp;
-
-		cgroup_name(c, name_buf, NAME_MAX + 1);
-		seq_printf(seq, "Root %d group %s\n",
-			   c->root->hierarchy_id, name_buf);
-	}
-	rcu_read_unlock();
-	spin_unlock_irq(&css_set_lock);
-	kfree(name_buf);
-	return 0;
-}
-
-#define MAX_TASKS_SHOWN_PER_CSS 25
-static int cgroup_css_links_read(struct seq_file *seq, void *v)
-{
-	struct cgroup_subsys_state *css = seq_css(seq);
-	struct cgrp_cset_link *link;
-
-	spin_lock_irq(&css_set_lock);
-	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
-		struct css_set *cset = link->cset;
-		struct task_struct *task;
-		int count = 0;
-
-		seq_printf(seq, "css_set %p\n", cset);
-
-		list_for_each_entry(task, &cset->tasks, cg_list) {
-			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
-				goto overflow;
-			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
-		}
-
-		list_for_each_entry(task, &cset->mg_tasks, cg_list) {
-			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
-				goto overflow;
-			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
-		}
-		continue;
-	overflow:
-		seq_puts(seq, "  ...\n");
-	}
-	spin_unlock_irq(&css_set_lock);
-	return 0;
-}
-
-static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
-{
-	return (!cgroup_is_populated(css->cgroup) &&
-		!css_has_online_children(&css->cgroup->self));
-}
-
-static struct cftype debug_files[] =  {
-	{
-		.name = "taskcount",
-		.read_u64 = debug_taskcount_read,
-	},
-
-	{
-		.name = "current_css_set",
-		.read_u64 = current_css_set_read,
-	},
-
-	{
-		.name = "current_css_set_refcount",
-		.read_u64 = current_css_set_refcount_read,
-	},
-
-	{
-		.name = "current_css_set_cg_links",
-		.seq_show = current_css_set_cg_links_read,
-	},
-
-	{
-		.name = "cgroup_css_links",
-		.seq_show = cgroup_css_links_read,
-	},
-
-	{
-		.name = "releasable",
-		.read_u64 = releasable_read,
-	},
-
-	{ }	/* terminate */
-};
-
-struct cgroup_subsys debug_cgrp_subsys = {
-	.css_alloc = debug_css_alloc,
-	.css_free = debug_css_free,
-	.legacy_cftypes = debug_files,
-};
-#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cpuset.c b/kernel/cgroup/cpuset.c
index b3088886cd37..b3088886cd37 100644
--- a/kernel/cpuset.c
+++ b/kernel/cgroup/cpuset.c
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup/freezer.c
index 1b72d56edce5..1b72d56edce5 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup/freezer.c
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
new file mode 100644
index 000000000000..cff7ea62c38f
--- /dev/null
+++ b/kernel/cgroup/namespace.c
@@ -0,0 +1,155 @@
+#include "cgroup-internal.h"
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/nsproxy.h>
+#include <linux/proc_ns.h>
+
+
+/* cgroup namespaces */
+
+static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns)
+{
+	return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES);
+}
+
+static void dec_cgroup_namespaces(struct ucounts *ucounts)
+{
+	dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES);
+}
+
+static struct cgroup_namespace *alloc_cgroup_ns(void)
+{
+	struct cgroup_namespace *new_ns;
+	int ret;
+
+	new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
+	if (!new_ns)
+		return ERR_PTR(-ENOMEM);
+	ret = ns_alloc_inum(&new_ns->ns);
+	if (ret) {
+		kfree(new_ns);
+		return ERR_PTR(ret);
+	}
+	atomic_set(&new_ns->count, 1);
+	new_ns->ns.ops = &cgroupns_operations;
+	return new_ns;
+}
+
+void free_cgroup_ns(struct cgroup_namespace *ns)
+{
+	put_css_set(ns->root_cset);
+	dec_cgroup_namespaces(ns->ucounts);
+	put_user_ns(ns->user_ns);
+	ns_free_inum(&ns->ns);
+	kfree(ns);
+}
+EXPORT_SYMBOL(free_cgroup_ns);
+
+struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
+					struct user_namespace *user_ns,
+					struct cgroup_namespace *old_ns)
+{
+	struct cgroup_namespace *new_ns;
+	struct ucounts *ucounts;
+	struct css_set *cset;
+
+	BUG_ON(!old_ns);
+
+	if (!(flags & CLONE_NEWCGROUP)) {
+		get_cgroup_ns(old_ns);
+		return old_ns;
+	}
+
+	/* Allow only sysadmin to create cgroup namespace. */
+	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	ucounts = inc_cgroup_namespaces(user_ns);
+	if (!ucounts)
+		return ERR_PTR(-ENOSPC);
+
+	/* It is not safe to take cgroup_mutex here */
+	spin_lock_irq(&css_set_lock);
+	cset = task_css_set(current);
+	get_css_set(cset);
+	spin_unlock_irq(&css_set_lock);
+
+	new_ns = alloc_cgroup_ns();
+	if (IS_ERR(new_ns)) {
+		put_css_set(cset);
+		dec_cgroup_namespaces(ucounts);
+		return new_ns;
+	}
+
+	new_ns->user_ns = get_user_ns(user_ns);
+	new_ns->ucounts = ucounts;
+	new_ns->root_cset = cset;
+
+	return new_ns;
+}
+
+static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
+{
+	return container_of(ns, struct cgroup_namespace, ns);
+}
+
+static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+{
+	struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
+
+	if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
+	    !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/* Don't need to do anything if we are attaching to our own cgroupns. */
+	if (cgroup_ns == nsproxy->cgroup_ns)
+		return 0;
+
+	get_cgroup_ns(cgroup_ns);
+	put_cgroup_ns(nsproxy->cgroup_ns);
+	nsproxy->cgroup_ns = cgroup_ns;
+
+	return 0;
+}
+
+static struct ns_common *cgroupns_get(struct task_struct *task)
+{
+	struct cgroup_namespace *ns = NULL;
+	struct nsproxy *nsproxy;
+
+	task_lock(task);
+	nsproxy = task->nsproxy;
+	if (nsproxy) {
+		ns = nsproxy->cgroup_ns;
+		get_cgroup_ns(ns);
+	}
+	task_unlock(task);
+
+	return ns ? &ns->ns : NULL;
+}
+
+static void cgroupns_put(struct ns_common *ns)
+{
+	put_cgroup_ns(to_cg_ns(ns));
+}
+
+static struct user_namespace *cgroupns_owner(struct ns_common *ns)
+{
+	return to_cg_ns(ns)->user_ns;
+}
+
+const struct proc_ns_operations cgroupns_operations = {
+	.name		= "cgroup",
+	.type		= CLONE_NEWCGROUP,
+	.get		= cgroupns_get,
+	.put		= cgroupns_put,
+	.install	= cgroupns_install,
+	.owner		= cgroupns_owner,
+};
+
+static __init int cgroup_namespaces_init(void)
+{
+	return 0;
+}
+subsys_initcall(cgroup_namespaces_init);
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup/pids.c
index 2bd673783f1a..2bd673783f1a 100644
--- a/kernel/cgroup_pids.c
+++ b/kernel/cgroup/pids.c
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
new file mode 100644
index 000000000000..defad3c5e7dc
--- /dev/null
+++ b/kernel/cgroup/rdma.c
@@ -0,0 +1,619 @@
+/*
+ * RDMA resource limiting controller for cgroups.
+ *
+ * Used to allow a cgroup hierarchy to stop processes from consuming
+ * additional RDMA resources after a certain limit is reached.
+ *
+ * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License. See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/cgroup.h>
+#include <linux/parser.h>
+#include <linux/cgroup_rdma.h>
+
+#define RDMACG_MAX_STR "max"
+
+/*
+ * Protects list of resource pools maintained on per cgroup basis
+ * and rdma device list.
+ */
+static DEFINE_MUTEX(rdmacg_mutex);
+static LIST_HEAD(rdmacg_devices);
+
+enum rdmacg_file_type {
+	RDMACG_RESOURCE_TYPE_MAX,
+	RDMACG_RESOURCE_TYPE_STAT,
+};
+
+/*
+ * resource table definition as to be seen by the user.
+ * Need to add entries to it when more resources are
+ * added/defined at IB verb/core layer.
+ */
+static char const *rdmacg_resource_names[] = {
+	[RDMACG_RESOURCE_HCA_HANDLE]	= "hca_handle",
+	[RDMACG_RESOURCE_HCA_OBJECT]	= "hca_object",
+};
+
+/* resource tracker for each resource of rdma cgroup */
+struct rdmacg_resource {
+	int max;
+	int usage;
+};
+
+/*
+ * resource pool object which represents per cgroup, per device
+ * resources. There are multiple instances of this object per cgroup,
+ * therefore it cannot be embedded within rdma_cgroup structure. It
+ * is maintained as list.
+ */
+struct rdmacg_resource_pool {
+	struct rdmacg_device	*device;
+	struct rdmacg_resource	resources[RDMACG_RESOURCE_MAX];
+
+	struct list_head	cg_node;
+	struct list_head	dev_node;
+
+	/* count active user tasks of this pool */
+	u64			usage_sum;
+	/* total number counts which are set to max */
+	int			num_max_cnt;
+};
+
+static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css)
+{
+	return container_of(css, struct rdma_cgroup, css);
+}
+
+static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg)
+{
+	return css_rdmacg(cg->css.parent);
+}
+
+static inline struct rdma_cgroup *get_current_rdmacg(void)
+{
+	return css_rdmacg(task_get_css(current, rdma_cgrp_id));
+}
+
+static void set_resource_limit(struct rdmacg_resource_pool *rpool,
+			       int index, int new_max)
+{
+	if (new_max == S32_MAX) {
+		if (rpool->resources[index].max != S32_MAX)
+			rpool->num_max_cnt++;
+	} else {
+		if (rpool->resources[index].max == S32_MAX)
+			rpool->num_max_cnt--;
+	}
+	rpool->resources[index].max = new_max;
+}
+
+static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool)
+{
+	int i;
+
+	for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
+		set_resource_limit(rpool, i, S32_MAX);
+}
+
+static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
+{
+	lockdep_assert_held(&rdmacg_mutex);
+
+	list_del(&rpool->cg_node);
+	list_del(&rpool->dev_node);
+	kfree(rpool);
+}
+
+static struct rdmacg_resource_pool *
+find_cg_rpool_locked(struct rdma_cgroup *cg,
+		     struct rdmacg_device *device)
+
+{
+	struct rdmacg_resource_pool *pool;
+
+	lockdep_assert_held(&rdmacg_mutex);
+
+	list_for_each_entry(pool, &cg->rpools, cg_node)
+		if (pool->device == device)
+			return pool;
+
+	return NULL;
+}
+
+static struct rdmacg_resource_pool *
+get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
+{
+	struct rdmacg_resource_pool *rpool;
+
+	rpool = find_cg_rpool_locked(cg, device);
+	if (rpool)
+		return rpool;
+
+	rpool = kzalloc(sizeof(*rpool), GFP_KERNEL);
+	if (!rpool)
+		return ERR_PTR(-ENOMEM);
+
+	rpool->device = device;
+	set_all_resource_max_limit(rpool);
+
+	INIT_LIST_HEAD(&rpool->cg_node);
+	INIT_LIST_HEAD(&rpool->dev_node);
+	list_add_tail(&rpool->cg_node, &cg->rpools);
+	list_add_tail(&rpool->dev_node, &device->rpools);
+	return rpool;
+}
+
+/**
+ * uncharge_cg_locked - uncharge resource for rdma cgroup
+ * @cg: pointer to cg to uncharge and all parents in hierarchy
+ * @device: pointer to rdmacg device
+ * @index: index of the resource to uncharge in cg (resource pool)
+ *
+ * It also frees the resource pool which was created as part of
+ * charging operation when there are no resources attached to
+ * resource pool.
+ */
+static void
+uncharge_cg_locked(struct rdma_cgroup *cg,
+		   struct rdmacg_device *device,
+		   enum rdmacg_resource_type index)
+{
+	struct rdmacg_resource_pool *rpool;
+
+	rpool = find_cg_rpool_locked(cg, device);
+
+	/*
+	 * rpool cannot be null at this stage. Let kernel operate in case
+	 * if there a bug in IB stack or rdma controller, instead of crashing
+	 * the system.
+	 */
+	if (unlikely(!rpool)) {
+		pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device);
+		return;
+	}
+
+	rpool->resources[index].usage--;
+
+	/*
+	 * A negative count (or overflow) is invalid,
+	 * it indicates a bug in the rdma controller.
+	 */
+	WARN_ON_ONCE(rpool->resources[index].usage < 0);
+	rpool->usage_sum--;
+	if (rpool->usage_sum == 0 &&
+	    rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
+		/*
+		 * No user of the rpool and all entries are set to max, so
+		 * safe to delete this rpool.
+		 */
+		free_cg_rpool_locked(rpool);
+	}
+}
+
+/**
+ * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
+ * @device: pointer to rdmacg device
+ * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
+ *           stop uncharging
+ * @index: index of the resource to uncharge in cg in given resource pool
+ */
+static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
+				     struct rdmacg_device *device,
+				     struct rdma_cgroup *stop_cg,
+				     enum rdmacg_resource_type index)
+{
+	struct rdma_cgroup *p;
+
+	mutex_lock(&rdmacg_mutex);
+
+	for (p = cg; p != stop_cg; p = parent_rdmacg(p))
+		uncharge_cg_locked(p, device, index);
+
+	mutex_unlock(&rdmacg_mutex);
+
+	css_put(&cg->css);
+}
+
+/**
+ * rdmacg_uncharge - hierarchically uncharge rdma resource count
+ * @device: pointer to rdmacg device
+ * @index: index of the resource to uncharge in cgroup in given resource pool
+ */
+void rdmacg_uncharge(struct rdma_cgroup *cg,
+		     struct rdmacg_device *device,
+		     enum rdmacg_resource_type index)
+{
+	if (index >= RDMACG_RESOURCE_MAX)
+		return;
+
+	rdmacg_uncharge_hierarchy(cg, device, NULL, index);
+}
+EXPORT_SYMBOL(rdmacg_uncharge);
+
+/**
+ * rdmacg_try_charge - hierarchically try to charge the rdma resource
+ * @rdmacg: pointer to rdma cgroup which will own this resource
+ * @device: pointer to rdmacg device
+ * @index: index of the resource to charge in cgroup (resource pool)
+ *
+ * This function follows charging resource in hierarchical way.
+ * It will fail if the charge would cause the new value to exceed the
+ * hierarchical limit.
+ * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
+ * Returns pointer to rdmacg for this resource when charging is successful.
+ *
+ * Charger needs to account resources on two criteria.
+ * (a) per cgroup & (b) per device resource usage.
+ * Per cgroup resource usage ensures that tasks of cgroup doesn't cross
+ * the configured limits. Per device provides granular configuration
+ * in multi device usage. It allocates resource pool in the hierarchy
+ * for each parent it come across for first resource. Later on resource
+ * pool will be available. Therefore it will be much faster thereon
+ * to charge/uncharge.
+ */
+int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
+		      struct rdmacg_device *device,
+		      enum rdmacg_resource_type index)
+{
+	struct rdma_cgroup *cg, *p;
+	struct rdmacg_resource_pool *rpool;
+	s64 new;
+	int ret = 0;
+
+	if (index >= RDMACG_RESOURCE_MAX)
+		return -EINVAL;
+
+	/*
+	 * hold on to css, as cgroup can be removed but resource
+	 * accounting happens on css.
+	 */
+	cg = get_current_rdmacg();
+
+	mutex_lock(&rdmacg_mutex);
+	for (p = cg; p; p = parent_rdmacg(p)) {
+		rpool = get_cg_rpool_locked(p, device);
+		if (IS_ERR(rpool)) {
+			ret = PTR_ERR(rpool);
+			goto err;
+		} else {
+			new = rpool->resources[index].usage + 1;
+			if (new > rpool->resources[index].max) {
+				ret = -EAGAIN;
+				goto err;
+			} else {
+				rpool->resources[index].usage = new;
+				rpool->usage_sum++;
+			}
+		}
+	}
+	mutex_unlock(&rdmacg_mutex);
+
+	*rdmacg = cg;
+	return 0;
+
+err:
+	mutex_unlock(&rdmacg_mutex);
+	rdmacg_uncharge_hierarchy(cg, device, p, index);
+	return ret;
+}
+EXPORT_SYMBOL(rdmacg_try_charge);
+
+/**
+ * rdmacg_register_device - register rdmacg device to rdma controller.
+ * @device: pointer to rdmacg device whose resources need to be accounted.
+ *
+ * If IB stack wish a device to participate in rdma cgroup resource
+ * tracking, it must invoke this API to register with rdma cgroup before
+ * any user space application can start using the RDMA resources.
+ * Returns 0 on success or EINVAL when table length given is beyond
+ * supported size.
+ */
+int rdmacg_register_device(struct rdmacg_device *device)
+{
+	INIT_LIST_HEAD(&device->dev_node);
+	INIT_LIST_HEAD(&device->rpools);
+
+	mutex_lock(&rdmacg_mutex);
+	list_add_tail(&device->dev_node, &rdmacg_devices);
+	mutex_unlock(&rdmacg_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(rdmacg_register_device);
+
+/**
+ * rdmacg_unregister_device - unregister rdmacg device from rdma controller.
+ * @device: pointer to rdmacg device which was previously registered with rdma
+ *          controller using rdmacg_register_device().
+ *
+ * IB stack must invoke this after all the resources of the IB device
+ * are destroyed and after ensuring that no more resources will be created
+ * when this API is invoked.
+ */
+void rdmacg_unregister_device(struct rdmacg_device *device)
+{
+	struct rdmacg_resource_pool *rpool, *tmp;
+
+	/*
+	 * Synchronize with any active resource settings,
+	 * usage query happening via configfs.
+	 */
+	mutex_lock(&rdmacg_mutex);
+	list_del_init(&device->dev_node);
+
+	/*
+	 * Now that this device is off the cgroup list, its safe to free
+	 * all the rpool resources.
+	 */
+	list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node)
+		free_cg_rpool_locked(rpool);
+
+	mutex_unlock(&rdmacg_mutex);
+}
+EXPORT_SYMBOL(rdmacg_unregister_device);
+
+static int parse_resource(char *c, int *intval)
+{
+	substring_t argstr;
+	const char **table = &rdmacg_resource_names[0];
+	char *name, *value = c;
+	size_t len;
+	int ret, i = 0;
+
+	name = strsep(&value, "=");
+	if (!name || !value)
+		return -EINVAL;
+
+	len = strlen(value);
+
+	for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
+		if (strcmp(table[i], name))
+			continue;
+
+		argstr.from = value;
+		argstr.to = value + len;
+
+		ret = match_int(&argstr, intval);
+		if (ret >= 0) {
+			if (*intval < 0)
+				break;
+			return i;
+		}
+		if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
+			*intval = S32_MAX;
+			return i;
+		}
+		break;
+	}
+	return -EINVAL;
+}
+
+static int rdmacg_parse_limits(char *options,
+			       int *new_limits, unsigned long *enables)
+{
+	char *c;
+	int err = -EINVAL;
+
+	/* parse resource options */
+	while ((c = strsep(&options, " ")) != NULL) {
+		int index, intval;
+
+		index = parse_resource(c, &intval);
+		if (index < 0)
+			goto err;
+
+		new_limits[index] = intval;
+		*enables |= BIT(index);
+	}
+	return 0;
+
+err:
+	return err;
+}
+
+static struct rdmacg_device *rdmacg_get_device_locked(const char *name)
+{
+	struct rdmacg_device *device;
+
+	lockdep_assert_held(&rdmacg_mutex);
+
+	list_for_each_entry(device, &rdmacg_devices, dev_node)
+		if (!strcmp(name, device->name))
+			return device;
+
+	return NULL;
+}
+
+static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
+				       char *buf, size_t nbytes, loff_t off)
+{
+	struct rdma_cgroup *cg = css_rdmacg(of_css(of));
+	const char *dev_name;
+	struct rdmacg_resource_pool *rpool;
+	struct rdmacg_device *device;
+	char *options = strstrip(buf);
+	int *new_limits;
+	unsigned long enables = 0;
+	int i = 0, ret = 0;
+
+	/* extract the device name first */
+	dev_name = strsep(&options, " ");
+	if (!dev_name) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL);
+	if (!new_limits) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	ret = rdmacg_parse_limits(options, new_limits, &enables);
+	if (ret)
+		goto parse_err;
+
+	/* acquire lock to synchronize with hot plug devices */
+	mutex_lock(&rdmacg_mutex);
+
+	device = rdmacg_get_device_locked(dev_name);
+	if (!device) {
+		ret = -ENODEV;
+		goto dev_err;
+	}
+
+	rpool = get_cg_rpool_locked(cg, device);
+	if (IS_ERR(rpool)) {
+		ret = PTR_ERR(rpool);
+		goto dev_err;
+	}
+
+	/* now set the new limits of the rpool */
+	for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX)
+		set_resource_limit(rpool, i, new_limits[i]);
+
+	if (rpool->usage_sum == 0 &&
+	    rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
+		/*
+		 * No user of the rpool and all entries are set to max, so
+		 * safe to delete this rpool.
+		 */
+		free_cg_rpool_locked(rpool);
+	}
+
+dev_err:
+	mutex_unlock(&rdmacg_mutex);
+
+parse_err:
+	kfree(new_limits);
+
+err:
+	return ret ?: nbytes;
+}
+
+static void print_rpool_values(struct seq_file *sf,
+			       struct rdmacg_resource_pool *rpool)
+{
+	enum rdmacg_file_type sf_type;
+	int i;
+	u32 value;
+
+	sf_type = seq_cft(sf)->private;
+
+	for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
+		seq_puts(sf, rdmacg_resource_names[i]);
+		seq_putc(sf, '=');
+		if (sf_type == RDMACG_RESOURCE_TYPE_MAX) {
+			if (rpool)
+				value = rpool->resources[i].max;
+			else
+				value = S32_MAX;
+		} else {
+			if (rpool)
+				value = rpool->resources[i].usage;
+			else
+				value = 0;
+		}
+
+		if (value == S32_MAX)
+			seq_puts(sf, RDMACG_MAX_STR);
+		else
+			seq_printf(sf, "%d", value);
+		seq_putc(sf, ' ');
+	}
+}
+
+static int rdmacg_resource_read(struct seq_file *sf, void *v)
+{
+	struct rdmacg_device *device;
+	struct rdmacg_resource_pool *rpool;
+	struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
+
+	mutex_lock(&rdmacg_mutex);
+
+	list_for_each_entry(device, &rdmacg_devices, dev_node) {
+		seq_printf(sf, "%s ", device->name);
+
+		rpool = find_cg_rpool_locked(cg, device);
+		print_rpool_values(sf, rpool);
+
+		seq_putc(sf, '\n');
+	}
+
+	mutex_unlock(&rdmacg_mutex);
+	return 0;
+}
+
+static struct cftype rdmacg_files[] = {
+	{
+		.name = "max",
+		.write = rdmacg_resource_set_max,
+		.seq_show = rdmacg_resource_read,
+		.private = RDMACG_RESOURCE_TYPE_MAX,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{
+		.name = "current",
+		.seq_show = rdmacg_resource_read,
+		.private = RDMACG_RESOURCE_TYPE_STAT,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{ }	/* terminate */
+};
+
+static struct cgroup_subsys_state *
+rdmacg_css_alloc(struct cgroup_subsys_state *parent)
+{
+	struct rdma_cgroup *cg;
+
+	cg = kzalloc(sizeof(*cg), GFP_KERNEL);
+	if (!cg)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&cg->rpools);
+	return &cg->css;
+}
+
+static void rdmacg_css_free(struct cgroup_subsys_state *css)
+{
+	struct rdma_cgroup *cg = css_rdmacg(css);
+
+	kfree(cg);
+}
+
+/**
+ * rdmacg_css_offline - cgroup css_offline callback
+ * @css: css of interest
+ *
+ * This function is called when @css is about to go away and responsible
+ * for shooting down all rdmacg associated with @css. As part of that it
+ * marks all the resource pool entries to max value, so that when resources are
+ * uncharged, associated resource pool can be freed as well.
+ */
+static void rdmacg_css_offline(struct cgroup_subsys_state *css)
+{
+	struct rdma_cgroup *cg = css_rdmacg(css);
+	struct rdmacg_resource_pool *rpool;
+
+	mutex_lock(&rdmacg_mutex);
+
+	list_for_each_entry(rpool, &cg->rpools, cg_node)
+		set_all_resource_max_limit(rpool);
+
+	mutex_unlock(&rdmacg_mutex);
+}
+
+struct cgroup_subsys rdma_cgrp_subsys = {
+	.css_alloc	= rdmacg_css_alloc,
+	.css_free	= rdmacg_css_free,
+	.css_offline	= rdmacg_css_offline,
+	.legacy_cftypes	= rdmacg_files,
+	.dfl_cftypes	= rdmacg_files,
+};
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b2eb3542e829..5b4e0b98f4eb 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -10959,5 +10959,11 @@ struct cgroup_subsys perf_event_cgrp_subsys = {
 	.css_alloc	= perf_cgroup_css_alloc,
 	.css_free	= perf_cgroup_css_free,
 	.attach		= perf_cgroup_attach,
+	/*
+	 * Implicitly enable on dfl hierarchy so that perf events can
+	 * always be filtered by cgroup2 path as long as perf_event
+	 * controller is not mounted on a legacy hierarchy.
+	 */
+	.implicit_on_dfl = true,
 };
 #endif /* CONFIG_CGROUP_PERF */
diff --git a/tools/perf/util/cgroup.c b/tools/perf/util/cgroup.c
index 8fdee24725a7..eafbf11442b2 100644
--- a/tools/perf/util/cgroup.c
+++ b/tools/perf/util/cgroup.c
@@ -12,8 +12,8 @@ cgroupfs_find_mountpoint(char *buf, size_t maxlen)
 {
 	FILE *fp;
 	char mountpoint[PATH_MAX + 1], tokens[PATH_MAX + 1], type[PATH_MAX + 1];
+	char path_v1[PATH_MAX + 1], path_v2[PATH_MAX + 2], *path;
 	char *token, *saved_ptr = NULL;
-	int found = 0;
 
 	fp = fopen("/proc/mounts", "r");
 	if (!fp)
@@ -24,31 +24,43 @@ cgroupfs_find_mountpoint(char *buf, size_t maxlen)
 	 * and inspect every cgroupfs mount point to find one that has
 	 * perf_event subsystem
 	 */
+	path_v1[0] = '\0';
+	path_v2[0] = '\0';
+
 	while (fscanf(fp, "%*s %"STR(PATH_MAX)"s %"STR(PATH_MAX)"s %"
 				STR(PATH_MAX)"s %*d %*d\n",
 				mountpoint, type, tokens) == 3) {
 
-		if (!strcmp(type, "cgroup")) {
+		if (!path_v1[0] && !strcmp(type, "cgroup")) {
 
 			token = strtok_r(tokens, ",", &saved_ptr);
 
 			while (token != NULL) {
 				if (!strcmp(token, "perf_event")) {
-					found = 1;
+					strcpy(path_v1, mountpoint);
 					break;
 				}
 				token = strtok_r(NULL, ",", &saved_ptr);
 			}
 		}
-		if (found)
+
+		if (!path_v2[0] && !strcmp(type, "cgroup2"))
+			strcpy(path_v2, mountpoint);
+
+		if (path_v1[0] && path_v2[0])
 			break;
 	}
 	fclose(fp);
-	if (!found)
+
+	if (path_v1[0])
+		path = path_v1;
+	else if (path_v2[0])
+		path = path_v2;
+	else
 		return -1;
 
-	if (strlen(mountpoint) < maxlen) {
-		strcpy(buf, mountpoint);
+	if (strlen(path) < maxlen) {
+		strcpy(buf, path);
 		return 0;
 	}
 	return -1;